diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..0c4474f
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,17 @@
+# Changelog
+
+## Unreleased
+
+### Added
+
+- Added `raghilda.crawl`, including `CrawlScope`, `FetchedSource`,
+  `DirectoryCrawler`, `WebCrawler`, and `CloudflareCrawler`, for discovering
+  directory, web, and Cloudflare sources and converting them to markdown
+  documents.
+- Added `BaseStore.ingest()` and `IngestSummary` for bulk document ingestion
+  with optional document preparation, parallel writes, and inserted, replaced,
+  and skipped counts.
+
+### Fixed
+
+- Fixed sitemap URL extraction so each `<loc>` entry is collected as one URL.
diff --git a/great-docs.yml b/great-docs.yml
index 05245cf..ee074e1 100644
--- a/great-docs.yml
+++ b/great-docs.yml
@@ -75,6 +75,16 @@ reference:
       - name: store.OpenAIStore
       - name: store.PostgreSQLStore
 
+  - title: Crawl
+    desc: Crawlers for discovering and converting source documents
+    contents:
+      - crawl.CrawlScope
+      - crawl.FetchedSource
+      - crawl.BaseCrawler
+      - crawl.DirectoryCrawler
+      - crawl.WebCrawler
+      - crawl.CloudflareCrawler
+
   - title: Embedding
     desc: Embedding providers for generating vector representations
     contents:
diff --git a/src/raghilda/__init__.py b/src/raghilda/__init__.py
index b123fc6..768b9e8 100644
--- a/src/raghilda/__init__.py
+++ b/src/raghilda/__init__.py
@@ -1,6 +1,7 @@
-from . import embedding, store, types, chunk, chunker, document, read, scrape
+from . import crawl, embedding, store, types, chunk, chunker, document, read, scrape
 
 __all__ = [
+    "crawl",
     "embedding",
     "store",
     "types",
diff --git a/src/raghilda/_postgres_store.py b/src/raghilda/_postgres_store.py
index 6314122..6aadd29 100644
--- a/src/raghilda/_postgres_store.py
+++ b/src/raghilda/_postgres_store.py
@@ -1,5 +1,6 @@
 from ._store import BaseStore, WriteResult
 import json
+import threading
 from .embedding import EmbeddingProvider, EmbedInputType, embedding_from_config
 from .document import Document, ChunkedMarkdownDocument
 from .chunk import Chunk, MarkdownChunk, RetrievedChunk, Metric
@@ -137,6 +138,7 @@ def __init__(
         self.con = con
         self._metadata = metadata
         self._schema = psycopg2.extensions.quote_ident(schema, con)
+        self._ingest_upsert_lock = threading.Lock()
 
     def close(self) -> None:
         """Close the store's database connection."""
@@ -539,6 +541,13 @@ def upsert(
             replaced_document=replaced_document,
         )
 
+    def _ingest_upsert(
+        self,
+        document: Document,
+    ) -> WriteResult[ChunkedMarkdownDocument]:
+        with self._ingest_upsert_lock:
+            return self.upsert(document)
+
     def _load_document_snapshot(
         self, *, origin: str, text: str
     ) -> ChunkedMarkdownDocument:
diff --git a/src/raghilda/_store.py b/src/raghilda/_store.py
index e6fdd44..c4440a1 100644
--- a/src/raghilda/_store.py
+++ b/src/raghilda/_store.py
@@ -1,13 +1,16 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from concurrent.futures import FIRST_COMPLETED, CancelledError, ThreadPoolExecutor, wait
 from dataclasses import dataclass
-from typing import Generic, Literal, Sequence, TypeVar
+import threading
+from typing import Any, Callable, Generic, Iterable, Literal, Sequence, TypeVar
 
 from .chunk import RetrievedChunk
 from .document import Document
 
 TDocument = TypeVar("TDocument", bound=Document, covariant=True)
+_RECENT_INGEST_ORIGIN_WINDOW = 10_000
 
 
 @dataclass(frozen=True)
@@ -17,6 +20,13 @@ class WriteResult(Generic[TDocument]):
     replaced_document: TDocument | None = None
 
 
+@dataclass(frozen=True)
+class IngestSummary:
+    inserted: int
+    replaced: int
+    skipped: int
+
+
 class BaseStore(ABC):
     """Abstract base class for vector stores.
 
@@ -77,6 +87,121 @@ def upsert(
         """
         pass
 
+    def _ingest_upsert(self, document: Document) -> WriteResult[Document]:
+        return self.upsert(document)
+
+    def ingest(
+        self,
+        documents: Iterable[Any],
+        *,
+        prepare: Callable[[Any], Document] | None = None,
+        max_workers: int = 1,
+    ) -> IngestSummary:
+        """Prepare and upsert a stream of documents.
+
+        Inputs are consumed lazily and submitted incrementally. After
+        ``prepare`` is applied, recent non-empty string origins are checked for
+        duplicates as the stream is consumed. Duplicate detection is best
+        effort: a duplicate raises ``ValueError`` when encountered, after any
+        writes already in flight complete. No rollback is attempted.
+
+        Returns
+        -------
+        IngestSummary
+            Aggregate counts for inserted, replaced, and skipped documents.
+            Call ``upsert()`` directly when per-document ``WriteResult`` values
+            are needed.
+        """
+        assert max_workers >= 1
+        stop_event = threading.Event()
+        recent_origins: dict[str, None] = {}
+        recent_origins_lock = threading.Lock()
+
+        def remember_origin(origin: str | None) -> None:
+            if not isinstance(origin, str) or not origin:
+                return
+            with recent_origins_lock:
+                if origin in recent_origins:
+                    raise ValueError(f"Duplicate origin during ingest: {origin}")
+                recent_origins[origin] = None
+                if len(recent_origins) > _RECENT_INGEST_ORIGIN_WINDOW:
+                    # dict preserves insertion order, so the first key is the oldest.
+                    recent_origins.pop(next(iter(recent_origins)))
+
+        def process_document(item: Any) -> WriteResult[Document]:
+            if stop_event.is_set():
+                raise CancelledError()
+            document = prepare(item) if prepare is not None else item
+            if stop_event.is_set():
+                raise CancelledError()
+            remember_origin(document.origin)
+            if stop_event.is_set():
+                raise CancelledError()
+            return self._ingest_upsert(document)
+
+        iterator = iter(documents)
+        pending = set()
+        inserted = 0
+        replaced = 0
+        skipped = 0
+        exhausted = False
+        executor = ThreadPoolExecutor(max_workers=max_workers)
+        try:
+            while not exhausted and len(pending) < max_workers:
+                try:
+                    document = next(iterator)
+                except StopIteration:
+                    exhausted = True
+                    continue
+                pending.add(executor.submit(process_document, document))
+
+            while pending:
+                done, pending = wait(pending, return_when=FIRST_COMPLETED)
+                results = []
+                cancelled_errors = []
+                errors = []
+                for future in done:
+                    try:
+                        results.append(future.result())
+                    except CancelledError as exc:
+                        cancelled_errors.append(exc)
+                    except Exception as exc:
+                        errors.append(exc)
+                if errors:
+                    raise errors[0]
+                if cancelled_errors and not stop_event.is_set():
+                    raise cancelled_errors[0]
+                for result in results:
+                    if result.action == "inserted":
+                        inserted += 1
+                    elif result.action == "replaced":
+                        replaced += 1
+                    elif result.action == "skipped":
+                        skipped += 1
+                    else:
+                        raise ValueError(f"Unknown write action: {result.action}")
+
+                while not exhausted and len(pending) < max_workers:
+                    try:
+                        document = next(iterator)
+                    except StopIteration:
+                        exhausted = True
+                        continue
+                    pending.add(executor.submit(process_document, document))
+        except Exception:
+            stop_event.set()
+            for future in pending:
+                future.cancel()
+            executor.shutdown(wait=True, cancel_futures=True)
+            raise
+
+        executor.shutdown(wait=True, cancel_futures=False)
+        return IngestSummary(
+            inserted=inserted,
+            replaced=replaced,
+            skipped=skipped,
+        )
+
     @abstractmethod
     def retrieve(
         self, text: str, top_k: int, *args, **kwargs
diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py
new file mode 100644
index 0000000..feb361c
--- /dev/null
+++ b/src/raghilda/crawl.py
@@ -0,0 +1,2156 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from collections import deque
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+import hashlib
+import json
+import mimetypes
+import os
+from pathlib import Path
+import re
+import shutil
+import tempfile
+import time
+from typing import Any, Callable, Iterable, Iterator, Mapping, Sequence, TypeVar
+import threading
+import unicodedata
+from urllib.parse import urldefrag, urljoin, urlparse, urlunparse
+from urllib.request import url2pathname
+
+import requests
+
+from .document import MarkdownDocument
+from .read import _convert_to_markdown
+from .scrape import _extract_links
+
+try:
+    from magika import Magika
+except ImportError:  # pragma: no cover - optional at runtime
+    Magika = None
+
+__all__ = [
+    "BaseCrawler",
+    "CrawlScope",
+    "FetchedSource",
+    "WebCrawler",
+    "DirectoryCrawler",
+    "CloudflareCrawler",
+]
+
+_TYPE_ALIASES = {
+    ".htm": "html",
+    ".html": "html",
+    ".ipynb": "jupyter-notebook",
+    ".markdown": "markdown",
+    ".md": "markdown",
+    ".pdf": "pdf",
+    ".py": "python",
+    ".rst": "rst",
+    ".txt": "text",
+}
+_CONTENT_TYPE_LABELS = {
+    "application/json": "json",
+    "application/pdf": "pdf",
+    "application/xml": "xml",
+    "text/html": "html",
+    "text/markdown": "markdown",
+    "text/plain": "text",
+    "text/x-python": "python",
+    "text/xml": "xml",
+}
+_MAGIKA_LABELS = {
+    "html": "html",
+    "ipynb": "jupyter-notebook",
+    "markdown": "markdown",
+    "pdf": "pdf",
+    "python": "python",
+    "rst": "rst",
+    "txt": "text",
+}
+_TERMINAL_CLOUDFLARE_STATUSES = {
+    "cancelled_by_user",
+    "cancelled_due_to_limits",
+    "cancelled_due_to_timeout",
+    "completed",
+    "errored",
+}
+_MAGIKA = Magika() if Magika is not None else None
+_DEFAULT_CRAWL_DEPTH = 100_000
+
+RootInput = str | Path
+RootsInput = RootInput | Sequence[RootInput]
+CacheValue = tuple[Path | None, dict[str, Any] | None]
+CacheEntry = tuple[str, Path | None, dict[str, Any] | None]
+WebOriginKey = tuple[str, str, int | None]
+TInput = TypeVar("TInput")
+TOutput = TypeVar("TOutput")
+
+
+@dataclass(frozen=True)
+class CrawlScope:
+    roots: RootsInput
+    include_patterns: Sequence[str] | None = None
+    exclude_patterns: Sequence[str] | None = None
+    depth: int | None = None
+    limit: int | None = None
+    include_types: Sequence[str] | None = None
+    exclude_types: Sequence[str] | None = None
+    include_external_links: bool = False
+    include_subdomains: bool = False
+
+    def __post_init__(self) -> None:
+        if self.depth is not None:
+            assert self.depth >= 0
+        if self.limit is not None:
+            assert self.limit >= 0
+
+
+@dataclass(frozen=True)
+class FetchedSource:
+    origin: str
+    body_path: Path
+    resolved_origin: str | None = None
+    content_type: str | None = None
+    status_code: int | None = None
+    metadata: dict[str, Any] | None = None
+    fetched_at: datetime | None = None
+    revalidated_at: datetime | None = None
+    markdown_path: Path | None = None
+
+
+@dataclass(frozen=True)
+class _CloudflareRootCacheEntry:
+    fetched_at: datetime
+    records: list[dict[str, Any]]
+
+
+@dataclass(frozen=True)
+class _CloudflareRecordCacheEntry:
+    fetched_at: datetime
+    record: dict[str, Any]
+
+
+@dataclass(frozen=True)
+class _ResolvedCrawlScope:
+    roots: list[RootInput]
+    include_patterns: list[str]
+    exclude_patterns: list[str]
+    depth: int
+    limit: int | None
+    include_types: set[str]
+    exclude_types: set[str]
+    include_external_links: bool
+    include_subdomains: bool
+
+
+@dataclass
+class _EntryLockState:
+    lock: threading.RLock
+    users: int = 0
+
+
+class _FilesystemCrawlerCache:
+    """
+    Filesystem-backed cache rooted at one directory.
+
+    Each logical key is stored as:
+        <root>/<sanitized-key>--<hash>.metadata.json
+        <root>/<sanitized-key>--<hash><ext>
+
+    The metadata file is the source of truth and stores:
+        {
+            "key": <original unsanitized key>,
+            "content_path": <basename of content file, or null>,
+            "metadata": <user metadata dict, or null>,
+        }
+    """
+
+    _METADATA_SUFFIX = ".metadata.json"
+    _HASH_LEN = 12
+    _MAX_STEM_LEN = 180
+
+    _WINDOWS_RESERVED = {
+        "CON",
+        "PRN",
+        "AUX",
+        "NUL",
+        "COM1",
+        "COM2",
+        "COM3",
+        "COM4",
+        "COM5",
+        "COM6",
+        "COM7",
+        "COM8",
+        "COM9",
+        "LPT1",
+        "LPT2",
+        "LPT3",
+        "LPT4",
+        "LPT5",
+        "LPT6",
+        "LPT7",
+        "LPT8",
+        "LPT9",
+    }
+
+    def __init__(self, root: Path | None) -> None:
+        """Create a filesystem-backed cache rooted at one directory."""
+        self.root = root
+        self._entry_locks_guard = threading.Lock()
+        self._entry_locks: dict[str, _EntryLockState] = {}
+        if self.root is not None:
+            self.root.mkdir(parents=True, exist_ok=True)
+
+    def fetch(self, key: str) -> CacheValue | None:
+        """
+        Return the materialized cache entry for one key, if present.
+
+        This method does not lock for normal reads. If it encounters a broken
+        metadata file, it triggers a locked re-check and best-effort cleanup,
+        then returns None.
+        """
+        if self.root is None:
+            return None
+
+        metadata_path = self._metadata_path_for_key(key)
+        if not metadata_path.exists():
+            return None
+
+        record = self._read_record(metadata_path)
+        if record is None:
+            self._cleanup_broken_metadata_path(metadata_path)
+            return None
+        if record["key"] != key:
+            self._cleanup_mismatched_metadata_path(metadata_path, key)
+            return None
+
+        content_path: Path | None = None
+        content_name = record["content_path"]
+        if content_name is not None:
+            candidate = self.root / content_name
+            if candidate.exists():
+                content_path = candidate
+
+        return content_path, record["metadata"]
+
+    def upsert(
+        self,
+        key: str,
+        *,
+        content: bytes | str | Path | None,
+        metadata: Mapping[str, Any] | None,
+        content_ext: str | None,
+    ) -> CacheValue | None:
+        """
+        Create or replace one cache entry.
+
+        Semantics:
+        - content=None means no content file for this entry
+        - metadata=None means no user metadata for this entry
+        - the metadata sidecar is always written, unless both are None
+        - (content=None, metadata=None) deletes the entry and returns None
+        """
+        if self.root is None:
+            return None
+
+        if content is None and metadata is None:
+            self.delete(key)
+            return None
+
+        base = self._base_for_key(key)
+        metadata_path = self.root / f"{base}{self._METADATA_SUFFIX}"
+        stored_metadata = dict(metadata) if metadata is not None else None
+        new_content_path: Path | None = None
+        new_content_name: str | None = None
+        if content is not None:
+            ext = self._choose_content_ext(
+                content=content,
+                content_ext=content_ext,
+            )
+            new_content_path = self.root / f"{base}{ext}"
+            new_content_name = new_content_path.name
+        record = {
+            "key": key,
+            "content_path": new_content_name,
+            "metadata": stored_metadata,
+        }
+        keep = {metadata_path.name}
+        if new_content_name is not None:
+            keep.add(new_content_name)
+
+        with self._locked_base(base):
+            if metadata_path.exists() and self._read_record(metadata_path) is None:
+                self._delete_base_files_locked(base)
+
+            if content is not None:
+                assert new_content_path is not None
+                self._write_content(new_content_path, content)
+
+            self._write_json(metadata_path, record)
+            self._delete_extra_base_files_locked(base, keep=keep)
+
+            return new_content_path, stored_metadata
+
+    def delete(self, key: str) -> int:
+        """
+        Delete one cache entry.
+
+        Returns the number of files removed.
+        """
+        if self.root is None:
+            return 0
+
+        base = self._base_for_key(key)
+        with self._locked_base(base):
+            return self._delete_base_files_locked(base)
+
+    def entries(self) -> Iterable[CacheEntry]:
+        """
+        Yield all cache entries currently described by metadata files.
+
+        This method does not lock for normal reads. Broken metadata files are
+        re-checked under the write lock and cleaned up if still invalid.
+        """
+        if self.root is None:
+            return
+
+        for metadata_path in sorted(self.root.glob(f"*{self._METADATA_SUFFIX}")):
+            record = self._read_record(metadata_path)
+            if record is None:
+                self._cleanup_broken_metadata_path(metadata_path)
+                continue
+
+            content_path: Path | None = None
+            content_name = record["content_path"]
+            if content_name is not None:
+                candidate = self.root / content_name
+                if candidate.exists():
+                    content_path = candidate
+
+            yield record["key"], content_path, record["metadata"]
+
+    def _metadata_path_for_key(self, key: str) -> Path:
+        """Return the deterministic metadata path for one logical key."""
+        assert self.root is not None
+        return self.root / f"{self._base_for_key(key)}{self._METADATA_SUFFIX}"
+
+    def _base_for_key(self, key: str) -> str:
+        """Build the shared basename for the metadata file and content file."""
+        return f"{self._sanitize_stem(key)}--{self._hash_fragment(key)}"
+
+    def _hash_fragment(self, key: str) -> str:
+        """Return a stable hash fragment of the original key."""
+        return hashlib.sha256(key.encode("utf-8")).hexdigest()[: self._HASH_LEN]
+
+    def _sanitize_stem(self, key: str) -> str:
+        """Make the key visible in the filename, but safe enough for Windows."""
+        value = unicodedata.normalize("NFC", key)
+        value = value.replace("://", "__")
+        value = value.replace("\\", "_")
+        value = value.replace("/", "_")
+        value = re.sub(r'[\x00-\x1f<>:"|?*]+', "_", value)
+        value = re.sub(r"\s+", "_", value)
+        value = re.sub(r"[^A-Za-z0-9._-]+", "_", value)
+        value = value.strip(" ._")
+
+        if not value:
+            value = "entry"
+
+        root = value.split(".", 1)[0].rstrip(" .").upper()
+        if root in self._WINDOWS_RESERVED:
+            value = f"_{value}"
+
+        if len(value) > self._MAX_STEM_LEN:
+            head = self._MAX_STEM_LEN // 2 - 2
+            tail = self._MAX_STEM_LEN - head - 2
+            value = f"{value[:head]}..{value[-tail:]}"
+
+        value = value.rstrip(" .")
+        return value or "entry"
+
+    def _choose_content_ext(
+        self,
+        *,
+        content: bytes | str | Path,
+        content_ext: str | None,
+    ) -> str:
+        """Choose the content file extension."""
+        ext = self._normalize_ext(content_ext)
+        if ext is not None:
+            return ext
+
+        if isinstance(content, Path):
+            ext = self._normalize_ext(content.suffix)
+            if ext is not None:
+                return ext
+
+        ext = self._infer_ext_with_magika(content)
+        if ext is not None:
+            return ext
+
+        if isinstance(content, str):
+            return ".txt"
+
+        return ".raw"
+
+    def _infer_ext_with_magika(self, content: bytes | str | Path) -> str | None:
+        """Best-effort extension inference using Magika."""
+        if _MAGIKA is None:
+            return None
+
+        if isinstance(content, Path):
+            if not content.exists():
+                return None
+            result = _MAGIKA.identify_path(content)
+        elif isinstance(content, str):
+            result = _MAGIKA.identify_bytes(content.encode("utf-8"))
+        else:
+            result = _MAGIKA.identify_bytes(content)
+
+        extensions = getattr(result.output, "extensions", None)
+        if not extensions:
+            return None
+        return self._normalize_ext(extensions[0])
+
+    def _normalize_ext(self, ext: str | None) -> str | None:
+        """Normalize an extension string into a safe canonical form."""
+        if ext is None:
+            return None
+
+        ext = ext.strip()
+        if not ext:
+            return None
+
+        if not ext.startswith("."):
+            ext = "." + ext
+
+        parts = [part for part in ext.split(".") if part]
+        if not parts:
+            return None
+
+        cleaned: list[str] = []
+        for part in parts:
+            token = re.sub(r"[^A-Za-z0-9_-]+", "", part)
+            if token:
+                cleaned.append(token.lower())
+
+        if not cleaned:
+            return None
+
+        return "".join(f".{part}" for part in cleaned)
+
+    def _read_record(self, path: Path) -> dict[str, Any] | None:
+        """Read and validate one metadata JSON file."""
+        try:
+            with path.open("r", encoding="utf-8") as handle:
+                obj = json.load(handle)
+        except (OSError, json.JSONDecodeError):
+            return None
+
+        if not isinstance(obj, dict):
+            return None
+
+        key = obj.get("key")
+        content_path = obj.get("content_path")
+        metadata = obj.get("metadata")
+
+        if not isinstance(key, str):
+            return None
+        if content_path is not None and not isinstance(content_path, str):
+            return None
+        if content_path is not None:
+            if content_path in {"", ".", ".."}:
+                return None
+            if Path(content_path).name != content_path or "\\" in content_path:
+                return None
+        if metadata is not None and not isinstance(metadata, dict):
+            return None
+
+        return {
+            "key": key,
+            "content_path": content_path,
+            "metadata": metadata,
+        }
+
+    def _cleanup_broken_metadata_path(self, metadata_path: Path) -> None:
+        """Best-effort cleanup for a broken metadata file."""
+        if self.root is None:
+            return
+        if not metadata_path.name.endswith(self._METADATA_SUFFIX):
+            return
+
+        base = metadata_path.name[: -len(self._METADATA_SUFFIX)]
+        with self._locked_base(base):
+            if not metadata_path.exists():
+                return
+            if self._read_record(metadata_path) is not None:
+                return
+
+            self._delete_base_files_locked(base)
+
+    def _cleanup_mismatched_metadata_path(
+        self,
+        metadata_path: Path,
+        key: str,
+    ) -> None:
+        """Best-effort cleanup for a metadata file stored under the wrong key."""
+        if self.root is None:
+            return
+
+        base = self._base_for_key(key)
+        with self._locked_base(base):
+            if not metadata_path.exists():
+                return
+            record = self._read_record(metadata_path)
+            if record is not None and record["key"] == key:
+                return
+
+            self._delete_base_files_locked(base)
+
+    @contextmanager
+    def _locked_base(self, base: str) -> Iterator[None]:
+        state = self._acquire_entry_lock_state(base)
+        state.lock.acquire()
+        try:
+            yield
+        finally:
+            self._release_entry_lock_state(base, state)
+
+    def _acquire_entry_lock_state(self, base: str) -> _EntryLockState:
+        with self._entry_locks_guard:
+            state = self._entry_locks.get(base)
+            if state is None:
+                state = _EntryLockState(lock=threading.RLock())
+                self._entry_locks[base] = state
+            state.users += 1
+            return state
+
+    def _release_entry_lock_state(self, base: str, state: _EntryLockState) -> None:
+        state.lock.release()
+        with self._entry_locks_guard:
+            current = self._entry_locks.get(base)
+            assert current is state
+            state.users -= 1
+            if state.users == 0:
+                del self._entry_locks[base]
+
+    def _delete_base_files_locked(self, base: str) -> int:
+        """Delete all files belonging to one logical base."""
+        assert self.root is not None
+
+        deleted = 0
+        for path in self.root.iterdir():
+            if not self._belongs_to_base(path.name, base):
+                continue
+            if not path.is_file():
+                continue
+            try:
+                path.unlink()
+                deleted += 1
+            except FileNotFoundError:
+                pass
+        return deleted
+
+    def _delete_extra_base_files_locked(self, base: str, *, keep: set[str]) -> None:
+        """Delete stale files for one base, keeping the current pair."""
+        assert self.root is not None
+
+        for path in self.root.iterdir():
+            if not self._belongs_to_base(path.name, base):
+                continue
+            if not path.is_file():
+                continue
+            if path.name in keep:
+                continue
+            try:
+                path.unlink()
+            except FileNotFoundError:
+                pass
+
+    def _belongs_to_base(self, name: str, base: str) -> bool:
+        if name == f"{base}{self._METADATA_SUFFIX}":
+            return True
+        prefix = f"{base}."
+        if not name.startswith(prefix):
+            return False
+        return "--" not in name[len(prefix) :]
+
+    def _write_content(self, content_path: Path, content: bytes | str | Path) -> None:
+        if isinstance(content, Path):
+            if content == content_path:
+                return
+
+        temporary_path: Path | None = None
+        try:
+            with tempfile.NamedTemporaryFile(
+                "wb",
+                dir=content_path.parent,
+                prefix=f".{content_path.name}.",
+                delete=False,
+            ) as handle:
+                temporary_path = Path(handle.name)
+                if isinstance(content, bytes):
+                    handle.write(content)
+                elif isinstance(content, str):
+                    handle.write(content.encode("utf-8"))
+                elif isinstance(content, Path):
+                    with content.open("rb") as source:
+                        shutil.copyfileobj(source, handle)
+                else:
+                    raise TypeError(f"Unsupported content type: {type(content)!r}")
+            os.replace(temporary_path, content_path)
+        finally:
+            if temporary_path is not None:
+                temporary_path.unlink(missing_ok=True)
+
+    def _write_json(self, path: Path, obj: Mapping[str, Any]) -> None:
+        """Write metadata JSON directly to its destination path."""
+        text = json.dumps(obj, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
+        with path.open("w", encoding="utf-8") as handle:
+            handle.write(text)
+
+
+class _DirectoryCrawlerCache(_FilesystemCrawlerCache):
+    pass
+
+
+class _WebCrawlerCache(_FilesystemCrawlerCache):
+    pass
+
+
+class _CloudflareCrawlerCache(_FilesystemCrawlerCache):
+    pass
+
+
+def _map_ordered(
+    items: Iterable[TInput],
+    *,
+    max_workers: int,
+    fn: Callable[[TInput], TOutput],
+) -> Iterator[TOutput]:
+    assert max_workers >= 1
+    iterator = iter(items)
+    if max_workers == 1:
+        for item in iterator:
+            yield fn(item)
+        return
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        pending: deque[Any] = deque()
+        while len(pending) < max_workers:
+            try:
+                item = next(iterator)
+            except StopIteration:
+                break
+            pending.append(executor.submit(fn, item))
+
+        while pending:
+            future = pending.popleft()
+            yield future.result()
+            try:
+                item = next(iterator)
+            except StopIteration:
+                continue
+            pending.append(executor.submit(fn, item))
+
+
+class BaseCrawler(ABC):
+    max_workers: int
+
+    @abstractmethod
+    def origins(
+        self,
+        scope: CrawlScope,
+        *,
+        progress: bool = True,
+        cache_force_refresh: bool = False,
+    ) -> Iterator[str]:
+        pass
+
+    @abstractmethod
+    def fetch_raw(
+        self,
+        origin: str,
+        *,
+        cache_force_refresh: bool = False,
+    ) -> FetchedSource:
+        pass
+
+    def fetch_markdown(
+        self,
+        origin: str,
+        *,
+        convert: Callable[[FetchedSource], MarkdownDocument] | None = None,
+        cache_force_refresh: bool = False,
+    ) -> MarkdownDocument:
+        source = self.fetch_raw(origin, cache_force_refresh=cache_force_refresh)
+        converter = convert or self._default_convert
+        return converter(source)
+
+    def _fetch_markdown_after_origin_discovery(
+        self,
+        origin: str,
+        *,
+        convert: Callable[[FetchedSource], MarkdownDocument] | None = None,
+    ) -> MarkdownDocument:
+        source = self._fetch_raw_after_origin_discovery(origin)
+        converter = convert or self._default_convert
+        return converter(source)
+
+    def _fetch_raw_after_origin_discovery(self, origin: str) -> FetchedSource:
+        return self.fetch_raw(origin, cache_force_refresh=False)
+
+    def markdown_documents(
+        self,
+        scope: CrawlScope,
+        *,
+        convert: Callable[[FetchedSource], MarkdownDocument] | None = None,
+        progress: bool = True,
+        cache_force_refresh: bool = False,
+    ) -> Iterator[MarkdownDocument]:
+        origins = self.origins(
+            scope,
+            progress=progress,
+            cache_force_refresh=cache_force_refresh,
+        )
+        yield from _map_ordered(
+            origins,
+            max_workers=self.max_workers,
+            fn=lambda origin: self._fetch_markdown_after_origin_discovery(
+                origin,
+                convert=convert,
+            ),
+        )
+
+    def _default_convert(self, source: FetchedSource) -> MarkdownDocument:
+        raise NotImplementedError
+
+
+class DirectoryCrawler(BaseCrawler):
+    """Crawl local files and optionally cache converted markdown.
+
+    Directory traversal always reads the current filesystem state. The cache
+    stores converted markdown per file origin and is reused only when the
+    current file hash and modification time still match the cached metadata.
+    """
+
+    def __init__(
+        self,
+        *,
+        cache_dir: bool | str | Path | None = None,
+        max_workers: int = 1,
+    ) -> None:
+        assert max_workers >= 1
+        self.cache_dir = _resolve_cache_dir(
+            cache_dir,
+            backend_name="directory",
+            default_factory=lambda: None,
+        )
+        self.max_workers = max_workers
+        self._cache = _DirectoryCrawlerCache(self.cache_dir)
+
+    def origins(
+        self,
+        scope: CrawlScope,
+        *,
+        progress: bool = True,
+        cache_force_refresh: bool = False,
+    ) -> Iterator[str]:
+        del progress, cache_force_refresh
+        resolved_scope = _resolve_crawl_scope(scope)
+        if resolved_scope.limit == 0:
+            return
+        cache_root = self.cache_dir.resolve() if self.cache_dir is not None else None
+        count = 0
+        yielded_origins: set[str] = set()
+        for root in resolved_scope.roots:
+            path = _to_directory_path(root)
+            assert path.exists(), f"Root does not exist: {path}"
+            if path.is_file():
+                resolved_path = path.resolve()
+                if cache_root is not None and resolved_path.is_relative_to(cache_root):
+                    continue
+                origin = resolved_path.as_uri()
+                if origin in yielded_origins:
+                    continue
+                if self._include_path(
+                    path,
+                    origin,
+                    include_patterns=resolved_scope.include_patterns,
+                    exclude_patterns=resolved_scope.exclude_patterns,
+                    include_types=resolved_scope.include_types,
+                    exclude_types=resolved_scope.exclude_types,
+                ):
+                    yielded_origins.add(origin)
+                    yield origin
+                    count += 1
+                    if (
+                        resolved_scope.limit is not None
+                        and count >= resolved_scope.limit
+                    ):
+                        return
+                continue
+            for file_path in _iter_directory_files(
+                path,
+                max_depth=resolved_scope.depth,
+            ):
+                resolved_file_path = file_path.resolve()
+                if cache_root is not None and resolved_file_path.is_relative_to(
+                    cache_root
+                ):
+                    continue
+                origin = resolved_file_path.as_uri()
+                if origin in yielded_origins:
+                    continue
+                if not self._include_path(
+                    file_path,
+                    origin,
+                    include_patterns=resolved_scope.include_patterns,
+                    exclude_patterns=resolved_scope.exclude_patterns,
+                    include_types=resolved_scope.include_types,
+                    exclude_types=resolved_scope.exclude_types,
+                ):
+                    continue
+                yielded_origins.add(origin)
+                yield origin
+                count += 1
+                if resolved_scope.limit is not None and count >= resolved_scope.limit:
+                    return
+
+    def fetch_raw(
+        self,
+        origin: str,
+        *,
+        cache_force_refresh: bool = False,
+    ) -> FetchedSource:
+        path = _path_from_file_origin(origin).resolve()
+        assert path.is_file(), f"File origin must exist: {origin}"
+        canonical_origin = path.as_uri()
+        content_type = mimetypes.guess_type(path.name)[0]
+        type_label = _detect_type_label(path=path, content_type=content_type)
+        source_hash = _sha256_path(path)
+        markdown_path: Path | None = None
+        if self.cache_dir is not None and not cache_force_refresh:
+            cached_entry = self._cache.fetch(canonical_origin)
+            if cached_entry is not None:
+                cached_markdown_path, cached_meta = cached_entry
+                if (
+                    cached_markdown_path is not None
+                    and cached_meta is not None
+                    and (
+                        cached_meta.get("source_hash") == source_hash
+                        and cached_meta.get("mtime_ns") == path.stat().st_mtime_ns
+                    )
+                ):
+                    markdown_path = cached_markdown_path
+        return FetchedSource(
+            origin=canonical_origin,
+            resolved_origin=canonical_origin,
+            content_type=content_type,
+            status_code=None,
+            metadata={
+                "mtime_ns": path.stat().st_mtime_ns,
+                "size": path.stat().st_size,
+                "source_hash": source_hash,
+                "type_label": type_label,
+            },
+            fetched_at=datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc),
+            body_path=path,
+            markdown_path=markdown_path,
+        )
+
+    def markdown_documents(
+        self,
+        scope: CrawlScope,
+        *,
+        convert: Callable[[FetchedSource], MarkdownDocument] | None = None,
+        progress: bool = True,
+        cache_force_refresh: bool = False,
+    ) -> Iterator[MarkdownDocument]:
+        origins = self.origins(
+            scope,
+            progress=progress,
+            cache_force_refresh=cache_force_refresh,
+        )
+        yield from _map_ordered(
+            origins,
+            max_workers=self.max_workers,
+            fn=lambda origin: self.fetch_markdown(
+                origin,
+                convert=convert,
+                cache_force_refresh=cache_force_refresh,
+            ),
+        )
+
+    def _default_convert(self, source: FetchedSource) -> MarkdownDocument:
+        if source.markdown_path is not None and source.markdown_path.exists():
+            markdown = source.markdown_path.read_text(encoding="utf-8")
+            return MarkdownDocument(origin=source.origin, content=markdown)
+
+        type_label = (source.metadata or {}).get("type_label")
+        if type_label == "markdown":
+            markdown = source.body_path.read_text(encoding="utf-8")
+        else:
+            markdown = _convert_to_markdown(str(source.body_path))
+
+        if self.cache_dir is not None:
+            self._cache.upsert(
+                source.origin,
+                content=markdown,
+                metadata={
+                    "origin": source.origin,
+                    "mtime_ns": (source.metadata or {}).get("mtime_ns"),
+                    "source_hash": (source.metadata or {}).get("source_hash"),
+                },
+                content_ext=".md",
+            )
+
+        return MarkdownDocument(origin=source.origin, content=markdown)
+
+    def _include_path(
+        self,
+        path: Path,
+        origin: str,
+        *,
+        include_patterns: Sequence[str],
+        exclude_patterns: Sequence[str],
+        include_types: set[str],
+        exclude_types: set[str],
+    ) -> bool:
+        if not _matches_patterns(
+            origin,
+            include_patterns=include_patterns,
+            exclude_patterns=exclude_patterns,
+        ):
+            return False
+        if not include_types and not exclude_types:
+            return True
+        label = _detect_type_label(
+            path=path, content_type=mimetypes.guess_type(path.name)[0]
+        )
+        return _matches_types(
+            label,
+            include_types=include_types,
+            exclude_types=exclude_types,
+        )
+
+
+class WebCrawler(BaseCrawler):
+    def __init__(
+        self,
+        *,
+        session: requests.Session | None = None,
+        cache_dir: bool | str | Path | None = None,
+        cache_stale_after: timedelta | None = None,
+        max_workers: int = 1,
+    ) -> None:
+        assert max_workers >= 1
+        self.session = requests.Session() if session is None else session
+        self._cache_context = None if session is None else f"session:{id(self.session)}"
+        self.cache_dir = _resolve_cache_dir(
+            cache_dir,
+            backend_name="web",
+            default_factory=lambda: Path(
+                tempfile.mkdtemp(prefix="raghilda-web-cache-")
+            ),
+        )
+        self.cache_stale_after = cache_stale_after
+        self.max_workers = max_workers
+        self._cache = _WebCrawlerCache(self.cache_dir)
+
+    def origins(
+        self,
+        scope: CrawlScope,
+        *,
+        progress: bool = True,
+        cache_force_refresh: bool = False,
+    ) -> Iterator[str]:
+        del progress
+        resolved_scope = _resolve_crawl_scope(scope)
+        if resolved_scope.limit == 0:
+            return
+        visited: set[tuple[str, WebOriginKey, str]] = set()
+        yielded_origins: set[str] = set()
+        yielded = 0
+        frontier: list[tuple[str, WebOriginKey, str]] = []
+
+        for root in resolved_scope.roots:
+            canonical_root = _canonicalize_web_url(str(root))
+            assert canonical_root is not None
+            parsed = urlparse(canonical_root)
+            assert parsed.scheme in {"http", "https"}
+            root_host = parsed.hostname or ""
+            frontier.append(
+                (canonical_root, _web_origin_key(canonical_root), root_host)
+            )
+
+        current_depth = 0
+        while frontier:
+            batch: list[tuple[str, WebOriginKey, str]] = []
+            for origin, scope_origin, root_host in frontier:
+                visit_key = (origin, scope_origin, root_host)
+                if visit_key in visited:
+                    continue
+                if not self._allow_origin(
+                    origin,
+                    scope_origin,
+                    root_host,
+                    include_external_links=resolved_scope.include_external_links,
+                    include_subdomains=resolved_scope.include_subdomains,
+                ):
+                    continue
+                if _matches_exclude_patterns(
+                    origin,
+                    exclude_patterns=resolved_scope.exclude_patterns,
+                ):
+                    continue
+                visited.add(visit_key)
+                batch.append((origin, scope_origin, root_host))
+
+            next_frontier: list[tuple[str, WebOriginKey, str]] = []
+            offset = 0
+            while offset < len(batch):
+                remaining = (
+                    None
+                    if resolved_scope.limit is None
+                    else resolved_scope.limit - yielded
+                )
+                if remaining == 0:
+                    return
+                chunk_size = len(batch) - offset
+                if remaining is not None:
+                    chunk_size = min(chunk_size, remaining)
+                window = batch[offset : offset + chunk_size]
+                fetched_sources = _map_ordered(
+                    window,
+                    max_workers=min(self.max_workers, len(window)),
+                    fn=lambda item: (
+                        item,
+                        self.fetch_raw(
+                            item[0],
+                            cache_force_refresh=cache_force_refresh,
+                        ),
+                    ),
+                )
+                for (origin, scope_origin, root_host), source in fetched_sources:
+                    type_label = (source.metadata or {}).get("type_label")
+                    matches_patterns = _matches_patterns(
+                        origin,
+                        include_patterns=resolved_scope.include_patterns,
+                        exclude_patterns=resolved_scope.exclude_patterns,
+                    )
+                    matches_types = _matches_types(
+                        type_label,
+                        include_types=resolved_scope.include_types,
+                        exclude_types=resolved_scope.exclude_types,
+                    )
+                    if (
+                        matches_patterns
+                        and matches_types
+                        and origin not in yielded_origins
+                    ):
+                        yield origin
+                        yielded_origins.add(origin)
+                        yielded += 1
+                        if (
+                            resolved_scope.limit is not None
+                            and yielded >= resolved_scope.limit
+                        ):
+                            return
+                    if current_depth >= resolved_scope.depth:
+                        continue
+
+                    text = _read_text(source.body_path)
+                    resolved_origin = source.resolved_origin or origin
+                    resolved_origin_key = _web_origin_key(resolved_origin)
+                    origin_key = _web_origin_key(origin)
+                    child_root_host = root_host
+                    if (
+                        resolved_scope.include_subdomains
+                        and resolved_origin_key == origin_key
+                    ):
+                        child_scope_origin = scope_origin
+                    else:
+                        child_scope_origin = resolved_origin_key
+                        child_root_host = (
+                            urlparse(resolved_origin).hostname or root_host
+                        )
+                    for link in sorted(_extract_links(text)):
+                        canonical = _canonicalize_web_url(link, base=resolved_origin)
+                        if canonical is None:
+                            continue
+                        parsed = urlparse(canonical)
+                        if parsed.scheme not in {"http", "https"}:
+                            continue
+                        next_frontier.append(
+                            (canonical, child_scope_origin, child_root_host)
+                        )
+                offset += chunk_size
+            frontier = next_frontier
+            current_depth += 1
+
+    def fetch_raw(
+        self,
+        origin: str,
+        *,
+        cache_force_refresh: bool = False,
+    ) -> FetchedSource:
+        canonical_origin = _canonicalize_web_url(origin)
+        assert canonical_origin is not None
+        parsed = urlparse(canonical_origin)
+        assert parsed.scheme in {"http", "https"}
+
+        cached_entry = self._cache.fetch(canonical_origin)
+        body_path: Path | None = None
+        cached_meta: dict[str, Any] | None = None
+        if cached_entry is not None:
+            body_path, cached_meta = cached_entry
+        has_cache = (
+            body_path is not None
+            and cached_meta is not None
+            and self._cache_context_matches(cached_meta)
+        )
+        now = _utcnow()
+
+        if has_cache and not cache_force_refresh:
+            assert cached_meta is not None
+            assert body_path is not None
+            if self._is_fresh(cached_meta, now):
+                return self._source_from_meta(cached_meta, body_path=body_path)
+
+        headers: dict[str, str] = {}
+        if has_cache and not cache_force_refresh:
+            assert cached_meta is not None
+            etag = cached_meta.get("etag")
+            last_modified = cached_meta.get("last_modified")
+            if etag:
+                headers["If-None-Match"] = etag
+            if last_modified:
+                headers["If-Modified-Since"] = last_modified
+
+        response = self.session.get(canonical_origin, headers=headers, timeout=30.0)
+        if response.status_code == 304 and has_cache:
+            assert cached_meta is not None
+            assert body_path is not None
+            cached_meta["revalidated_at"] = now.isoformat()
+            cached_entry = self._cache.upsert(
+                canonical_origin,
+                content=body_path,
+                metadata=cached_meta,
+                content_ext=None,
+            )
+            assert cached_entry is not None
+            body_path, cached_meta = cached_entry
+            assert body_path is not None
+            assert cached_meta is not None
+            return self._source_from_meta(cached_meta, body_path=body_path)
+
+        response.raise_for_status()
+        content_type = response.headers.get("Content-Type")
+        resolved_origin = (
+            _canonicalize_web_url(response.url, base=canonical_origin) or response.url
+        )
+        type_label = _detect_type_label(
+            path=_type_hint_path(canonical_origin, content_type=content_type),
+            content_type=content_type,
+        )
+        meta = {
+            "origin": canonical_origin,
+            "resolved_origin": resolved_origin,
+            "content_type": content_type,
+            "status_code": response.status_code,
+            "etag": response.headers.get("ETag"),
+            "last_modified": response.headers.get("Last-Modified"),
+            "type_label": type_label,
+            "fetched_at": now.isoformat(),
+            "revalidated_at": None,
+            "cache_context": self._cache_context,
+        }
+        cached_entry = self._cache.upsert(
+            canonical_origin,
+            content=response.content,
+            metadata=meta,
+            content_ext=_known_body_suffix(
+                canonical_origin,
+                content_type=content_type,
+            ),
+        )
+        assert cached_entry is not None
+        body_path, meta = cached_entry
+        assert body_path is not None
+        assert meta is not None
+        actual_type_label = _detect_type_label(
+            path=body_path,
+            content_type=content_type,
+        )
+        if actual_type_label != meta.get("type_label"):
+            meta["type_label"] = actual_type_label
+            cached_entry = self._cache.upsert(
+                canonical_origin,
+                content=body_path,
+                metadata=meta,
+                content_ext=None,
+            )
+            assert cached_entry is not None
+            body_path, meta = cached_entry
+            assert body_path is not None
+            assert meta is not None
+        return self._source_from_meta(meta, body_path=body_path)
+
+    def _fetch_raw_after_origin_discovery(self, origin: str) -> FetchedSource:
+        canonical_origin = _canonicalize_web_url(origin)
+        assert canonical_origin is not None
+        cached_entry = self._cache.fetch(canonical_origin)
+        assert cached_entry is not None
+        body_path, cached_meta = cached_entry
+        assert body_path is not None
+        assert cached_meta is not None
+        return self._source_from_meta(cached_meta, body_path=body_path)
+
+    def _default_convert(self, source: FetchedSource) -> MarkdownDocument:
+        type_label = (source.metadata or {}).get("type_label")
+        if type_label == "markdown":
+            markdown = _read_text(source.body_path)
+        else:
+            path_for_conversion = source.body_path
+            if source.body_path.suffix == "":
+                suffix = _body_suffix(
+                    source.origin,
+                    content_type=source.content_type,
+                )
+                with tempfile.NamedTemporaryFile(
+                    prefix="raghilda-convert-",
+                    suffix=suffix,
+                    delete=False,
+                ) as temporary_file:
+                    temporary_path = Path(temporary_file.name)
+                    temporary_file.write(source.body_path.read_bytes())
+                try:
+                    path_for_conversion = temporary_path
+                    markdown = _convert_to_markdown(str(path_for_conversion))
+                finally:
+                    temporary_path.unlink(missing_ok=True)
+            else:
+                markdown = _convert_to_markdown(str(path_for_conversion))
+        return MarkdownDocument(origin=source.origin, content=markdown)
+
+    def _source_from_meta(
+        self,
+        meta: dict[str, Any],
+        *,
+        body_path: Path,
+    ) -> FetchedSource:
+        return FetchedSource(
+            origin=meta["origin"],
+            resolved_origin=meta.get("resolved_origin"),
+            content_type=meta.get("content_type"),
+            status_code=meta.get("status_code"),
+            metadata={
+                "etag": meta.get("etag"),
+                "last_modified": meta.get("last_modified"),
+                "type_label": meta.get("type_label"),
+            },
+            fetched_at=_parse_datetime(meta.get("fetched_at")),
+            revalidated_at=_parse_datetime(meta.get("revalidated_at")),
+            body_path=body_path,
+        )
+
+    def _is_fresh(self, cached_meta: dict[str, Any], now: datetime) -> bool:
+        if self.cache_stale_after is None:
+            return True
+        timestamps = [
+            _parse_datetime(cached_meta.get("fetched_at")),
+            _parse_datetime(cached_meta.get("revalidated_at")),
+        ]
+        freshest_cache_time = max(
+            (timestamp for timestamp in timestamps if timestamp is not None),
+            default=None,
+        )
+        if freshest_cache_time is None:
+            return False
+        return now - freshest_cache_time <= self.cache_stale_after
+
+    def _cache_context_matches(self, cached_meta: dict[str, Any]) -> bool:
+        return cached_meta.get("cache_context") == self._cache_context
+
+    def _allow_origin(
+        self,
+        origin: str,
+        scope_origin: WebOriginKey,
+        root_host: str,
+        *,
+        include_external_links: bool,
+        include_subdomains: bool,
+    ) -> bool:
+        return _allow_web_origin(
+            origin,
+            scope_origin,
+            root_host,
+            include_external_links=include_external_links,
+            include_subdomains=include_subdomains,
+        )
+
+
+class CloudflareCrawler(BaseCrawler):
+    def __init__(
+        self,
+        *,
+        account_id: str,
+        api_token: str,
+        cache_dir: bool | str | Path | None = None,
+        session: requests.Session | Any | None = None,
+        source: str = "all",
+        render: bool = True,
+        cache_stale_after: timedelta | None = None,
+        modified_since: int | None = None,
+        poll_interval: float = 5.0,
+        max_poll_attempts: int = 60,
+        max_workers: int = 1,
+        base_url: str = "https://api.cloudflare.com/client/v4",
+    ) -> None:
+        assert max_workers >= 1
+        self.account_id = account_id
+        self.api_token = api_token
+        self.cache_dir = _resolve_cache_dir(
+            cache_dir,
+            backend_name="cloudflare",
+            default_factory=lambda: Path(
+                tempfile.mkdtemp(prefix="raghilda-cloudflare-cache-")
+            ),
+        )
+        self.session = session or requests.Session()
+        self.source = source
+        self.render = render
+        self.cache_stale_after = cache_stale_after
+        self.modified_since = modified_since
+        self.poll_interval = poll_interval
+        self.max_poll_attempts = max_poll_attempts
+        self.max_workers = max_workers
+        self.base_url = base_url.rstrip("/")
+        self._records: dict[str, _CloudflareRecordCacheEntry] = {}
+        self._roots: dict[tuple[Any, ...], _CloudflareRootCacheEntry] = {}
+        self._cache = _CloudflareCrawlerCache(self.cache_dir)
+
+    def origins(
+        self,
+        scope: CrawlScope,
+        *,
+        progress: bool = True,
+        cache_force_refresh: bool = False,
+    ) -> Iterator[str]:
+        del progress
+        resolved_scope = _resolve_crawl_scope(scope)
+        yielded = 0
+        yielded_origins: set[str] = set()
+        crawled_roots: set[str] = set()
+        for root in resolved_scope.roots:
+            if resolved_scope.limit is not None and yielded >= resolved_scope.limit:
+                return
+            canonical_root = _canonicalize_web_url(str(root))
+            assert canonical_root is not None
+            if canonical_root in crawled_roots:
+                continue
+            crawled_roots.add(canonical_root)
+            remaining = (
+                None if resolved_scope.limit is None else resolved_scope.limit - yielded
+            )
+            root_limit = remaining if not yielded_origins else None
+            records = self._crawl_root(
+                canonical_root,
+                cache_force_refresh=cache_force_refresh,
+                depth=resolved_scope.depth,
+                include_patterns=resolved_scope.include_patterns,
+                exclude_patterns=resolved_scope.exclude_patterns,
+                include_external_links=resolved_scope.include_external_links,
+                include_subdomains=resolved_scope.include_subdomains,
+                limit=root_limit,
+            )
+            for record in records:
+                origin = record["url"]
+                if origin in yielded_origins:
+                    continue
+                label = _detect_type_label(
+                    path=None,
+                    content_type="text/markdown",
+                )
+                if not _matches_types(
+                    label,
+                    include_types=resolved_scope.include_types,
+                    exclude_types=resolved_scope.exclude_types,
+                ):
+                    continue
+                yielded_origins.add(origin)
+                yield origin
+                yielded += 1
+                if resolved_scope.limit is not None and yielded >= resolved_scope.limit:
+                    return
+
+    def fetch_raw(
+        self,
+        origin: str,
+        *,
+        cache_force_refresh: bool = False,
+    ) -> FetchedSource:
+        canonical_origin = _canonicalize_web_url(origin)
+        assert canonical_origin is not None
+        record_entry = (
+            None if cache_force_refresh else self._records.get(canonical_origin)
+        )
+        if record_entry is not None and not self._cloudflare_cache_is_fresh(
+            record_entry.fetched_at
+        ):
+            record_entry = None
+        if record_entry is None and not cache_force_refresh:
+            record_entry = self._load_record_cache_entry(canonical_origin)
+            if record_entry is not None:
+                self._records[canonical_origin] = record_entry
+        if record_entry is None or cache_force_refresh:
+            records = self._crawl_root(
+                canonical_origin,
+                cache_force_refresh=cache_force_refresh,
+                depth=0,
+                limit=1,
+                apply_patterns=False,
+                include_external_links=False,
+                include_subdomains=False,
+            )
+            record = next(
+                (item for item in records if item["url"] == canonical_origin),
+                None,
+            )
+            if record is None and len(records) == 1:
+                record = records[0]
+            if record is None:
+                raise ValueError(f"Cloudflare crawl did not return record for {origin}")
+            record_entry = self._records.get(record["url"])
+            assert record_entry is not None
+            self._records[canonical_origin] = record_entry
+
+        assert record_entry is not None
+        return self._source_from_record_entry(canonical_origin, record_entry)
+
+    def _fetch_raw_after_origin_discovery(self, origin: str) -> FetchedSource:
+        canonical_origin = _canonicalize_web_url(origin)
+        assert canonical_origin is not None
+        record_entry = self._records.get(canonical_origin)
+        if record_entry is None:
+            record_entry = self._load_record_cache_entry(canonical_origin)
+            assert record_entry is not None
+            self._records[canonical_origin] = record_entry
+        return self._source_from_record_entry(canonical_origin, record_entry)
+
+    def _source_from_record_entry(
+        self,
+        canonical_origin: str,
+        record_entry: _CloudflareRecordCacheEntry,
+    ) -> FetchedSource:
+        content_path, _ = self._store_record_cache_entry(
+            canonical_origin,
+            record=record_entry.record,
+            fetched_at=record_entry.fetched_at,
+        )
+        assert content_path is not None
+        record = record_entry.record
+        return FetchedSource(
+            origin=canonical_origin,
+            resolved_origin=record.get("metadata", {}).get("url", canonical_origin),
+            content_type="text/markdown",
+            status_code=record.get("metadata", {}).get("status"),
+            metadata={
+                "crawler_status": record.get("status"),
+                "title": record.get("metadata", {}).get("title"),
+                "type_label": "markdown",
+            },
+            fetched_at=record_entry.fetched_at,
+            body_path=content_path,
+            markdown_path=content_path,
+        )
+
+    def _default_convert(self, source: FetchedSource) -> MarkdownDocument:
+        markdown = source.body_path.read_text(encoding="utf-8")
+        return MarkdownDocument(origin=source.origin, content=markdown)
+
+    def _crawl_root(
+        self,
+        root: str,
+        *,
+        cache_force_refresh: bool,
+        depth: int | None = None,
+        include_patterns: Sequence[str] | None = None,
+        exclude_patterns: Sequence[str] | None = None,
+        include_external_links: bool,
+        include_subdomains: bool,
+        limit: int | None = None,
+        apply_patterns: bool = True,
+    ) -> list[dict[str, Any]]:
+        resolved_depth = _DEFAULT_CRAWL_DEPTH if depth is None else depth
+        resolved_include_patterns = list(include_patterns or [])
+        resolved_exclude_patterns = list(exclude_patterns or [])
+        resolved_limit = limit
+        cache_key = (
+            root,
+            resolved_depth,
+            resolved_limit,
+            apply_patterns,
+            tuple(resolved_include_patterns),
+            tuple(resolved_exclude_patterns),
+            include_external_links,
+            include_subdomains,
+        )
+        cached_entry = self._roots.get(cache_key)
+        if (
+            not cache_force_refresh
+            and cached_entry is not None
+            and self._cloudflare_cache_is_fresh(cached_entry.fetched_at)
+        ):
+            return cached_entry.records
+        if not cache_force_refresh and apply_patterns:
+            cached_entry = self._load_root_cache_entry(cache_key)
+            if cached_entry is not None:
+                self._roots[cache_key] = cached_entry
+                return cached_entry.records
+
+        endpoint = f"{self.base_url}/accounts/{self.account_id}/browser-rendering/crawl"
+        payload = self._crawl_payload(
+            root,
+            depth=resolved_depth,
+            limit=resolved_limit,
+            include_patterns=resolved_include_patterns,
+            exclude_patterns=resolved_exclude_patterns,
+            include_external_links=include_external_links,
+            include_subdomains=include_subdomains,
+            cache_force_refresh=cache_force_refresh,
+            apply_patterns=apply_patterns,
+        )
+        headers = {
+            "Authorization": f"Bearer {self.api_token}",
+            "Content-Type": "application/json",
+        }
+        response = self.session.post(
+            endpoint,
+            json=payload,
+            headers=headers,
+            timeout=30.0,
+        )
+        response.raise_for_status()
+        response_payload = response.json()
+        job_id = response_payload["result"]
+
+        result: dict[str, Any] | None = None
+        for _ in range(self.max_poll_attempts):
+            poll_response = self.session.get(
+                f"{endpoint}/{job_id}",
+                headers={"Authorization": f"Bearer {self.api_token}"},
+                params={"limit": 1},
+                timeout=30.0,
+            )
+            poll_response.raise_for_status()
+            result = poll_response.json()["result"]
+            assert result is not None
+            status = result["status"]
+            if status == "running":
+                if self.poll_interval > 0:
+                    time.sleep(self.poll_interval)
+                continue
+            if status not in _TERMINAL_CLOUDFLARE_STATUSES:
+                raise ValueError(f"Unexpected Cloudflare crawl status: {status}")
+            if status != "completed":
+                raise ValueError(f"Cloudflare crawl ended with status '{status}'")
+            break
+        else:
+            raise TimeoutError("Cloudflare crawl did not complete within the timeout")
+
+        assert result is not None
+        full_response = self.session.get(
+            f"{endpoint}/{job_id}",
+            headers={"Authorization": f"Bearer {self.api_token}"},
+            params=None,
+            timeout=30.0,
+        )
+        full_response.raise_for_status()
+        result = full_response.json()["result"]
+        assert result is not None
+
+        records = list(result.get("records") or [])
+        cursor = result.get("cursor")
+        while cursor is not None:
+            page_response = self.session.get(
+                f"{endpoint}/{job_id}",
+                headers={"Authorization": f"Bearer {self.api_token}"},
+                params={"cursor": cursor, "status": "completed"},
+                timeout=30.0,
+            )
+            page_response.raise_for_status()
+            page_result = page_response.json()["result"]
+            records.extend(page_result.get("records") or [])
+            cursor = page_result.get("cursor")
+
+        scope_origin = _web_origin_key(root)
+        root_host = urlparse(root).hostname or ""
+        completed_records = []
+        for record in records:
+            if record.get("status") != "completed":
+                continue
+            canonical_url = _canonicalize_web_url(record["url"])
+            if canonical_url is None:
+                continue
+            if (
+                apply_patterns
+                and not _allow_web_origin(
+                    canonical_url,
+                    scope_origin,
+                    root_host,
+                    include_external_links=include_external_links,
+                    include_subdomains=include_subdomains,
+                )
+                and not _is_cloudflare_seed_redirect_target(root, canonical_url)
+            ):
+                continue
+            if canonical_url != record["url"]:
+                record = dict(record)
+                record["url"] = canonical_url
+            completed_records.append(record)
+        if apply_patterns:
+            completed_records = [
+                record
+                for record in completed_records
+                if _matches_cloudflare_patterns(
+                    record["url"],
+                    include_patterns=resolved_include_patterns,
+                    exclude_patterns=resolved_exclude_patterns,
+                )
+            ]
+        fetched_at = _utcnow()
+        self._roots[cache_key] = _CloudflareRootCacheEntry(
+            fetched_at=fetched_at,
+            records=completed_records,
+        )
+        if apply_patterns:
+            self._store_root_cache_entry(
+                cache_key,
+                records=completed_records,
+                fetched_at=fetched_at,
+            )
+        for record in completed_records:
+            self._records[record["url"]] = _CloudflareRecordCacheEntry(
+                fetched_at=fetched_at,
+                record=record,
+            )
+            self._store_record_cache_entry(
+                record["url"],
+                record=record,
+                fetched_at=fetched_at,
+            )
+        return completed_records
+
+    def _crawl_payload(
+        self,
+        root: str,
+        *,
+        depth: int,
+        limit: int | None,
+        include_patterns: Sequence[str],
+        exclude_patterns: Sequence[str],
+        include_external_links: bool,
+        include_subdomains: bool,
+        cache_force_refresh: bool,
+        apply_patterns: bool,
+    ) -> dict[str, Any]:
+        payload: dict[str, Any] = {
+            "url": root,
+            "depth": depth,
+            "formats": ["markdown"],
+            "render": self.render,
+            "source": self.source,
+            "options": {
+                "includeExternalLinks": include_external_links,
+                "includeSubdomains": include_subdomains,
+            },
+        }
+        if limit is not None:
+            payload["limit"] = limit
+        if apply_patterns and include_patterns:
+            payload["options"]["includePatterns"] = list(include_patterns)
+        if apply_patterns and exclude_patterns:
+            payload["options"]["excludePatterns"] = list(exclude_patterns)
+        if self.modified_since is not None:
+            payload["modifiedSince"] = self.modified_since
+        if cache_force_refresh:
+            payload["maxAge"] = 0
+        elif self.cache_stale_after is not None:
+            payload["maxAge"] = int(self.cache_stale_after.total_seconds())
+        return payload
+
+    def _record_cache_signature(self) -> dict[str, Any]:
+        return {
+            "account_id": self.account_id,
+            "base_url": self.base_url,
+            "render": self.render,
+            "source": self.source,
+            "modified_since": self.modified_since,
+        }
+
+    def _root_cache_key(self, cache_key: tuple[Any, ...]) -> str:
+        payload = {
+            "cache_key": cache_key,
+            "signature": self._record_cache_signature(),
+        }
+        encoded = json.dumps(payload, sort_keys=True, separators=(",", ":"))
+        return f"cloudflare-root:{encoded}"
+
+    def _load_root_cache_entry(
+        self,
+        cache_key: tuple[Any, ...],
+    ) -> _CloudflareRootCacheEntry | None:
+        cached_entry = self._cache.fetch(self._root_cache_key(cache_key))
+        if cached_entry is None:
+            return None
+        _, cached_meta = cached_entry
+        if cached_meta is None:
+            return None
+        if cached_meta.get("signature") != self._record_cache_signature():
+            return None
+        fetched_at = _parse_datetime(cached_meta.get("fetched_at"))
+        if fetched_at is None or not self._cloudflare_cache_is_fresh(fetched_at):
+            return None
+        records = cached_meta["records"]
+        for record in records:
+            self._records[record["url"]] = _CloudflareRecordCacheEntry(
+                fetched_at=fetched_at,
+                record=record,
+            )
+        return _CloudflareRootCacheEntry(
+            fetched_at=fetched_at,
+            records=records,
+        )
+
+    def _store_root_cache_entry(
+        self,
+        cache_key: tuple[Any, ...],
+        *,
+        records: list[dict[str, Any]],
+        fetched_at: datetime,
+    ) -> None:
+        self._cache.upsert(
+            self._root_cache_key(cache_key),
+            content=None,
+            metadata={
+                "fetched_at": fetched_at.isoformat(),
+                "records": records,
+                "signature": self._record_cache_signature(),
+            },
+            content_ext=None,
+        )
+
+    def _load_record_cache_entry(
+        self,
+        origin: str,
+    ) -> _CloudflareRecordCacheEntry | None:
+        cached_entry = self._cache.fetch(origin)
+        if cached_entry is None:
+            return None
+        _, cached_meta = cached_entry
+        if cached_meta is None:
+            return None
+        if cached_meta.get("signature") != self._record_cache_signature():
+            return None
+        fetched_at = _parse_datetime(cached_meta.get("fetched_at"))
+        if fetched_at is None or not self._cloudflare_cache_is_fresh(fetched_at):
+            return None
+        record = cached_meta["record"]
+        return _CloudflareRecordCacheEntry(
+            fetched_at=fetched_at,
+            record=record,
+        )
+
+    def _store_record_cache_entry(
+        self,
+        origin: str,
+        *,
+        record: dict[str, Any],
+        fetched_at: datetime,
+    ) -> CacheValue:
+        cached_entry = self._cache.upsert(
+            origin,
+            content=record["markdown"],
+            metadata={
+                "origin": origin,
+                "fetched_at": fetched_at.isoformat(),
+                "record": record,
+                "signature": self._record_cache_signature(),
+            },
+            content_ext=".md",
+        )
+        assert cached_entry is not None
+        return cached_entry
+
+    def _cloudflare_cache_is_fresh(self, fetched_at: datetime) -> bool:
+        if self.cache_stale_after is None:
+            return True
+        return _utcnow() - fetched_at <= self.cache_stale_after
+
+
+def _coerce_roots(roots: RootsInput) -> list[RootInput]:
+    if isinstance(roots, (str, Path)):
+        return [roots]
+    return list(roots)
+
+
+def _resolve_crawl_scope(scope: CrawlScope) -> _ResolvedCrawlScope:
+    return _ResolvedCrawlScope(
+        roots=_coerce_roots(scope.roots),
+        include_patterns=_coerce_string_sequence(scope.include_patterns),
+        exclude_patterns=_coerce_string_sequence(scope.exclude_patterns),
+        depth=_DEFAULT_CRAWL_DEPTH if scope.depth is None else scope.depth,
+        limit=scope.limit,
+        include_types=_normalize_types(scope.include_types),
+        exclude_types=_normalize_types(scope.exclude_types),
+        include_external_links=scope.include_external_links,
+        include_subdomains=scope.include_subdomains,
+    )
+
+
+def _coerce_string_sequence(values: Sequence[str] | str | None) -> list[str]:
+    if values is None:
+        return []
+    if isinstance(values, str):
+        return [values]
+    return list(values)
+
+
+def _canonicalize_web_url(target: str, *, base: str | None = None) -> str | None:
+    url = urljoin(base, target) if base else target
+    if not url:
+        return None
+    url, _ = urldefrag(url)
+    parsed = urlparse(url)
+    scheme = parsed.scheme.lower()
+    if scheme != parsed.scheme:
+        parsed = parsed._replace(scheme=scheme)
+        url = urlunparse(parsed)
+    if parsed.scheme not in {"http", "https"}:
+        return None
+    if not parsed.netloc:
+        return None
+    try:
+        parsed.port
+    except ValueError:
+        return None
+    netloc = _canonical_netloc(parsed)
+    if netloc != parsed.netloc:
+        parsed = parsed._replace(netloc=netloc)
+    if parsed.path == "/" and not parsed.params:
+        parsed = parsed._replace(path="")
+    return urlunparse(parsed)
+
+
+def _canonical_netloc(parsed: Any) -> str:
+    userinfo = ""
+    if "@" in parsed.netloc:
+        userinfo = f"{parsed.netloc.rsplit('@', 1)[0]}@"
+    host = parsed.hostname or ""
+    if ":" in host and not host.startswith("["):
+        host = f"[{host}]"
+    port = parsed.port
+    if port is None:
+        return f"{userinfo}{host}"
+    if parsed.scheme == "http" and port == 80:
+        return f"{userinfo}{host}"
+    if parsed.scheme == "https" and port == 443:
+        return f"{userinfo}{host}"
+    return f"{userinfo}{host}:{port}"
+
+
+def _web_origin_key(origin: str) -> WebOriginKey:
+    parsed = urlparse(origin)
+    scheme = parsed.scheme.lower()
+    port = parsed.port
+    if port is None and scheme == "http":
+        port = 80
+    elif port is None and scheme == "https":
+        port = 443
+    return scheme, parsed.hostname or "", port
+
+
+def _allow_web_origin(
+    origin: str,
+    scope_origin: WebOriginKey,
+    root_host: str,
+    *,
+    include_external_links: bool,
+    include_subdomains: bool,
+) -> bool:
+    parsed = urlparse(origin)
+    host = parsed.hostname or ""
+    if not host:
+        return False
+    origin_key = _web_origin_key(origin)
+    if origin_key == scope_origin:
+        return True
+    if include_external_links:
+        return True
+    if not include_subdomains:
+        return False
+    return (
+        origin_key[0] == scope_origin[0]
+        and origin_key[2] == scope_origin[2]
+        and host.endswith(f".{root_host}")
+    )
+
+
+def _is_cloudflare_seed_redirect_target(root: str, target: str) -> bool:
+    root_parsed = urlparse(root)
+    target_parsed = urlparse(target)
+    if root_parsed.scheme not in {"http", "https"}:
+        return False
+    if target_parsed.scheme not in {"http", "https"}:
+        return False
+    if root_parsed.port is not None or target_parsed.port is not None:
+        return False
+
+    root_host = _redirect_host_key(root_parsed.hostname or "")
+    target_host = _redirect_host_key(target_parsed.hostname or "")
+    return root_host != "" and root_host == target_host
+
+
+def _redirect_host_key(host: str) -> str:
+    host = host.lower()
+    if host.startswith("www."):
+        return host[4:]
+    return host
+
+
+def _resolve_cache_dir(
+    cache_dir: bool | str | Path | None,
+    *,
+    backend_name: str,
+    default_factory: Callable[[], Path | None],
+) -> Path | None:
+    if cache_dir is None:
+        return default_factory()
+    if isinstance(cache_dir, bool):
+        if cache_dir is True:
+            return Path.cwd() / ".raghilda" / "cache" / backend_name
+        raise TypeError("cache_dir must be None, True, or a filesystem path")
+    return Path(cache_dir).resolve()
+
+
+def _to_directory_path(root: str | Path) -> Path:
+    if isinstance(root, Path):
+        return root
+    value = str(root)
+    if re.match(r"^[A-Za-z]:(?:[\\/]|$)", value):
+        return Path(value)
+    parsed = urlparse(value)
+    if parsed.scheme == "file":
+        return _path_from_file_uri(value)
+    assert parsed.scheme in {"", "file"}
+    return Path(value)
+
+
+def _iter_directory_files(root: Path, *, max_depth: int) -> Iterator[Path]:
+    yield from _iter_directory_files_from(
+        root,
+        root=root,
+        resolved_root=root.resolve(),
+        max_depth=max_depth,
+    )
+
+
+def _iter_directory_files_from(
+    directory: Path,
+    *,
+    root: Path,
+    resolved_root: Path,
+    max_depth: int,
+) -> Iterator[Path]:
+    for child in sorted(directory.iterdir()):
+        if not child.resolve().is_relative_to(resolved_root):
+            continue
+        if child.is_file():
+            yield child
+            continue
+        if child.is_symlink():
+            continue
+        if not child.is_dir():
+            continue
+        child_depth = len(child.relative_to(root).parts) - 1
+        if child_depth < max_depth:
+            yield from _iter_directory_files_from(
+                child,
+                root=root,
+                resolved_root=resolved_root,
+                max_depth=max_depth,
+            )
+
+
+def _path_from_file_uri(origin: str) -> Path:
+    parsed = urlparse(origin)
+    assert parsed.scheme == "file"
+    raw_path = parsed.path
+    if parsed.netloc and parsed.netloc != "localhost":
+        raw_path = f"//{parsed.netloc}{parsed.path}"
+    return Path(url2pathname(raw_path))
+
+
+def _path_from_file_origin(origin: str) -> Path:
+    parsed = urlparse(origin)
+    if parsed.scheme == "file":
+        return _path_from_file_uri(origin)
+    return Path(origin)
+
+
+def _normalize_types(types: Sequence[str] | None) -> set[str]:
+    if types is None:
+        return set()
+    if isinstance(types, str):
+        types = [types]
+    return {item.strip().lower() for item in types}
+
+
+def _matches_patterns(
+    origin: str,
+    *,
+    include_patterns: Sequence[str],
+    exclude_patterns: Sequence[str],
+) -> bool:
+    if _matches_exclude_patterns(origin, exclude_patterns=exclude_patterns):
+        return False
+    if not include_patterns:
+        return True
+    return any(re.search(pattern, origin) for pattern in include_patterns)
+
+
+def _matches_exclude_patterns(
+    origin: str,
+    *,
+    exclude_patterns: Sequence[str],
+) -> bool:
+    return any(re.search(pattern, origin) for pattern in exclude_patterns)
+
+
+def _matches_cloudflare_patterns(
+    origin: str,
+    *,
+    include_patterns: Sequence[str],
+    exclude_patterns: Sequence[str],
+) -> bool:
+    for pattern in exclude_patterns:
+        if _wildcard_matches(origin, pattern):
+            return False
+    if not include_patterns:
+        return True
+    return any(_wildcard_matches(origin, pattern) for pattern in include_patterns)
+
+
+def _wildcard_matches(origin: str, pattern: str) -> bool:
+    placeholder = "\0"
+    regex = re.escape(pattern)
+    regex = regex.replace(r"/\*\*", "(?:/.*)?")
+    regex = regex.replace(r"\*\*", placeholder)
+    regex = regex.replace(r"\*", "[^/]*")
+    regex = regex.replace(placeholder, ".*")
+    return re.fullmatch(regex, origin) is not None
+
+
+def _matches_types(
+    label: str | None,
+    *,
+    include_types: set[str],
+    exclude_types: set[str],
+) -> bool:
+    normalized = label.lower() if label is not None else None
+    if normalized is not None and normalized in exclude_types:
+        return False
+    if not include_types:
+        return True
+    return normalized in include_types
+
+
+def _detect_type_label(
+    *,
+    path: Path | None,
+    content_type: str | None,
+) -> str | None:
+    if path is not None:
+        alias = _TYPE_ALIASES.get(path.suffix.lower())
+        if alias is not None:
+            return alias
+    normalized_content_type = _normalize_content_type(content_type)
+    if normalized_content_type in _CONTENT_TYPE_LABELS:
+        return _CONTENT_TYPE_LABELS[normalized_content_type]
+    if path is not None and path.exists() and _MAGIKA is not None:
+        result = _MAGIKA.identify_path(path)
+        return _MAGIKA_LABELS.get(result.output.label, result.output.label)
+    return None
+
+
+def _normalize_content_type(content_type: str | None) -> str | None:
+    if content_type is None:
+        return None
+    return content_type.split(";", 1)[0].strip().lower()
+
+
+def _sha256_path(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        while True:
+            chunk = handle.read(8192)
+            if not chunk:
+                break
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def _read_text(path: Path) -> str:
+    return path.read_text(encoding="utf-8", errors="ignore")
+
+
+def _known_body_suffix(origin: str, *, content_type: str | None) -> str | None:
+    normalized = _normalize_content_type(content_type)
+    if normalized == "text/html":
+        return ".html"
+    if normalized == "text/markdown":
+        return ".md"
+    if normalized == "text/plain":
+        return ".txt"
+    if normalized in {"application/xml", "text/xml"}:
+        return ".xml"
+    if normalized == "text/x-python":
+        return ".py"
+    if normalized == "application/json":
+        return ".json"
+    if normalized == "application/pdf":
+        return ".pdf"
+    parsed = urlparse(origin)
+    suffix = Path(parsed.path).suffix
+    if suffix:
+        return suffix
+    return None
+
+
+def _body_suffix(origin: str, *, content_type: str | None) -> str:
+    suffix = _known_body_suffix(origin, content_type=content_type)
+    if suffix is not None:
+        return suffix
+    return ".bin"
+
+
+def _type_hint_path(origin: str, *, content_type: str | None) -> Path:
+    suffix = _body_suffix(origin, content_type=content_type)
+    return Path("source").with_suffix(suffix)
+
+
+def _utcnow() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+def _parse_datetime(value: str | None) -> datetime | None:
+    if value is None:
+        return None
+    return datetime.fromisoformat(value)
diff --git a/src/raghilda/scrape.py b/src/raghilda/scrape.py
index 15a81cc..bf07c37 100644
--- a/src/raghilda/scrape.py
+++ b/src/raghilda/scrape.py
@@ -38,7 +38,7 @@ def _extract_links(txt: str) -> set[str]:
         root = ET.fromstring(txt)
         for loc in root.findall(".//{*}url/{*}loc"):
             if loc is not None and loc.text:
-                links.update(loc.text.strip())
+                links.add(loc.text.strip())
     except Exception:
         pass
 
diff --git a/src/raghilda/store.py b/src/raghilda/store.py
index d790399..8847339 100644
--- a/src/raghilda/store.py
+++ b/src/raghilda/store.py
@@ -1,4 +1,4 @@
-from ._store import BaseStore, WriteResult
+from ._store import BaseStore, IngestSummary, WriteResult
 from ._duckdb_store import DuckDBStore
 from ._openai_store import OpenAIStore
 from ._chroma_store import ChromaDBStore
@@ -6,6 +6,7 @@
 __all__ = [
     "BaseStore",
     "WriteResult",
+    "IngestSummary",
     "DuckDBStore",
     "OpenAIStore",
     "ChromaDBStore",
diff --git a/tests/test_api_contract.py b/tests/test_api_contract.py
index 79d4095..df1b53c 100644
--- a/tests/test_api_contract.py
+++ b/tests/test_api_contract.py
@@ -1,11 +1,27 @@
+import inspect
 from types import SimpleNamespace
 
 import pytest
 
+import raghilda.crawl as crawl_module
 from raghilda.chunk import MarkdownChunk
+from raghilda.crawl import (
+    BaseCrawler,
+    CrawlScope,
+    CloudflareCrawler,
+    DirectoryCrawler,
+    FetchedSource,
+    WebCrawler,
+)
 from raghilda.document import Document, MarkdownDocument
 import raghilda.store as store_module
-from raghilda.store import ChromaDBStore, DuckDBStore, OpenAIStore, WriteResult
+from raghilda.store import (
+    ChromaDBStore,
+    DuckDBStore,
+    IngestSummary,
+    OpenAIStore,
+    WriteResult,
+)
 
 
 def test_document_uses_origin_field_not_id():
@@ -15,23 +31,81 @@ def test_document_uses_origin_field_not_id():
     assert not hasattr(doc, "id")
 
 
-def test_store_api_uses_upsert_not_insert():
+def test_store_api_uses_upsert_and_ingest_not_insert():
     assert hasattr(DuckDBStore, "upsert")
     assert hasattr(ChromaDBStore, "upsert")
     assert hasattr(OpenAIStore, "upsert")
-    assert not hasattr(DuckDBStore, "ingest")
-    assert not hasattr(ChromaDBStore, "ingest")
-    assert not hasattr(OpenAIStore, "ingest")
+    assert hasattr(DuckDBStore, "ingest")
+    assert hasattr(ChromaDBStore, "ingest")
+    assert hasattr(OpenAIStore, "ingest")
     assert not hasattr(DuckDBStore, "insert")
     assert not hasattr(ChromaDBStore, "insert")
     assert not hasattr(OpenAIStore, "insert")
 
 
-def test_store_exports_write_result_not_insert_result():
+def test_store_exports_write_and_ingest_results_not_insert_result():
     assert WriteResult is store_module.WriteResult
+    assert IngestSummary is store_module.IngestSummary
     assert not hasattr(store_module, "InsertResult")
 
 
+def test_store_exports_postgres_store_when_dependency_is_installed():
+    pytest.importorskip("psycopg2")
+
+    assert hasattr(store_module, "PostgreSQLStore")
+    assert "PostgreSQLStore" in store_module.__all__
+
+
+def test_crawl_exports_public_crawler_types():
+    assert crawl_module.BaseCrawler is BaseCrawler
+    assert crawl_module.CrawlScope is CrawlScope
+    assert crawl_module.DirectoryCrawler is DirectoryCrawler
+    assert crawl_module.WebCrawler is WebCrawler
+    assert crawl_module.CloudflareCrawler is CloudflareCrawler
+    assert crawl_module.FetchedSource is FetchedSource
+
+
+def test_crawl_scope_owns_traversal_policy() -> None:
+    assert tuple(inspect.signature(CrawlScope).parameters) == (
+        "roots",
+        "include_patterns",
+        "exclude_patterns",
+        "depth",
+        "limit",
+        "include_types",
+        "exclude_types",
+        "include_external_links",
+        "include_subdomains",
+    )
+
+
+def test_crawler_constructors_keep_backend_and_cache_configuration_only() -> None:
+    assert tuple(inspect.signature(DirectoryCrawler).parameters) == (
+        "cache_dir",
+        "max_workers",
+    )
+    assert tuple(inspect.signature(WebCrawler).parameters) == (
+        "session",
+        "cache_dir",
+        "cache_stale_after",
+        "max_workers",
+    )
+    assert tuple(inspect.signature(CloudflareCrawler).parameters) == (
+        "account_id",
+        "api_token",
+        "cache_dir",
+        "session",
+        "source",
+        "render",
+        "cache_stale_after",
+        "modified_since",
+        "poll_interval",
+        "max_poll_attempts",
+        "max_workers",
+        "base_url",
+    )
+
+
 def test_openai_upsert_rejects_chunked_document():
     class _SinglePage:
         def __init__(self):
diff --git a/tests/test_crawl.py b/tests/test_crawl.py
new file mode 100644
index 0000000..15317a8
--- /dev/null
+++ b/tests/test_crawl.py
@@ -0,0 +1,3129 @@
+from __future__ import annotations
+
+from contextlib import contextmanager
+from datetime import datetime, timedelta, timezone
+import fnmatch
+import hashlib
+import http.server
+import json
+import os
+from pathlib import Path
+import re
+import socketserver
+import threading
+from typing import Any
+import unicodedata
+
+import pytest
+import raghilda.crawl as crawl_module
+from raghilda.crawl import (
+    CrawlScope,
+    CloudflareCrawler,
+    DirectoryCrawler,
+    FetchedSource,
+    WebCrawler,
+)
+from raghilda.document import MarkdownDocument
+
+_WINDOWS_RESERVED = {
+    "CON",
+    "PRN",
+    "AUX",
+    "NUL",
+    "COM1",
+    "COM2",
+    "COM3",
+    "COM4",
+    "COM5",
+    "COM6",
+    "COM7",
+    "COM8",
+    "COM9",
+    "LPT1",
+    "LPT2",
+    "LPT3",
+    "LPT4",
+    "LPT5",
+    "LPT6",
+    "LPT7",
+    "LPT8",
+    "LPT9",
+}
+
+
+def _write(tmp_path: Path, relative: str, contents: str) -> Path:
+    path = tmp_path / relative
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(contents, encoding="utf-8")
+    return path
+
+
+def _expected_cache_base(key: str) -> str:
+    value = unicodedata.normalize("NFC", key)
+    value = value.replace("://", "__")
+    value = value.replace("\\", "_")
+    value = value.replace("/", "_")
+    value = re.sub(r'[\x00-\x1f<>:"|?*]+', "_", value)
+    value = re.sub(r"\s+", "_", value)
+    value = re.sub(r"[^A-Za-z0-9._-]+", "_", value)
+    value = value.strip(" ._")
+
+    if not value:
+        value = "entry"
+
+    root = value.split(".", 1)[0].rstrip(" .").upper()
+    if root in _WINDOWS_RESERVED:
+        value = f"_{value}"
+
+    if len(value) > 180:
+        head = 180 // 2 - 2
+        tail = 180 - head - 2
+        value = f"{value[:head]}..{value[-tail:]}"
+
+    value = value.rstrip(" .")
+    stem = value or "entry"
+    digest = hashlib.sha256(key.encode("utf-8")).hexdigest()[:12]
+    return f"{stem}--{digest}"
+
+
+def test_directory_crawler_discovers_and_converts_markdown_documents(
+    tmp_path: Path,
+) -> None:
+    markdown = _write(tmp_path, "docs/readme.md", "# Hello\n\nDirectory crawler")
+    _write(tmp_path, "docs/skip.py", "print('skip')")
+    notebook = _write(
+        tmp_path,
+        "docs/notebook.ipynb",
+        json.dumps(
+            {
+                "cells": [],
+                "metadata": {},
+                "nbformat": 4,
+                "nbformat_minor": 5,
+            }
+        ),
+    )
+
+    crawler = DirectoryCrawler()
+    scope = CrawlScope(
+        roots=[tmp_path],
+        depth=3,
+        include_patterns=[r".*/docs/.*"],
+        exclude_patterns=[r".*/skip\.py$"],
+        include_types=["markdown", "jupyter-notebook"],
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert markdown.resolve().as_uri() in origins
+    assert notebook.resolve().as_uri() in origins
+    assert all(not origin.endswith("skip.py") for origin in origins)
+
+    source = crawler.fetch_raw(markdown.resolve().as_uri())
+    assert isinstance(source, FetchedSource)
+    assert source.origin == markdown.resolve().as_uri()
+    assert source.body_path == markdown.resolve()
+    assert source.status_code is None
+
+    fetched_markdown = crawler.fetch_markdown(markdown.resolve().as_uri())
+    assert fetched_markdown == MarkdownDocument(
+        origin=markdown.resolve().as_uri(),
+        content="# Hello\n\nDirectory crawler",
+    )
+
+
+def test_directory_crawler_convert_override_receives_fetched_source(
+    tmp_path: Path,
+) -> None:
+    markdown = _write(tmp_path, "docs/readme.md", "# Hello\n\nDirectory crawler")
+    seen: list[FetchedSource] = []
+
+    crawler = DirectoryCrawler()
+
+    converted = crawler.fetch_markdown(
+        markdown.resolve().as_uri(),
+        convert=lambda source: _record_directory_conversion(source, seen),
+    )
+
+    assert [item.origin for item in seen] == [markdown.resolve().as_uri()]
+    assert converted == MarkdownDocument(
+        origin=markdown.resolve().as_uri(),
+        content="# Converted\n",
+    )
+
+
+def test_directory_crawler_cache_dir_uses_hashed_file_pair(
+    tmp_path: Path,
+) -> None:
+    markdown = _write(tmp_path, "docs/readme.md", "# Hello\n")
+    cache_dir = tmp_path / "cache"
+    crawler = DirectoryCrawler(cache_dir=cache_dir)
+
+    origin = markdown.resolve().as_uri()
+    document = crawler.fetch_markdown(origin)
+
+    base = _expected_cache_base(origin)
+    metadata_path = cache_dir / f"{base}.metadata.json"
+    content_path = cache_dir / f"{base}.md"
+    assert document == MarkdownDocument(origin=origin, content="# Hello\n")
+    assert sorted(path.name for path in cache_dir.iterdir()) == [
+        content_path.name,
+        metadata_path.name,
+    ]
+    assert json.loads(metadata_path.read_text(encoding="utf-8")) == {
+        "content_path": content_path.name,
+        "key": origin,
+        "metadata": {
+            "mtime_ns": markdown.stat().st_mtime_ns,
+            "origin": origin,
+            "source_hash": hashlib.sha256(markdown.read_bytes()).hexdigest(),
+        },
+    }
+
+
+def test_directory_crawler_cache_dir_true_uses_default_backend_directory(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    markdown = _write(tmp_path, "docs/readme.md", "# Hello\n")
+    monkeypatch.chdir(tmp_path)
+    crawler = DirectoryCrawler(cache_dir=True)
+
+    origin = markdown.resolve().as_uri()
+    crawler.fetch_markdown(origin)
+
+    cache_dir = tmp_path / ".raghilda" / "cache" / "directory"
+    base = _expected_cache_base(origin)
+    assert sorted(path.name for path in cache_dir.iterdir()) == [
+        f"{base}.md",
+        f"{base}.metadata.json",
+    ]
+
+
+def _record_directory_conversion(
+    source: FetchedSource, seen: list[FetchedSource]
+) -> MarkdownDocument:
+    seen.append(source)
+    return MarkdownDocument(origin=source.origin, content="# Converted\n")
+
+
+class _ThreadingHTTPServer(socketserver.ThreadingMixIn, http.server.HTTPServer):
+    daemon_threads = True
+
+
+class _RequestHandler(http.server.BaseHTTPRequestHandler):
+    def do_GET(self) -> None:
+        path = self.path.split("?", 1)[0]
+        routes = self.server.routes  # type: ignore[attr-defined]
+        route = routes[path]
+        self.server.requests.append(  # type: ignore[attr-defined]
+            {"path": path, "headers": dict(self.headers.items())}
+        )
+        if route["etag"] and self.headers.get("If-None-Match") == route["etag"]:
+            self.send_response(304)
+            self.send_header("ETag", route["etag"])
+            self.end_headers()
+            return
+
+        body = route["body"].encode("utf-8")
+        self.send_response(200)
+        self.send_header("Content-Type", route["content_type"])
+        self.send_header("Content-Length", str(len(body)))
+        if route["etag"]:
+            self.send_header("ETag", route["etag"])
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, format: str, *args: Any) -> None:
+        return
+
+
+class _FakeWebResponse:
+    def __init__(
+        self,
+        *,
+        body: str,
+        url: str,
+        content_type: str = "text/html; charset=utf-8",
+        status_code: int = 200,
+        headers: dict[str, str] | None = None,
+    ) -> None:
+        self.url = url
+        self.content = body.encode("utf-8")
+        self.headers = {"Content-Type": content_type, **(headers or {})}
+        self.status_code = status_code
+
+    def raise_for_status(self) -> None:
+        assert self.status_code < 400
+
+
+class _FakeWebSession:
+    def __init__(self, routes: dict[str, dict[str, Any]]) -> None:
+        self.routes = routes
+        self.requests: list[tuple[str, dict[str, str]]] = []
+
+    def get(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str],
+        timeout: float,
+    ) -> _FakeWebResponse:
+        del timeout
+        self.requests.append((url, headers))
+        route = self.routes[url]
+        return _FakeWebResponse(
+            body=route["body"],
+            url=route.get("resolved_url", url),
+            content_type=route.get("content_type", "text/html; charset=utf-8"),
+            status_code=route.get("status_code", 200),
+            headers=route.get("headers"),
+        )
+
+
+@contextmanager
+def _serve(routes: dict[str, dict[str, str | None]]):
+    server = _ThreadingHTTPServer(("127.0.0.1", 0), _RequestHandler)
+    server.routes = routes  # type: ignore[attr-defined]
+    server.requests = []  # type: ignore[attr-defined]
+    thread = threading.Thread(target=server.serve_forever)
+    thread.start()
+    try:
+        yield server
+    finally:
+        server.shutdown()
+        thread.join()
+        server.server_close()
+
+
+def test_web_crawler_discovers_origins_and_revalidates_cache(tmp_path: Path) -> None:
+    with _serve(
+        {
+            "/": {
+                "body": """
+                <html><body>
+                  <main>
+                    <a href="/guide">Guide</a>
+                    <a href="/skip">Skip</a>
+                    <a href="http://external.test/path">External</a>
+                  </main>
+                </body></html>
+                """,
+                "content_type": "text/html; charset=utf-8",
+                "etag": "root-v1",
+            },
+            "/guide": {
+                "body": "<html><body><main><h1>Guide</h1><p>Hello</p></main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": "guide-v1",
+            },
+            "/skip": {
+                "body": "<html><body><main><h1>Skip</h1></main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": "skip-v1",
+            },
+        }
+    ) as server:
+        root_url = f"http://127.0.0.1:{server.server_port}/"
+        root_origin = root_url.rstrip("/")
+        crawler = WebCrawler(
+            cache_dir=tmp_path / "cache",
+            cache_stale_after=timedelta(seconds=0),
+        )
+        scope = CrawlScope(
+            roots=[root_url],
+            depth=1,
+            include_patterns=[rf"^{re.escape(root_origin)}(?:/.*)?$"],
+            exclude_patterns=[r".*/skip$"],
+        )
+
+        origins = list(crawler.origins(scope, progress=False))
+
+        assert root_origin in origins
+        assert f"{root_url}guide" in origins
+        assert all(not origin.endswith("/skip") for origin in origins)
+        assert all("external.test" not in origin for origin in origins)
+
+        first = crawler.fetch_raw(root_url)
+        second = crawler.fetch_raw(root_url)
+        third = crawler.fetch_raw(root_url, cache_force_refresh=True)
+        server_requests = getattr(server, "requests")
+        root_requests = [
+            request for request in server_requests if request["path"] == "/"
+        ]
+
+        assert first.body_path == second.body_path == third.body_path
+        assert second.revalidated_at is not None
+        assert root_requests[-2]["headers"]["If-None-Match"] == "root-v1"
+        assert "If-None-Match" not in root_requests[-1]["headers"]
+
+        guide_doc = crawler.fetch_markdown(f"{root_url}guide")
+        assert guide_doc.origin == f"{root_url}guide"
+        assert "Guide" in guide_doc.content
+
+
+def test_web_crawler_resolves_relative_links_from_redirect_target(
+    tmp_path: Path,
+) -> None:
+    session: Any = _FakeWebSession(
+        {
+            "https://example.com/docs": {
+                "body": '<html><body><a href="page">Page</a></body></html>',
+                "resolved_url": "https://example.com/docs/",
+            },
+            "https://example.com/docs/page": {
+                "body": "<html><body><main>Page</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "redirect-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=["https://example.com/docs"], depth=1)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert "https://example.com/docs" in origins
+    assert "https://example.com/docs/page" in origins
+    assert "https://example.com/page" not in origins
+
+
+def test_web_crawler_follows_links_after_redirect_to_different_host(
+    tmp_path: Path,
+) -> None:
+    session: Any = _FakeWebSession(
+        {
+            "https://example.com": {
+                "body": '<html><body><a href="/about">About</a></body></html>',
+                "resolved_url": "https://www.example.com/landing",
+            },
+            "https://www.example.com/about": {
+                "body": "<html><body><main>About</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "redirect-host-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=["https://example.com"], depth=1)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert "https://example.com" in origins
+    assert "https://www.example.com/about" in origins
+
+
+def test_web_crawler_include_subdomains_uses_redirect_scope(
+    tmp_path: Path,
+) -> None:
+    root = "http://example.com"
+    page = "https://example.com/page"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": '<html><body><a href="/page">Page</a></body></html>',
+                "resolved_url": "https://example.com/landing",
+            },
+            page: {
+                "body": "<html><body><main>Page</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "redirect-subdomain-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=[root], depth=1, include_subdomains=True)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root, page]
+
+
+def test_web_crawler_include_subdomains_stays_within_requested_host_tree(
+    tmp_path: Path,
+) -> None:
+    root = "https://docs.example.co.uk/start"
+    allowed = "https://api.docs.example.co.uk/page"
+    disallowed_parent = "https://example.co.uk/root"
+    disallowed_sibling = "https://other.co.uk/page"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": (
+                    f'<html><body><a href="{allowed}">Allowed</a>'
+                    f'<a href="{disallowed_parent}">Parent</a>'
+                    f'<a href="{disallowed_sibling}">Sibling</a></body></html>'
+                ),
+            },
+            allowed: {"body": "<html><body><main>Allowed</main></body></html>"},
+            disallowed_parent: {
+                "body": "<html><body><main>Parent</main></body></html>"
+            },
+            disallowed_sibling: {
+                "body": "<html><body><main>Sibling</main></body></html>"
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "subdomain-cache",
+        session=session,
+    )
+    scope = CrawlScope(
+        roots=[root],
+        depth=1,
+        include_subdomains=True,
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert root in origins
+    assert allowed in origins
+    assert disallowed_parent not in origins
+    assert disallowed_sibling not in origins
+
+
+def test_web_crawler_include_subdomains_keeps_original_scope_host(
+    tmp_path: Path,
+) -> None:
+    root = "https://docs.example.com/start"
+    api = "https://api.docs.example.com/page"
+    cdn = "https://cdn.docs.example.com/asset"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": f'<html><body><a href="{api}">API</a></body></html>',
+            },
+            api: {
+                "body": f'<html><body><a href="{cdn}">CDN</a></body></html>',
+            },
+            cdn: {"body": "<html><body><main>CDN</main></body></html>"},
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "subdomain-root-host-cache",
+        session=session,
+    )
+    scope = CrawlScope(
+        roots=[root],
+        depth=2,
+        include_subdomains=True,
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root, api, cdn]
+
+
+def test_web_crawler_excludes_same_host_different_port_by_default(
+    tmp_path: Path,
+) -> None:
+    root = "http://127.0.0.1:8000"
+    other_port = "http://127.0.0.1:9000/page"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": f'<html><body><a href="{other_port}">Other</a></body></html>',
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "same-host-port-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=[root], depth=1)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root]
+    assert session.requests == [(root, {})]
+
+
+def test_web_crawler_include_subdomains_excludes_same_host_different_port(
+    tmp_path: Path,
+) -> None:
+    root = "http://127.0.0.1:8000"
+    other_port = "http://127.0.0.1:9000/page"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": f'<html><body><a href="{other_port}">Other</a></body></html>',
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "subdomain-same-host-port-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=[root], depth=1, include_subdomains=True)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root]
+    assert session.requests == [(root, {})]
+
+
+def test_web_crawler_treats_explicit_default_port_as_same_origin(
+    tmp_path: Path,
+) -> None:
+    root = "http://example.com"
+    explicit_root = "http://example.com:80"
+    child = "http://example.com/about"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": f'<html><body><a href="{child}">About</a></body></html>',
+            },
+            child: {
+                "body": "<html><body><main>About</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "default-port-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=[explicit_root], depth=1)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root, child]
+
+
+def test_web_crawler_deduplicates_explicit_default_port_variants(
+    tmp_path: Path,
+) -> None:
+    root = "https://example.com"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": "<html><body><main>Root</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "default-port-variant-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=[root, "https://example.com:443"], depth=0)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root]
+    assert session.requests == [(root, {})]
+
+
+def test_web_crawler_normalizes_uppercase_url_schemes(tmp_path: Path) -> None:
+    origin = "http://example.com"
+    page = "https://example.com/page"
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": (
+                    '<html><body><a href="HTTPS://example.com/page">'
+                    "Page</a></body></html>"
+                ),
+            },
+            page: {
+                "body": "<html><body><main>Page</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "uppercase-scheme-cache",
+        session=session,
+    )
+    scope = CrawlScope(
+        roots=["HTTP://example.com"], depth=1, include_external_links=True
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [origin, page]
+
+
+def test_web_crawler_preserves_url_credentials(tmp_path: Path) -> None:
+    origin = "https://user:pass@example.com/private"
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "<html><body><main>Private</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "credential-url-cache",
+        session=session,
+    )
+
+    source = crawler.fetch_raw(origin)
+
+    assert source.origin == origin
+    assert session.requests == [(origin, {})]
+
+
+def test_web_crawler_discovers_urls_from_xml_sitemap(tmp_path: Path) -> None:
+    sitemap = "https://example.com/sitemap.xml"
+    page = "https://example.com/docs/page"
+    session: Any = _FakeWebSession(
+        {
+            sitemap: {
+                "body": (
+                    '<?xml version="1.0" encoding="UTF-8"?>'
+                    '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
+                    f"<url><loc>{page}</loc></url>"
+                    "</urlset>"
+                ),
+                "content_type": "application/xml",
+            },
+            page: {
+                "body": "<html><body><main>Page</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "sitemap-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=[sitemap], depth=1)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [sitemap, page]
+
+
+def test_web_crawler_allows_later_in_scope_occurrence_of_same_url(
+    tmp_path: Path,
+) -> None:
+    first_root = "https://alpha.example.com/start"
+    second_root = "https://docs.example.com/start"
+    shared = "https://api.docs.example.com/page"
+    session: Any = _FakeWebSession(
+        {
+            first_root: {
+                "body": f'<html><body><a href="{shared}">Shared</a></body></html>',
+            },
+            second_root: {
+                "body": f'<html><body><a href="{shared}">Shared</a></body></html>',
+            },
+            shared: {"body": "<html><body><main>Shared</main></body></html>"},
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "multi-root-visited-cache",
+        session=session,
+    )
+    scope = CrawlScope(
+        roots=[first_root, second_root],
+        depth=1,
+        include_subdomains=True,
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [first_root, second_root, shared]
+
+
+def test_web_crawler_revisits_shared_page_for_broader_subdomain_scope(
+    tmp_path: Path,
+) -> None:
+    narrow_root = "https://api.docs.example.com/start"
+    broad_root = "https://docs.example.com/start"
+    shared = "https://api.docs.example.com/shared"
+    sibling = "https://cdn.docs.example.com/asset"
+    session: Any = _FakeWebSession(
+        {
+            narrow_root: {
+                "body": f'<html><body><a href="{shared}">Shared</a></body></html>',
+            },
+            broad_root: {
+                "body": f'<html><body><a href="{shared}">Shared</a></body></html>',
+            },
+            shared: {
+                "body": f'<html><body><a href="{sibling}">Sibling</a></body></html>',
+            },
+            sibling: {"body": "<html><body><main>Sibling</main></body></html>"},
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "multi-root-subdomain-cache",
+        session=session,
+    )
+    scope = CrawlScope(
+        roots=[narrow_root, broad_root],
+        depth=2,
+        include_subdomains=True,
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [narrow_root, broad_root, shared, sibling]
+
+
+def test_web_crawler_discovers_matching_descendants_from_filtered_seed(
+    tmp_path: Path,
+) -> None:
+    with _serve(
+        {
+            "/": {
+                "body": '<html><body><a href="/docs/guide">Guide</a></body></html>',
+                "content_type": "text/html; charset=utf-8",
+                "etag": None,
+            },
+            "/docs/guide": {
+                "body": "<html><body><main>Guide</main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": None,
+            },
+        }
+    ) as server:
+        root_url = f"http://127.0.0.1:{server.server_port}/"
+        crawler = WebCrawler(
+            cache_dir=tmp_path / "filtered-seed-cache",
+        )
+        scope = CrawlScope(
+            roots=[root_url],
+            depth=1,
+            include_patterns=[rf"^{re.escape(root_url)}docs/.*"],
+        )
+
+        origins = list(crawler.origins(scope, progress=False))
+
+        assert root_url not in origins
+        assert f"{root_url}docs/guide" in origins
+
+
+def test_web_crawler_does_not_fetch_excluded_origins(tmp_path: Path) -> None:
+    root = "https://example.com"
+    admin = "https://example.com/admin"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": f'<html><body><a href="{admin}">Admin</a></body></html>',
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "excluded-origin-cache",
+        session=session,
+    )
+    scope = CrawlScope(
+        roots=[root],
+        depth=1,
+        exclude_patterns=[r"/admin$"],
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root]
+    assert session.requests == [(root, {})]
+
+
+def test_web_crawler_deduplicates_root_url_with_and_without_slash(
+    tmp_path: Path,
+) -> None:
+    root = "https://example.com"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": '<html><body><a href="/">Root</a></body></html>',
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "root-slash-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=[root], depth=1)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root]
+    assert session.requests == [(root, {})]
+
+
+def test_web_crawler_deduplicates_root_scope_variants(tmp_path: Path) -> None:
+    root = "https://example.com"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": "<html><body><main>Root</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "root-variant-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=[root, f"{root}/"], depth=0)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root]
+    assert session.requests == [(root, {})]
+
+
+def test_web_crawler_deduplicates_queried_root_scope_variants(
+    tmp_path: Path,
+) -> None:
+    root = "https://example.com?x=1"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": "<html><body><main>Root</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "queried-root-variant-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=[root, "https://example.com/?x=1"], depth=0)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root]
+    assert session.requests == [(root, {})]
+
+
+def test_web_crawler_normalizes_root_links_from_non_root_pages(
+    tmp_path: Path,
+) -> None:
+    root = "https://example.com"
+    page = "https://example.com/docs"
+    session: Any = _FakeWebSession(
+        {
+            page: {
+                "body": '<html><body><a href="/">Root</a></body></html>',
+            },
+            root: {
+                "body": "<html><body><main>Root</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "non-root-root-link-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=[page], depth=1)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [page, root]
+    assert session.requests == [(page, {}), (root, {})]
+
+
+def test_web_crawler_skips_links_with_malformed_ports(tmp_path: Path) -> None:
+    root = "https://example.com"
+    session: Any = _FakeWebSession(
+        {
+            root: {
+                "body": (
+                    '<html><body><a href="http://example.com:bad/path">'
+                    "Bad</a></body></html>"
+                ),
+            },
+        }
+    )
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "bad-port-cache",
+        session=session,
+    )
+    scope = CrawlScope(roots=[root], depth=1)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root]
+    assert session.requests == [(root, {})]
+
+
+def test_web_crawler_accepts_crawl_scope_for_roots_and_patterns(
+    tmp_path: Path,
+) -> None:
+    with _serve(
+        {
+            "/": {
+                "body": '<html><body><a href="/docs/guide">Guide</a></body></html>',
+                "content_type": "text/html; charset=utf-8",
+                "etag": None,
+            },
+            "/docs/guide": {
+                "body": "<html><body><main>Guide</main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": None,
+            },
+        }
+    ) as server:
+        root_url = f"http://127.0.0.1:{server.server_port}/"
+        crawler = WebCrawler(cache_dir=tmp_path / "scope-cache")
+        scope = CrawlScope(
+            roots=[root_url],
+            depth=1,
+            include_patterns=[rf"^{re.escape(root_url)}docs/.*"],
+        )
+
+        origins = list(crawler.origins(scope, progress=False))
+        documents = list(crawler.markdown_documents(scope, progress=False))
+
+        assert origins == [f"{root_url}docs/guide"]
+        assert documents == [
+            MarkdownDocument(origin=f"{root_url}docs/guide", content="Guide")
+        ]
+
+
+def test_web_markdown_documents_reuses_refreshed_sources(
+    tmp_path: Path,
+) -> None:
+    with _serve(
+        {
+            "/": {
+                "body": "<html><body><main>Root</main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": None,
+            }
+        }
+    ) as server:
+        root_url = f"http://127.0.0.1:{server.server_port}/"
+        root_origin = root_url.rstrip("/")
+        crawler = WebCrawler(
+            cache_dir=tmp_path / "markdown-docs-cache",
+        )
+        scope = CrawlScope(roots=[root_url], depth=0)
+
+        documents = list(crawler.markdown_documents(scope, cache_force_refresh=True))
+        root_requests = [
+            request for request in getattr(server, "requests") if request["path"] == "/"
+        ]
+
+        assert documents == [MarkdownDocument(origin=root_origin, content="Root")]
+        assert len(root_requests) == 1
+
+
+def test_web_markdown_documents_reuses_immediately_stale_discovery_cache(
+    tmp_path: Path,
+) -> None:
+    with _serve(
+        {
+            "/": {
+                "body": "<html><body><main>Root</main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": None,
+            }
+        }
+    ) as server:
+        root_url = f"http://127.0.0.1:{server.server_port}/"
+        root_origin = root_url.rstrip("/")
+        crawler = WebCrawler(
+            cache_dir=tmp_path / "stale-markdown-docs-cache",
+            cache_stale_after=timedelta(seconds=0),
+        )
+        scope = CrawlScope(roots=[root_url], depth=0)
+
+        documents = list(crawler.markdown_documents(scope, progress=False))
+        root_requests = [
+            request for request in getattr(server, "requests") if request["path"] == "/"
+        ]
+
+        assert documents == [MarkdownDocument(origin=root_origin, content="Root")]
+        assert len(root_requests) == 1
+
+
+def test_web_crawler_fetches_same_depth_frontier_concurrently(tmp_path: Path) -> None:
+    root = "https://example.com/docs"
+    first = "https://example.com/docs/one"
+    second = "https://example.com/docs/two"
+
+    class _ConcurrentWebSession:
+        def __init__(self) -> None:
+            self.requests: list[tuple[str, dict[str, str]]] = []
+            self._lock = threading.Lock()
+            self._barrier = threading.Barrier(2)
+            self.in_flight = 0
+            self.max_in_flight = 0
+
+        def get(
+            self, url: str, headers: dict[str, str], timeout: float
+        ) -> _FakeWebResponse:
+            del timeout
+            with self._lock:
+                self.requests.append((url, headers))
+                self.in_flight += 1
+                self.max_in_flight = max(self.max_in_flight, self.in_flight)
+            try:
+                if url == root:
+                    return _FakeWebResponse(
+                        body=(
+                            f'<html><body><a href="{first}">One</a>'
+                            f'<a href="{second}">Two</a></body></html>'
+                        ),
+                        url=url,
+                    )
+                if url in {first, second}:
+                    self._barrier.wait(timeout=1.0)
+                    return _FakeWebResponse(
+                        body="<html><body><main>Child</main></body></html>",
+                        url=url,
+                    )
+                raise AssertionError(f"Unexpected url: {url}")
+            finally:
+                with self._lock:
+                    self.in_flight -= 1
+
+    session: Any = _ConcurrentWebSession()
+    crawler = WebCrawler(
+        cache_dir=tmp_path / "frontier-cache",
+        session=session,
+        max_workers=2,
+    )
+    scope = CrawlScope(roots=[root], depth=1)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [root, first, second]
+    assert session.max_in_flight == 2
+
+
+def test_web_crawler_treats_304_revalidation_as_fresh_cache_hit(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    with _serve(
+        {
+            "/": {
+                "body": "<html><body><main>Cached</main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": "root-v1",
+            }
+        }
+    ) as server:
+        root_url = f"http://127.0.0.1:{server.server_port}/"
+        times = iter(
+            [
+                datetime(2026, 1, 1, tzinfo=timezone.utc),
+                datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc),
+                datetime(2026, 1, 1, 0, 0, 2, 500000, tzinfo=timezone.utc),
+            ]
+        )
+        monkeypatch.setattr(crawl_module, "_utcnow", lambda: next(times))
+        crawler = WebCrawler(
+            cache_dir=tmp_path / "fresh-cache",
+            cache_stale_after=timedelta(seconds=1),
+        )
+
+        first = crawler.fetch_raw(root_url)
+        second = crawler.fetch_raw(root_url)
+        third = crawler.fetch_raw(root_url)
+        root_requests = [
+            request for request in getattr(server, "requests") if request["path"] == "/"
+        ]
+
+        assert first.body_path == second.body_path == third.body_path
+        assert second.revalidated_at is not None
+        assert len(root_requests) == 2
+        assert root_requests[1]["headers"]["If-None-Match"] == "root-v1"
+
+
+def test_web_crawler_cache_dir_uses_hashed_file_pair(
+    tmp_path: Path,
+) -> None:
+    with _serve(
+        {
+            "/": {
+                "body": "<html><body><main>Root</main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": None,
+            }
+        }
+    ) as server:
+        root_url = f"http://127.0.0.1:{server.server_port}/"
+        root_origin = root_url.rstrip("/")
+        cache_dir = tmp_path / "cache"
+        crawler = WebCrawler(cache_dir=cache_dir)
+
+        document = crawler.fetch_markdown(root_url)
+
+        base = _expected_cache_base(root_origin)
+        metadata_path = cache_dir / f"{base}.metadata.json"
+        content_path = cache_dir / f"{base}.html"
+        assert document == MarkdownDocument(origin=root_origin, content="Root")
+        assert sorted(path.name for path in cache_dir.iterdir()) == [
+            content_path.name,
+            metadata_path.name,
+        ]
+        record = json.loads(metadata_path.read_text(encoding="utf-8"))
+        assert record["key"] == root_origin
+        assert record["content_path"] == content_path.name
+        assert record["metadata"]["content_type"] == "text/html; charset=utf-8"
+        assert record["metadata"]["origin"] == root_origin
+
+
+def test_web_crawler_rejects_cache_metadata_content_path_outside_cache(
+    tmp_path: Path,
+) -> None:
+    origin = "https://example.com/poison"
+    cache_dir = tmp_path / "cache"
+    cache_dir.mkdir()
+    outside = tmp_path / "outside.html"
+    outside.write_text("<html><body>Poison</body></html>", encoding="utf-8")
+    base = _expected_cache_base(origin)
+    metadata_path = cache_dir / f"{base}.metadata.json"
+    metadata_path.write_text(
+        json.dumps(
+            {
+                "key": origin,
+                "content_path": "../outside.html",
+                "metadata": {
+                    "origin": origin,
+                    "resolved_origin": origin,
+                    "content_type": "text/html",
+                    "status_code": 200,
+                    "etag": None,
+                    "last_modified": None,
+                    "type_label": "html",
+                    "fetched_at": "2026-01-01T00:00:00+00:00",
+                    "revalidated_at": None,
+                },
+            }
+        ),
+        encoding="utf-8",
+    )
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "<html><body><main>Fresh</main></body></html>",
+            }
+        }
+    )
+    crawler = WebCrawler(cache_dir=cache_dir, session=session)
+
+    source = crawler.fetch_raw(origin)
+
+    assert source.body_path.parent == cache_dir
+    assert source.body_path != outside
+    assert session.requests == [(origin, {})]
+
+
+def test_web_crawler_rejects_cache_metadata_with_mismatched_key(
+    tmp_path: Path,
+) -> None:
+    origin = "https://example.com/requested"
+    stale_origin = "https://example.com/stale"
+    cache_dir = tmp_path / "cache"
+    cache_dir.mkdir()
+    base = _expected_cache_base(origin)
+    content_path = cache_dir / f"{base}.html"
+    content_path.write_text("<html><body>Stale</body></html>", encoding="utf-8")
+    metadata_path = cache_dir / f"{base}.metadata.json"
+    metadata_path.write_text(
+        json.dumps(
+            {
+                "key": stale_origin,
+                "content_path": content_path.name,
+                "metadata": {
+                    "origin": stale_origin,
+                    "resolved_origin": stale_origin,
+                    "content_type": "text/html",
+                    "status_code": 200,
+                    "etag": None,
+                    "last_modified": None,
+                    "type_label": "html",
+                    "fetched_at": "2026-01-01T00:00:00+00:00",
+                    "revalidated_at": None,
+                },
+            }
+        ),
+        encoding="utf-8",
+    )
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "<html><body><main>Fresh</main></body></html>",
+            }
+        }
+    )
+    crawler = WebCrawler(cache_dir=cache_dir, session=session)
+
+    source = crawler.fetch_raw(origin)
+
+    assert source.origin == origin
+    assert source.body_path.read_text(encoding="utf-8") != (
+        "<html><body>Stale</body></html>"
+    )
+    assert session.requests == [(origin, {})]
+
+
+def test_web_crawler_cache_dir_true_uses_default_backend_directory(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    monkeypatch.chdir(tmp_path)
+    with _serve(
+        {
+            "/": {
+                "body": "<html><body><main>Root</main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": None,
+            }
+        }
+    ) as server:
+        root_url = f"http://127.0.0.1:{server.server_port}/"
+        root_origin = root_url.rstrip("/")
+        crawler = WebCrawler(cache_dir=True)
+
+        crawler.fetch_markdown(root_url)
+
+    cache_dir = tmp_path / ".raghilda" / "cache" / "web"
+    base = _expected_cache_base(root_origin)
+    assert sorted(path.name for path in cache_dir.iterdir()) == [
+        f"{base}.html",
+        f"{base}.metadata.json",
+    ]
+
+
+def test_web_crawler_relative_cache_dir_is_anchored_at_construction(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    monkeypatch.chdir(tmp_path)
+    origin = "https://example.com/page"
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "<html><body><main>Page</main></body></html>",
+            }
+        }
+    )
+    crawler = WebCrawler(cache_dir="cache", session=session)
+    other_cwd = tmp_path / "other"
+    other_cwd.mkdir()
+    monkeypatch.chdir(other_cwd)
+
+    source = crawler.fetch_raw(origin)
+
+    assert source.body_path.parent == tmp_path / "cache"
+
+
+def test_web_crawler_scopes_fresh_cache_hits_to_custom_session(
+    tmp_path: Path,
+) -> None:
+    origin = "https://example.com/private"
+    cache_dir = tmp_path / "session-cache"
+    first_session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "<html><body><main>First</main></body></html>",
+            },
+        }
+    )
+    second_session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "<html><body><main>Second</main></body></html>",
+            },
+        }
+    )
+    first_crawler = WebCrawler(cache_dir=cache_dir, session=first_session)
+    second_crawler = WebCrawler(cache_dir=cache_dir, session=second_session)
+
+    first = first_crawler.fetch_raw(origin)
+    first_body = first.body_path.read_text(encoding="utf-8")
+    second = second_crawler.fetch_raw(origin)
+
+    assert "First" in first_body
+    assert "Second" in second.body_path.read_text(encoding="utf-8")
+    assert second_session.requests == [(origin, {})]
+
+
+def test_web_crawler_disambiguates_colliding_sanitized_cache_prefixes(
+    tmp_path: Path,
+) -> None:
+    first_origin = "https://example.com/docs/page"
+    second_origin = "https://example.com/docs:page"
+    third_origin = "https://example.com/docs?page"
+    session: Any = _FakeWebSession(
+        {
+            first_origin: {"body": "<html><body><main>One</main></body></html>"},
+            second_origin: {"body": "<html><body><main>Two</main></body></html>"},
+            third_origin: {"body": "<html><body><main>Three</main></body></html>"},
+        }
+    )
+    cache_dir = tmp_path / "collision-cache"
+    crawler = WebCrawler(cache_dir=cache_dir, session=session)
+
+    crawler.fetch_raw(first_origin)
+    crawler.fetch_raw(second_origin)
+    crawler.fetch_raw(third_origin)
+
+    first_base = _expected_cache_base(first_origin)
+    second_base = _expected_cache_base(second_origin)
+    third_base = _expected_cache_base(third_origin)
+    cached_names = {path.name for path in cache_dir.iterdir()}
+    assert {
+        f"{first_base}.html",
+        f"{first_base}.metadata.json",
+        f"{second_base}.html",
+        f"{second_base}.metadata.json",
+        f"{third_base}.html",
+        f"{third_base}.metadata.json",
+    }.issubset(cached_names)
+    assert len(cached_names) == 6
+
+    second_session: Any = _FakeWebSession(
+        {
+            first_origin: {"body": "<html><body><main>One</main></body></html>"},
+            second_origin: {"body": "<html><body><main>Two</main></body></html>"},
+            third_origin: {"body": "<html><body><main>Three</main></body></html>"},
+        }
+    )
+    second_crawler = WebCrawler(cache_dir=cache_dir, session=second_session)
+
+    assert second_crawler.fetch_raw(first_origin).body_path.exists()
+    assert second_crawler.fetch_raw(second_origin).body_path.exists()
+    assert second_crawler.fetch_raw(third_origin).body_path.exists()
+    assert second_session.requests == [
+        (first_origin, {}),
+        (second_origin, {}),
+        (third_origin, {}),
+    ]
+
+
+def test_web_crawler_refresh_deletes_only_exact_cache_base(tmp_path: Path) -> None:
+    first_origin = "https://example.com"
+    first_base = _expected_cache_base(first_origin)
+    second_origin = f"https://example.com--{first_base.rsplit('--', 1)[1]}.child"
+    session: Any = _FakeWebSession(
+        {
+            first_origin: {"body": "<html><body><main>One</main></body></html>"},
+            second_origin: {"body": "<html><body><main>Two</main></body></html>"},
+        }
+    )
+    crawler = WebCrawler(cache_dir=tmp_path / "exact-delete-cache", session=session)
+
+    crawler.fetch_raw(first_origin)
+    crawler.fetch_raw(second_origin)
+    crawler.fetch_raw(first_origin, cache_force_refresh=True)
+    session.requests.clear()
+    crawler.fetch_raw(second_origin)
+
+    assert session.requests == []
+
+
+def test_web_crawler_refresh_replaces_cached_body_atomically(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    origin = "https://example.com"
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "<html><body><main>First</main></body></html>",
+            },
+        }
+    )
+    crawler = WebCrawler(cache_dir=tmp_path / "atomic-cache", session=session)
+    first = crawler.fetch_raw(origin)
+    session.routes[origin]["body"] = "<html><body><main>Second</main></body></html>"
+    replacements: list[tuple[Path, Path]] = []
+    replace = crawl_module.os.replace
+
+    def track_replace(src: str | Path, dst: str | Path) -> None:
+        replacements.append((Path(src), Path(dst)))
+        replace(src, dst)
+
+    monkeypatch.setattr(crawl_module.os, "replace", track_replace)
+
+    second = crawler.fetch_raw(origin, cache_force_refresh=True)
+
+    assert first.body_path == second.body_path
+    assert second.body_path.read_text(encoding="utf-8") == (
+        "<html><body><main>Second</main></body></html>"
+    )
+    assert replacements[-1][1] == second.body_path
+
+
+def test_web_crawler_cache_writes_for_different_keys_do_not_contend(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    first_origin = "https://example.com/docs/one"
+    second_origin = "https://example.com/docs/two"
+    session: Any = _FakeWebSession(
+        {
+            first_origin: {"body": "<html><body><main>One</main></body></html>"},
+            second_origin: {"body": "<html><body><main>Two</main></body></html>"},
+        }
+    )
+    cache_dir = tmp_path / "concurrency-cache"
+    crawler = WebCrawler(cache_dir=cache_dir, session=session)
+
+    first_content_path = cache_dir / f"{_expected_cache_base(first_origin)}.html"
+    second_content_path = cache_dir / f"{_expected_cache_base(second_origin)}.html"
+    first_started = threading.Event()
+    release_first = threading.Event()
+    second_finished = threading.Event()
+    errors: list[BaseException] = []
+    original_write_content = crawl_module._FilesystemCrawlerCache._write_content
+
+    def blocking_write_content(
+        self,
+        path: Path,
+        content: bytes | str | Path,
+    ) -> None:
+        if path == first_content_path and not first_started.is_set():
+            first_started.set()
+            assert release_first.wait(timeout=2.0)
+        original_write_content(self, path, content)
+        if path == second_content_path:
+            second_finished.set()
+
+    monkeypatch.setattr(
+        crawl_module._FilesystemCrawlerCache,
+        "_write_content",
+        blocking_write_content,
+    )
+
+    def fetch(origin: str) -> None:
+        try:
+            crawler.fetch_raw(origin)
+        except BaseException as exc:
+            errors.append(exc)
+
+    first_thread = threading.Thread(target=fetch, args=(first_origin,))
+    second_thread = threading.Thread(target=fetch, args=(second_origin,))
+    first_thread.start()
+    assert first_started.wait(timeout=1.0)
+    second_thread.start()
+    try:
+        assert second_finished.wait(timeout=1.0)
+    finally:
+        release_first.set()
+        first_thread.join(timeout=1.0)
+        second_thread.join(timeout=1.0)
+
+    assert errors == []
+
+
+def test_web_crawler_uses_magika_when_no_explicit_ext_is_available(
+    tmp_path: Path,
+) -> None:
+    origin = "https://example.com/download"
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "<html><body><main>Download</main></body></html>",
+                "content_type": "application/octet-stream",
+            }
+        }
+    )
+    cache_dir = tmp_path / "magika-cache"
+    crawler = WebCrawler(cache_dir=cache_dir, session=session)
+
+    source = crawler.fetch_raw(origin)
+
+    base = _expected_cache_base(origin)
+    assert source.body_path == cache_dir / f"{base}.html"
+
+
+def test_web_crawler_type_filters_use_sniffed_cache_extension(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    class _FakeMagikaOutput:
+        label = "html"
+        extensions = ["html"]
+
+    class _FakeMagikaResult:
+        output = _FakeMagikaOutput()
+
+    class _FakeMagika:
+        def identify_bytes(self, content: bytes) -> _FakeMagikaResult:
+            assert content.startswith(b"<html>")
+            return _FakeMagikaResult()
+
+        def identify_path(self, path: Path) -> _FakeMagikaResult:
+            assert path.suffix == ".html"
+            return _FakeMagikaResult()
+
+    origin = "https://example.com/download"
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "<html><body><main>Download</main></body></html>",
+                "content_type": "application/octet-stream",
+            }
+        }
+    )
+    monkeypatch.setattr(crawl_module, "_MAGIKA", _FakeMagika())
+    crawler = WebCrawler(cache_dir=tmp_path / "sniffed-type-cache", session=session)
+    scope = CrawlScope(roots=[origin], depth=0, include_types=["html"])
+
+    origins = list(crawler.origins(scope, progress=False))
+    source = crawler.fetch_raw(origin)
+
+    assert origins == [origin]
+    assert source.metadata == {
+        "etag": None,
+        "last_modified": None,
+        "type_label": "html",
+    }
+
+
+def test_web_crawler_prefers_content_type_over_misleading_url_suffix(
+    tmp_path: Path,
+) -> None:
+    origin = "https://example.com/README.md"
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "<html><body><main>Rendered Readme</main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+            }
+        }
+    )
+    cache_dir = tmp_path / "content-type-cache"
+    crawler = WebCrawler(cache_dir=cache_dir, session=session)
+
+    source = crawler.fetch_raw(origin)
+    document = crawler.fetch_markdown(origin)
+
+    base = _expected_cache_base(origin)
+    assert source.body_path == cache_dir / f"{base}.html"
+    assert document == MarkdownDocument(origin=origin, content="Rendered Readme")
+
+
+def test_web_crawler_prefers_text_content_type_over_url_suffix(
+    tmp_path: Path,
+) -> None:
+    origin = "https://example.com/plain.html"
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "plain text",
+                "content_type": "text/plain; charset=utf-8",
+            }
+        }
+    )
+    cache_dir = tmp_path / "text-content-type-cache"
+    crawler = WebCrawler(cache_dir=cache_dir, session=session)
+    scope = CrawlScope(roots=[origin], depth=0, include_types=["text"])
+
+    origins = list(crawler.origins(scope, progress=False))
+    source = crawler.fetch_raw(origin)
+
+    assert origins == [origin]
+    assert source.body_path == cache_dir / f"{_expected_cache_base(origin)}.txt"
+    assert (source.metadata or {})["type_label"] == "text"
+
+
+def test_web_crawler_preserves_reserved_escapes_in_requested_origin(
+    tmp_path: Path,
+) -> None:
+    origin = "https://example.com/a%2Fb"
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "<html><body><main>Escaped</main></body></html>",
+            }
+        }
+    )
+    cache_dir = tmp_path / "escaped-cache"
+    crawler = WebCrawler(cache_dir=cache_dir, session=session)
+
+    source = crawler.fetch_raw(origin)
+
+    assert session.requests == [(origin, {})]
+    assert source.origin == origin
+    assert source.body_path == cache_dir / f"{_expected_cache_base(origin)}.html"
+
+
+def test_web_crawler_falls_back_to_raw_when_magika_is_unavailable(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    origin = "https://example.com/download"
+    session: Any = _FakeWebSession(
+        {
+            origin: {
+                "body": "opaque payload",
+                "content_type": "application/octet-stream",
+            }
+        }
+    )
+    cache_dir = tmp_path / "raw-cache"
+    monkeypatch.setattr(crawl_module, "_MAGIKA", None)
+    crawler = WebCrawler(cache_dir=cache_dir, session=session)
+
+    source = crawler.fetch_raw(origin)
+
+    base = _expected_cache_base(origin)
+    assert source.body_path == cache_dir / f"{base}.raw"
+
+
+class _CloudflareResponse:
+    def __init__(self, payload: dict[str, Any]):
+        self.payload = payload
+        self.status_code = 200
+
+    def json(self) -> dict[str, Any]:
+        return self.payload
+
+    def raise_for_status(self) -> None:
+        return
+
+
+class _CloudflareSession:
+    def __init__(self) -> None:
+        self.post_calls: list[tuple[str, dict[str, Any], dict[str, str]]] = []
+        self.get_calls: list[tuple[str, dict[str, Any] | None]] = []
+        self._poll_count = 0
+
+    def post(
+        self,
+        url: str,
+        *,
+        json: dict[str, Any],
+        headers: dict[str, str],
+        timeout: float,
+    ) -> _CloudflareResponse:
+        self.post_calls.append((url, json, headers))
+        return _CloudflareResponse({"success": True, "result": "job-123"})
+
+    def get(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str],
+        params: dict[str, Any] | None = None,
+        timeout: float,
+    ) -> _CloudflareResponse:
+        self.get_calls.append((url, params))
+        self._poll_count += 1
+        if self._poll_count == 1:
+            return _CloudflareResponse(
+                {"success": True, "result": {"id": "job-123", "status": "running"}}
+            )
+        if params == {"limit": 1}:
+            return _CloudflareResponse(
+                {"success": True, "result": {"id": "job-123", "status": "completed"}}
+            )
+        return _CloudflareResponse(
+            {
+                "success": True,
+                "result": {
+                    "id": "job-123",
+                    "status": "completed",
+                    "records": [
+                        {
+                            "url": "https://example.com/docs",
+                            "status": "completed",
+                            "markdown": "# Docs\n",
+                            "metadata": {
+                                "status": 200,
+                                "title": "Docs",
+                                "url": "https://example.com/docs",
+                            },
+                        },
+                        {
+                            "url": "https://example.com/docs/page",
+                            "status": "completed",
+                            "markdown": "## Page\n",
+                            "metadata": {
+                                "status": 200,
+                                "title": "Page",
+                                "url": "https://example.com/docs/page",
+                            },
+                        },
+                    ],
+                },
+            }
+        )
+
+
+class _ParameterizedCloudflareSession:
+    def __init__(self) -> None:
+        self.post_calls: list[tuple[str, dict[str, Any], dict[str, str]]] = []
+        self.get_calls: list[tuple[str, dict[str, Any] | None]] = []
+        self._jobs: dict[str, dict[str, Any]] = {}
+
+    def post(
+        self,
+        url: str,
+        *,
+        json: dict[str, Any],
+        headers: dict[str, str],
+        timeout: float,
+    ) -> _CloudflareResponse:
+        del timeout
+        job_id = f"job-{len(self.post_calls) + 1}"
+        self.post_calls.append((url, json, headers))
+        self._jobs[job_id] = json
+        return _CloudflareResponse({"success": True, "result": job_id})
+
+    def get(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str],
+        params: dict[str, Any] | None = None,
+        timeout: float,
+    ) -> _CloudflareResponse:
+        del headers, timeout
+        self.get_calls.append((url, params))
+        job_id = url.rsplit("/", 1)[-1]
+        payload = self._jobs[job_id]
+        if params == {"limit": 1}:
+            return _CloudflareResponse(
+                {"success": True, "result": {"id": job_id, "status": "completed"}}
+            )
+        records = [
+            {
+                "url": payload["url"],
+                "status": "completed",
+                "markdown": "# Docs\n",
+                "metadata": {
+                    "status": 200,
+                    "title": "Docs",
+                    "url": payload["url"],
+                },
+            }
+        ]
+        if payload["depth"] > 0:
+            records.append(
+                {
+                    "url": f"{payload['url']}/page",
+                    "status": "completed",
+                    "markdown": "## Page\n",
+                    "metadata": {
+                        "status": 200,
+                        "title": "Page",
+                        "url": f"{payload['url']}/page",
+                    },
+                }
+            )
+        return _CloudflareResponse(
+            {
+                "success": True,
+                "result": {
+                    "id": job_id,
+                    "status": "completed",
+                    "records": records,
+                },
+            }
+        )
+
+
+class _DiscoveryFilteringCloudflareSession(_ParameterizedCloudflareSession):
+    def get(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str],
+        params: dict[str, Any] | None = None,
+        timeout: float,
+    ) -> _CloudflareResponse:
+        response = super().get(url, headers=headers, params=params, timeout=timeout)
+        if params == {"limit": 1}:
+            return response
+
+        payload = self._jobs[url.rsplit("/", 1)[-1]]
+        include_patterns = payload["options"].get("includePatterns", [])
+        exclude_patterns = payload["options"].get("excludePatterns", [])
+        records = response.json()["result"]["records"]
+        filtered_records = [
+            record
+            for record in records
+            if (
+                (
+                    not include_patterns
+                    or any(
+                        fnmatch.fnmatchcase(record["url"], pattern)
+                        for pattern in include_patterns
+                    )
+                )
+                and not any(
+                    fnmatch.fnmatchcase(record["url"], pattern)
+                    for pattern in exclude_patterns
+                )
+            )
+        ]
+        return _CloudflareResponse(
+            {
+                "success": True,
+                "result": {
+                    "id": url.rsplit("/", 1)[-1],
+                    "status": "completed",
+                    "records": filtered_records,
+                },
+            }
+        )
+
+
+class _OverlappingLimitedCloudflareSession(_ParameterizedCloudflareSession):
+    def get(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str],
+        params: dict[str, Any] | None = None,
+        timeout: float,
+    ) -> _CloudflareResponse:
+        del headers, timeout
+        self.get_calls.append((url, params))
+        job_id = url.rsplit("/", 1)[-1]
+        payload = self._jobs[job_id]
+        if params == {"limit": 1}:
+            return _CloudflareResponse(
+                {"success": True, "result": {"id": job_id, "status": "completed"}}
+            )
+        records = [
+            {
+                "url": "https://example.com/shared",
+                "status": "completed",
+                "markdown": "# Shared\n",
+                "metadata": {
+                    "status": 200,
+                    "title": "Shared",
+                    "url": "https://example.com/shared",
+                },
+            }
+        ]
+        if payload["url"] == "https://example.com/root-b":
+            records.append(
+                {
+                    "url": "https://example.com/root-b/unique",
+                    "status": "completed",
+                    "markdown": "# Unique\n",
+                    "metadata": {
+                        "status": 200,
+                        "title": "Unique",
+                        "url": "https://example.com/root-b/unique",
+                    },
+                }
+            )
+        if "limit" in payload:
+            records = records[: payload["limit"]]
+        return _CloudflareResponse(
+            {
+                "success": True,
+                "result": {
+                    "id": job_id,
+                    "status": "completed",
+                    "records": records,
+                },
+            }
+        )
+
+
+class _TrailingSlashCloudflareSession(_ParameterizedCloudflareSession):
+    def get(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str],
+        params: dict[str, Any] | None = None,
+        timeout: float,
+    ) -> _CloudflareResponse:
+        response = super().get(url, headers=headers, params=params, timeout=timeout)
+        if params == {"limit": 1}:
+            return response
+        payload = self._jobs[url.rsplit("/", 1)[-1]]
+        records = response.json()["result"]["records"]
+        records[0]["url"] = f"{payload['url'].rstrip('/')}/"
+        records[0]["metadata"]["url"] = records[0]["url"]
+        return response
+
+
+class _OutOfScopeCloudflareSession(_ParameterizedCloudflareSession):
+    def get(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str],
+        params: dict[str, Any] | None = None,
+        timeout: float,
+    ) -> _CloudflareResponse:
+        response = super().get(url, headers=headers, params=params, timeout=timeout)
+        if params == {"limit": 1}:
+            return response
+        job_id = url.rsplit("/", 1)[-1]
+        payload = self._jobs[job_id]
+        root = payload["url"]
+        records = [
+            {
+                "url": root,
+                "status": "completed",
+                "markdown": "# Root\n",
+                "metadata": {
+                    "status": 200,
+                    "title": "Root",
+                    "url": root,
+                },
+            },
+            {
+                "url": "https://example.com/page",
+                "status": "completed",
+                "markdown": "# Page\n",
+                "metadata": {
+                    "status": 200,
+                    "title": "Page",
+                    "url": "https://example.com/page",
+                },
+            },
+            {
+                "url": "https://docs.example.com/page",
+                "status": "completed",
+                "markdown": "# Subdomain\n",
+                "metadata": {
+                    "status": 200,
+                    "title": "Subdomain",
+                    "url": "https://docs.example.com/page",
+                },
+            },
+            {
+                "url": "https://external.test/page",
+                "status": "completed",
+                "markdown": "# External\n",
+                "metadata": {
+                    "status": 200,
+                    "title": "External",
+                    "url": "https://external.test/page",
+                },
+            },
+        ]
+        return _CloudflareResponse(
+            {
+                "success": True,
+                "result": {
+                    "id": job_id,
+                    "status": "completed",
+                    "records": records,
+                },
+            }
+        )
+
+
+class _ExternalFirstCloudflareSession(_ParameterizedCloudflareSession):
+    def get(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str],
+        params: dict[str, Any] | None = None,
+        timeout: float,
+    ) -> _CloudflareResponse:
+        response = super().get(url, headers=headers, params=params, timeout=timeout)
+        if params == {"limit": 1}:
+            return response
+        job_id = url.rsplit("/", 1)[-1]
+        root = self._jobs[job_id]["url"]
+        return _CloudflareResponse(
+            {
+                "success": True,
+                "result": {
+                    "id": job_id,
+                    "status": "completed",
+                    "records": [
+                        {
+                            "url": "https://external.test/page",
+                            "status": "completed",
+                            "markdown": "# External\n",
+                            "metadata": {
+                                "status": 200,
+                                "title": "External",
+                                "url": "https://external.test/page",
+                            },
+                        },
+                        {
+                            "url": root,
+                            "status": "completed",
+                            "markdown": "# Root\n",
+                            "metadata": {
+                                "status": 200,
+                                "title": "Root",
+                                "url": root,
+                            },
+                        },
+                    ],
+                },
+            }
+        )
+
+
+class _RedirectCloudflareSession(_ParameterizedCloudflareSession):
+    def get(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str],
+        params: dict[str, Any] | None = None,
+        timeout: float,
+    ) -> _CloudflareResponse:
+        response = super().get(url, headers=headers, params=params, timeout=timeout)
+        if params == {"limit": 1}:
+            return response
+        job_id = url.rsplit("/", 1)[-1]
+        root = self._jobs[job_id]["url"]
+        final_url = f"{root.rstrip('/')}/landing"
+        return _CloudflareResponse(
+            {
+                "success": True,
+                "result": {
+                    "id": job_id,
+                    "status": "completed",
+                    "records": [
+                        {
+                            "url": final_url,
+                            "status": "completed",
+                            "markdown": "# Landing\n",
+                            "metadata": {
+                                "status": 200,
+                                "title": "Landing",
+                                "url": final_url,
+                            },
+                        }
+                    ],
+                },
+            }
+        )
+
+
+class _CrossOriginRedirectCloudflareSession(_ParameterizedCloudflareSession):
+    def get(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str],
+        params: dict[str, Any] | None = None,
+        timeout: float,
+    ) -> _CloudflareResponse:
+        response = super().get(url, headers=headers, params=params, timeout=timeout)
+        if params == {"limit": 1}:
+            return response
+        job_id = url.rsplit("/", 1)[-1]
+        final_url = "https://example.com/landing"
+        return _CloudflareResponse(
+            {
+                "success": True,
+                "result": {
+                    "id": job_id,
+                    "status": "completed",
+                    "records": [
+                        {
+                            "url": final_url,
+                            "status": "completed",
+                            "markdown": "# Landing\n",
+                            "metadata": {
+                                "status": 200,
+                                "title": "Landing",
+                                "url": final_url,
+                            },
+                        }
+                    ],
+                },
+            }
+        )
+
+
+def test_cloudflare_crawler_polls_job_and_uses_markdown_records(
+    tmp_path: Path,
+) -> None:
+    session = _CloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-cache",
+        render=False,
+        session=session,
+        poll_interval=0,
+    )
+    scope = CrawlScope(
+        roots=["https://example.com/docs"],
+        depth=2,
+        limit=25,
+        include_patterns=["https://example.com/docs/**"],
+        exclude_patterns=["https://example.com/docs/archive/**"],
+        include_external_links=True,
+        include_subdomains=True,
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [
+        "https://example.com/docs",
+        "https://example.com/docs/page",
+    ]
+    assert len(session.post_calls) == 1
+    post_url, payload, headers = session.post_calls[0]
+    assert post_url.endswith("/accounts/account-123/browser-rendering/crawl")
+    assert headers["Authorization"] == "Bearer token-123"
+    assert payload["formats"] == ["markdown"]
+    assert payload["depth"] == 2
+    assert payload["limit"] == 25
+    assert payload["render"] is False
+    assert payload["options"]["includePatterns"] == ["https://example.com/docs/**"]
+    assert payload["options"]["excludePatterns"] == [
+        "https://example.com/docs/archive/**"
+    ]
+    assert payload["options"]["includeExternalLinks"] is True
+    assert payload["options"]["includeSubdomains"] is True
+
+    page_source = crawler.fetch_raw("https://example.com/docs/page")
+    assert page_source.status_code == 200
+    assert page_source.markdown_path is not None
+    assert page_source.markdown_path.read_text(encoding="utf-8") == "## Page\n"
+
+    page_doc = crawler.fetch_markdown("https://example.com/docs/page")
+    assert page_doc == MarkdownDocument(
+        origin="https://example.com/docs/page",
+        content="## Page\n",
+    )
+    assert len(session.post_calls) == 1
+
+
+def test_cloudflare_markdown_documents_reuses_immediately_stale_discovery_cache(
+    tmp_path: Path,
+) -> None:
+    session = _ParameterizedCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-stale-cache",
+        session=session,
+        cache_stale_after=timedelta(seconds=0),
+        poll_interval=0,
+    )
+    scope = CrawlScope(roots=["https://example.com"], depth=0)
+
+    documents = list(crawler.markdown_documents(scope, progress=False))
+
+    assert documents == [
+        MarkdownDocument(origin="https://example.com", content="# Docs\n")
+    ]
+    assert len(session.post_calls) == 1
+
+
+def test_cloudflare_crawler_accepts_crawl_scope_for_roots_and_patterns(
+    tmp_path: Path,
+) -> None:
+    session = _ParameterizedCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-scope-cache",
+        session=session,
+        poll_interval=0,
+    )
+    scope = CrawlScope(
+        roots=["https://example.com/docs"],
+        depth=1,
+        include_patterns=["https://example.com/docs/**"],
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [
+        "https://example.com/docs",
+        "https://example.com/docs/page",
+    ]
+    assert session.post_calls[0][1]["depth"] == 1
+    assert session.post_calls[0][1]["options"]["includePatterns"] == [
+        "https://example.com/docs/**"
+    ]
+
+
+def test_cloudflare_crawler_filters_returned_records_to_web_scope(
+    tmp_path: Path,
+) -> None:
+    session = _OutOfScopeCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-returned-scope-cache",
+        session=session,
+        poll_interval=0,
+    )
+    scope = CrawlScope(
+        roots=["https://example.com/root"],
+        depth=1,
+        include_external_links=False,
+        include_subdomains=False,
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [
+        "https://example.com/root",
+        "https://example.com/page",
+    ]
+
+
+def test_cloudflare_crawler_does_not_treat_external_first_record_as_seed(
+    tmp_path: Path,
+) -> None:
+    session = _ExternalFirstCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-external-first-cache",
+        session=session,
+        poll_interval=0,
+    )
+    scope = CrawlScope(
+        roots=["https://example.com/root"],
+        depth=1,
+        limit=1,
+        include_external_links=False,
+        include_subdomains=False,
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == ["https://example.com/root"]
+
+
+def test_cloudflare_markdown_documents_keeps_cross_origin_redirected_seed(
+    tmp_path: Path,
+) -> None:
+    session = _CrossOriginRedirectCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-cross-origin-seed-cache",
+        session=session,
+        poll_interval=0,
+    )
+    scope = CrawlScope(
+        roots=["http://example.com"],
+        depth=0,
+        include_external_links=False,
+        include_subdomains=False,
+    )
+
+    documents = list(crawler.markdown_documents(scope, progress=False))
+
+    assert documents == [
+        MarkdownDocument(
+            origin="https://example.com/landing",
+            content="# Landing\n",
+        )
+    ]
+
+
+def test_cloudflare_crawler_cache_key_includes_crawl_parameters(
+    tmp_path: Path,
+) -> None:
+    session = _ParameterizedCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-cache",
+        session=session,
+        poll_interval=0,
+    )
+    scope = CrawlScope(
+        roots=["https://example.com/docs"],
+        depth=2,
+        limit=25,
+    )
+
+    source = crawler.fetch_raw("https://example.com/docs")
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert source.origin == "https://example.com/docs"
+    assert origins == [
+        "https://example.com/docs",
+        "https://example.com/docs/page",
+    ]
+    assert len(session.post_calls) == 2
+    assert session.post_calls[0][1]["depth"] == 0
+    assert session.post_calls[0][1]["limit"] == 1
+    assert session.post_calls[1][1]["depth"] == 2
+    assert session.post_calls[1][1]["limit"] == 25
+
+
+def test_cloudflare_crawler_rechecks_stale_in_memory_records(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    session = _ParameterizedCloudflareSession()
+    times = iter(
+        [
+            datetime(2026, 1, 1, tzinfo=timezone.utc),
+            datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc),
+            datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc),
+            datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc),
+        ]
+    )
+    monkeypatch.setattr(crawl_module, "_utcnow", lambda: next(times))
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-cache",
+        cache_stale_after=timedelta(seconds=1),
+        session=session,
+        poll_interval=0,
+    )
+    scope = CrawlScope(roots=["https://example.com/docs"], depth=2)
+
+    origins = list(crawler.origins(scope, progress=False))
+    page_source = crawler.fetch_raw("https://example.com/docs/page")
+
+    assert origins == [
+        "https://example.com/docs",
+        "https://example.com/docs/page",
+    ]
+    assert page_source.origin == "https://example.com/docs/page"
+    assert len(session.post_calls) == 2
+    assert session.post_calls[1][1]["url"] == "https://example.com/docs/page"
+    assert session.post_calls[1][1]["depth"] == 0
+    assert session.post_calls[1][1]["limit"] == 1
+
+
+def test_cloudflare_fetch_raw_ignores_discovery_patterns_for_explicit_origin(
+    tmp_path: Path,
+) -> None:
+    session = _DiscoveryFilteringCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-cache",
+        session=session,
+        poll_interval=0,
+    )
+
+    source = crawler.fetch_raw("https://example.com/docs")
+
+    assert source.origin == "https://example.com/docs"
+    assert source.status_code == 200
+    assert "includePatterns" not in session.post_calls[0][1]["options"]
+
+
+def test_cloudflare_fetch_raw_accepts_redirected_record(
+    tmp_path: Path,
+) -> None:
+    cache = tmp_path / "cloudflare-redirect-cache"
+    session = _RedirectCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=cache,
+        session=session,
+        poll_interval=0,
+    )
+
+    source = crawler.fetch_raw("https://example.com")
+
+    assert source.origin == "https://example.com"
+    assert source.resolved_origin == "https://example.com/landing"
+    assert source.body_path.read_text(encoding="utf-8") == "# Landing\n"
+    assert len(session.post_calls) == 1
+
+    cached_session = _RedirectCloudflareSession()
+    cached_crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=cache,
+        session=cached_session,
+        poll_interval=0,
+    )
+    cached_source = cached_crawler.fetch_raw("https://example.com")
+
+    assert cached_source.resolved_origin == "https://example.com/landing"
+    assert cached_session.post_calls == []
+
+
+def test_cloudflare_fetch_raw_accepts_cross_origin_redirected_record(
+    tmp_path: Path,
+) -> None:
+    session = _CrossOriginRedirectCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-cross-origin-redirect-cache",
+        session=session,
+        poll_interval=0,
+    )
+
+    source = crawler.fetch_raw("http://example.com")
+
+    assert source.origin == "http://example.com"
+    assert source.resolved_origin == "https://example.com/landing"
+    assert source.body_path.read_text(encoding="utf-8") == "# Landing\n"
+
+
+def test_cloudflare_fetch_raw_reuses_cache_directory_across_instances(
+    tmp_path: Path,
+) -> None:
+    cache = tmp_path / "cloudflare-cache"
+    first_session = _ParameterizedCloudflareSession()
+    first_crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=cache,
+        session=first_session,
+        poll_interval=0,
+    )
+
+    first = first_crawler.fetch_raw("https://example.com/docs")
+
+    second_session = _ParameterizedCloudflareSession()
+    second_crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=cache,
+        session=second_session,
+        poll_interval=0,
+    )
+
+    second = second_crawler.fetch_raw("https://example.com/docs")
+
+    assert first.body_path.read_text(encoding="utf-8") == "# Docs\n"
+    assert second.body_path.read_text(encoding="utf-8") == "# Docs\n"
+    assert second.status_code == 200
+    assert len(first_session.post_calls) == 1
+    assert second_session.post_calls == []
+
+
+def test_cloudflare_origins_reuses_root_cache_directory_across_instances(
+    tmp_path: Path,
+) -> None:
+    cache = tmp_path / "cloudflare-cache"
+    scope = CrawlScope(roots=["https://example.com/docs"], depth=1)
+    first_session = _ParameterizedCloudflareSession()
+    first_crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=cache,
+        session=first_session,
+        poll_interval=0,
+    )
+
+    first_origins = list(first_crawler.origins(scope, progress=False))
+
+    second_session = _ParameterizedCloudflareSession()
+    second_crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=cache,
+        session=second_session,
+        poll_interval=0,
+    )
+
+    second_origins = list(second_crawler.origins(scope, progress=False))
+    page_source = second_crawler.fetch_raw("https://example.com/docs/page")
+
+    assert first_origins == [
+        "https://example.com/docs",
+        "https://example.com/docs/page",
+    ]
+    assert second_origins == first_origins
+    assert page_source.origin == "https://example.com/docs/page"
+    assert len(first_session.post_calls) == 1
+    assert second_session.post_calls == []
+
+
+def test_cloudflare_markdown_documents_canonicalizes_record_urls(
+    tmp_path: Path,
+) -> None:
+    session = _TrailingSlashCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-record-url-cache",
+        session=session,
+        poll_interval=0,
+    )
+    scope = CrawlScope(roots=["https://example.com"], depth=0)
+
+    documents = list(crawler.markdown_documents(scope, progress=False))
+
+    assert documents == [
+        MarkdownDocument(origin="https://example.com", content="# Docs\n")
+    ]
+
+
+def test_cloudflare_crawler_cache_dir_uses_hashed_file_pair(
+    tmp_path: Path,
+) -> None:
+    session = _ParameterizedCloudflareSession()
+    cache_dir = tmp_path / "cloudflare-cache"
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=cache_dir,
+        session=session,
+        poll_interval=0,
+    )
+
+    source = crawler.fetch_raw("https://example.com/docs")
+
+    base = _expected_cache_base(source.origin)
+    metadata_path = cache_dir / f"{base}.metadata.json"
+    content_path = cache_dir / f"{base}.md"
+    assert source.body_path.read_text(encoding="utf-8") == "# Docs\n"
+    assert sorted(path.name for path in cache_dir.iterdir()) == [
+        content_path.name,
+        metadata_path.name,
+    ]
+    record = json.loads(metadata_path.read_text(encoding="utf-8"))
+    assert record["key"] == source.origin
+    assert record["content_path"] == content_path.name
+    assert record["metadata"]["record"]["url"] == source.origin
+
+
+def test_cloudflare_crawler_cache_dir_true_uses_default_backend_directory(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    monkeypatch.chdir(tmp_path)
+    session = _ParameterizedCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=True,
+        session=session,
+        poll_interval=0,
+    )
+
+    source = crawler.fetch_raw("https://example.com/docs")
+
+    cache_dir = tmp_path / ".raghilda" / "cache" / "cloudflare"
+    base = _expected_cache_base(source.origin)
+    assert sorted(path.name for path in cache_dir.iterdir()) == [
+        f"{base}.md",
+        f"{base}.metadata.json",
+    ]
+
+
+def test_cloudflare_fetch_raw_scopes_cache_to_account_id(
+    tmp_path: Path,
+) -> None:
+    cache = tmp_path / "cloudflare-cache"
+    first_session = _ParameterizedCloudflareSession()
+    first_crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=cache,
+        session=first_session,
+        poll_interval=0,
+    )
+    first_crawler.fetch_raw("https://example.com/docs")
+
+    second_session = _ParameterizedCloudflareSession()
+    second_crawler = CloudflareCrawler(
+        account_id="account-456",
+        api_token="token-123",
+        cache_dir=cache,
+        session=second_session,
+        poll_interval=0,
+    )
+
+    second_crawler.fetch_raw("https://example.com/docs")
+
+    assert len(first_session.post_calls) == 1
+    assert len(second_session.post_calls) == 1
+    assert "/accounts/account-456/" in second_session.post_calls[0][0]
+
+
+def test_cloudflare_fetch_raw_scopes_cache_to_api_base(
+    tmp_path: Path,
+) -> None:
+    cache = tmp_path / "cloudflare-cache"
+    first_session = _ParameterizedCloudflareSession()
+    first_crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=cache,
+        session=first_session,
+        poll_interval=0,
+        base_url="https://prod.example/api",
+    )
+    first_crawler.fetch_raw("https://example.com/docs")
+
+    second_session = _ParameterizedCloudflareSession()
+    second_crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=cache,
+        session=second_session,
+        poll_interval=0,
+        base_url="https://staging.example/api",
+    )
+
+    second_crawler.fetch_raw("https://example.com/docs")
+
+    assert len(first_session.post_calls) == 1
+    assert len(second_session.post_calls) == 1
+    assert second_session.post_calls[0][0].startswith(
+        "https://staging.example/api/accounts/account-123/"
+    )
+
+
+def test_cloudflare_crawler_applies_limit_across_all_roots(
+    tmp_path: Path,
+) -> None:
+    session = _ParameterizedCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-cache",
+        session=session,
+        poll_interval=0,
+    )
+    scope = CrawlScope(
+        roots=["https://example.com/docs-a", "https://example.com/docs-b"],
+        limit=1,
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == ["https://example.com/docs-a"]
+    assert len(session.post_calls) == 1
+
+
+def test_cloudflare_crawler_deduplicates_roots_before_counting_limit(
+    tmp_path: Path,
+) -> None:
+    session = _ParameterizedCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-dedupe-cache",
+        session=session,
+        poll_interval=0,
+    )
+    scope = CrawlScope(
+        roots=[
+            "https://example.com/docs-a",
+            "https://example.com/docs-a",
+            "https://example.com/docs-b",
+        ],
+        depth=0,
+        limit=2,
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [
+        "https://example.com/docs-a",
+        "https://example.com/docs-b",
+    ]
+    assert [call[1]["url"] for call in session.post_calls] == [
+        "https://example.com/docs-a",
+        "https://example.com/docs-b",
+    ]
+
+
+def test_cloudflare_crawler_applies_limit_after_deduplication(
+    tmp_path: Path,
+) -> None:
+    session = _OverlappingLimitedCloudflareSession()
+    crawler = CloudflareCrawler(
+        account_id="account-123",
+        api_token="token-123",
+        cache_dir=tmp_path / "cloudflare-overlap-cache",
+        session=session,
+        poll_interval=0,
+    )
+    scope = CrawlScope(
+        roots=[
+            "https://example.com/root-a",
+            "https://example.com/root-b",
+        ],
+        limit=2,
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [
+        "https://example.com/shared",
+        "https://example.com/root-b/unique",
+    ]
+    assert "limit" not in session.post_calls[1][1]
+
+
+def test_directory_crawler_counts_file_roots_toward_limit(tmp_path: Path) -> None:
+    first = _write(tmp_path, "a.md", "# First")
+    second = _write(tmp_path, "b.md", "# Second")
+    crawler = DirectoryCrawler()
+    scope = CrawlScope(roots=[first, second], limit=1)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [first.resolve().as_uri()]
+
+
+def test_directory_crawler_deduplicates_roots_before_counting_limit(
+    tmp_path: Path,
+) -> None:
+    docs = tmp_path / "docs"
+    first = _write(docs, "a.md", "# First")
+    second = _write(docs, "b.md", "# Second")
+    crawler = DirectoryCrawler()
+    scope = CrawlScope(roots=[first, docs], limit=2)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [
+        first.resolve().as_uri(),
+        second.resolve().as_uri(),
+    ]
+
+
+def test_directory_crawler_applies_limit_without_prewalking_tree(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    first = _write(tmp_path, "a.md", "# First")
+    _write(tmp_path, "z/b.md", "# Second")
+    crawler = DirectoryCrawler()
+
+    def fail_rglob(self: Path, pattern: str):
+        del self, pattern
+        raise AssertionError("DirectoryCrawler should not prewalk with rglob")
+
+    monkeypatch.setattr(Path, "rglob", fail_rglob)
+
+    origins = list(
+        crawler.origins(CrawlScope(roots=[tmp_path], depth=0, limit=1), progress=False)
+    )
+
+    assert origins == [first.resolve().as_uri()]
+
+
+def test_directory_crawler_does_not_follow_symlinked_directories_outside_root(
+    tmp_path: Path,
+) -> None:
+    root = tmp_path / "root"
+    inside = _write(root, "inside.md", "# Inside")
+    external_dir = tmp_path / "external"
+    outside = _write(external_dir, "outside.md", "# Outside")
+    link = root / "linked"
+    try:
+        link.symlink_to(external_dir, target_is_directory=True)
+    except OSError as exc:
+        pytest.skip(f"Symlink creation failed: {exc}")
+    crawler = DirectoryCrawler()
+
+    origins = list(crawler.origins(CrawlScope(roots=[root], depth=2), progress=False))
+
+    assert origins == [inside.resolve().as_uri()]
+    assert outside.resolve().as_uri() not in origins
+
+
+def test_directory_crawler_skips_type_sniffing_without_type_filters(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    document = _write(tmp_path, "extensionless", "# Document")
+
+    class _FailingMagika:
+        def identify_path(self, path: Path):
+            raise AssertionError(f"Unexpected type sniff for {path}")
+
+    monkeypatch.setattr(crawl_module, "_MAGIKA", _FailingMagika())
+    crawler = DirectoryCrawler()
+
+    origins = list(crawler.origins(CrawlScope(roots=[tmp_path]), progress=False))
+
+    assert origins == [document.resolve().as_uri()]
+
+
+def test_directory_crawler_coerces_scalar_patterns_and_types(
+    tmp_path: Path,
+) -> None:
+    docs = tmp_path / "docs"
+    readme = _write(docs, "readme.md", "# Readme")
+    _write(docs, "skip.py", "print('skip')")
+    _write(tmp_path, "notes.md", "# Notes")
+    crawler = DirectoryCrawler()
+    scope = CrawlScope(
+        roots=[tmp_path],
+        include_patterns=r".*/docs/.*",
+        include_types="markdown",
+        exclude_types="python",
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == [readme.resolve().as_uri()]
+
+
+def test_directory_crawler_accepts_crawl_scope_for_roots_and_patterns(
+    tmp_path: Path,
+) -> None:
+    docs = _write(tmp_path, "docs/readme.md", "# Hello")
+    _write(tmp_path, "notes/todo.md", "# Skip")
+    crawler = DirectoryCrawler()
+    scope = CrawlScope(
+        roots=[tmp_path],
+        depth=1,
+        include_patterns=[r".*/docs/.*"],
+    )
+
+    origins = list(crawler.origins(scope, progress=False))
+    documents = list(crawler.markdown_documents(scope, progress=False))
+
+    assert origins == [docs.resolve().as_uri()]
+    assert documents == [
+        MarkdownDocument(origin=docs.resolve().as_uri(), content="# Hello")
+    ]
+
+
+def test_directory_crawler_returns_no_origins_when_limit_is_zero(
+    tmp_path: Path,
+) -> None:
+    markdown = _write(tmp_path, "a.md", "# First")
+    crawler = DirectoryCrawler()
+    scope = CrawlScope(roots=[markdown], limit=0)
+
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert origins == []
+
+
+def test_directory_crawler_fetch_markdown_refreshes_when_file_changes(
+    tmp_path: Path,
+) -> None:
+    markdown = _write(tmp_path, "docs/readme.md", "# Hello")
+    cache = tmp_path / "cache"
+    crawler = DirectoryCrawler(cache_dir=cache)
+
+    origin = markdown.resolve().as_uri()
+    first = crawler.fetch_markdown(origin)
+    markdown.write_text("# Updated\n", encoding="utf-8")
+
+    refreshed = crawler.fetch_markdown(origin)
+
+    assert first == MarkdownDocument(origin=origin, content="# Hello")
+    assert refreshed == MarkdownDocument(origin=origin, content="# Updated\n")
+
+
+def test_directory_crawler_excludes_own_cache_files_from_directory_walk(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    markdown = _write(tmp_path, "docs/readme.md", "# Hello")
+    monkeypatch.chdir(tmp_path)
+    crawler = DirectoryCrawler(cache_dir=True)
+    scope = CrawlScope(roots=[tmp_path])
+
+    documents = list(crawler.markdown_documents(scope, progress=False))
+    origins = list(crawler.origins(scope, progress=False))
+
+    assert documents == [
+        MarkdownDocument(origin=markdown.resolve().as_uri(), content="# Hello")
+    ]
+    assert origins == [markdown.resolve().as_uri()]
+
+
+def test_directory_crawler_fetch_markdown_force_refresh_rebuilds_cached_markdown(
+    tmp_path: Path,
+) -> None:
+    markdown = _write(tmp_path, "docs/readme.md", "# Hello")
+    cache = tmp_path / "cache"
+    crawler = DirectoryCrawler(cache_dir=cache)
+
+    origin = markdown.resolve().as_uri()
+    first = crawler.fetch_markdown(origin)
+    cached_markdown = next(
+        path for path in cache.iterdir() if not path.name.endswith(".metadata.json")
+    )
+    cached_markdown.write_text("# Stale\n", encoding="utf-8")
+
+    refreshed = crawler.fetch_markdown(origin, cache_force_refresh=True)
+
+    assert first.content == "# Hello"
+    assert refreshed.content == "# Hello"
+
+
+def test_directory_crawler_markdown_documents_force_refresh_rebuilds_cache(
+    tmp_path: Path,
+) -> None:
+    markdown = _write(tmp_path, "docs/readme.md", "# Hello")
+    cache = tmp_path / "cache"
+    crawler = DirectoryCrawler(cache_dir=cache)
+    root = tmp_path / "docs"
+    scope = CrawlScope(roots=[root])
+
+    documents = list(crawler.markdown_documents(scope, progress=False))
+    cached_markdown = next(
+        path for path in cache.iterdir() if not path.name.endswith(".metadata.json")
+    )
+    cached_markdown.write_text("# Stale\n", encoding="utf-8")
+
+    refreshed = list(
+        crawler.markdown_documents(
+            scope,
+            progress=False,
+            cache_force_refresh=True,
+        )
+    )
+
+    assert documents == [
+        MarkdownDocument(origin=markdown.resolve().as_uri(), content="# Hello")
+    ]
+    assert refreshed == [
+        MarkdownDocument(origin=markdown.resolve().as_uri(), content="# Hello")
+    ]
+
+
+def test_directory_crawler_markdown_documents_converts_in_parallel(
+    tmp_path: Path,
+) -> None:
+    first = _write(tmp_path, "docs/a.md", "# First")
+    second = _write(tmp_path, "docs/b.md", "# Second")
+    crawler = DirectoryCrawler(max_workers=2)
+    scope = CrawlScope(roots=[tmp_path / "docs"])
+    barrier = threading.Barrier(2)
+    lock = threading.Lock()
+    in_flight = 0
+    max_in_flight = 0
+
+    def convert(source: FetchedSource) -> MarkdownDocument:
+        nonlocal in_flight, max_in_flight
+        with lock:
+            in_flight += 1
+            max_in_flight = max(max_in_flight, in_flight)
+        try:
+            barrier.wait(timeout=1.0)
+            return MarkdownDocument(
+                origin=source.origin,
+                content=source.body_path.read_text(encoding="utf-8"),
+            )
+        finally:
+            with lock:
+                in_flight -= 1
+
+    documents = list(crawler.markdown_documents(scope, progress=False, convert=convert))
+
+    assert documents == [
+        MarkdownDocument(origin=first.resolve().as_uri(), content="# First"),
+        MarkdownDocument(origin=second.resolve().as_uri(), content="# Second"),
+    ]
+    assert max_in_flight == 2
+
+
+def test_directory_crawler_reopens_origins_with_uri_escaped_characters(
+    tmp_path: Path,
+) -> None:
+    root = tmp_path / "My Docs"
+    markdown = _write(root, "read me.md", "# Hello")
+    crawler = DirectoryCrawler()
+
+    origin = next(crawler.origins(CrawlScope(roots=[root]), progress=False))
+    document = crawler.fetch_markdown(origin)
+
+    assert "%20" in origin
+    assert document == MarkdownDocument(
+        origin=markdown.resolve().as_uri(),
+        content="# Hello",
+    )
+
+
+def test_directory_crawler_accepts_percent_escaped_file_uri_roots(
+    tmp_path: Path,
+) -> None:
+    root = tmp_path / "My Docs"
+    markdown = _write(root, "read me.md", "# Hello")
+    crawler = DirectoryCrawler()
+
+    origins = list(
+        crawler.origins(CrawlScope(roots=[root.resolve().as_uri()]), progress=False)
+    )
+
+    assert origins == [markdown.resolve().as_uri()]
+
+
+def test_directory_crawler_accepts_windows_drive_letter_string_roots(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    root = tmp_path / "C:\\docs"
+    markdown = _write(root, "readme.md", "# Hello")
+    monkeypatch.chdir(tmp_path)
+    crawler = DirectoryCrawler()
+
+    origins = list(crawler.origins(CrawlScope(roots=["C:\\docs"]), progress=False))
+
+    assert origins == [markdown.resolve().as_uri()]
+
+
+@pytest.mark.skipif(os.name != "nt", reason="Windows-specific file URI handling")
+def test_directory_crawler_round_trips_windows_file_uris(
+    tmp_path: Path,
+) -> None:
+    root = tmp_path / "My Docs"
+    markdown = _write(root, "read me.md", "# Hello")
+    crawler = DirectoryCrawler()
+
+    root_uri = root.resolve().as_uri()
+    origin = markdown.resolve().as_uri()
+
+    origins = list(crawler.origins(CrawlScope(roots=[root_uri]), progress=False))
+    source = crawler.fetch_raw(origin)
+
+    assert origins == [origin]
+    assert source.origin == origin
+    assert source.body_path == markdown.resolve()
+
+
+def test_web_crawler_returns_no_origins_or_requests_when_limit_is_zero(
+    tmp_path: Path,
+) -> None:
+    with _serve(
+        {
+            "/": {
+                "body": "<html><body><main>Root</main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": None,
+            }
+        }
+    ) as server:
+        root_url = f"http://127.0.0.1:{server.server_port}/"
+        crawler = WebCrawler(
+            cache_dir=tmp_path / "zero-limit-cache",
+        )
+        scope = CrawlScope(roots=[root_url], depth=0, limit=0)
+
+        origins = list(crawler.origins(scope, progress=False))
+
+        assert origins == []
+        assert getattr(server, "requests") == []
+
+
+def test_web_crawler_does_not_fetch_extra_root_once_limit_is_reached(
+    tmp_path: Path,
+) -> None:
+    with _serve(
+        {
+            "/first": {
+                "body": "<html><body><main>First</main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": None,
+            },
+            "/second": {
+                "body": "<html><body><main>Second</main></body></html>",
+                "content_type": "text/html; charset=utf-8",
+                "etag": None,
+            },
+        }
+    ) as server:
+        root_url = f"http://127.0.0.1:{server.server_port}"
+        crawler = WebCrawler(
+            cache_dir=tmp_path / "limit-cache",
+            max_workers=2,
+        )
+        scope = CrawlScope(
+            roots=[f"{root_url}/first", f"{root_url}/second"],
+            depth=0,
+            limit=1,
+        )
+
+        origins = list(crawler.origins(scope, progress=False))
+        requests = [request["path"] for request in getattr(server, "requests")]
+
+        assert origins == [f"{root_url}/first"]
+        assert requests == ["/first"]
diff --git a/tests/test_store_ingest.py b/tests/test_store_ingest.py
new file mode 100644
index 0000000..1b2ad61
--- /dev/null
+++ b/tests/test_store_ingest.py
@@ -0,0 +1,495 @@
+from __future__ import annotations
+
+from concurrent.futures import CancelledError
+from dataclasses import replace
+from pathlib import Path
+import threading
+import time
+from types import SimpleNamespace
+from typing import Any
+
+import pytest
+
+import raghilda._store as store_module
+from raghilda.chunker import MarkdownChunker
+from raghilda.document import Document, MarkdownDocument
+from raghilda.store import (
+    BaseStore,
+    ChromaDBStore,
+    DuckDBStore,
+    IngestSummary,
+    OpenAIStore,
+    WriteResult,
+)
+
+
+class _RecordingStore(BaseStore):
+    def __init__(self) -> None:
+        self.lock = threading.Lock()
+        self.prepare_thread_ids: dict[str, int] = {}
+        self.upsert_thread_ids: dict[str, int] = {}
+        self.started_origins: list[str] = []
+        self.max_in_flight = 0
+        self.in_flight = 0
+
+    @staticmethod
+    def connect(*args, **kwargs) -> "_RecordingStore":
+        return _RecordingStore()
+
+    @staticmethod
+    def create(*args, **kwargs) -> "_RecordingStore":
+        return _RecordingStore()
+
+    def upsert(
+        self,
+        document: Document,
+        *,
+        skip_if_unchanged: bool = True,
+    ) -> WriteResult[Document]:
+        origin = document.origin
+        assert isinstance(origin, str)
+        with self.lock:
+            self.started_origins.append(origin)
+            self.in_flight += 1
+            self.max_in_flight = max(self.max_in_flight, self.in_flight)
+        try:
+            time.sleep(0.02 if origin == "doc-1" else 0)
+            if origin == "doc-2":
+                raise RuntimeError("boom")
+            self.upsert_thread_ids[origin] = threading.get_ident()
+            action = (
+                document.attributes["action"] if document.attributes else "inserted"
+            )
+            return WriteResult(action=action, document=document)
+        finally:
+            with self.lock:
+                self.in_flight -= 1
+
+    def retrieve(self, text: str, top_k: int, *args, **kwargs):
+        return []
+
+    def size(self) -> int:
+        return len(self.started_origins)
+
+
+class _BlockingFailureStore(BaseStore):
+    def __init__(self) -> None:
+        self.blocking_started = threading.Event()
+        self.release_blocked = threading.Event()
+        self.blocking_finished = threading.Event()
+
+    @staticmethod
+    def connect(*args, **kwargs) -> "_BlockingFailureStore":
+        return _BlockingFailureStore()
+
+    @staticmethod
+    def create(*args, **kwargs) -> "_BlockingFailureStore":
+        return _BlockingFailureStore()
+
+    def upsert(
+        self,
+        document: Document,
+        *,
+        skip_if_unchanged: bool = True,
+    ) -> WriteResult[Document]:
+        del skip_if_unchanged
+        origin = document.origin
+        assert isinstance(origin, str)
+        if origin == "doc-1":
+            self.blocking_started.set()
+            self.release_blocked.wait(timeout=1.0)
+            self.blocking_finished.set()
+            return WriteResult(action="inserted", document=document)
+        if origin == "doc-2":
+            assert self.blocking_started.wait(timeout=1.0)
+            raise RuntimeError("boom")
+        return WriteResult(action="inserted", document=document)
+
+    def retrieve(self, text: str, top_k: int, *args, **kwargs):
+        return []
+
+    def size(self) -> int:
+        return 0
+
+
+class _CancelledSiblingStore(BaseStore):
+    def __init__(self) -> None:
+        self.allow_failure = threading.Event()
+        self.release_cancelled = threading.Event()
+
+    @staticmethod
+    def connect(*args, **kwargs) -> "_CancelledSiblingStore":
+        return _CancelledSiblingStore()
+
+    @staticmethod
+    def create(*args, **kwargs) -> "_CancelledSiblingStore":
+        return _CancelledSiblingStore()
+
+    def upsert(
+        self,
+        document: Document,
+        *,
+        skip_if_unchanged: bool = True,
+    ) -> WriteResult[Document]:
+        del skip_if_unchanged
+        assert isinstance(document.origin, str)
+        if document.origin == "doc-1":
+            self.allow_failure.set()
+            raise RuntimeError("boom")
+        return WriteResult(action="inserted", document=document)
+
+    def retrieve(self, text: str, top_k: int, *args, **kwargs):
+        return []
+
+    def size(self) -> int:
+        return 0
+
+
+def test_base_store_ingest_returns_summary_and_applies_prepare_before_upsert() -> None:
+    store = _RecordingStore()
+    main_thread_id = threading.get_ident()
+    documents = [
+        MarkdownDocument(
+            origin="doc-1", content="# One", attributes={"action": "inserted"}
+        ),
+        MarkdownDocument(
+            origin="doc-3", content="# Three", attributes={"action": "skipped"}
+        ),
+    ]
+
+    def prepare(document: MarkdownDocument) -> MarkdownDocument:
+        assert document.origin is not None
+        store.prepare_thread_ids[document.origin] = threading.get_ident()
+        return replace(document, content=document.content + "\nprepared")
+
+    summary = store.ingest(documents, prepare=prepare, max_workers=2)
+
+    assert summary == IngestSummary(inserted=1, replaced=0, skipped=1)
+    assert set(store.prepare_thread_ids) == {"doc-1", "doc-3"}
+    assert set(store.upsert_thread_ids) == {"doc-1", "doc-3"}
+    assert set(store.prepare_thread_ids.values()).isdisjoint({main_thread_id})
+
+
+def test_base_store_ingest_runs_prepare_in_worker_pool_concurrently() -> None:
+    store = _RecordingStore()
+    barrier = threading.Barrier(2)
+    lock = threading.Lock()
+    in_prepare = 0
+    max_in_prepare = 0
+
+    documents = [
+        MarkdownDocument(origin="doc-1", content="# One"),
+        MarkdownDocument(origin="doc-3", content="# Three"),
+    ]
+
+    def prepare(document: MarkdownDocument) -> MarkdownDocument:
+        nonlocal in_prepare, max_in_prepare
+        with lock:
+            in_prepare += 1
+            max_in_prepare = max(max_in_prepare, in_prepare)
+        try:
+            barrier.wait(timeout=1.0)
+            return document
+        finally:
+            with lock:
+                in_prepare -= 1
+
+    summary = store.ingest(documents, prepare=prepare, max_workers=2)
+
+    assert summary == IngestSummary(inserted=2, replaced=0, skipped=0)
+    assert max_in_prepare == 2
+
+
+def test_base_store_ingest_starts_writes_before_input_is_exhausted() -> None:
+    class _StreamingStore(BaseStore):
+        def __init__(self) -> None:
+            self.started = threading.Event()
+            self.started_origins: list[str] = []
+
+        @staticmethod
+        def connect(*args, **kwargs) -> "_StreamingStore":
+            return _StreamingStore()
+
+        @staticmethod
+        def create(*args, **kwargs) -> "_StreamingStore":
+            return _StreamingStore()
+
+        def upsert(
+            self,
+            document: Document,
+            *,
+            skip_if_unchanged: bool = True,
+        ) -> WriteResult[Document]:
+            del skip_if_unchanged
+            assert isinstance(document.origin, str)
+            self.started_origins.append(document.origin)
+            self.started.set()
+            return WriteResult(action="inserted", document=document)
+
+        def retrieve(self, text: str, top_k: int, *args, **kwargs):
+            return []
+
+        def size(self) -> int:
+            return len(self.started_origins)
+
+    store = _StreamingStore()
+
+    def documents():
+        yield MarkdownDocument(origin="doc-1", content="# One")
+        assert store.started.wait(timeout=1.0)
+        yield MarkdownDocument(origin="doc-2", content="# Two")
+
+    summary = store.ingest(documents(), max_workers=1)
+
+    assert summary == IngestSummary(inserted=2, replaced=0, skipped=0)
+    assert store.started_origins == ["doc-1", "doc-2"]
+
+
+def test_base_store_ingest_raises_on_duplicate_after_streaming_started() -> None:
+    store = _RecordingStore()
+    documents = [
+        MarkdownDocument(origin="dup", content="# One"),
+        MarkdownDocument(origin="dup", content="# Two"),
+        MarkdownDocument(origin="doc-3", content="# Three"),
+    ]
+
+    with pytest.raises(ValueError, match="Duplicate origin during ingest: dup"):
+        store.ingest(documents, max_workers=1)
+
+    assert store.started_origins == ["dup"]
+
+
+def test_base_store_ingest_fails_fast_and_bounds_worker_count() -> None:
+    store = _RecordingStore()
+    documents = [
+        MarkdownDocument(origin="doc-1", content="# One"),
+        MarkdownDocument(origin="doc-2", content="# Two"),
+        MarkdownDocument(origin="doc-3", content="# Three"),
+    ]
+
+    with pytest.raises(RuntimeError, match="boom"):
+        store.ingest(documents, max_workers=2)
+
+    assert "doc-3" not in store.started_origins
+    assert store.max_in_flight <= 2
+
+
+def test_base_store_ingest_waits_for_running_workers_before_raising() -> None:
+    store = _BlockingFailureStore()
+    documents = [
+        MarkdownDocument(origin="doc-1", content="# One"),
+        MarkdownDocument(origin="doc-2", content="# Two"),
+    ]
+
+    def release_blocked() -> None:
+        assert store.blocking_started.wait(timeout=1.0)
+        time.sleep(0.2)
+        store.release_blocked.set()
+
+    releaser = threading.Thread(target=release_blocked)
+    releaser.start()
+
+    try:
+        with pytest.raises(RuntimeError, match="boom"):
+            store.ingest(documents, max_workers=2)
+
+        assert store.release_blocked.is_set()
+        assert store.blocking_finished.is_set()
+    finally:
+        releaser.join()
+
+
+def test_base_store_ingest_ignores_cancelled_sibling_when_worker_failed(
+    monkeypatch,
+) -> None:
+    class _FakeFuture:
+        def __init__(
+            self,
+            *,
+            result: WriteResult[Document] | None = None,
+            error: BaseException | None = None,
+        ) -> None:
+            self._result = result
+            self._error = error
+
+        def result(self) -> WriteResult[Document]:
+            if self._error is not None:
+                raise self._error
+            assert self._result is not None
+            return self._result
+
+        def cancel(self) -> None:
+            return None
+
+    class _FakeExecutor:
+        def __init__(self, *, max_workers: int) -> None:
+            del max_workers
+            self._submissions = [
+                _FakeFuture(error=CancelledError()),
+                _FakeFuture(error=RuntimeError("boom")),
+            ]
+
+        def submit(self, fn, arg):
+            del fn, arg
+            return self._submissions.pop(0)
+
+        def shutdown(self, *, wait: bool, cancel_futures: bool) -> None:
+            del wait, cancel_futures
+            return None
+
+    def fake_wait(pending, return_when):
+        del pending, return_when
+        return (
+            [
+                _FakeFuture(error=CancelledError()),
+                _FakeFuture(error=RuntimeError("boom")),
+            ],
+            set(),
+        )
+
+    monkeypatch.setattr(store_module, "ThreadPoolExecutor", _FakeExecutor)
+    monkeypatch.setattr(store_module, "wait", fake_wait)
+
+    store = _RecordingStore()
+    documents = [
+        MarkdownDocument(origin="doc-1", content="# One"),
+        MarkdownDocument(origin="doc-2", content="# Two"),
+    ]
+
+    with pytest.raises(RuntimeError, match="boom"):
+        store.ingest(documents, max_workers=2)
+
+
+def test_base_store_ingest_propagates_worker_cancelled_error() -> None:
+    store = _RecordingStore()
+    documents = [MarkdownDocument(origin="doc-1", content="# One")]
+
+    def prepare(document: MarkdownDocument) -> MarkdownDocument:
+        del document
+        raise CancelledError("prepare cancelled")
+
+    with pytest.raises(CancelledError, match="prepare cancelled"):
+        store.ingest(documents, prepare=prepare, max_workers=1)
+
+
+def test_postgresql_store_ingest_serializes_upsert_calls() -> None:
+    pytest.importorskip("psycopg2")
+    from raghilda._postgres_store import PostgreSQLStore
+
+    store = PostgreSQLStore.__new__(PostgreSQLStore)
+    store._ingest_upsert_lock = threading.Lock()
+    lock = threading.Lock()
+    in_flight = 0
+    max_in_flight = 0
+
+    def upsert(
+        document: Document,
+        *,
+        skip_if_unchanged: bool = True,
+    ) -> WriteResult[Document]:
+        del skip_if_unchanged
+        nonlocal in_flight, max_in_flight
+        with lock:
+            in_flight += 1
+            max_in_flight = max(max_in_flight, in_flight)
+        try:
+            time.sleep(0.02)
+            return WriteResult(action="inserted", document=document)
+        finally:
+            with lock:
+                in_flight -= 1
+
+    store.upsert = upsert  # type: ignore[method-assign]
+    documents = [
+        MarkdownDocument(origin="doc-1", content="# One"),
+        MarkdownDocument(origin="doc-2", content="# Two"),
+    ]
+
+    summary = store.ingest(documents, max_workers=2)
+
+    assert summary == IngestSummary(inserted=2, replaced=0, skipped=0)
+    assert max_in_flight == 1
+
+
+def test_duckdb_store_ingest_prepares_chunked_documents() -> None:
+    store = DuckDBStore.create(
+        location=":memory:",
+        embed=None,
+        overwrite=True,
+        name="duckdb_ingest",
+    )
+    documents = [
+        MarkdownDocument(origin="doc-1", content="# One\n\nHello"),
+        MarkdownDocument(origin="doc-2", content="# Two\n\nWorld"),
+    ]
+
+    summary = store.ingest(
+        documents,
+        prepare=MarkdownChunker(chunk_size=32, target_overlap=0).chunk,
+        max_workers=2,
+    )
+
+    assert summary == IngestSummary(inserted=2, replaced=0, skipped=0)
+    assert store.size() == 2
+
+
+def test_chromadb_store_ingest_prepares_chunked_documents(tmp_path: Path) -> None:
+    store = ChromaDBStore.create(
+        location=tmp_path / "chroma",
+        overwrite=True,
+        name="chroma_ingest",
+        embed=None,
+    )
+    documents = [
+        MarkdownDocument(origin="doc-1", content="# One\n\nHello"),
+        MarkdownDocument(origin="doc-2", content="# Two\n\nWorld"),
+    ]
+
+    summary = store.ingest(
+        documents,
+        prepare=MarkdownChunker(chunk_size=32, target_overlap=0).chunk,
+        max_workers=2,
+    )
+
+    assert summary == IngestSummary(inserted=2, replaced=0, skipped=0)
+    assert store.size() == 2
+
+
+class _SinglePage:
+    def __init__(self, data: list[Any]):
+        self.data = data
+
+    def has_next_page(self) -> bool:
+        return False
+
+
+class _FakeVectorStoreFiles:
+    def __init__(self) -> None:
+        self.uploads: list[dict[str, Any]] = []
+
+    def list(self, **kwargs):
+        return _SinglePage([])
+
+    def upload_and_poll(self, **kwargs):
+        self.uploads.append(kwargs)
+        return SimpleNamespace(id=f"file-{len(self.uploads)}")
+
+    def delete(self, **kwargs):
+        raise AssertionError("delete should not be called")
+
+
+def test_openai_store_ingest_accepts_markdown_documents_without_prepare() -> None:
+    vector_store_files = _FakeVectorStoreFiles()
+    fake_client = SimpleNamespace(
+        vector_stores=SimpleNamespace(files=vector_store_files),
+    )
+    store = OpenAIStore(client=fake_client, store_id="vs_test")
+    documents = [
+        MarkdownDocument(origin="doc-1", content="# One"),
+        MarkdownDocument(origin="doc-2", content="# Two"),
+    ]
+
+    summary = store.ingest(documents, max_workers=2)
+
+    assert summary == IngestSummary(inserted=2, replaced=0, skipped=0)
+    assert len(vector_store_files.uploads) == 2
diff --git a/user_guide/04-crawling-and-ingestion.qmd b/user_guide/04-crawling-and-ingestion.qmd
new file mode 100644
index 0000000..d877d16
--- /dev/null
+++ b/user_guide/04-crawling-and-ingestion.qmd
@@ -0,0 +1,268 @@
+---
+title: "Crawling and Ingestion"
+guide-section: "Getting Started"
+---
+
+raghilda's core workflow is intentionally sequential: find a source, read it,
+chunk it, and upsert it. That is the recommended first path for building a store
+because every step is visible, easy to inspect, and easy to change.
+
+As your source collection grows, store creation can become mostly waiting on
+network requests, file conversion, chunking, and writes. The crawling and
+ingestion API is the next step when you want that work to run concurrently. It
+can make store creation substantially faster while still letting you inspect the
+origins, fetched sources, converted Markdown documents, and final ingest
+summary.
+
+The tradeoff is a few extra concepts. Use this API when the simple sequential
+workflow is too slow, or when you need a repeatable refresh job for a larger
+site, document collection, or codebase. The API has three parts:
+
+- `CrawlScope` describes what to crawl.
+- A crawler discovers sources and returns `MarkdownDocument` objects.
+- `store.ingest()` prepares and upserts the stream.
+
+## Crawl a website
+
+Use `WebCrawler` when you want raghilda to fetch pages directly with
+`requests`. The crawler starts from one or more roots, follows links up to
+`depth`, and yields matching pages as Markdown documents.
+
+```{python}
+#| eval: false
+from datetime import timedelta
+
+from raghilda.chunker import MarkdownChunker
+from raghilda.crawl import CrawlScope, WebCrawler
+from raghilda.embedding import EmbeddingOpenAI
+from raghilda.store import DuckDBStore
+
+store = DuckDBStore.create(
+    location="docs.db",
+    embed=EmbeddingOpenAI(),
+    name="docs",
+    overwrite=True,
+)
+
+crawler = WebCrawler(
+    cache_dir=True,
+    cache_stale_after=timedelta(days=1),
+    max_workers=4,
+)
+scope = CrawlScope(
+    roots=["https://quarto.org/docs/guide/"],
+    depth=2,
+    include_patterns=[r"^https://quarto\.org/docs/guide/"],
+    exclude_patterns=[r"/reference/"],
+    include_types=["html"],
+)
+
+chunker = MarkdownChunker(chunk_size=1600, target_overlap=0.5)
+
+summary = store.ingest(
+    crawler.markdown_documents(scope),
+    prepare=chunker.chunk,
+    max_workers=4,
+)
+store.build_index()
+
+print(summary)
+```
+
+`CrawlScope` owns traversal policy:
+
+| Field | Description |
+|-------|-------------|
+| `roots` | Starting files, directories, or URLs. |
+| `depth` | Number of link or directory levels to follow. `0` means only the roots. |
+| `limit` | Maximum number of origins to yield. |
+| `include_patterns` | Regular expressions that origins must match. |
+| `exclude_patterns` | Regular expressions that remove origins from the crawl. |
+| `include_types` | Type labels to include, such as `html`, `markdown`, `pdf`, `python`, or `text`. |
+| `exclude_types` | Type labels to skip. |
+| `include_external_links` | Allow links outside the root origin. Defaults to `False`. |
+| `include_subdomains` | Allow subdomains under the root host. Defaults to `False`. |
+
+`WebCrawler(cache_dir=True)` stores fetched response bodies under
+`.raghilda/cache/web`. With `cache_stale_after`, fresh cached responses are
+reused, and stale responses are revalidated with `ETag` or `Last-Modified`
+headers when the server provides them. Pass `cache_force_refresh=True` to
+`origins()`, `fetch_raw()`, `fetch_markdown()`, or `markdown_documents()` when a
+run must bypass the cache.
+
+## Crawl local files
+
+Use `DirectoryCrawler` for local Markdown, notebooks, PDFs, text files, and
+other files supported by `read_as_markdown()`.
+
+```{python}
+#| eval: false
+from raghilda.chunker import MarkdownChunker
+from raghilda.crawl import CrawlScope, DirectoryCrawler
+from raghilda.store import DuckDBStore
+
+store = DuckDBStore.create(
+    location="local-docs.db",
+    embed=None,
+    name="local_docs",
+    overwrite=True,
+)
+
+crawler = DirectoryCrawler(cache_dir=True, max_workers=4)
+scope = CrawlScope(
+    roots=["docs"],
+    depth=3,
+    include_patterns=[r".*\.(md|qmd|ipynb|pdf)$"],
+    exclude_patterns=[r".*/_site/.*", r".*/\.venv/.*"],
+)
+
+chunker = MarkdownChunker()
+summary = store.ingest(
+    crawler.markdown_documents(scope),
+    prepare=chunker.chunk,
+    max_workers=4,
+)
+
+print(summary)
+```
+
+Directory crawling always reads the current filesystem tree. If you enable
+`cache_dir`, converted Markdown is reused only when the source file hash and
+modification time still match the cached metadata. The crawler also skips its
+own cache directory when the cache is inside a crawled root.
+
+## Inspect before ingesting
+
+The crawler interface is useful even when you are not ready to write to a
+store. Use `origins()` to inspect what the scope discovers, or use
+`fetch_markdown()` to convert one source.
+
+```{python}
+#| eval: false
+from raghilda.crawl import CrawlScope, WebCrawler
+
+crawler = WebCrawler(cache_dir=True)
+scope = CrawlScope(
+    roots=["https://example.com/docs/"],
+    depth=1,
+    limit=10,
+)
+
+for origin in crawler.origins(scope):
+    print(origin)
+
+doc = crawler.fetch_markdown("https://example.com/docs/")
+print(doc.origin)
+print(doc.content[:500])
+```
+
+All crawler classes implement the same public methods:
+
+| Method | Returns |
+|--------|---------|
+| `origins(scope)` | A lazy iterator of source origins. |
+| `fetch_raw(origin)` | A `FetchedSource` with the cached body path and metadata. |
+| `fetch_markdown(origin)` | One `MarkdownDocument`. |
+| `markdown_documents(scope)` | A lazy iterator of `MarkdownDocument` objects. |
+
+## Customize conversion
+
+By default, crawlers convert fetched sources with raghilda's Markdown reader.
+Pass a `convert` function when a site or file collection needs custom cleanup.
+The function receives a `FetchedSource` and returns a `MarkdownDocument`.
+
+```{python}
+#| eval: false
+from raghilda.crawl import CrawlScope, FetchedSource, WebCrawler
+from raghilda.document import MarkdownDocument
+from raghilda.read import read_as_markdown
+
+
+def convert_reference_page(source: FetchedSource) -> MarkdownDocument:
+    doc = read_as_markdown(str(source.body_path))
+    markdown = doc.content
+    markdown = markdown.replace("Edit this page", "")
+    return MarkdownDocument(origin=source.origin, content=markdown)
+
+
+crawler = WebCrawler(cache_dir=True)
+scope = CrawlScope(roots=["https://example.com/reference/"], depth=1)
+documents = crawler.markdown_documents(scope, convert=convert_reference_page)
+```
+
+Keep chunking in `store.ingest(prepare=...)`, not in the converter. The
+converter should return one unchunked Markdown document per origin; `prepare`
+can then apply the same chunking policy to every document.
+
+## Use Cloudflare crawling
+
+Use `CloudflareCrawler` when you want Cloudflare to perform the browser-rendered
+crawl and return Markdown records. This is useful for sites that need rendering
+or where you want Cloudflare's crawl service to manage discovery.
+
+```{python}
+#| eval: false
+import os
+from datetime import timedelta
+
+from raghilda.chunker import MarkdownChunker
+from raghilda.crawl import CloudflareCrawler, CrawlScope
+from raghilda.store import DuckDBStore
+
+store = DuckDBStore.create(
+    location="rendered-docs.db",
+    embed=None,
+    name="rendered_docs",
+    overwrite=True,
+)
+
+crawler = CloudflareCrawler(
+    account_id=os.environ["CLOUDFLARE_ACCOUNT_ID"],
+    api_token=os.environ["CLOUDFLARE_API_TOKEN"],
+    cache_dir=True,
+    cache_stale_after=timedelta(days=1),
+    render=True,
+)
+scope = CrawlScope(
+    roots=["https://example.com/docs/"],
+    depth=2,
+    include_patterns=["https://example.com/docs/**"],
+    exclude_patterns=["https://example.com/docs/archive/**"],
+    limit=250,
+)
+
+summary = store.ingest(
+    crawler.markdown_documents(scope),
+    prepare=MarkdownChunker().chunk,
+    max_workers=4,
+)
+
+print(summary)
+```
+
+For Cloudflare crawls, `include_patterns` and `exclude_patterns` use
+Cloudflare-style wildcard patterns, such as `https://example.com/docs/**`.
+`include_external_links` and `include_subdomains` are passed through to the
+Cloudflare crawl request.
+
+## Refresh a store
+
+`store.ingest()` upserts each prepared document and returns an `IngestSummary`
+with counts for inserted, replaced, and skipped documents. The input stream is
+consumed lazily, and `prepare` runs in the worker pool.
+
+```{python}
+#| eval: false
+summary = store.ingest(
+    crawler.markdown_documents(scope, cache_force_refresh=True),
+    prepare=chunker.chunk,
+    max_workers=4,
+)
+
+print(f"Inserted: {summary.inserted}")
+print(f"Replaced: {summary.replaced}")
+print(f"Skipped: {summary.skipped}")
+```
+
+Use `upsert()` directly when you need per-document `WriteResult` objects.
+Use `ingest()` when you want one aggregate summary for a crawl or refresh job.