diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..0c4474f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,17 @@ +# Changelog + +## Unreleased + +### Added + +- Added `raghilda.crawl`, including `CrawlScope`, `FetchedSource`, + `DirectoryCrawler`, `WebCrawler`, and `CloudflareCrawler`, for discovering + directory, web, and Cloudflare sources and converting them to markdown + documents. +- Added `BaseStore.ingest()` and `IngestSummary` for bulk document ingestion + with optional document preparation, parallel writes, and inserted, replaced, + and skipped counts. + +### Fixed + +- Fixed sitemap URL extraction so each `` entry is collected as one URL. diff --git a/great-docs.yml b/great-docs.yml index 05245cf..ee074e1 100644 --- a/great-docs.yml +++ b/great-docs.yml @@ -75,6 +75,16 @@ reference: - name: store.OpenAIStore - name: store.PostgreSQLStore + - title: Crawl + desc: Crawlers for discovering and converting source documents + contents: + - crawl.CrawlScope + - crawl.FetchedSource + - crawl.BaseCrawler + - crawl.DirectoryCrawler + - crawl.WebCrawler + - crawl.CloudflareCrawler + - title: Embedding desc: Embedding providers for generating vector representations contents: diff --git a/src/raghilda/__init__.py b/src/raghilda/__init__.py index b123fc6..768b9e8 100644 --- a/src/raghilda/__init__.py +++ b/src/raghilda/__init__.py @@ -1,6 +1,7 @@ -from . import embedding, store, types, chunk, chunker, document, read, scrape +from . import crawl, embedding, store, types, chunk, chunker, document, read, scrape __all__ = [ + "crawl", "embedding", "store", "types", diff --git a/src/raghilda/_postgres_store.py b/src/raghilda/_postgres_store.py index 6314122..6aadd29 100644 --- a/src/raghilda/_postgres_store.py +++ b/src/raghilda/_postgres_store.py @@ -1,5 +1,6 @@ from ._store import BaseStore, WriteResult import json +import threading from .embedding import EmbeddingProvider, EmbedInputType, embedding_from_config from .document import Document, ChunkedMarkdownDocument from .chunk import Chunk, MarkdownChunk, RetrievedChunk, Metric @@ -137,6 +138,7 @@ def __init__( self.con = con self._metadata = metadata self._schema = psycopg2.extensions.quote_ident(schema, con) + self._ingest_upsert_lock = threading.Lock() def close(self) -> None: """Close the store's database connection.""" @@ -539,6 +541,13 @@ def upsert( replaced_document=replaced_document, ) + def _ingest_upsert( + self, + document: Document, + ) -> WriteResult[ChunkedMarkdownDocument]: + with self._ingest_upsert_lock: + return self.upsert(document) + def _load_document_snapshot( self, *, origin: str, text: str ) -> ChunkedMarkdownDocument: diff --git a/src/raghilda/_store.py b/src/raghilda/_store.py index e6fdd44..c4440a1 100644 --- a/src/raghilda/_store.py +++ b/src/raghilda/_store.py @@ -1,13 +1,16 @@ from __future__ import annotations from abc import ABC, abstractmethod +from concurrent.futures import FIRST_COMPLETED, CancelledError, ThreadPoolExecutor, wait from dataclasses import dataclass -from typing import Generic, Literal, Sequence, TypeVar +import threading +from typing import Any, Callable, Generic, Iterable, Literal, Sequence, TypeVar from .chunk import RetrievedChunk from .document import Document TDocument = TypeVar("TDocument", bound=Document, covariant=True) +_RECENT_INGEST_ORIGIN_WINDOW = 10_000 @dataclass(frozen=True) @@ -17,6 +20,13 @@ class WriteResult(Generic[TDocument]): replaced_document: TDocument | None = None +@dataclass(frozen=True) +class IngestSummary: + inserted: int + replaced: int + skipped: int + + class BaseStore(ABC): """Abstract base class for vector stores. @@ -77,6 +87,121 @@ def upsert( """ pass + def _ingest_upsert(self, document: Document) -> WriteResult[Document]: + return self.upsert(document) + + def ingest( + self, + documents: Iterable[Any], + *, + prepare: Callable[[Any], Document] | None = None, + max_workers: int = 1, + ) -> IngestSummary: + """Prepare and upsert a stream of documents. + + Inputs are consumed lazily and submitted incrementally. After + ``prepare`` is applied, recent non-empty string origins are checked for + duplicates as the stream is consumed. Duplicate detection is best + effort: a duplicate raises ``ValueError`` when encountered, after any + writes already in flight complete. No rollback is attempted. + + Returns + ------- + IngestSummary + Aggregate counts for inserted, replaced, and skipped documents. + Call ``upsert()`` directly when per-document ``WriteResult`` values + are needed. + """ + assert max_workers >= 1 + stop_event = threading.Event() + recent_origins: dict[str, None] = {} + recent_origins_lock = threading.Lock() + + def remember_origin(origin: str | None) -> None: + if not isinstance(origin, str) or not origin: + return + with recent_origins_lock: + if origin in recent_origins: + raise ValueError(f"Duplicate origin during ingest: {origin}") + recent_origins[origin] = None + if len(recent_origins) > _RECENT_INGEST_ORIGIN_WINDOW: + # dict preserves insertion order, so the first key is the oldest. + recent_origins.pop(next(iter(recent_origins))) + + def process_document(item: Any) -> WriteResult[Document]: + if stop_event.is_set(): + raise CancelledError() + document = prepare(item) if prepare is not None else item + if stop_event.is_set(): + raise CancelledError() + remember_origin(document.origin) + if stop_event.is_set(): + raise CancelledError() + return self._ingest_upsert(document) + + iterator = iter(documents) + pending = set() + inserted = 0 + replaced = 0 + skipped = 0 + exhausted = False + executor = ThreadPoolExecutor(max_workers=max_workers) + try: + while not exhausted and len(pending) < max_workers: + try: + document = next(iterator) + except StopIteration: + exhausted = True + continue + pending.add(executor.submit(process_document, document)) + + while pending: + done, pending = wait(pending, return_when=FIRST_COMPLETED) + results = [] + cancelled_errors = [] + errors = [] + for future in done: + try: + results.append(future.result()) + except CancelledError as exc: + cancelled_errors.append(exc) + except Exception as exc: + errors.append(exc) + if errors: + raise errors[0] + if cancelled_errors and not stop_event.is_set(): + raise cancelled_errors[0] + for result in results: + if result.action == "inserted": + inserted += 1 + elif result.action == "replaced": + replaced += 1 + elif result.action == "skipped": + skipped += 1 + else: + raise ValueError(f"Unknown write action: {result.action}") + + while not exhausted and len(pending) < max_workers: + try: + document = next(iterator) + except StopIteration: + exhausted = True + continue + pending.add(executor.submit(process_document, document)) + except Exception: + stop_event.set() + for future in pending: + future.cancel() + executor.shutdown(wait=True, cancel_futures=True) + raise + + executor.shutdown(wait=True, cancel_futures=False) + return IngestSummary( + inserted=inserted, + replaced=replaced, + skipped=skipped, + ) + @abstractmethod def retrieve( self, text: str, top_k: int, *args, **kwargs diff --git a/src/raghilda/crawl.py b/src/raghilda/crawl.py new file mode 100644 index 0000000..feb361c --- /dev/null +++ b/src/raghilda/crawl.py @@ -0,0 +1,2156 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections import deque +from concurrent.futures import ThreadPoolExecutor +from contextlib import contextmanager +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +import hashlib +import json +import mimetypes +import os +from pathlib import Path +import re +import shutil +import tempfile +import time +from typing import Any, Callable, Iterable, Iterator, Mapping, Sequence, TypeVar +import threading +import unicodedata +from urllib.parse import urldefrag, urljoin, urlparse, urlunparse +from urllib.request import url2pathname + +import requests + +from .document import MarkdownDocument +from .read import _convert_to_markdown +from .scrape import _extract_links + +try: + from magika import Magika +except ImportError: # pragma: no cover - optional at runtime + Magika = None + +__all__ = [ + "BaseCrawler", + "CrawlScope", + "FetchedSource", + "WebCrawler", + "DirectoryCrawler", + "CloudflareCrawler", +] + +_TYPE_ALIASES = { + ".htm": "html", + ".html": "html", + ".ipynb": "jupyter-notebook", + ".markdown": "markdown", + ".md": "markdown", + ".pdf": "pdf", + ".py": "python", + ".rst": "rst", + ".txt": "text", +} +_CONTENT_TYPE_LABELS = { + "application/json": "json", + "application/pdf": "pdf", + "application/xml": "xml", + "text/html": "html", + "text/markdown": "markdown", + "text/plain": "text", + "text/x-python": "python", + "text/xml": "xml", +} +_MAGIKA_LABELS = { + "html": "html", + "ipynb": "jupyter-notebook", + "markdown": "markdown", + "pdf": "pdf", + "python": "python", + "rst": "rst", + "txt": "text", +} +_TERMINAL_CLOUDFLARE_STATUSES = { + "cancelled_by_user", + "cancelled_due_to_limits", + "cancelled_due_to_timeout", + "completed", + "errored", +} +_MAGIKA = Magika() if Magika is not None else None +_DEFAULT_CRAWL_DEPTH = 100_000 + +RootInput = str | Path +RootsInput = RootInput | Sequence[RootInput] +CacheValue = tuple[Path | None, dict[str, Any] | None] +CacheEntry = tuple[str, Path | None, dict[str, Any] | None] +WebOriginKey = tuple[str, str, int | None] +TInput = TypeVar("TInput") +TOutput = TypeVar("TOutput") + + +@dataclass(frozen=True) +class CrawlScope: + roots: RootsInput + include_patterns: Sequence[str] | None = None + exclude_patterns: Sequence[str] | None = None + depth: int | None = None + limit: int | None = None + include_types: Sequence[str] | None = None + exclude_types: Sequence[str] | None = None + include_external_links: bool = False + include_subdomains: bool = False + + def __post_init__(self) -> None: + if self.depth is not None: + assert self.depth >= 0 + if self.limit is not None: + assert self.limit >= 0 + + +@dataclass(frozen=True) +class FetchedSource: + origin: str + body_path: Path + resolved_origin: str | None = None + content_type: str | None = None + status_code: int | None = None + metadata: dict[str, Any] | None = None + fetched_at: datetime | None = None + revalidated_at: datetime | None = None + markdown_path: Path | None = None + + +@dataclass(frozen=True) +class _CloudflareRootCacheEntry: + fetched_at: datetime + records: list[dict[str, Any]] + + +@dataclass(frozen=True) +class _CloudflareRecordCacheEntry: + fetched_at: datetime + record: dict[str, Any] + + +@dataclass(frozen=True) +class _ResolvedCrawlScope: + roots: list[RootInput] + include_patterns: list[str] + exclude_patterns: list[str] + depth: int + limit: int | None + include_types: set[str] + exclude_types: set[str] + include_external_links: bool + include_subdomains: bool + + +@dataclass +class _EntryLockState: + lock: threading.RLock + users: int = 0 + + +class _FilesystemCrawlerCache: + """ + Filesystem-backed cache rooted at one directory. + + Each logical key is stored as: + /--.metadata.json + /-- + + The metadata file is the source of truth and stores: + { + "key": , + "content_path": , + "metadata": , + } + """ + + _METADATA_SUFFIX = ".metadata.json" + _HASH_LEN = 12 + _MAX_STEM_LEN = 180 + + _WINDOWS_RESERVED = { + "CON", + "PRN", + "AUX", + "NUL", + "COM1", + "COM2", + "COM3", + "COM4", + "COM5", + "COM6", + "COM7", + "COM8", + "COM9", + "LPT1", + "LPT2", + "LPT3", + "LPT4", + "LPT5", + "LPT6", + "LPT7", + "LPT8", + "LPT9", + } + + def __init__(self, root: Path | None) -> None: + """Create a filesystem-backed cache rooted at one directory.""" + self.root = root + self._entry_locks_guard = threading.Lock() + self._entry_locks: dict[str, _EntryLockState] = {} + if self.root is not None: + self.root.mkdir(parents=True, exist_ok=True) + + def fetch(self, key: str) -> CacheValue | None: + """ + Return the materialized cache entry for one key, if present. + + This method does not lock for normal reads. If it encounters a broken + metadata file, it triggers a locked re-check and best-effort cleanup, + then returns None. + """ + if self.root is None: + return None + + metadata_path = self._metadata_path_for_key(key) + if not metadata_path.exists(): + return None + + record = self._read_record(metadata_path) + if record is None: + self._cleanup_broken_metadata_path(metadata_path) + return None + if record["key"] != key: + self._cleanup_mismatched_metadata_path(metadata_path, key) + return None + + content_path: Path | None = None + content_name = record["content_path"] + if content_name is not None: + candidate = self.root / content_name + if candidate.exists(): + content_path = candidate + + return content_path, record["metadata"] + + def upsert( + self, + key: str, + *, + content: bytes | str | Path | None, + metadata: Mapping[str, Any] | None, + content_ext: str | None, + ) -> CacheValue | None: + """ + Create or replace one cache entry. + + Semantics: + - content=None means no content file for this entry + - metadata=None means no user metadata for this entry + - the metadata sidecar is always written, unless both are None + - (content=None, metadata=None) deletes the entry and returns None + """ + if self.root is None: + return None + + if content is None and metadata is None: + self.delete(key) + return None + + base = self._base_for_key(key) + metadata_path = self.root / f"{base}{self._METADATA_SUFFIX}" + stored_metadata = dict(metadata) if metadata is not None else None + new_content_path: Path | None = None + new_content_name: str | None = None + if content is not None: + ext = self._choose_content_ext( + content=content, + content_ext=content_ext, + ) + new_content_path = self.root / f"{base}{ext}" + new_content_name = new_content_path.name + record = { + "key": key, + "content_path": new_content_name, + "metadata": stored_metadata, + } + keep = {metadata_path.name} + if new_content_name is not None: + keep.add(new_content_name) + + with self._locked_base(base): + if metadata_path.exists() and self._read_record(metadata_path) is None: + self._delete_base_files_locked(base) + + if content is not None: + assert new_content_path is not None + self._write_content(new_content_path, content) + + self._write_json(metadata_path, record) + self._delete_extra_base_files_locked(base, keep=keep) + + return new_content_path, stored_metadata + + def delete(self, key: str) -> int: + """ + Delete one cache entry. + + Returns the number of files removed. + """ + if self.root is None: + return 0 + + base = self._base_for_key(key) + with self._locked_base(base): + return self._delete_base_files_locked(base) + + def entries(self) -> Iterable[CacheEntry]: + """ + Yield all cache entries currently described by metadata files. + + This method does not lock for normal reads. Broken metadata files are + re-checked under the write lock and cleaned up if still invalid. + """ + if self.root is None: + return + + for metadata_path in sorted(self.root.glob(f"*{self._METADATA_SUFFIX}")): + record = self._read_record(metadata_path) + if record is None: + self._cleanup_broken_metadata_path(metadata_path) + continue + + content_path: Path | None = None + content_name = record["content_path"] + if content_name is not None: + candidate = self.root / content_name + if candidate.exists(): + content_path = candidate + + yield record["key"], content_path, record["metadata"] + + def _metadata_path_for_key(self, key: str) -> Path: + """Return the deterministic metadata path for one logical key.""" + assert self.root is not None + return self.root / f"{self._base_for_key(key)}{self._METADATA_SUFFIX}" + + def _base_for_key(self, key: str) -> str: + """Build the shared basename for the metadata file and content file.""" + return f"{self._sanitize_stem(key)}--{self._hash_fragment(key)}" + + def _hash_fragment(self, key: str) -> str: + """Return a stable hash fragment of the original key.""" + return hashlib.sha256(key.encode("utf-8")).hexdigest()[: self._HASH_LEN] + + def _sanitize_stem(self, key: str) -> str: + """Make the key visible in the filename, but safe enough for Windows.""" + value = unicodedata.normalize("NFC", key) + value = value.replace("://", "__") + value = value.replace("\\", "_") + value = value.replace("/", "_") + value = re.sub(r'[\x00-\x1f<>:"|?*]+', "_", value) + value = re.sub(r"\s+", "_", value) + value = re.sub(r"[^A-Za-z0-9._-]+", "_", value) + value = value.strip(" ._") + + if not value: + value = "entry" + + root = value.split(".", 1)[0].rstrip(" .").upper() + if root in self._WINDOWS_RESERVED: + value = f"_{value}" + + if len(value) > self._MAX_STEM_LEN: + head = self._MAX_STEM_LEN // 2 - 2 + tail = self._MAX_STEM_LEN - head - 2 + value = f"{value[:head]}..{value[-tail:]}" + + value = value.rstrip(" .") + return value or "entry" + + def _choose_content_ext( + self, + *, + content: bytes | str | Path, + content_ext: str | None, + ) -> str: + """Choose the content file extension.""" + ext = self._normalize_ext(content_ext) + if ext is not None: + return ext + + if isinstance(content, Path): + ext = self._normalize_ext(content.suffix) + if ext is not None: + return ext + + ext = self._infer_ext_with_magika(content) + if ext is not None: + return ext + + if isinstance(content, str): + return ".txt" + + return ".raw" + + def _infer_ext_with_magika(self, content: bytes | str | Path) -> str | None: + """Best-effort extension inference using Magika.""" + if _MAGIKA is None: + return None + + if isinstance(content, Path): + if not content.exists(): + return None + result = _MAGIKA.identify_path(content) + elif isinstance(content, str): + result = _MAGIKA.identify_bytes(content.encode("utf-8")) + else: + result = _MAGIKA.identify_bytes(content) + + extensions = getattr(result.output, "extensions", None) + if not extensions: + return None + return self._normalize_ext(extensions[0]) + + def _normalize_ext(self, ext: str | None) -> str | None: + """Normalize an extension string into a safe canonical form.""" + if ext is None: + return None + + ext = ext.strip() + if not ext: + return None + + if not ext.startswith("."): + ext = "." + ext + + parts = [part for part in ext.split(".") if part] + if not parts: + return None + + cleaned: list[str] = [] + for part in parts: + token = re.sub(r"[^A-Za-z0-9_-]+", "", part) + if token: + cleaned.append(token.lower()) + + if not cleaned: + return None + + return "".join(f".{part}" for part in cleaned) + + def _read_record(self, path: Path) -> dict[str, Any] | None: + """Read and validate one metadata JSON file.""" + try: + with path.open("r", encoding="utf-8") as handle: + obj = json.load(handle) + except (OSError, json.JSONDecodeError): + return None + + if not isinstance(obj, dict): + return None + + key = obj.get("key") + content_path = obj.get("content_path") + metadata = obj.get("metadata") + + if not isinstance(key, str): + return None + if content_path is not None and not isinstance(content_path, str): + return None + if content_path is not None: + if content_path in {"", ".", ".."}: + return None + if Path(content_path).name != content_path or "\\" in content_path: + return None + if metadata is not None and not isinstance(metadata, dict): + return None + + return { + "key": key, + "content_path": content_path, + "metadata": metadata, + } + + def _cleanup_broken_metadata_path(self, metadata_path: Path) -> None: + """Best-effort cleanup for a broken metadata file.""" + if self.root is None: + return + if not metadata_path.name.endswith(self._METADATA_SUFFIX): + return + + base = metadata_path.name[: -len(self._METADATA_SUFFIX)] + with self._locked_base(base): + if not metadata_path.exists(): + return + if self._read_record(metadata_path) is not None: + return + + self._delete_base_files_locked(base) + + def _cleanup_mismatched_metadata_path( + self, + metadata_path: Path, + key: str, + ) -> None: + """Best-effort cleanup for a metadata file stored under the wrong key.""" + if self.root is None: + return + + base = self._base_for_key(key) + with self._locked_base(base): + if not metadata_path.exists(): + return + record = self._read_record(metadata_path) + if record is not None and record["key"] == key: + return + + self._delete_base_files_locked(base) + + @contextmanager + def _locked_base(self, base: str) -> Iterator[None]: + state = self._acquire_entry_lock_state(base) + state.lock.acquire() + try: + yield + finally: + self._release_entry_lock_state(base, state) + + def _acquire_entry_lock_state(self, base: str) -> _EntryLockState: + with self._entry_locks_guard: + state = self._entry_locks.get(base) + if state is None: + state = _EntryLockState(lock=threading.RLock()) + self._entry_locks[base] = state + state.users += 1 + return state + + def _release_entry_lock_state(self, base: str, state: _EntryLockState) -> None: + state.lock.release() + with self._entry_locks_guard: + current = self._entry_locks.get(base) + assert current is state + state.users -= 1 + if state.users == 0: + del self._entry_locks[base] + + def _delete_base_files_locked(self, base: str) -> int: + """Delete all files belonging to one logical base.""" + assert self.root is not None + + deleted = 0 + for path in self.root.iterdir(): + if not self._belongs_to_base(path.name, base): + continue + if not path.is_file(): + continue + try: + path.unlink() + deleted += 1 + except FileNotFoundError: + pass + return deleted + + def _delete_extra_base_files_locked(self, base: str, *, keep: set[str]) -> None: + """Delete stale files for one base, keeping the current pair.""" + assert self.root is not None + + for path in self.root.iterdir(): + if not self._belongs_to_base(path.name, base): + continue + if not path.is_file(): + continue + if path.name in keep: + continue + try: + path.unlink() + except FileNotFoundError: + pass + + def _belongs_to_base(self, name: str, base: str) -> bool: + if name == f"{base}{self._METADATA_SUFFIX}": + return True + prefix = f"{base}." + if not name.startswith(prefix): + return False + return "--" not in name[len(prefix) :] + + def _write_content(self, content_path: Path, content: bytes | str | Path) -> None: + if isinstance(content, Path): + if content == content_path: + return + + temporary_path: Path | None = None + try: + with tempfile.NamedTemporaryFile( + "wb", + dir=content_path.parent, + prefix=f".{content_path.name}.", + delete=False, + ) as handle: + temporary_path = Path(handle.name) + if isinstance(content, bytes): + handle.write(content) + elif isinstance(content, str): + handle.write(content.encode("utf-8")) + elif isinstance(content, Path): + with content.open("rb") as source: + shutil.copyfileobj(source, handle) + else: + raise TypeError(f"Unsupported content type: {type(content)!r}") + os.replace(temporary_path, content_path) + finally: + if temporary_path is not None: + temporary_path.unlink(missing_ok=True) + + def _write_json(self, path: Path, obj: Mapping[str, Any]) -> None: + """Write metadata JSON directly to its destination path.""" + text = json.dumps(obj, indent=2, sort_keys=True, ensure_ascii=False) + "\n" + with path.open("w", encoding="utf-8") as handle: + handle.write(text) + + +class _DirectoryCrawlerCache(_FilesystemCrawlerCache): + pass + + +class _WebCrawlerCache(_FilesystemCrawlerCache): + pass + + +class _CloudflareCrawlerCache(_FilesystemCrawlerCache): + pass + + +def _map_ordered( + items: Iterable[TInput], + *, + max_workers: int, + fn: Callable[[TInput], TOutput], +) -> Iterator[TOutput]: + assert max_workers >= 1 + iterator = iter(items) + if max_workers == 1: + for item in iterator: + yield fn(item) + return + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + pending: deque[Any] = deque() + while len(pending) < max_workers: + try: + item = next(iterator) + except StopIteration: + break + pending.append(executor.submit(fn, item)) + + while pending: + future = pending.popleft() + yield future.result() + try: + item = next(iterator) + except StopIteration: + continue + pending.append(executor.submit(fn, item)) + + +class BaseCrawler(ABC): + max_workers: int + + @abstractmethod + def origins( + self, + scope: CrawlScope, + *, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[str]: + pass + + @abstractmethod + def fetch_raw( + self, + origin: str, + *, + cache_force_refresh: bool = False, + ) -> FetchedSource: + pass + + def fetch_markdown( + self, + origin: str, + *, + convert: Callable[[FetchedSource], MarkdownDocument] | None = None, + cache_force_refresh: bool = False, + ) -> MarkdownDocument: + source = self.fetch_raw(origin, cache_force_refresh=cache_force_refresh) + converter = convert or self._default_convert + return converter(source) + + def _fetch_markdown_after_origin_discovery( + self, + origin: str, + *, + convert: Callable[[FetchedSource], MarkdownDocument] | None = None, + ) -> MarkdownDocument: + source = self._fetch_raw_after_origin_discovery(origin) + converter = convert or self._default_convert + return converter(source) + + def _fetch_raw_after_origin_discovery(self, origin: str) -> FetchedSource: + return self.fetch_raw(origin, cache_force_refresh=False) + + def markdown_documents( + self, + scope: CrawlScope, + *, + convert: Callable[[FetchedSource], MarkdownDocument] | None = None, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[MarkdownDocument]: + origins = self.origins( + scope, + progress=progress, + cache_force_refresh=cache_force_refresh, + ) + yield from _map_ordered( + origins, + max_workers=self.max_workers, + fn=lambda origin: self._fetch_markdown_after_origin_discovery( + origin, + convert=convert, + ), + ) + + def _default_convert(self, source: FetchedSource) -> MarkdownDocument: + raise NotImplementedError + + +class DirectoryCrawler(BaseCrawler): + """Crawl local files and optionally cache converted markdown. + + Directory traversal always reads the current filesystem state. The cache + stores converted markdown per file origin and is reused only when the + current file hash and modification time still match the cached metadata. + """ + + def __init__( + self, + *, + cache_dir: bool | str | Path | None = None, + max_workers: int = 1, + ) -> None: + assert max_workers >= 1 + self.cache_dir = _resolve_cache_dir( + cache_dir, + backend_name="directory", + default_factory=lambda: None, + ) + self.max_workers = max_workers + self._cache = _DirectoryCrawlerCache(self.cache_dir) + + def origins( + self, + scope: CrawlScope, + *, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[str]: + del progress, cache_force_refresh + resolved_scope = _resolve_crawl_scope(scope) + if resolved_scope.limit == 0: + return + cache_root = self.cache_dir.resolve() if self.cache_dir is not None else None + count = 0 + yielded_origins: set[str] = set() + for root in resolved_scope.roots: + path = _to_directory_path(root) + assert path.exists(), f"Root does not exist: {path}" + if path.is_file(): + resolved_path = path.resolve() + if cache_root is not None and resolved_path.is_relative_to(cache_root): + continue + origin = resolved_path.as_uri() + if origin in yielded_origins: + continue + if self._include_path( + path, + origin, + include_patterns=resolved_scope.include_patterns, + exclude_patterns=resolved_scope.exclude_patterns, + include_types=resolved_scope.include_types, + exclude_types=resolved_scope.exclude_types, + ): + yielded_origins.add(origin) + yield origin + count += 1 + if ( + resolved_scope.limit is not None + and count >= resolved_scope.limit + ): + return + continue + for file_path in _iter_directory_files( + path, + max_depth=resolved_scope.depth, + ): + resolved_file_path = file_path.resolve() + if cache_root is not None and resolved_file_path.is_relative_to( + cache_root + ): + continue + origin = resolved_file_path.as_uri() + if origin in yielded_origins: + continue + if not self._include_path( + file_path, + origin, + include_patterns=resolved_scope.include_patterns, + exclude_patterns=resolved_scope.exclude_patterns, + include_types=resolved_scope.include_types, + exclude_types=resolved_scope.exclude_types, + ): + continue + yielded_origins.add(origin) + yield origin + count += 1 + if resolved_scope.limit is not None and count >= resolved_scope.limit: + return + + def fetch_raw( + self, + origin: str, + *, + cache_force_refresh: bool = False, + ) -> FetchedSource: + path = _path_from_file_origin(origin).resolve() + assert path.is_file(), f"File origin must exist: {origin}" + canonical_origin = path.as_uri() + content_type = mimetypes.guess_type(path.name)[0] + type_label = _detect_type_label(path=path, content_type=content_type) + source_hash = _sha256_path(path) + markdown_path: Path | None = None + if self.cache_dir is not None and not cache_force_refresh: + cached_entry = self._cache.fetch(canonical_origin) + if cached_entry is not None: + cached_markdown_path, cached_meta = cached_entry + if ( + cached_markdown_path is not None + and cached_meta is not None + and ( + cached_meta.get("source_hash") == source_hash + and cached_meta.get("mtime_ns") == path.stat().st_mtime_ns + ) + ): + markdown_path = cached_markdown_path + return FetchedSource( + origin=canonical_origin, + resolved_origin=canonical_origin, + content_type=content_type, + status_code=None, + metadata={ + "mtime_ns": path.stat().st_mtime_ns, + "size": path.stat().st_size, + "source_hash": source_hash, + "type_label": type_label, + }, + fetched_at=datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc), + body_path=path, + markdown_path=markdown_path, + ) + + def markdown_documents( + self, + scope: CrawlScope, + *, + convert: Callable[[FetchedSource], MarkdownDocument] | None = None, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[MarkdownDocument]: + origins = self.origins( + scope, + progress=progress, + cache_force_refresh=cache_force_refresh, + ) + yield from _map_ordered( + origins, + max_workers=self.max_workers, + fn=lambda origin: self.fetch_markdown( + origin, + convert=convert, + cache_force_refresh=cache_force_refresh, + ), + ) + + def _default_convert(self, source: FetchedSource) -> MarkdownDocument: + if source.markdown_path is not None and source.markdown_path.exists(): + markdown = source.markdown_path.read_text(encoding="utf-8") + return MarkdownDocument(origin=source.origin, content=markdown) + + type_label = (source.metadata or {}).get("type_label") + if type_label == "markdown": + markdown = source.body_path.read_text(encoding="utf-8") + else: + markdown = _convert_to_markdown(str(source.body_path)) + + if self.cache_dir is not None: + self._cache.upsert( + source.origin, + content=markdown, + metadata={ + "origin": source.origin, + "mtime_ns": (source.metadata or {}).get("mtime_ns"), + "source_hash": (source.metadata or {}).get("source_hash"), + }, + content_ext=".md", + ) + + return MarkdownDocument(origin=source.origin, content=markdown) + + def _include_path( + self, + path: Path, + origin: str, + *, + include_patterns: Sequence[str], + exclude_patterns: Sequence[str], + include_types: set[str], + exclude_types: set[str], + ) -> bool: + if not _matches_patterns( + origin, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + ): + return False + if not include_types and not exclude_types: + return True + label = _detect_type_label( + path=path, content_type=mimetypes.guess_type(path.name)[0] + ) + return _matches_types( + label, + include_types=include_types, + exclude_types=exclude_types, + ) + + +class WebCrawler(BaseCrawler): + def __init__( + self, + *, + session: requests.Session | None = None, + cache_dir: bool | str | Path | None = None, + cache_stale_after: timedelta | None = None, + max_workers: int = 1, + ) -> None: + assert max_workers >= 1 + self.session = requests.Session() if session is None else session + self._cache_context = None if session is None else f"session:{id(self.session)}" + self.cache_dir = _resolve_cache_dir( + cache_dir, + backend_name="web", + default_factory=lambda: Path( + tempfile.mkdtemp(prefix="raghilda-web-cache-") + ), + ) + self.cache_stale_after = cache_stale_after + self.max_workers = max_workers + self._cache = _WebCrawlerCache(self.cache_dir) + + def origins( + self, + scope: CrawlScope, + *, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[str]: + del progress + resolved_scope = _resolve_crawl_scope(scope) + if resolved_scope.limit == 0: + return + visited: set[tuple[str, WebOriginKey, str]] = set() + yielded_origins: set[str] = set() + yielded = 0 + frontier: list[tuple[str, WebOriginKey, str]] = [] + + for root in resolved_scope.roots: + canonical_root = _canonicalize_web_url(str(root)) + assert canonical_root is not None + parsed = urlparse(canonical_root) + assert parsed.scheme in {"http", "https"} + root_host = parsed.hostname or "" + frontier.append( + (canonical_root, _web_origin_key(canonical_root), root_host) + ) + + current_depth = 0 + while frontier: + batch: list[tuple[str, WebOriginKey, str]] = [] + for origin, scope_origin, root_host in frontier: + visit_key = (origin, scope_origin, root_host) + if visit_key in visited: + continue + if not self._allow_origin( + origin, + scope_origin, + root_host, + include_external_links=resolved_scope.include_external_links, + include_subdomains=resolved_scope.include_subdomains, + ): + continue + if _matches_exclude_patterns( + origin, + exclude_patterns=resolved_scope.exclude_patterns, + ): + continue + visited.add(visit_key) + batch.append((origin, scope_origin, root_host)) + + next_frontier: list[tuple[str, WebOriginKey, str]] = [] + offset = 0 + while offset < len(batch): + remaining = ( + None + if resolved_scope.limit is None + else resolved_scope.limit - yielded + ) + if remaining == 0: + return + chunk_size = len(batch) - offset + if remaining is not None: + chunk_size = min(chunk_size, remaining) + window = batch[offset : offset + chunk_size] + fetched_sources = _map_ordered( + window, + max_workers=min(self.max_workers, len(window)), + fn=lambda item: ( + item, + self.fetch_raw( + item[0], + cache_force_refresh=cache_force_refresh, + ), + ), + ) + for (origin, scope_origin, root_host), source in fetched_sources: + type_label = (source.metadata or {}).get("type_label") + matches_patterns = _matches_patterns( + origin, + include_patterns=resolved_scope.include_patterns, + exclude_patterns=resolved_scope.exclude_patterns, + ) + matches_types = _matches_types( + type_label, + include_types=resolved_scope.include_types, + exclude_types=resolved_scope.exclude_types, + ) + if ( + matches_patterns + and matches_types + and origin not in yielded_origins + ): + yield origin + yielded_origins.add(origin) + yielded += 1 + if ( + resolved_scope.limit is not None + and yielded >= resolved_scope.limit + ): + return + if current_depth >= resolved_scope.depth: + continue + + text = _read_text(source.body_path) + resolved_origin = source.resolved_origin or origin + resolved_origin_key = _web_origin_key(resolved_origin) + origin_key = _web_origin_key(origin) + child_root_host = root_host + if ( + resolved_scope.include_subdomains + and resolved_origin_key == origin_key + ): + child_scope_origin = scope_origin + else: + child_scope_origin = resolved_origin_key + child_root_host = ( + urlparse(resolved_origin).hostname or root_host + ) + for link in sorted(_extract_links(text)): + canonical = _canonicalize_web_url(link, base=resolved_origin) + if canonical is None: + continue + parsed = urlparse(canonical) + if parsed.scheme not in {"http", "https"}: + continue + next_frontier.append( + (canonical, child_scope_origin, child_root_host) + ) + offset += chunk_size + frontier = next_frontier + current_depth += 1 + + def fetch_raw( + self, + origin: str, + *, + cache_force_refresh: bool = False, + ) -> FetchedSource: + canonical_origin = _canonicalize_web_url(origin) + assert canonical_origin is not None + parsed = urlparse(canonical_origin) + assert parsed.scheme in {"http", "https"} + + cached_entry = self._cache.fetch(canonical_origin) + body_path: Path | None = None + cached_meta: dict[str, Any] | None = None + if cached_entry is not None: + body_path, cached_meta = cached_entry + has_cache = ( + body_path is not None + and cached_meta is not None + and self._cache_context_matches(cached_meta) + ) + now = _utcnow() + + if has_cache and not cache_force_refresh: + assert cached_meta is not None + assert body_path is not None + if self._is_fresh(cached_meta, now): + return self._source_from_meta(cached_meta, body_path=body_path) + + headers: dict[str, str] = {} + if has_cache and not cache_force_refresh: + assert cached_meta is not None + etag = cached_meta.get("etag") + last_modified = cached_meta.get("last_modified") + if etag: + headers["If-None-Match"] = etag + if last_modified: + headers["If-Modified-Since"] = last_modified + + response = self.session.get(canonical_origin, headers=headers, timeout=30.0) + if response.status_code == 304 and has_cache: + assert cached_meta is not None + assert body_path is not None + cached_meta["revalidated_at"] = now.isoformat() + cached_entry = self._cache.upsert( + canonical_origin, + content=body_path, + metadata=cached_meta, + content_ext=None, + ) + assert cached_entry is not None + body_path, cached_meta = cached_entry + assert body_path is not None + assert cached_meta is not None + return self._source_from_meta(cached_meta, body_path=body_path) + + response.raise_for_status() + content_type = response.headers.get("Content-Type") + resolved_origin = ( + _canonicalize_web_url(response.url, base=canonical_origin) or response.url + ) + type_label = _detect_type_label( + path=_type_hint_path(canonical_origin, content_type=content_type), + content_type=content_type, + ) + meta = { + "origin": canonical_origin, + "resolved_origin": resolved_origin, + "content_type": content_type, + "status_code": response.status_code, + "etag": response.headers.get("ETag"), + "last_modified": response.headers.get("Last-Modified"), + "type_label": type_label, + "fetched_at": now.isoformat(), + "revalidated_at": None, + "cache_context": self._cache_context, + } + cached_entry = self._cache.upsert( + canonical_origin, + content=response.content, + metadata=meta, + content_ext=_known_body_suffix( + canonical_origin, + content_type=content_type, + ), + ) + assert cached_entry is not None + body_path, meta = cached_entry + assert body_path is not None + assert meta is not None + actual_type_label = _detect_type_label( + path=body_path, + content_type=content_type, + ) + if actual_type_label != meta.get("type_label"): + meta["type_label"] = actual_type_label + cached_entry = self._cache.upsert( + canonical_origin, + content=body_path, + metadata=meta, + content_ext=None, + ) + assert cached_entry is not None + body_path, meta = cached_entry + assert body_path is not None + assert meta is not None + return self._source_from_meta(meta, body_path=body_path) + + def _fetch_raw_after_origin_discovery(self, origin: str) -> FetchedSource: + canonical_origin = _canonicalize_web_url(origin) + assert canonical_origin is not None + cached_entry = self._cache.fetch(canonical_origin) + assert cached_entry is not None + body_path, cached_meta = cached_entry + assert body_path is not None + assert cached_meta is not None + return self._source_from_meta(cached_meta, body_path=body_path) + + def _default_convert(self, source: FetchedSource) -> MarkdownDocument: + type_label = (source.metadata or {}).get("type_label") + if type_label == "markdown": + markdown = _read_text(source.body_path) + else: + path_for_conversion = source.body_path + if source.body_path.suffix == "": + suffix = _body_suffix( + source.origin, + content_type=source.content_type, + ) + with tempfile.NamedTemporaryFile( + prefix="raghilda-convert-", + suffix=suffix, + delete=False, + ) as temporary_file: + temporary_path = Path(temporary_file.name) + temporary_file.write(source.body_path.read_bytes()) + try: + path_for_conversion = temporary_path + markdown = _convert_to_markdown(str(path_for_conversion)) + finally: + temporary_path.unlink(missing_ok=True) + else: + markdown = _convert_to_markdown(str(path_for_conversion)) + return MarkdownDocument(origin=source.origin, content=markdown) + + def _source_from_meta( + self, + meta: dict[str, Any], + *, + body_path: Path, + ) -> FetchedSource: + return FetchedSource( + origin=meta["origin"], + resolved_origin=meta.get("resolved_origin"), + content_type=meta.get("content_type"), + status_code=meta.get("status_code"), + metadata={ + "etag": meta.get("etag"), + "last_modified": meta.get("last_modified"), + "type_label": meta.get("type_label"), + }, + fetched_at=_parse_datetime(meta.get("fetched_at")), + revalidated_at=_parse_datetime(meta.get("revalidated_at")), + body_path=body_path, + ) + + def _is_fresh(self, cached_meta: dict[str, Any], now: datetime) -> bool: + if self.cache_stale_after is None: + return True + timestamps = [ + _parse_datetime(cached_meta.get("fetched_at")), + _parse_datetime(cached_meta.get("revalidated_at")), + ] + freshest_cache_time = max( + (timestamp for timestamp in timestamps if timestamp is not None), + default=None, + ) + if freshest_cache_time is None: + return False + return now - freshest_cache_time <= self.cache_stale_after + + def _cache_context_matches(self, cached_meta: dict[str, Any]) -> bool: + return cached_meta.get("cache_context") == self._cache_context + + def _allow_origin( + self, + origin: str, + scope_origin: WebOriginKey, + root_host: str, + *, + include_external_links: bool, + include_subdomains: bool, + ) -> bool: + return _allow_web_origin( + origin, + scope_origin, + root_host, + include_external_links=include_external_links, + include_subdomains=include_subdomains, + ) + + +class CloudflareCrawler(BaseCrawler): + def __init__( + self, + *, + account_id: str, + api_token: str, + cache_dir: bool | str | Path | None = None, + session: requests.Session | Any | None = None, + source: str = "all", + render: bool = True, + cache_stale_after: timedelta | None = None, + modified_since: int | None = None, + poll_interval: float = 5.0, + max_poll_attempts: int = 60, + max_workers: int = 1, + base_url: str = "https://api.cloudflare.com/client/v4", + ) -> None: + assert max_workers >= 1 + self.account_id = account_id + self.api_token = api_token + self.cache_dir = _resolve_cache_dir( + cache_dir, + backend_name="cloudflare", + default_factory=lambda: Path( + tempfile.mkdtemp(prefix="raghilda-cloudflare-cache-") + ), + ) + self.session = session or requests.Session() + self.source = source + self.render = render + self.cache_stale_after = cache_stale_after + self.modified_since = modified_since + self.poll_interval = poll_interval + self.max_poll_attempts = max_poll_attempts + self.max_workers = max_workers + self.base_url = base_url.rstrip("/") + self._records: dict[str, _CloudflareRecordCacheEntry] = {} + self._roots: dict[tuple[Any, ...], _CloudflareRootCacheEntry] = {} + self._cache = _CloudflareCrawlerCache(self.cache_dir) + + def origins( + self, + scope: CrawlScope, + *, + progress: bool = True, + cache_force_refresh: bool = False, + ) -> Iterator[str]: + del progress + resolved_scope = _resolve_crawl_scope(scope) + yielded = 0 + yielded_origins: set[str] = set() + crawled_roots: set[str] = set() + for root in resolved_scope.roots: + if resolved_scope.limit is not None and yielded >= resolved_scope.limit: + return + canonical_root = _canonicalize_web_url(str(root)) + assert canonical_root is not None + if canonical_root in crawled_roots: + continue + crawled_roots.add(canonical_root) + remaining = ( + None if resolved_scope.limit is None else resolved_scope.limit - yielded + ) + root_limit = remaining if not yielded_origins else None + records = self._crawl_root( + canonical_root, + cache_force_refresh=cache_force_refresh, + depth=resolved_scope.depth, + include_patterns=resolved_scope.include_patterns, + exclude_patterns=resolved_scope.exclude_patterns, + include_external_links=resolved_scope.include_external_links, + include_subdomains=resolved_scope.include_subdomains, + limit=root_limit, + ) + for record in records: + origin = record["url"] + if origin in yielded_origins: + continue + label = _detect_type_label( + path=None, + content_type="text/markdown", + ) + if not _matches_types( + label, + include_types=resolved_scope.include_types, + exclude_types=resolved_scope.exclude_types, + ): + continue + yielded_origins.add(origin) + yield origin + yielded += 1 + if resolved_scope.limit is not None and yielded >= resolved_scope.limit: + return + + def fetch_raw( + self, + origin: str, + *, + cache_force_refresh: bool = False, + ) -> FetchedSource: + canonical_origin = _canonicalize_web_url(origin) + assert canonical_origin is not None + record_entry = ( + None if cache_force_refresh else self._records.get(canonical_origin) + ) + if record_entry is not None and not self._cloudflare_cache_is_fresh( + record_entry.fetched_at + ): + record_entry = None + if record_entry is None and not cache_force_refresh: + record_entry = self._load_record_cache_entry(canonical_origin) + if record_entry is not None: + self._records[canonical_origin] = record_entry + if record_entry is None or cache_force_refresh: + records = self._crawl_root( + canonical_origin, + cache_force_refresh=cache_force_refresh, + depth=0, + limit=1, + apply_patterns=False, + include_external_links=False, + include_subdomains=False, + ) + record = next( + (item for item in records if item["url"] == canonical_origin), + None, + ) + if record is None and len(records) == 1: + record = records[0] + if record is None: + raise ValueError(f"Cloudflare crawl did not return record for {origin}") + record_entry = self._records.get(record["url"]) + assert record_entry is not None + self._records[canonical_origin] = record_entry + + assert record_entry is not None + return self._source_from_record_entry(canonical_origin, record_entry) + + def _fetch_raw_after_origin_discovery(self, origin: str) -> FetchedSource: + canonical_origin = _canonicalize_web_url(origin) + assert canonical_origin is not None + record_entry = self._records.get(canonical_origin) + if record_entry is None: + record_entry = self._load_record_cache_entry(canonical_origin) + assert record_entry is not None + self._records[canonical_origin] = record_entry + return self._source_from_record_entry(canonical_origin, record_entry) + + def _source_from_record_entry( + self, + canonical_origin: str, + record_entry: _CloudflareRecordCacheEntry, + ) -> FetchedSource: + content_path, _ = self._store_record_cache_entry( + canonical_origin, + record=record_entry.record, + fetched_at=record_entry.fetched_at, + ) + assert content_path is not None + record = record_entry.record + return FetchedSource( + origin=canonical_origin, + resolved_origin=record.get("metadata", {}).get("url", canonical_origin), + content_type="text/markdown", + status_code=record.get("metadata", {}).get("status"), + metadata={ + "crawler_status": record.get("status"), + "title": record.get("metadata", {}).get("title"), + "type_label": "markdown", + }, + fetched_at=record_entry.fetched_at, + body_path=content_path, + markdown_path=content_path, + ) + + def _default_convert(self, source: FetchedSource) -> MarkdownDocument: + markdown = source.body_path.read_text(encoding="utf-8") + return MarkdownDocument(origin=source.origin, content=markdown) + + def _crawl_root( + self, + root: str, + *, + cache_force_refresh: bool, + depth: int | None = None, + include_patterns: Sequence[str] | None = None, + exclude_patterns: Sequence[str] | None = None, + include_external_links: bool, + include_subdomains: bool, + limit: int | None = None, + apply_patterns: bool = True, + ) -> list[dict[str, Any]]: + resolved_depth = _DEFAULT_CRAWL_DEPTH if depth is None else depth + resolved_include_patterns = list(include_patterns or []) + resolved_exclude_patterns = list(exclude_patterns or []) + resolved_limit = limit + cache_key = ( + root, + resolved_depth, + resolved_limit, + apply_patterns, + tuple(resolved_include_patterns), + tuple(resolved_exclude_patterns), + include_external_links, + include_subdomains, + ) + cached_entry = self._roots.get(cache_key) + if ( + not cache_force_refresh + and cached_entry is not None + and self._cloudflare_cache_is_fresh(cached_entry.fetched_at) + ): + return cached_entry.records + if not cache_force_refresh and apply_patterns: + cached_entry = self._load_root_cache_entry(cache_key) + if cached_entry is not None: + self._roots[cache_key] = cached_entry + return cached_entry.records + + endpoint = f"{self.base_url}/accounts/{self.account_id}/browser-rendering/crawl" + payload = self._crawl_payload( + root, + depth=resolved_depth, + limit=resolved_limit, + include_patterns=resolved_include_patterns, + exclude_patterns=resolved_exclude_patterns, + include_external_links=include_external_links, + include_subdomains=include_subdomains, + cache_force_refresh=cache_force_refresh, + apply_patterns=apply_patterns, + ) + headers = { + "Authorization": f"Bearer {self.api_token}", + "Content-Type": "application/json", + } + response = self.session.post( + endpoint, + json=payload, + headers=headers, + timeout=30.0, + ) + response.raise_for_status() + response_payload = response.json() + job_id = response_payload["result"] + + result: dict[str, Any] | None = None + for _ in range(self.max_poll_attempts): + poll_response = self.session.get( + f"{endpoint}/{job_id}", + headers={"Authorization": f"Bearer {self.api_token}"}, + params={"limit": 1}, + timeout=30.0, + ) + poll_response.raise_for_status() + result = poll_response.json()["result"] + assert result is not None + status = result["status"] + if status == "running": + if self.poll_interval > 0: + time.sleep(self.poll_interval) + continue + if status not in _TERMINAL_CLOUDFLARE_STATUSES: + raise ValueError(f"Unexpected Cloudflare crawl status: {status}") + if status != "completed": + raise ValueError(f"Cloudflare crawl ended with status '{status}'") + break + else: + raise TimeoutError("Cloudflare crawl did not complete within the timeout") + + assert result is not None + full_response = self.session.get( + f"{endpoint}/{job_id}", + headers={"Authorization": f"Bearer {self.api_token}"}, + params=None, + timeout=30.0, + ) + full_response.raise_for_status() + result = full_response.json()["result"] + assert result is not None + + records = list(result.get("records") or []) + cursor = result.get("cursor") + while cursor is not None: + page_response = self.session.get( + f"{endpoint}/{job_id}", + headers={"Authorization": f"Bearer {self.api_token}"}, + params={"cursor": cursor, "status": "completed"}, + timeout=30.0, + ) + page_response.raise_for_status() + page_result = page_response.json()["result"] + records.extend(page_result.get("records") or []) + cursor = page_result.get("cursor") + + scope_origin = _web_origin_key(root) + root_host = urlparse(root).hostname or "" + completed_records = [] + for record in records: + if record.get("status") != "completed": + continue + canonical_url = _canonicalize_web_url(record["url"]) + if canonical_url is None: + continue + if ( + apply_patterns + and not _allow_web_origin( + canonical_url, + scope_origin, + root_host, + include_external_links=include_external_links, + include_subdomains=include_subdomains, + ) + and not _is_cloudflare_seed_redirect_target(root, canonical_url) + ): + continue + if canonical_url != record["url"]: + record = dict(record) + record["url"] = canonical_url + completed_records.append(record) + if apply_patterns: + completed_records = [ + record + for record in completed_records + if _matches_cloudflare_patterns( + record["url"], + include_patterns=resolved_include_patterns, + exclude_patterns=resolved_exclude_patterns, + ) + ] + fetched_at = _utcnow() + self._roots[cache_key] = _CloudflareRootCacheEntry( + fetched_at=fetched_at, + records=completed_records, + ) + if apply_patterns: + self._store_root_cache_entry( + cache_key, + records=completed_records, + fetched_at=fetched_at, + ) + for record in completed_records: + self._records[record["url"]] = _CloudflareRecordCacheEntry( + fetched_at=fetched_at, + record=record, + ) + self._store_record_cache_entry( + record["url"], + record=record, + fetched_at=fetched_at, + ) + return completed_records + + def _crawl_payload( + self, + root: str, + *, + depth: int, + limit: int | None, + include_patterns: Sequence[str], + exclude_patterns: Sequence[str], + include_external_links: bool, + include_subdomains: bool, + cache_force_refresh: bool, + apply_patterns: bool, + ) -> dict[str, Any]: + payload: dict[str, Any] = { + "url": root, + "depth": depth, + "formats": ["markdown"], + "render": self.render, + "source": self.source, + "options": { + "includeExternalLinks": include_external_links, + "includeSubdomains": include_subdomains, + }, + } + if limit is not None: + payload["limit"] = limit + if apply_patterns and include_patterns: + payload["options"]["includePatterns"] = list(include_patterns) + if apply_patterns and exclude_patterns: + payload["options"]["excludePatterns"] = list(exclude_patterns) + if self.modified_since is not None: + payload["modifiedSince"] = self.modified_since + if cache_force_refresh: + payload["maxAge"] = 0 + elif self.cache_stale_after is not None: + payload["maxAge"] = int(self.cache_stale_after.total_seconds()) + return payload + + def _record_cache_signature(self) -> dict[str, Any]: + return { + "account_id": self.account_id, + "base_url": self.base_url, + "render": self.render, + "source": self.source, + "modified_since": self.modified_since, + } + + def _root_cache_key(self, cache_key: tuple[Any, ...]) -> str: + payload = { + "cache_key": cache_key, + "signature": self._record_cache_signature(), + } + encoded = json.dumps(payload, sort_keys=True, separators=(",", ":")) + return f"cloudflare-root:{encoded}" + + def _load_root_cache_entry( + self, + cache_key: tuple[Any, ...], + ) -> _CloudflareRootCacheEntry | None: + cached_entry = self._cache.fetch(self._root_cache_key(cache_key)) + if cached_entry is None: + return None + _, cached_meta = cached_entry + if cached_meta is None: + return None + if cached_meta.get("signature") != self._record_cache_signature(): + return None + fetched_at = _parse_datetime(cached_meta.get("fetched_at")) + if fetched_at is None or not self._cloudflare_cache_is_fresh(fetched_at): + return None + records = cached_meta["records"] + for record in records: + self._records[record["url"]] = _CloudflareRecordCacheEntry( + fetched_at=fetched_at, + record=record, + ) + return _CloudflareRootCacheEntry( + fetched_at=fetched_at, + records=records, + ) + + def _store_root_cache_entry( + self, + cache_key: tuple[Any, ...], + *, + records: list[dict[str, Any]], + fetched_at: datetime, + ) -> None: + self._cache.upsert( + self._root_cache_key(cache_key), + content=None, + metadata={ + "fetched_at": fetched_at.isoformat(), + "records": records, + "signature": self._record_cache_signature(), + }, + content_ext=None, + ) + + def _load_record_cache_entry( + self, + origin: str, + ) -> _CloudflareRecordCacheEntry | None: + cached_entry = self._cache.fetch(origin) + if cached_entry is None: + return None + _, cached_meta = cached_entry + if cached_meta is None: + return None + if cached_meta.get("signature") != self._record_cache_signature(): + return None + fetched_at = _parse_datetime(cached_meta.get("fetched_at")) + if fetched_at is None or not self._cloudflare_cache_is_fresh(fetched_at): + return None + record = cached_meta["record"] + return _CloudflareRecordCacheEntry( + fetched_at=fetched_at, + record=record, + ) + + def _store_record_cache_entry( + self, + origin: str, + *, + record: dict[str, Any], + fetched_at: datetime, + ) -> CacheValue: + cached_entry = self._cache.upsert( + origin, + content=record["markdown"], + metadata={ + "origin": origin, + "fetched_at": fetched_at.isoformat(), + "record": record, + "signature": self._record_cache_signature(), + }, + content_ext=".md", + ) + assert cached_entry is not None + return cached_entry + + def _cloudflare_cache_is_fresh(self, fetched_at: datetime) -> bool: + if self.cache_stale_after is None: + return True + return _utcnow() - fetched_at <= self.cache_stale_after + + +def _coerce_roots(roots: RootsInput) -> list[RootInput]: + if isinstance(roots, (str, Path)): + return [roots] + return list(roots) + + +def _resolve_crawl_scope(scope: CrawlScope) -> _ResolvedCrawlScope: + return _ResolvedCrawlScope( + roots=_coerce_roots(scope.roots), + include_patterns=_coerce_string_sequence(scope.include_patterns), + exclude_patterns=_coerce_string_sequence(scope.exclude_patterns), + depth=_DEFAULT_CRAWL_DEPTH if scope.depth is None else scope.depth, + limit=scope.limit, + include_types=_normalize_types(scope.include_types), + exclude_types=_normalize_types(scope.exclude_types), + include_external_links=scope.include_external_links, + include_subdomains=scope.include_subdomains, + ) + + +def _coerce_string_sequence(values: Sequence[str] | str | None) -> list[str]: + if values is None: + return [] + if isinstance(values, str): + return [values] + return list(values) + + +def _canonicalize_web_url(target: str, *, base: str | None = None) -> str | None: + url = urljoin(base, target) if base else target + if not url: + return None + url, _ = urldefrag(url) + parsed = urlparse(url) + scheme = parsed.scheme.lower() + if scheme != parsed.scheme: + parsed = parsed._replace(scheme=scheme) + url = urlunparse(parsed) + if parsed.scheme not in {"http", "https"}: + return None + if not parsed.netloc: + return None + try: + parsed.port + except ValueError: + return None + netloc = _canonical_netloc(parsed) + if netloc != parsed.netloc: + parsed = parsed._replace(netloc=netloc) + if parsed.path == "/" and not parsed.params: + parsed = parsed._replace(path="") + return urlunparse(parsed) + + +def _canonical_netloc(parsed: Any) -> str: + userinfo = "" + if "@" in parsed.netloc: + userinfo = f"{parsed.netloc.rsplit('@', 1)[0]}@" + host = parsed.hostname or "" + if ":" in host and not host.startswith("["): + host = f"[{host}]" + port = parsed.port + if port is None: + return f"{userinfo}{host}" + if parsed.scheme == "http" and port == 80: + return f"{userinfo}{host}" + if parsed.scheme == "https" and port == 443: + return f"{userinfo}{host}" + return f"{userinfo}{host}:{port}" + + +def _web_origin_key(origin: str) -> WebOriginKey: + parsed = urlparse(origin) + scheme = parsed.scheme.lower() + port = parsed.port + if port is None and scheme == "http": + port = 80 + elif port is None and scheme == "https": + port = 443 + return scheme, parsed.hostname or "", port + + +def _allow_web_origin( + origin: str, + scope_origin: WebOriginKey, + root_host: str, + *, + include_external_links: bool, + include_subdomains: bool, +) -> bool: + parsed = urlparse(origin) + host = parsed.hostname or "" + if not host: + return False + origin_key = _web_origin_key(origin) + if origin_key == scope_origin: + return True + if include_external_links: + return True + if not include_subdomains: + return False + return ( + origin_key[0] == scope_origin[0] + and origin_key[2] == scope_origin[2] + and host.endswith(f".{root_host}") + ) + + +def _is_cloudflare_seed_redirect_target(root: str, target: str) -> bool: + root_parsed = urlparse(root) + target_parsed = urlparse(target) + if root_parsed.scheme not in {"http", "https"}: + return False + if target_parsed.scheme not in {"http", "https"}: + return False + if root_parsed.port is not None or target_parsed.port is not None: + return False + + root_host = _redirect_host_key(root_parsed.hostname or "") + target_host = _redirect_host_key(target_parsed.hostname or "") + return root_host != "" and root_host == target_host + + +def _redirect_host_key(host: str) -> str: + host = host.lower() + if host.startswith("www."): + return host[4:] + return host + + +def _resolve_cache_dir( + cache_dir: bool | str | Path | None, + *, + backend_name: str, + default_factory: Callable[[], Path | None], +) -> Path | None: + if cache_dir is None: + return default_factory() + if isinstance(cache_dir, bool): + if cache_dir is True: + return Path.cwd() / ".raghilda" / "cache" / backend_name + raise TypeError("cache_dir must be None, True, or a filesystem path") + return Path(cache_dir).resolve() + + +def _to_directory_path(root: str | Path) -> Path: + if isinstance(root, Path): + return root + value = str(root) + if re.match(r"^[A-Za-z]:(?:[\\/]|$)", value): + return Path(value) + parsed = urlparse(value) + if parsed.scheme == "file": + return _path_from_file_uri(value) + assert parsed.scheme in {"", "file"} + return Path(value) + + +def _iter_directory_files(root: Path, *, max_depth: int) -> Iterator[Path]: + yield from _iter_directory_files_from( + root, + root=root, + resolved_root=root.resolve(), + max_depth=max_depth, + ) + + +def _iter_directory_files_from( + directory: Path, + *, + root: Path, + resolved_root: Path, + max_depth: int, +) -> Iterator[Path]: + for child in sorted(directory.iterdir()): + if not child.resolve().is_relative_to(resolved_root): + continue + if child.is_file(): + yield child + continue + if child.is_symlink(): + continue + if not child.is_dir(): + continue + child_depth = len(child.relative_to(root).parts) - 1 + if child_depth < max_depth: + yield from _iter_directory_files_from( + child, + root=root, + resolved_root=resolved_root, + max_depth=max_depth, + ) + + +def _path_from_file_uri(origin: str) -> Path: + parsed = urlparse(origin) + assert parsed.scheme == "file" + raw_path = parsed.path + if parsed.netloc and parsed.netloc != "localhost": + raw_path = f"//{parsed.netloc}{parsed.path}" + return Path(url2pathname(raw_path)) + + +def _path_from_file_origin(origin: str) -> Path: + parsed = urlparse(origin) + if parsed.scheme == "file": + return _path_from_file_uri(origin) + return Path(origin) + + +def _normalize_types(types: Sequence[str] | None) -> set[str]: + if types is None: + return set() + if isinstance(types, str): + types = [types] + return {item.strip().lower() for item in types} + + +def _matches_patterns( + origin: str, + *, + include_patterns: Sequence[str], + exclude_patterns: Sequence[str], +) -> bool: + if _matches_exclude_patterns(origin, exclude_patterns=exclude_patterns): + return False + if not include_patterns: + return True + return any(re.search(pattern, origin) for pattern in include_patterns) + + +def _matches_exclude_patterns( + origin: str, + *, + exclude_patterns: Sequence[str], +) -> bool: + return any(re.search(pattern, origin) for pattern in exclude_patterns) + + +def _matches_cloudflare_patterns( + origin: str, + *, + include_patterns: Sequence[str], + exclude_patterns: Sequence[str], +) -> bool: + for pattern in exclude_patterns: + if _wildcard_matches(origin, pattern): + return False + if not include_patterns: + return True + return any(_wildcard_matches(origin, pattern) for pattern in include_patterns) + + +def _wildcard_matches(origin: str, pattern: str) -> bool: + placeholder = "\0" + regex = re.escape(pattern) + regex = regex.replace(r"/\*\*", "(?:/.*)?") + regex = regex.replace(r"\*\*", placeholder) + regex = regex.replace(r"\*", "[^/]*") + regex = regex.replace(placeholder, ".*") + return re.fullmatch(regex, origin) is not None + + +def _matches_types( + label: str | None, + *, + include_types: set[str], + exclude_types: set[str], +) -> bool: + normalized = label.lower() if label is not None else None + if normalized is not None and normalized in exclude_types: + return False + if not include_types: + return True + return normalized in include_types + + +def _detect_type_label( + *, + path: Path | None, + content_type: str | None, +) -> str | None: + if path is not None: + alias = _TYPE_ALIASES.get(path.suffix.lower()) + if alias is not None: + return alias + normalized_content_type = _normalize_content_type(content_type) + if normalized_content_type in _CONTENT_TYPE_LABELS: + return _CONTENT_TYPE_LABELS[normalized_content_type] + if path is not None and path.exists() and _MAGIKA is not None: + result = _MAGIKA.identify_path(path) + return _MAGIKA_LABELS.get(result.output.label, result.output.label) + return None + + +def _normalize_content_type(content_type: str | None) -> str | None: + if content_type is None: + return None + return content_type.split(";", 1)[0].strip().lower() + + +def _sha256_path(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + while True: + chunk = handle.read(8192) + if not chunk: + break + digest.update(chunk) + return digest.hexdigest() + + +def _read_text(path: Path) -> str: + return path.read_text(encoding="utf-8", errors="ignore") + + +def _known_body_suffix(origin: str, *, content_type: str | None) -> str | None: + normalized = _normalize_content_type(content_type) + if normalized == "text/html": + return ".html" + if normalized == "text/markdown": + return ".md" + if normalized == "text/plain": + return ".txt" + if normalized in {"application/xml", "text/xml"}: + return ".xml" + if normalized == "text/x-python": + return ".py" + if normalized == "application/json": + return ".json" + if normalized == "application/pdf": + return ".pdf" + parsed = urlparse(origin) + suffix = Path(parsed.path).suffix + if suffix: + return suffix + return None + + +def _body_suffix(origin: str, *, content_type: str | None) -> str: + suffix = _known_body_suffix(origin, content_type=content_type) + if suffix is not None: + return suffix + return ".bin" + + +def _type_hint_path(origin: str, *, content_type: str | None) -> Path: + suffix = _body_suffix(origin, content_type=content_type) + return Path("source").with_suffix(suffix) + + +def _utcnow() -> datetime: + return datetime.now(timezone.utc) + + +def _parse_datetime(value: str | None) -> datetime | None: + if value is None: + return None + return datetime.fromisoformat(value) diff --git a/src/raghilda/scrape.py b/src/raghilda/scrape.py index 15a81cc..bf07c37 100644 --- a/src/raghilda/scrape.py +++ b/src/raghilda/scrape.py @@ -38,7 +38,7 @@ def _extract_links(txt: str) -> set[str]: root = ET.fromstring(txt) for loc in root.findall(".//{*}url/{*}loc"): if loc is not None and loc.text: - links.update(loc.text.strip()) + links.add(loc.text.strip()) except Exception: pass diff --git a/src/raghilda/store.py b/src/raghilda/store.py index d790399..8847339 100644 --- a/src/raghilda/store.py +++ b/src/raghilda/store.py @@ -1,4 +1,4 @@ -from ._store import BaseStore, WriteResult +from ._store import BaseStore, IngestSummary, WriteResult from ._duckdb_store import DuckDBStore from ._openai_store import OpenAIStore from ._chroma_store import ChromaDBStore @@ -6,6 +6,7 @@ __all__ = [ "BaseStore", "WriteResult", + "IngestSummary", "DuckDBStore", "OpenAIStore", "ChromaDBStore", diff --git a/tests/test_api_contract.py b/tests/test_api_contract.py index 79d4095..df1b53c 100644 --- a/tests/test_api_contract.py +++ b/tests/test_api_contract.py @@ -1,11 +1,27 @@ +import inspect from types import SimpleNamespace import pytest +import raghilda.crawl as crawl_module from raghilda.chunk import MarkdownChunk +from raghilda.crawl import ( + BaseCrawler, + CrawlScope, + CloudflareCrawler, + DirectoryCrawler, + FetchedSource, + WebCrawler, +) from raghilda.document import Document, MarkdownDocument import raghilda.store as store_module -from raghilda.store import ChromaDBStore, DuckDBStore, OpenAIStore, WriteResult +from raghilda.store import ( + ChromaDBStore, + DuckDBStore, + IngestSummary, + OpenAIStore, + WriteResult, +) def test_document_uses_origin_field_not_id(): @@ -15,23 +31,81 @@ def test_document_uses_origin_field_not_id(): assert not hasattr(doc, "id") -def test_store_api_uses_upsert_not_insert(): +def test_store_api_uses_upsert_and_ingest_not_insert(): assert hasattr(DuckDBStore, "upsert") assert hasattr(ChromaDBStore, "upsert") assert hasattr(OpenAIStore, "upsert") - assert not hasattr(DuckDBStore, "ingest") - assert not hasattr(ChromaDBStore, "ingest") - assert not hasattr(OpenAIStore, "ingest") + assert hasattr(DuckDBStore, "ingest") + assert hasattr(ChromaDBStore, "ingest") + assert hasattr(OpenAIStore, "ingest") assert not hasattr(DuckDBStore, "insert") assert not hasattr(ChromaDBStore, "insert") assert not hasattr(OpenAIStore, "insert") -def test_store_exports_write_result_not_insert_result(): +def test_store_exports_write_and_ingest_results_not_insert_result(): assert WriteResult is store_module.WriteResult + assert IngestSummary is store_module.IngestSummary assert not hasattr(store_module, "InsertResult") +def test_store_exports_postgres_store_when_dependency_is_installed(): + pytest.importorskip("psycopg2") + + assert hasattr(store_module, "PostgreSQLStore") + assert "PostgreSQLStore" in store_module.__all__ + + +def test_crawl_exports_public_crawler_types(): + assert crawl_module.BaseCrawler is BaseCrawler + assert crawl_module.CrawlScope is CrawlScope + assert crawl_module.DirectoryCrawler is DirectoryCrawler + assert crawl_module.WebCrawler is WebCrawler + assert crawl_module.CloudflareCrawler is CloudflareCrawler + assert crawl_module.FetchedSource is FetchedSource + + +def test_crawl_scope_owns_traversal_policy() -> None: + assert tuple(inspect.signature(CrawlScope).parameters) == ( + "roots", + "include_patterns", + "exclude_patterns", + "depth", + "limit", + "include_types", + "exclude_types", + "include_external_links", + "include_subdomains", + ) + + +def test_crawler_constructors_keep_backend_and_cache_configuration_only() -> None: + assert tuple(inspect.signature(DirectoryCrawler).parameters) == ( + "cache_dir", + "max_workers", + ) + assert tuple(inspect.signature(WebCrawler).parameters) == ( + "session", + "cache_dir", + "cache_stale_after", + "max_workers", + ) + assert tuple(inspect.signature(CloudflareCrawler).parameters) == ( + "account_id", + "api_token", + "cache_dir", + "session", + "source", + "render", + "cache_stale_after", + "modified_since", + "poll_interval", + "max_poll_attempts", + "max_workers", + "base_url", + ) + + def test_openai_upsert_rejects_chunked_document(): class _SinglePage: def __init__(self): diff --git a/tests/test_crawl.py b/tests/test_crawl.py new file mode 100644 index 0000000..15317a8 --- /dev/null +++ b/tests/test_crawl.py @@ -0,0 +1,3129 @@ +from __future__ import annotations + +from contextlib import contextmanager +from datetime import datetime, timedelta, timezone +import fnmatch +import hashlib +import http.server +import json +import os +from pathlib import Path +import re +import socketserver +import threading +from typing import Any +import unicodedata + +import pytest +import raghilda.crawl as crawl_module +from raghilda.crawl import ( + CrawlScope, + CloudflareCrawler, + DirectoryCrawler, + FetchedSource, + WebCrawler, +) +from raghilda.document import MarkdownDocument + +_WINDOWS_RESERVED = { + "CON", + "PRN", + "AUX", + "NUL", + "COM1", + "COM2", + "COM3", + "COM4", + "COM5", + "COM6", + "COM7", + "COM8", + "COM9", + "LPT1", + "LPT2", + "LPT3", + "LPT4", + "LPT5", + "LPT6", + "LPT7", + "LPT8", + "LPT9", +} + + +def _write(tmp_path: Path, relative: str, contents: str) -> Path: + path = tmp_path / relative + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(contents, encoding="utf-8") + return path + + +def _expected_cache_base(key: str) -> str: + value = unicodedata.normalize("NFC", key) + value = value.replace("://", "__") + value = value.replace("\\", "_") + value = value.replace("/", "_") + value = re.sub(r'[\x00-\x1f<>:"|?*]+', "_", value) + value = re.sub(r"\s+", "_", value) + value = re.sub(r"[^A-Za-z0-9._-]+", "_", value) + value = value.strip(" ._") + + if not value: + value = "entry" + + root = value.split(".", 1)[0].rstrip(" .").upper() + if root in _WINDOWS_RESERVED: + value = f"_{value}" + + if len(value) > 180: + head = 180 // 2 - 2 + tail = 180 - head - 2 + value = f"{value[:head]}..{value[-tail:]}" + + value = value.rstrip(" .") + stem = value or "entry" + digest = hashlib.sha256(key.encode("utf-8")).hexdigest()[:12] + return f"{stem}--{digest}" + + +def test_directory_crawler_discovers_and_converts_markdown_documents( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello\n\nDirectory crawler") + _write(tmp_path, "docs/skip.py", "print('skip')") + notebook = _write( + tmp_path, + "docs/notebook.ipynb", + json.dumps( + { + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5, + } + ), + ) + + crawler = DirectoryCrawler() + scope = CrawlScope( + roots=[tmp_path], + depth=3, + include_patterns=[r".*/docs/.*"], + exclude_patterns=[r".*/skip\.py$"], + include_types=["markdown", "jupyter-notebook"], + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert markdown.resolve().as_uri() in origins + assert notebook.resolve().as_uri() in origins + assert all(not origin.endswith("skip.py") for origin in origins) + + source = crawler.fetch_raw(markdown.resolve().as_uri()) + assert isinstance(source, FetchedSource) + assert source.origin == markdown.resolve().as_uri() + assert source.body_path == markdown.resolve() + assert source.status_code is None + + fetched_markdown = crawler.fetch_markdown(markdown.resolve().as_uri()) + assert fetched_markdown == MarkdownDocument( + origin=markdown.resolve().as_uri(), + content="# Hello\n\nDirectory crawler", + ) + + +def test_directory_crawler_convert_override_receives_fetched_source( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello\n\nDirectory crawler") + seen: list[FetchedSource] = [] + + crawler = DirectoryCrawler() + + converted = crawler.fetch_markdown( + markdown.resolve().as_uri(), + convert=lambda source: _record_directory_conversion(source, seen), + ) + + assert [item.origin for item in seen] == [markdown.resolve().as_uri()] + assert converted == MarkdownDocument( + origin=markdown.resolve().as_uri(), + content="# Converted\n", + ) + + +def test_directory_crawler_cache_dir_uses_hashed_file_pair( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello\n") + cache_dir = tmp_path / "cache" + crawler = DirectoryCrawler(cache_dir=cache_dir) + + origin = markdown.resolve().as_uri() + document = crawler.fetch_markdown(origin) + + base = _expected_cache_base(origin) + metadata_path = cache_dir / f"{base}.metadata.json" + content_path = cache_dir / f"{base}.md" + assert document == MarkdownDocument(origin=origin, content="# Hello\n") + assert sorted(path.name for path in cache_dir.iterdir()) == [ + content_path.name, + metadata_path.name, + ] + assert json.loads(metadata_path.read_text(encoding="utf-8")) == { + "content_path": content_path.name, + "key": origin, + "metadata": { + "mtime_ns": markdown.stat().st_mtime_ns, + "origin": origin, + "source_hash": hashlib.sha256(markdown.read_bytes()).hexdigest(), + }, + } + + +def test_directory_crawler_cache_dir_true_uses_default_backend_directory( + tmp_path: Path, + monkeypatch, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello\n") + monkeypatch.chdir(tmp_path) + crawler = DirectoryCrawler(cache_dir=True) + + origin = markdown.resolve().as_uri() + crawler.fetch_markdown(origin) + + cache_dir = tmp_path / ".raghilda" / "cache" / "directory" + base = _expected_cache_base(origin) + assert sorted(path.name for path in cache_dir.iterdir()) == [ + f"{base}.md", + f"{base}.metadata.json", + ] + + +def _record_directory_conversion( + source: FetchedSource, seen: list[FetchedSource] +) -> MarkdownDocument: + seen.append(source) + return MarkdownDocument(origin=source.origin, content="# Converted\n") + + +class _ThreadingHTTPServer(socketserver.ThreadingMixIn, http.server.HTTPServer): + daemon_threads = True + + +class _RequestHandler(http.server.BaseHTTPRequestHandler): + def do_GET(self) -> None: + path = self.path.split("?", 1)[0] + routes = self.server.routes # type: ignore[attr-defined] + route = routes[path] + self.server.requests.append( # type: ignore[attr-defined] + {"path": path, "headers": dict(self.headers.items())} + ) + if route["etag"] and self.headers.get("If-None-Match") == route["etag"]: + self.send_response(304) + self.send_header("ETag", route["etag"]) + self.end_headers() + return + + body = route["body"].encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", route["content_type"]) + self.send_header("Content-Length", str(len(body))) + if route["etag"]: + self.send_header("ETag", route["etag"]) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format: str, *args: Any) -> None: + return + + +class _FakeWebResponse: + def __init__( + self, + *, + body: str, + url: str, + content_type: str = "text/html; charset=utf-8", + status_code: int = 200, + headers: dict[str, str] | None = None, + ) -> None: + self.url = url + self.content = body.encode("utf-8") + self.headers = {"Content-Type": content_type, **(headers or {})} + self.status_code = status_code + + def raise_for_status(self) -> None: + assert self.status_code < 400 + + +class _FakeWebSession: + def __init__(self, routes: dict[str, dict[str, Any]]) -> None: + self.routes = routes + self.requests: list[tuple[str, dict[str, str]]] = [] + + def get( + self, + url: str, + *, + headers: dict[str, str], + timeout: float, + ) -> _FakeWebResponse: + del timeout + self.requests.append((url, headers)) + route = self.routes[url] + return _FakeWebResponse( + body=route["body"], + url=route.get("resolved_url", url), + content_type=route.get("content_type", "text/html; charset=utf-8"), + status_code=route.get("status_code", 200), + headers=route.get("headers"), + ) + + +@contextmanager +def _serve(routes: dict[str, dict[str, str | None]]): + server = _ThreadingHTTPServer(("127.0.0.1", 0), _RequestHandler) + server.routes = routes # type: ignore[attr-defined] + server.requests = [] # type: ignore[attr-defined] + thread = threading.Thread(target=server.serve_forever) + thread.start() + try: + yield server + finally: + server.shutdown() + thread.join() + server.server_close() + + +def test_web_crawler_discovers_origins_and_revalidates_cache(tmp_path: Path) -> None: + with _serve( + { + "/": { + "body": """ + +
+ Guide + Skip + External +
+ + """, + "content_type": "text/html; charset=utf-8", + "etag": "root-v1", + }, + "/guide": { + "body": "

Guide

Hello

", + "content_type": "text/html; charset=utf-8", + "etag": "guide-v1", + }, + "/skip": { + "body": "

Skip

", + "content_type": "text/html; charset=utf-8", + "etag": "skip-v1", + }, + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + root_origin = root_url.rstrip("/") + crawler = WebCrawler( + cache_dir=tmp_path / "cache", + cache_stale_after=timedelta(seconds=0), + ) + scope = CrawlScope( + roots=[root_url], + depth=1, + include_patterns=[rf"^{re.escape(root_origin)}(?:/.*)?$"], + exclude_patterns=[r".*/skip$"], + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert root_origin in origins + assert f"{root_url}guide" in origins + assert all(not origin.endswith("/skip") for origin in origins) + assert all("external.test" not in origin for origin in origins) + + first = crawler.fetch_raw(root_url) + second = crawler.fetch_raw(root_url) + third = crawler.fetch_raw(root_url, cache_force_refresh=True) + server_requests = getattr(server, "requests") + root_requests = [ + request for request in server_requests if request["path"] == "/" + ] + + assert first.body_path == second.body_path == third.body_path + assert second.revalidated_at is not None + assert root_requests[-2]["headers"]["If-None-Match"] == "root-v1" + assert "If-None-Match" not in root_requests[-1]["headers"] + + guide_doc = crawler.fetch_markdown(f"{root_url}guide") + assert guide_doc.origin == f"{root_url}guide" + assert "Guide" in guide_doc.content + + +def test_web_crawler_resolves_relative_links_from_redirect_target( + tmp_path: Path, +) -> None: + session: Any = _FakeWebSession( + { + "https://example.com/docs": { + "body": 'Page', + "resolved_url": "https://example.com/docs/", + }, + "https://example.com/docs/page": { + "body": "
Page
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "redirect-cache", + session=session, + ) + scope = CrawlScope(roots=["https://example.com/docs"], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert "https://example.com/docs" in origins + assert "https://example.com/docs/page" in origins + assert "https://example.com/page" not in origins + + +def test_web_crawler_follows_links_after_redirect_to_different_host( + tmp_path: Path, +) -> None: + session: Any = _FakeWebSession( + { + "https://example.com": { + "body": 'About', + "resolved_url": "https://www.example.com/landing", + }, + "https://www.example.com/about": { + "body": "
About
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "redirect-host-cache", + session=session, + ) + scope = CrawlScope(roots=["https://example.com"], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert "https://example.com" in origins + assert "https://www.example.com/about" in origins + + +def test_web_crawler_include_subdomains_uses_redirect_scope( + tmp_path: Path, +) -> None: + root = "http://example.com" + page = "https://example.com/page" + session: Any = _FakeWebSession( + { + root: { + "body": 'Page', + "resolved_url": "https://example.com/landing", + }, + page: { + "body": "
Page
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "redirect-subdomain-cache", + session=session, + ) + scope = CrawlScope(roots=[root], depth=1, include_subdomains=True) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root, page] + + +def test_web_crawler_include_subdomains_stays_within_requested_host_tree( + tmp_path: Path, +) -> None: + root = "https://docs.example.co.uk/start" + allowed = "https://api.docs.example.co.uk/page" + disallowed_parent = "https://example.co.uk/root" + disallowed_sibling = "https://other.co.uk/page" + session: Any = _FakeWebSession( + { + root: { + "body": ( + f'Allowed' + f'Parent' + f'Sibling' + ), + }, + allowed: {"body": "
Allowed
"}, + disallowed_parent: { + "body": "
Parent
" + }, + disallowed_sibling: { + "body": "
Sibling
" + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "subdomain-cache", + session=session, + ) + scope = CrawlScope( + roots=[root], + depth=1, + include_subdomains=True, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert root in origins + assert allowed in origins + assert disallowed_parent not in origins + assert disallowed_sibling not in origins + + +def test_web_crawler_include_subdomains_keeps_original_scope_host( + tmp_path: Path, +) -> None: + root = "https://docs.example.com/start" + api = "https://api.docs.example.com/page" + cdn = "https://cdn.docs.example.com/asset" + session: Any = _FakeWebSession( + { + root: { + "body": f'API', + }, + api: { + "body": f'CDN', + }, + cdn: {"body": "
CDN
"}, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "subdomain-root-host-cache", + session=session, + ) + scope = CrawlScope( + roots=[root], + depth=2, + include_subdomains=True, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root, api, cdn] + + +def test_web_crawler_excludes_same_host_different_port_by_default( + tmp_path: Path, +) -> None: + root = "http://127.0.0.1:8000" + other_port = "http://127.0.0.1:9000/page" + session: Any = _FakeWebSession( + { + root: { + "body": f'Other', + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "same-host-port-cache", + session=session, + ) + scope = CrawlScope(roots=[root], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_include_subdomains_excludes_same_host_different_port( + tmp_path: Path, +) -> None: + root = "http://127.0.0.1:8000" + other_port = "http://127.0.0.1:9000/page" + session: Any = _FakeWebSession( + { + root: { + "body": f'Other', + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "subdomain-same-host-port-cache", + session=session, + ) + scope = CrawlScope(roots=[root], depth=1, include_subdomains=True) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_treats_explicit_default_port_as_same_origin( + tmp_path: Path, +) -> None: + root = "http://example.com" + explicit_root = "http://example.com:80" + child = "http://example.com/about" + session: Any = _FakeWebSession( + { + root: { + "body": f'About', + }, + child: { + "body": "
About
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "default-port-cache", + session=session, + ) + scope = CrawlScope(roots=[explicit_root], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root, child] + + +def test_web_crawler_deduplicates_explicit_default_port_variants( + tmp_path: Path, +) -> None: + root = "https://example.com" + session: Any = _FakeWebSession( + { + root: { + "body": "
Root
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "default-port-variant-cache", + session=session, + ) + scope = CrawlScope(roots=[root, "https://example.com:443"], depth=0) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_normalizes_uppercase_url_schemes(tmp_path: Path) -> None: + origin = "http://example.com" + page = "https://example.com/page" + session: Any = _FakeWebSession( + { + origin: { + "body": ( + '' + "Page" + ), + }, + page: { + "body": "
Page
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "uppercase-scheme-cache", + session=session, + ) + scope = CrawlScope( + roots=["HTTP://example.com"], depth=1, include_external_links=True + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [origin, page] + + +def test_web_crawler_preserves_url_credentials(tmp_path: Path) -> None: + origin = "https://user:pass@example.com/private" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Private
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "credential-url-cache", + session=session, + ) + + source = crawler.fetch_raw(origin) + + assert source.origin == origin + assert session.requests == [(origin, {})] + + +def test_web_crawler_discovers_urls_from_xml_sitemap(tmp_path: Path) -> None: + sitemap = "https://example.com/sitemap.xml" + page = "https://example.com/docs/page" + session: Any = _FakeWebSession( + { + sitemap: { + "body": ( + '' + '' + f"{page}" + "" + ), + "content_type": "application/xml", + }, + page: { + "body": "
Page
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "sitemap-cache", + session=session, + ) + scope = CrawlScope(roots=[sitemap], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [sitemap, page] + + +def test_web_crawler_allows_later_in_scope_occurrence_of_same_url( + tmp_path: Path, +) -> None: + first_root = "https://alpha.example.com/start" + second_root = "https://docs.example.com/start" + shared = "https://api.docs.example.com/page" + session: Any = _FakeWebSession( + { + first_root: { + "body": f'Shared', + }, + second_root: { + "body": f'Shared', + }, + shared: {"body": "
Shared
"}, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "multi-root-visited-cache", + session=session, + ) + scope = CrawlScope( + roots=[first_root, second_root], + depth=1, + include_subdomains=True, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [first_root, second_root, shared] + + +def test_web_crawler_revisits_shared_page_for_broader_subdomain_scope( + tmp_path: Path, +) -> None: + narrow_root = "https://api.docs.example.com/start" + broad_root = "https://docs.example.com/start" + shared = "https://api.docs.example.com/shared" + sibling = "https://cdn.docs.example.com/asset" + session: Any = _FakeWebSession( + { + narrow_root: { + "body": f'Shared', + }, + broad_root: { + "body": f'Shared', + }, + shared: { + "body": f'Sibling', + }, + sibling: {"body": "
Sibling
"}, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "multi-root-subdomain-cache", + session=session, + ) + scope = CrawlScope( + roots=[narrow_root, broad_root], + depth=2, + include_subdomains=True, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [narrow_root, broad_root, shared, sibling] + + +def test_web_crawler_discovers_matching_descendants_from_filtered_seed( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": 'Guide', + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + "/docs/guide": { + "body": "
Guide
", + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + crawler = WebCrawler( + cache_dir=tmp_path / "filtered-seed-cache", + ) + scope = CrawlScope( + roots=[root_url], + depth=1, + include_patterns=[rf"^{re.escape(root_url)}docs/.*"], + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert root_url not in origins + assert f"{root_url}docs/guide" in origins + + +def test_web_crawler_does_not_fetch_excluded_origins(tmp_path: Path) -> None: + root = "https://example.com" + admin = "https://example.com/admin" + session: Any = _FakeWebSession( + { + root: { + "body": f'Admin', + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "excluded-origin-cache", + session=session, + ) + scope = CrawlScope( + roots=[root], + depth=1, + exclude_patterns=[r"/admin$"], + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_deduplicates_root_url_with_and_without_slash( + tmp_path: Path, +) -> None: + root = "https://example.com" + session: Any = _FakeWebSession( + { + root: { + "body": 'Root', + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "root-slash-cache", + session=session, + ) + scope = CrawlScope(roots=[root], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_deduplicates_root_scope_variants(tmp_path: Path) -> None: + root = "https://example.com" + session: Any = _FakeWebSession( + { + root: { + "body": "
Root
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "root-variant-cache", + session=session, + ) + scope = CrawlScope(roots=[root, f"{root}/"], depth=0) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_deduplicates_queried_root_scope_variants( + tmp_path: Path, +) -> None: + root = "https://example.com?x=1" + session: Any = _FakeWebSession( + { + root: { + "body": "
Root
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "queried-root-variant-cache", + session=session, + ) + scope = CrawlScope(roots=[root, "https://example.com/?x=1"], depth=0) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_normalizes_root_links_from_non_root_pages( + tmp_path: Path, +) -> None: + root = "https://example.com" + page = "https://example.com/docs" + session: Any = _FakeWebSession( + { + page: { + "body": 'Root', + }, + root: { + "body": "
Root
", + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "non-root-root-link-cache", + session=session, + ) + scope = CrawlScope(roots=[page], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [page, root] + assert session.requests == [(page, {}), (root, {})] + + +def test_web_crawler_skips_links_with_malformed_ports(tmp_path: Path) -> None: + root = "https://example.com" + session: Any = _FakeWebSession( + { + root: { + "body": ( + '' + "Bad" + ), + }, + } + ) + crawler = WebCrawler( + cache_dir=tmp_path / "bad-port-cache", + session=session, + ) + scope = CrawlScope(roots=[root], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root] + assert session.requests == [(root, {})] + + +def test_web_crawler_accepts_crawl_scope_for_roots_and_patterns( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": 'Guide', + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + "/docs/guide": { + "body": "
Guide
", + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + crawler = WebCrawler(cache_dir=tmp_path / "scope-cache") + scope = CrawlScope( + roots=[root_url], + depth=1, + include_patterns=[rf"^{re.escape(root_url)}docs/.*"], + ) + + origins = list(crawler.origins(scope, progress=False)) + documents = list(crawler.markdown_documents(scope, progress=False)) + + assert origins == [f"{root_url}docs/guide"] + assert documents == [ + MarkdownDocument(origin=f"{root_url}docs/guide", content="Guide") + ] + + +def test_web_markdown_documents_reuses_refreshed_sources( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": "
Root
", + "content_type": "text/html; charset=utf-8", + "etag": None, + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + root_origin = root_url.rstrip("/") + crawler = WebCrawler( + cache_dir=tmp_path / "markdown-docs-cache", + ) + scope = CrawlScope(roots=[root_url], depth=0) + + documents = list(crawler.markdown_documents(scope, cache_force_refresh=True)) + root_requests = [ + request for request in getattr(server, "requests") if request["path"] == "/" + ] + + assert documents == [MarkdownDocument(origin=root_origin, content="Root")] + assert len(root_requests) == 1 + + +def test_web_markdown_documents_reuses_immediately_stale_discovery_cache( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": "
Root
", + "content_type": "text/html; charset=utf-8", + "etag": None, + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + root_origin = root_url.rstrip("/") + crawler = WebCrawler( + cache_dir=tmp_path / "stale-markdown-docs-cache", + cache_stale_after=timedelta(seconds=0), + ) + scope = CrawlScope(roots=[root_url], depth=0) + + documents = list(crawler.markdown_documents(scope, progress=False)) + root_requests = [ + request for request in getattr(server, "requests") if request["path"] == "/" + ] + + assert documents == [MarkdownDocument(origin=root_origin, content="Root")] + assert len(root_requests) == 1 + + +def test_web_crawler_fetches_same_depth_frontier_concurrently(tmp_path: Path) -> None: + root = "https://example.com/docs" + first = "https://example.com/docs/one" + second = "https://example.com/docs/two" + + class _ConcurrentWebSession: + def __init__(self) -> None: + self.requests: list[tuple[str, dict[str, str]]] = [] + self._lock = threading.Lock() + self._barrier = threading.Barrier(2) + self.in_flight = 0 + self.max_in_flight = 0 + + def get( + self, url: str, headers: dict[str, str], timeout: float + ) -> _FakeWebResponse: + del timeout + with self._lock: + self.requests.append((url, headers)) + self.in_flight += 1 + self.max_in_flight = max(self.max_in_flight, self.in_flight) + try: + if url == root: + return _FakeWebResponse( + body=( + f'One' + f'Two' + ), + url=url, + ) + if url in {first, second}: + self._barrier.wait(timeout=1.0) + return _FakeWebResponse( + body="
Child
", + url=url, + ) + raise AssertionError(f"Unexpected url: {url}") + finally: + with self._lock: + self.in_flight -= 1 + + session: Any = _ConcurrentWebSession() + crawler = WebCrawler( + cache_dir=tmp_path / "frontier-cache", + session=session, + max_workers=2, + ) + scope = CrawlScope(roots=[root], depth=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [root, first, second] + assert session.max_in_flight == 2 + + +def test_web_crawler_treats_304_revalidation_as_fresh_cache_hit( + tmp_path: Path, + monkeypatch, +) -> None: + with _serve( + { + "/": { + "body": "
Cached
", + "content_type": "text/html; charset=utf-8", + "etag": "root-v1", + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + times = iter( + [ + datetime(2026, 1, 1, tzinfo=timezone.utc), + datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc), + datetime(2026, 1, 1, 0, 0, 2, 500000, tzinfo=timezone.utc), + ] + ) + monkeypatch.setattr(crawl_module, "_utcnow", lambda: next(times)) + crawler = WebCrawler( + cache_dir=tmp_path / "fresh-cache", + cache_stale_after=timedelta(seconds=1), + ) + + first = crawler.fetch_raw(root_url) + second = crawler.fetch_raw(root_url) + third = crawler.fetch_raw(root_url) + root_requests = [ + request for request in getattr(server, "requests") if request["path"] == "/" + ] + + assert first.body_path == second.body_path == third.body_path + assert second.revalidated_at is not None + assert len(root_requests) == 2 + assert root_requests[1]["headers"]["If-None-Match"] == "root-v1" + + +def test_web_crawler_cache_dir_uses_hashed_file_pair( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": "
Root
", + "content_type": "text/html; charset=utf-8", + "etag": None, + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + root_origin = root_url.rstrip("/") + cache_dir = tmp_path / "cache" + crawler = WebCrawler(cache_dir=cache_dir) + + document = crawler.fetch_markdown(root_url) + + base = _expected_cache_base(root_origin) + metadata_path = cache_dir / f"{base}.metadata.json" + content_path = cache_dir / f"{base}.html" + assert document == MarkdownDocument(origin=root_origin, content="Root") + assert sorted(path.name for path in cache_dir.iterdir()) == [ + content_path.name, + metadata_path.name, + ] + record = json.loads(metadata_path.read_text(encoding="utf-8")) + assert record["key"] == root_origin + assert record["content_path"] == content_path.name + assert record["metadata"]["content_type"] == "text/html; charset=utf-8" + assert record["metadata"]["origin"] == root_origin + + +def test_web_crawler_rejects_cache_metadata_content_path_outside_cache( + tmp_path: Path, +) -> None: + origin = "https://example.com/poison" + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + outside = tmp_path / "outside.html" + outside.write_text("Poison", encoding="utf-8") + base = _expected_cache_base(origin) + metadata_path = cache_dir / f"{base}.metadata.json" + metadata_path.write_text( + json.dumps( + { + "key": origin, + "content_path": "../outside.html", + "metadata": { + "origin": origin, + "resolved_origin": origin, + "content_type": "text/html", + "status_code": 200, + "etag": None, + "last_modified": None, + "type_label": "html", + "fetched_at": "2026-01-01T00:00:00+00:00", + "revalidated_at": None, + }, + } + ), + encoding="utf-8", + ) + session: Any = _FakeWebSession( + { + origin: { + "body": "
Fresh
", + } + } + ) + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + + assert source.body_path.parent == cache_dir + assert source.body_path != outside + assert session.requests == [(origin, {})] + + +def test_web_crawler_rejects_cache_metadata_with_mismatched_key( + tmp_path: Path, +) -> None: + origin = "https://example.com/requested" + stale_origin = "https://example.com/stale" + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + base = _expected_cache_base(origin) + content_path = cache_dir / f"{base}.html" + content_path.write_text("Stale", encoding="utf-8") + metadata_path = cache_dir / f"{base}.metadata.json" + metadata_path.write_text( + json.dumps( + { + "key": stale_origin, + "content_path": content_path.name, + "metadata": { + "origin": stale_origin, + "resolved_origin": stale_origin, + "content_type": "text/html", + "status_code": 200, + "etag": None, + "last_modified": None, + "type_label": "html", + "fetched_at": "2026-01-01T00:00:00+00:00", + "revalidated_at": None, + }, + } + ), + encoding="utf-8", + ) + session: Any = _FakeWebSession( + { + origin: { + "body": "
Fresh
", + } + } + ) + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + + assert source.origin == origin + assert source.body_path.read_text(encoding="utf-8") != ( + "Stale" + ) + assert session.requests == [(origin, {})] + + +def test_web_crawler_cache_dir_true_uses_default_backend_directory( + tmp_path: Path, + monkeypatch, +) -> None: + monkeypatch.chdir(tmp_path) + with _serve( + { + "/": { + "body": "
Root
", + "content_type": "text/html; charset=utf-8", + "etag": None, + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + root_origin = root_url.rstrip("/") + crawler = WebCrawler(cache_dir=True) + + crawler.fetch_markdown(root_url) + + cache_dir = tmp_path / ".raghilda" / "cache" / "web" + base = _expected_cache_base(root_origin) + assert sorted(path.name for path in cache_dir.iterdir()) == [ + f"{base}.html", + f"{base}.metadata.json", + ] + + +def test_web_crawler_relative_cache_dir_is_anchored_at_construction( + tmp_path: Path, + monkeypatch, +) -> None: + monkeypatch.chdir(tmp_path) + origin = "https://example.com/page" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Page
", + } + } + ) + crawler = WebCrawler(cache_dir="cache", session=session) + other_cwd = tmp_path / "other" + other_cwd.mkdir() + monkeypatch.chdir(other_cwd) + + source = crawler.fetch_raw(origin) + + assert source.body_path.parent == tmp_path / "cache" + + +def test_web_crawler_scopes_fresh_cache_hits_to_custom_session( + tmp_path: Path, +) -> None: + origin = "https://example.com/private" + cache_dir = tmp_path / "session-cache" + first_session: Any = _FakeWebSession( + { + origin: { + "body": "
First
", + }, + } + ) + second_session: Any = _FakeWebSession( + { + origin: { + "body": "
Second
", + }, + } + ) + first_crawler = WebCrawler(cache_dir=cache_dir, session=first_session) + second_crawler = WebCrawler(cache_dir=cache_dir, session=second_session) + + first = first_crawler.fetch_raw(origin) + first_body = first.body_path.read_text(encoding="utf-8") + second = second_crawler.fetch_raw(origin) + + assert "First" in first_body + assert "Second" in second.body_path.read_text(encoding="utf-8") + assert second_session.requests == [(origin, {})] + + +def test_web_crawler_disambiguates_colliding_sanitized_cache_prefixes( + tmp_path: Path, +) -> None: + first_origin = "https://example.com/docs/page" + second_origin = "https://example.com/docs:page" + third_origin = "https://example.com/docs?page" + session: Any = _FakeWebSession( + { + first_origin: {"body": "
One
"}, + second_origin: {"body": "
Two
"}, + third_origin: {"body": "
Three
"}, + } + ) + cache_dir = tmp_path / "collision-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + crawler.fetch_raw(first_origin) + crawler.fetch_raw(second_origin) + crawler.fetch_raw(third_origin) + + first_base = _expected_cache_base(first_origin) + second_base = _expected_cache_base(second_origin) + third_base = _expected_cache_base(third_origin) + cached_names = {path.name for path in cache_dir.iterdir()} + assert { + f"{first_base}.html", + f"{first_base}.metadata.json", + f"{second_base}.html", + f"{second_base}.metadata.json", + f"{third_base}.html", + f"{third_base}.metadata.json", + }.issubset(cached_names) + assert len(cached_names) == 6 + + second_session: Any = _FakeWebSession( + { + first_origin: {"body": "
One
"}, + second_origin: {"body": "
Two
"}, + third_origin: {"body": "
Three
"}, + } + ) + second_crawler = WebCrawler(cache_dir=cache_dir, session=second_session) + + assert second_crawler.fetch_raw(first_origin).body_path.exists() + assert second_crawler.fetch_raw(second_origin).body_path.exists() + assert second_crawler.fetch_raw(third_origin).body_path.exists() + assert second_session.requests == [ + (first_origin, {}), + (second_origin, {}), + (third_origin, {}), + ] + + +def test_web_crawler_refresh_deletes_only_exact_cache_base(tmp_path: Path) -> None: + first_origin = "https://example.com" + first_base = _expected_cache_base(first_origin) + second_origin = f"https://example.com--{first_base.rsplit('--', 1)[1]}.child" + session: Any = _FakeWebSession( + { + first_origin: {"body": "
One
"}, + second_origin: {"body": "
Two
"}, + } + ) + crawler = WebCrawler(cache_dir=tmp_path / "exact-delete-cache", session=session) + + crawler.fetch_raw(first_origin) + crawler.fetch_raw(second_origin) + crawler.fetch_raw(first_origin, cache_force_refresh=True) + session.requests.clear() + crawler.fetch_raw(second_origin) + + assert session.requests == [] + + +def test_web_crawler_refresh_replaces_cached_body_atomically( + tmp_path: Path, + monkeypatch, +) -> None: + origin = "https://example.com" + session: Any = _FakeWebSession( + { + origin: { + "body": "
First
", + }, + } + ) + crawler = WebCrawler(cache_dir=tmp_path / "atomic-cache", session=session) + first = crawler.fetch_raw(origin) + session.routes[origin]["body"] = "
Second
" + replacements: list[tuple[Path, Path]] = [] + replace = crawl_module.os.replace + + def track_replace(src: str | Path, dst: str | Path) -> None: + replacements.append((Path(src), Path(dst))) + replace(src, dst) + + monkeypatch.setattr(crawl_module.os, "replace", track_replace) + + second = crawler.fetch_raw(origin, cache_force_refresh=True) + + assert first.body_path == second.body_path + assert second.body_path.read_text(encoding="utf-8") == ( + "
Second
" + ) + assert replacements[-1][1] == second.body_path + + +def test_web_crawler_cache_writes_for_different_keys_do_not_contend( + tmp_path: Path, + monkeypatch, +) -> None: + first_origin = "https://example.com/docs/one" + second_origin = "https://example.com/docs/two" + session: Any = _FakeWebSession( + { + first_origin: {"body": "
One
"}, + second_origin: {"body": "
Two
"}, + } + ) + cache_dir = tmp_path / "concurrency-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + first_content_path = cache_dir / f"{_expected_cache_base(first_origin)}.html" + second_content_path = cache_dir / f"{_expected_cache_base(second_origin)}.html" + first_started = threading.Event() + release_first = threading.Event() + second_finished = threading.Event() + errors: list[BaseException] = [] + original_write_content = crawl_module._FilesystemCrawlerCache._write_content + + def blocking_write_content( + self, + path: Path, + content: bytes | str | Path, + ) -> None: + if path == first_content_path and not first_started.is_set(): + first_started.set() + assert release_first.wait(timeout=2.0) + original_write_content(self, path, content) + if path == second_content_path: + second_finished.set() + + monkeypatch.setattr( + crawl_module._FilesystemCrawlerCache, + "_write_content", + blocking_write_content, + ) + + def fetch(origin: str) -> None: + try: + crawler.fetch_raw(origin) + except BaseException as exc: + errors.append(exc) + + first_thread = threading.Thread(target=fetch, args=(first_origin,)) + second_thread = threading.Thread(target=fetch, args=(second_origin,)) + first_thread.start() + assert first_started.wait(timeout=1.0) + second_thread.start() + try: + assert second_finished.wait(timeout=1.0) + finally: + release_first.set() + first_thread.join(timeout=1.0) + second_thread.join(timeout=1.0) + + assert errors == [] + + +def test_web_crawler_uses_magika_when_no_explicit_ext_is_available( + tmp_path: Path, +) -> None: + origin = "https://example.com/download" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Download
", + "content_type": "application/octet-stream", + } + } + ) + cache_dir = tmp_path / "magika-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + + base = _expected_cache_base(origin) + assert source.body_path == cache_dir / f"{base}.html" + + +def test_web_crawler_type_filters_use_sniffed_cache_extension( + tmp_path: Path, + monkeypatch, +) -> None: + class _FakeMagikaOutput: + label = "html" + extensions = ["html"] + + class _FakeMagikaResult: + output = _FakeMagikaOutput() + + class _FakeMagika: + def identify_bytes(self, content: bytes) -> _FakeMagikaResult: + assert content.startswith(b"") + return _FakeMagikaResult() + + def identify_path(self, path: Path) -> _FakeMagikaResult: + assert path.suffix == ".html" + return _FakeMagikaResult() + + origin = "https://example.com/download" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Download
", + "content_type": "application/octet-stream", + } + } + ) + monkeypatch.setattr(crawl_module, "_MAGIKA", _FakeMagika()) + crawler = WebCrawler(cache_dir=tmp_path / "sniffed-type-cache", session=session) + scope = CrawlScope(roots=[origin], depth=0, include_types=["html"]) + + origins = list(crawler.origins(scope, progress=False)) + source = crawler.fetch_raw(origin) + + assert origins == [origin] + assert source.metadata == { + "etag": None, + "last_modified": None, + "type_label": "html", + } + + +def test_web_crawler_prefers_content_type_over_misleading_url_suffix( + tmp_path: Path, +) -> None: + origin = "https://example.com/README.md" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Rendered Readme
", + "content_type": "text/html; charset=utf-8", + } + } + ) + cache_dir = tmp_path / "content-type-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + document = crawler.fetch_markdown(origin) + + base = _expected_cache_base(origin) + assert source.body_path == cache_dir / f"{base}.html" + assert document == MarkdownDocument(origin=origin, content="Rendered Readme") + + +def test_web_crawler_prefers_text_content_type_over_url_suffix( + tmp_path: Path, +) -> None: + origin = "https://example.com/plain.html" + session: Any = _FakeWebSession( + { + origin: { + "body": "plain text", + "content_type": "text/plain; charset=utf-8", + } + } + ) + cache_dir = tmp_path / "text-content-type-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + scope = CrawlScope(roots=[origin], depth=0, include_types=["text"]) + + origins = list(crawler.origins(scope, progress=False)) + source = crawler.fetch_raw(origin) + + assert origins == [origin] + assert source.body_path == cache_dir / f"{_expected_cache_base(origin)}.txt" + assert (source.metadata or {})["type_label"] == "text" + + +def test_web_crawler_preserves_reserved_escapes_in_requested_origin( + tmp_path: Path, +) -> None: + origin = "https://example.com/a%2Fb" + session: Any = _FakeWebSession( + { + origin: { + "body": "
Escaped
", + } + } + ) + cache_dir = tmp_path / "escaped-cache" + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + + assert session.requests == [(origin, {})] + assert source.origin == origin + assert source.body_path == cache_dir / f"{_expected_cache_base(origin)}.html" + + +def test_web_crawler_falls_back_to_raw_when_magika_is_unavailable( + tmp_path: Path, + monkeypatch, +) -> None: + origin = "https://example.com/download" + session: Any = _FakeWebSession( + { + origin: { + "body": "opaque payload", + "content_type": "application/octet-stream", + } + } + ) + cache_dir = tmp_path / "raw-cache" + monkeypatch.setattr(crawl_module, "_MAGIKA", None) + crawler = WebCrawler(cache_dir=cache_dir, session=session) + + source = crawler.fetch_raw(origin) + + base = _expected_cache_base(origin) + assert source.body_path == cache_dir / f"{base}.raw" + + +class _CloudflareResponse: + def __init__(self, payload: dict[str, Any]): + self.payload = payload + self.status_code = 200 + + def json(self) -> dict[str, Any]: + return self.payload + + def raise_for_status(self) -> None: + return + + +class _CloudflareSession: + def __init__(self) -> None: + self.post_calls: list[tuple[str, dict[str, Any], dict[str, str]]] = [] + self.get_calls: list[tuple[str, dict[str, Any] | None]] = [] + self._poll_count = 0 + + def post( + self, + url: str, + *, + json: dict[str, Any], + headers: dict[str, str], + timeout: float, + ) -> _CloudflareResponse: + self.post_calls.append((url, json, headers)) + return _CloudflareResponse({"success": True, "result": "job-123"}) + + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + self.get_calls.append((url, params)) + self._poll_count += 1 + if self._poll_count == 1: + return _CloudflareResponse( + {"success": True, "result": {"id": "job-123", "status": "running"}} + ) + if params == {"limit": 1}: + return _CloudflareResponse( + {"success": True, "result": {"id": "job-123", "status": "completed"}} + ) + return _CloudflareResponse( + { + "success": True, + "result": { + "id": "job-123", + "status": "completed", + "records": [ + { + "url": "https://example.com/docs", + "status": "completed", + "markdown": "# Docs\n", + "metadata": { + "status": 200, + "title": "Docs", + "url": "https://example.com/docs", + }, + }, + { + "url": "https://example.com/docs/page", + "status": "completed", + "markdown": "## Page\n", + "metadata": { + "status": 200, + "title": "Page", + "url": "https://example.com/docs/page", + }, + }, + ], + }, + } + ) + + +class _ParameterizedCloudflareSession: + def __init__(self) -> None: + self.post_calls: list[tuple[str, dict[str, Any], dict[str, str]]] = [] + self.get_calls: list[tuple[str, dict[str, Any] | None]] = [] + self._jobs: dict[str, dict[str, Any]] = {} + + def post( + self, + url: str, + *, + json: dict[str, Any], + headers: dict[str, str], + timeout: float, + ) -> _CloudflareResponse: + del timeout + job_id = f"job-{len(self.post_calls) + 1}" + self.post_calls.append((url, json, headers)) + self._jobs[job_id] = json + return _CloudflareResponse({"success": True, "result": job_id}) + + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + del headers, timeout + self.get_calls.append((url, params)) + job_id = url.rsplit("/", 1)[-1] + payload = self._jobs[job_id] + if params == {"limit": 1}: + return _CloudflareResponse( + {"success": True, "result": {"id": job_id, "status": "completed"}} + ) + records = [ + { + "url": payload["url"], + "status": "completed", + "markdown": "# Docs\n", + "metadata": { + "status": 200, + "title": "Docs", + "url": payload["url"], + }, + } + ] + if payload["depth"] > 0: + records.append( + { + "url": f"{payload['url']}/page", + "status": "completed", + "markdown": "## Page\n", + "metadata": { + "status": 200, + "title": "Page", + "url": f"{payload['url']}/page", + }, + } + ) + return _CloudflareResponse( + { + "success": True, + "result": { + "id": job_id, + "status": "completed", + "records": records, + }, + } + ) + + +class _DiscoveryFilteringCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + + payload = self._jobs[url.rsplit("/", 1)[-1]] + include_patterns = payload["options"].get("includePatterns", []) + exclude_patterns = payload["options"].get("excludePatterns", []) + records = response.json()["result"]["records"] + filtered_records = [ + record + for record in records + if ( + ( + not include_patterns + or any( + fnmatch.fnmatchcase(record["url"], pattern) + for pattern in include_patterns + ) + ) + and not any( + fnmatch.fnmatchcase(record["url"], pattern) + for pattern in exclude_patterns + ) + ) + ] + return _CloudflareResponse( + { + "success": True, + "result": { + "id": url.rsplit("/", 1)[-1], + "status": "completed", + "records": filtered_records, + }, + } + ) + + +class _OverlappingLimitedCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + del headers, timeout + self.get_calls.append((url, params)) + job_id = url.rsplit("/", 1)[-1] + payload = self._jobs[job_id] + if params == {"limit": 1}: + return _CloudflareResponse( + {"success": True, "result": {"id": job_id, "status": "completed"}} + ) + records = [ + { + "url": "https://example.com/shared", + "status": "completed", + "markdown": "# Shared\n", + "metadata": { + "status": 200, + "title": "Shared", + "url": "https://example.com/shared", + }, + } + ] + if payload["url"] == "https://example.com/root-b": + records.append( + { + "url": "https://example.com/root-b/unique", + "status": "completed", + "markdown": "# Unique\n", + "metadata": { + "status": 200, + "title": "Unique", + "url": "https://example.com/root-b/unique", + }, + } + ) + if "limit" in payload: + records = records[: payload["limit"]] + return _CloudflareResponse( + { + "success": True, + "result": { + "id": job_id, + "status": "completed", + "records": records, + }, + } + ) + + +class _TrailingSlashCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + payload = self._jobs[url.rsplit("/", 1)[-1]] + records = response.json()["result"]["records"] + records[0]["url"] = f"{payload['url'].rstrip('/')}/" + records[0]["metadata"]["url"] = records[0]["url"] + return response + + +class _OutOfScopeCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + job_id = url.rsplit("/", 1)[-1] + payload = self._jobs[job_id] + root = payload["url"] + records = [ + { + "url": root, + "status": "completed", + "markdown": "# Root\n", + "metadata": { + "status": 200, + "title": "Root", + "url": root, + }, + }, + { + "url": "https://example.com/page", + "status": "completed", + "markdown": "# Page\n", + "metadata": { + "status": 200, + "title": "Page", + "url": "https://example.com/page", + }, + }, + { + "url": "https://docs.example.com/page", + "status": "completed", + "markdown": "# Subdomain\n", + "metadata": { + "status": 200, + "title": "Subdomain", + "url": "https://docs.example.com/page", + }, + }, + { + "url": "https://external.test/page", + "status": "completed", + "markdown": "# External\n", + "metadata": { + "status": 200, + "title": "External", + "url": "https://external.test/page", + }, + }, + ] + return _CloudflareResponse( + { + "success": True, + "result": { + "id": job_id, + "status": "completed", + "records": records, + }, + } + ) + + +class _ExternalFirstCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + job_id = url.rsplit("/", 1)[-1] + root = self._jobs[job_id]["url"] + return _CloudflareResponse( + { + "success": True, + "result": { + "id": job_id, + "status": "completed", + "records": [ + { + "url": "https://external.test/page", + "status": "completed", + "markdown": "# External\n", + "metadata": { + "status": 200, + "title": "External", + "url": "https://external.test/page", + }, + }, + { + "url": root, + "status": "completed", + "markdown": "# Root\n", + "metadata": { + "status": 200, + "title": "Root", + "url": root, + }, + }, + ], + }, + } + ) + + +class _RedirectCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + job_id = url.rsplit("/", 1)[-1] + root = self._jobs[job_id]["url"] + final_url = f"{root.rstrip('/')}/landing" + return _CloudflareResponse( + { + "success": True, + "result": { + "id": job_id, + "status": "completed", + "records": [ + { + "url": final_url, + "status": "completed", + "markdown": "# Landing\n", + "metadata": { + "status": 200, + "title": "Landing", + "url": final_url, + }, + } + ], + }, + } + ) + + +class _CrossOriginRedirectCloudflareSession(_ParameterizedCloudflareSession): + def get( + self, + url: str, + *, + headers: dict[str, str], + params: dict[str, Any] | None = None, + timeout: float, + ) -> _CloudflareResponse: + response = super().get(url, headers=headers, params=params, timeout=timeout) + if params == {"limit": 1}: + return response + job_id = url.rsplit("/", 1)[-1] + final_url = "https://example.com/landing" + return _CloudflareResponse( + { + "success": True, + "result": { + "id": job_id, + "status": "completed", + "records": [ + { + "url": final_url, + "status": "completed", + "markdown": "# Landing\n", + "metadata": { + "status": 200, + "title": "Landing", + "url": final_url, + }, + } + ], + }, + } + ) + + +def test_cloudflare_crawler_polls_job_and_uses_markdown_records( + tmp_path: Path, +) -> None: + session = _CloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cache", + render=False, + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/docs"], + depth=2, + limit=25, + include_patterns=["https://example.com/docs/**"], + exclude_patterns=["https://example.com/docs/archive/**"], + include_external_links=True, + include_subdomains=True, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + "https://example.com/docs", + "https://example.com/docs/page", + ] + assert len(session.post_calls) == 1 + post_url, payload, headers = session.post_calls[0] + assert post_url.endswith("/accounts/account-123/browser-rendering/crawl") + assert headers["Authorization"] == "Bearer token-123" + assert payload["formats"] == ["markdown"] + assert payload["depth"] == 2 + assert payload["limit"] == 25 + assert payload["render"] is False + assert payload["options"]["includePatterns"] == ["https://example.com/docs/**"] + assert payload["options"]["excludePatterns"] == [ + "https://example.com/docs/archive/**" + ] + assert payload["options"]["includeExternalLinks"] is True + assert payload["options"]["includeSubdomains"] is True + + page_source = crawler.fetch_raw("https://example.com/docs/page") + assert page_source.status_code == 200 + assert page_source.markdown_path is not None + assert page_source.markdown_path.read_text(encoding="utf-8") == "## Page\n" + + page_doc = crawler.fetch_markdown("https://example.com/docs/page") + assert page_doc == MarkdownDocument( + origin="https://example.com/docs/page", + content="## Page\n", + ) + assert len(session.post_calls) == 1 + + +def test_cloudflare_markdown_documents_reuses_immediately_stale_discovery_cache( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-stale-cache", + session=session, + cache_stale_after=timedelta(seconds=0), + poll_interval=0, + ) + scope = CrawlScope(roots=["https://example.com"], depth=0) + + documents = list(crawler.markdown_documents(scope, progress=False)) + + assert documents == [ + MarkdownDocument(origin="https://example.com", content="# Docs\n") + ] + assert len(session.post_calls) == 1 + + +def test_cloudflare_crawler_accepts_crawl_scope_for_roots_and_patterns( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-scope-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/docs"], + depth=1, + include_patterns=["https://example.com/docs/**"], + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + "https://example.com/docs", + "https://example.com/docs/page", + ] + assert session.post_calls[0][1]["depth"] == 1 + assert session.post_calls[0][1]["options"]["includePatterns"] == [ + "https://example.com/docs/**" + ] + + +def test_cloudflare_crawler_filters_returned_records_to_web_scope( + tmp_path: Path, +) -> None: + session = _OutOfScopeCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-returned-scope-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/root"], + depth=1, + include_external_links=False, + include_subdomains=False, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + "https://example.com/root", + "https://example.com/page", + ] + + +def test_cloudflare_crawler_does_not_treat_external_first_record_as_seed( + tmp_path: Path, +) -> None: + session = _ExternalFirstCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-external-first-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/root"], + depth=1, + limit=1, + include_external_links=False, + include_subdomains=False, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == ["https://example.com/root"] + + +def test_cloudflare_markdown_documents_keeps_cross_origin_redirected_seed( + tmp_path: Path, +) -> None: + session = _CrossOriginRedirectCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cross-origin-seed-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["http://example.com"], + depth=0, + include_external_links=False, + include_subdomains=False, + ) + + documents = list(crawler.markdown_documents(scope, progress=False)) + + assert documents == [ + MarkdownDocument( + origin="https://example.com/landing", + content="# Landing\n", + ) + ] + + +def test_cloudflare_crawler_cache_key_includes_crawl_parameters( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/docs"], + depth=2, + limit=25, + ) + + source = crawler.fetch_raw("https://example.com/docs") + origins = list(crawler.origins(scope, progress=False)) + + assert source.origin == "https://example.com/docs" + assert origins == [ + "https://example.com/docs", + "https://example.com/docs/page", + ] + assert len(session.post_calls) == 2 + assert session.post_calls[0][1]["depth"] == 0 + assert session.post_calls[0][1]["limit"] == 1 + assert session.post_calls[1][1]["depth"] == 2 + assert session.post_calls[1][1]["limit"] == 25 + + +def test_cloudflare_crawler_rechecks_stale_in_memory_records( + tmp_path: Path, + monkeypatch, +) -> None: + session = _ParameterizedCloudflareSession() + times = iter( + [ + datetime(2026, 1, 1, tzinfo=timezone.utc), + datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc), + datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc), + datetime(2026, 1, 1, 0, 0, 2, tzinfo=timezone.utc), + ] + ) + monkeypatch.setattr(crawl_module, "_utcnow", lambda: next(times)) + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cache", + cache_stale_after=timedelta(seconds=1), + session=session, + poll_interval=0, + ) + scope = CrawlScope(roots=["https://example.com/docs"], depth=2) + + origins = list(crawler.origins(scope, progress=False)) + page_source = crawler.fetch_raw("https://example.com/docs/page") + + assert origins == [ + "https://example.com/docs", + "https://example.com/docs/page", + ] + assert page_source.origin == "https://example.com/docs/page" + assert len(session.post_calls) == 2 + assert session.post_calls[1][1]["url"] == "https://example.com/docs/page" + assert session.post_calls[1][1]["depth"] == 0 + assert session.post_calls[1][1]["limit"] == 1 + + +def test_cloudflare_fetch_raw_ignores_discovery_patterns_for_explicit_origin( + tmp_path: Path, +) -> None: + session = _DiscoveryFilteringCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cache", + session=session, + poll_interval=0, + ) + + source = crawler.fetch_raw("https://example.com/docs") + + assert source.origin == "https://example.com/docs" + assert source.status_code == 200 + assert "includePatterns" not in session.post_calls[0][1]["options"] + + +def test_cloudflare_fetch_raw_accepts_redirected_record( + tmp_path: Path, +) -> None: + cache = tmp_path / "cloudflare-redirect-cache" + session = _RedirectCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=session, + poll_interval=0, + ) + + source = crawler.fetch_raw("https://example.com") + + assert source.origin == "https://example.com" + assert source.resolved_origin == "https://example.com/landing" + assert source.body_path.read_text(encoding="utf-8") == "# Landing\n" + assert len(session.post_calls) == 1 + + cached_session = _RedirectCloudflareSession() + cached_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=cached_session, + poll_interval=0, + ) + cached_source = cached_crawler.fetch_raw("https://example.com") + + assert cached_source.resolved_origin == "https://example.com/landing" + assert cached_session.post_calls == [] + + +def test_cloudflare_fetch_raw_accepts_cross_origin_redirected_record( + tmp_path: Path, +) -> None: + session = _CrossOriginRedirectCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cross-origin-redirect-cache", + session=session, + poll_interval=0, + ) + + source = crawler.fetch_raw("http://example.com") + + assert source.origin == "http://example.com" + assert source.resolved_origin == "https://example.com/landing" + assert source.body_path.read_text(encoding="utf-8") == "# Landing\n" + + +def test_cloudflare_fetch_raw_reuses_cache_directory_across_instances( + tmp_path: Path, +) -> None: + cache = tmp_path / "cloudflare-cache" + first_session = _ParameterizedCloudflareSession() + first_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=first_session, + poll_interval=0, + ) + + first = first_crawler.fetch_raw("https://example.com/docs") + + second_session = _ParameterizedCloudflareSession() + second_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=second_session, + poll_interval=0, + ) + + second = second_crawler.fetch_raw("https://example.com/docs") + + assert first.body_path.read_text(encoding="utf-8") == "# Docs\n" + assert second.body_path.read_text(encoding="utf-8") == "# Docs\n" + assert second.status_code == 200 + assert len(first_session.post_calls) == 1 + assert second_session.post_calls == [] + + +def test_cloudflare_origins_reuses_root_cache_directory_across_instances( + tmp_path: Path, +) -> None: + cache = tmp_path / "cloudflare-cache" + scope = CrawlScope(roots=["https://example.com/docs"], depth=1) + first_session = _ParameterizedCloudflareSession() + first_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=first_session, + poll_interval=0, + ) + + first_origins = list(first_crawler.origins(scope, progress=False)) + + second_session = _ParameterizedCloudflareSession() + second_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=second_session, + poll_interval=0, + ) + + second_origins = list(second_crawler.origins(scope, progress=False)) + page_source = second_crawler.fetch_raw("https://example.com/docs/page") + + assert first_origins == [ + "https://example.com/docs", + "https://example.com/docs/page", + ] + assert second_origins == first_origins + assert page_source.origin == "https://example.com/docs/page" + assert len(first_session.post_calls) == 1 + assert second_session.post_calls == [] + + +def test_cloudflare_markdown_documents_canonicalizes_record_urls( + tmp_path: Path, +) -> None: + session = _TrailingSlashCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-record-url-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope(roots=["https://example.com"], depth=0) + + documents = list(crawler.markdown_documents(scope, progress=False)) + + assert documents == [ + MarkdownDocument(origin="https://example.com", content="# Docs\n") + ] + + +def test_cloudflare_crawler_cache_dir_uses_hashed_file_pair( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + cache_dir = tmp_path / "cloudflare-cache" + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache_dir, + session=session, + poll_interval=0, + ) + + source = crawler.fetch_raw("https://example.com/docs") + + base = _expected_cache_base(source.origin) + metadata_path = cache_dir / f"{base}.metadata.json" + content_path = cache_dir / f"{base}.md" + assert source.body_path.read_text(encoding="utf-8") == "# Docs\n" + assert sorted(path.name for path in cache_dir.iterdir()) == [ + content_path.name, + metadata_path.name, + ] + record = json.loads(metadata_path.read_text(encoding="utf-8")) + assert record["key"] == source.origin + assert record["content_path"] == content_path.name + assert record["metadata"]["record"]["url"] == source.origin + + +def test_cloudflare_crawler_cache_dir_true_uses_default_backend_directory( + tmp_path: Path, + monkeypatch, +) -> None: + monkeypatch.chdir(tmp_path) + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=True, + session=session, + poll_interval=0, + ) + + source = crawler.fetch_raw("https://example.com/docs") + + cache_dir = tmp_path / ".raghilda" / "cache" / "cloudflare" + base = _expected_cache_base(source.origin) + assert sorted(path.name for path in cache_dir.iterdir()) == [ + f"{base}.md", + f"{base}.metadata.json", + ] + + +def test_cloudflare_fetch_raw_scopes_cache_to_account_id( + tmp_path: Path, +) -> None: + cache = tmp_path / "cloudflare-cache" + first_session = _ParameterizedCloudflareSession() + first_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=first_session, + poll_interval=0, + ) + first_crawler.fetch_raw("https://example.com/docs") + + second_session = _ParameterizedCloudflareSession() + second_crawler = CloudflareCrawler( + account_id="account-456", + api_token="token-123", + cache_dir=cache, + session=second_session, + poll_interval=0, + ) + + second_crawler.fetch_raw("https://example.com/docs") + + assert len(first_session.post_calls) == 1 + assert len(second_session.post_calls) == 1 + assert "/accounts/account-456/" in second_session.post_calls[0][0] + + +def test_cloudflare_fetch_raw_scopes_cache_to_api_base( + tmp_path: Path, +) -> None: + cache = tmp_path / "cloudflare-cache" + first_session = _ParameterizedCloudflareSession() + first_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=first_session, + poll_interval=0, + base_url="https://prod.example/api", + ) + first_crawler.fetch_raw("https://example.com/docs") + + second_session = _ParameterizedCloudflareSession() + second_crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=cache, + session=second_session, + poll_interval=0, + base_url="https://staging.example/api", + ) + + second_crawler.fetch_raw("https://example.com/docs") + + assert len(first_session.post_calls) == 1 + assert len(second_session.post_calls) == 1 + assert second_session.post_calls[0][0].startswith( + "https://staging.example/api/accounts/account-123/" + ) + + +def test_cloudflare_crawler_applies_limit_across_all_roots( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=["https://example.com/docs-a", "https://example.com/docs-b"], + limit=1, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == ["https://example.com/docs-a"] + assert len(session.post_calls) == 1 + + +def test_cloudflare_crawler_deduplicates_roots_before_counting_limit( + tmp_path: Path, +) -> None: + session = _ParameterizedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-dedupe-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=[ + "https://example.com/docs-a", + "https://example.com/docs-a", + "https://example.com/docs-b", + ], + depth=0, + limit=2, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + "https://example.com/docs-a", + "https://example.com/docs-b", + ] + assert [call[1]["url"] for call in session.post_calls] == [ + "https://example.com/docs-a", + "https://example.com/docs-b", + ] + + +def test_cloudflare_crawler_applies_limit_after_deduplication( + tmp_path: Path, +) -> None: + session = _OverlappingLimitedCloudflareSession() + crawler = CloudflareCrawler( + account_id="account-123", + api_token="token-123", + cache_dir=tmp_path / "cloudflare-overlap-cache", + session=session, + poll_interval=0, + ) + scope = CrawlScope( + roots=[ + "https://example.com/root-a", + "https://example.com/root-b", + ], + limit=2, + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + "https://example.com/shared", + "https://example.com/root-b/unique", + ] + assert "limit" not in session.post_calls[1][1] + + +def test_directory_crawler_counts_file_roots_toward_limit(tmp_path: Path) -> None: + first = _write(tmp_path, "a.md", "# First") + second = _write(tmp_path, "b.md", "# Second") + crawler = DirectoryCrawler() + scope = CrawlScope(roots=[first, second], limit=1) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [first.resolve().as_uri()] + + +def test_directory_crawler_deduplicates_roots_before_counting_limit( + tmp_path: Path, +) -> None: + docs = tmp_path / "docs" + first = _write(docs, "a.md", "# First") + second = _write(docs, "b.md", "# Second") + crawler = DirectoryCrawler() + scope = CrawlScope(roots=[first, docs], limit=2) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [ + first.resolve().as_uri(), + second.resolve().as_uri(), + ] + + +def test_directory_crawler_applies_limit_without_prewalking_tree( + tmp_path: Path, + monkeypatch, +) -> None: + first = _write(tmp_path, "a.md", "# First") + _write(tmp_path, "z/b.md", "# Second") + crawler = DirectoryCrawler() + + def fail_rglob(self: Path, pattern: str): + del self, pattern + raise AssertionError("DirectoryCrawler should not prewalk with rglob") + + monkeypatch.setattr(Path, "rglob", fail_rglob) + + origins = list( + crawler.origins(CrawlScope(roots=[tmp_path], depth=0, limit=1), progress=False) + ) + + assert origins == [first.resolve().as_uri()] + + +def test_directory_crawler_does_not_follow_symlinked_directories_outside_root( + tmp_path: Path, +) -> None: + root = tmp_path / "root" + inside = _write(root, "inside.md", "# Inside") + external_dir = tmp_path / "external" + outside = _write(external_dir, "outside.md", "# Outside") + link = root / "linked" + try: + link.symlink_to(external_dir, target_is_directory=True) + except OSError as exc: + pytest.skip(f"Symlink creation failed: {exc}") + crawler = DirectoryCrawler() + + origins = list(crawler.origins(CrawlScope(roots=[root], depth=2), progress=False)) + + assert origins == [inside.resolve().as_uri()] + assert outside.resolve().as_uri() not in origins + + +def test_directory_crawler_skips_type_sniffing_without_type_filters( + tmp_path: Path, + monkeypatch, +) -> None: + document = _write(tmp_path, "extensionless", "# Document") + + class _FailingMagika: + def identify_path(self, path: Path): + raise AssertionError(f"Unexpected type sniff for {path}") + + monkeypatch.setattr(crawl_module, "_MAGIKA", _FailingMagika()) + crawler = DirectoryCrawler() + + origins = list(crawler.origins(CrawlScope(roots=[tmp_path]), progress=False)) + + assert origins == [document.resolve().as_uri()] + + +def test_directory_crawler_coerces_scalar_patterns_and_types( + tmp_path: Path, +) -> None: + docs = tmp_path / "docs" + readme = _write(docs, "readme.md", "# Readme") + _write(docs, "skip.py", "print('skip')") + _write(tmp_path, "notes.md", "# Notes") + crawler = DirectoryCrawler() + scope = CrawlScope( + roots=[tmp_path], + include_patterns=r".*/docs/.*", + include_types="markdown", + exclude_types="python", + ) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [readme.resolve().as_uri()] + + +def test_directory_crawler_accepts_crawl_scope_for_roots_and_patterns( + tmp_path: Path, +) -> None: + docs = _write(tmp_path, "docs/readme.md", "# Hello") + _write(tmp_path, "notes/todo.md", "# Skip") + crawler = DirectoryCrawler() + scope = CrawlScope( + roots=[tmp_path], + depth=1, + include_patterns=[r".*/docs/.*"], + ) + + origins = list(crawler.origins(scope, progress=False)) + documents = list(crawler.markdown_documents(scope, progress=False)) + + assert origins == [docs.resolve().as_uri()] + assert documents == [ + MarkdownDocument(origin=docs.resolve().as_uri(), content="# Hello") + ] + + +def test_directory_crawler_returns_no_origins_when_limit_is_zero( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "a.md", "# First") + crawler = DirectoryCrawler() + scope = CrawlScope(roots=[markdown], limit=0) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [] + + +def test_directory_crawler_fetch_markdown_refreshes_when_file_changes( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello") + cache = tmp_path / "cache" + crawler = DirectoryCrawler(cache_dir=cache) + + origin = markdown.resolve().as_uri() + first = crawler.fetch_markdown(origin) + markdown.write_text("# Updated\n", encoding="utf-8") + + refreshed = crawler.fetch_markdown(origin) + + assert first == MarkdownDocument(origin=origin, content="# Hello") + assert refreshed == MarkdownDocument(origin=origin, content="# Updated\n") + + +def test_directory_crawler_excludes_own_cache_files_from_directory_walk( + tmp_path: Path, + monkeypatch, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello") + monkeypatch.chdir(tmp_path) + crawler = DirectoryCrawler(cache_dir=True) + scope = CrawlScope(roots=[tmp_path]) + + documents = list(crawler.markdown_documents(scope, progress=False)) + origins = list(crawler.origins(scope, progress=False)) + + assert documents == [ + MarkdownDocument(origin=markdown.resolve().as_uri(), content="# Hello") + ] + assert origins == [markdown.resolve().as_uri()] + + +def test_directory_crawler_fetch_markdown_force_refresh_rebuilds_cached_markdown( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello") + cache = tmp_path / "cache" + crawler = DirectoryCrawler(cache_dir=cache) + + origin = markdown.resolve().as_uri() + first = crawler.fetch_markdown(origin) + cached_markdown = next( + path for path in cache.iterdir() if not path.name.endswith(".metadata.json") + ) + cached_markdown.write_text("# Stale\n", encoding="utf-8") + + refreshed = crawler.fetch_markdown(origin, cache_force_refresh=True) + + assert first.content == "# Hello" + assert refreshed.content == "# Hello" + + +def test_directory_crawler_markdown_documents_force_refresh_rebuilds_cache( + tmp_path: Path, +) -> None: + markdown = _write(tmp_path, "docs/readme.md", "# Hello") + cache = tmp_path / "cache" + crawler = DirectoryCrawler(cache_dir=cache) + root = tmp_path / "docs" + scope = CrawlScope(roots=[root]) + + documents = list(crawler.markdown_documents(scope, progress=False)) + cached_markdown = next( + path for path in cache.iterdir() if not path.name.endswith(".metadata.json") + ) + cached_markdown.write_text("# Stale\n", encoding="utf-8") + + refreshed = list( + crawler.markdown_documents( + scope, + progress=False, + cache_force_refresh=True, + ) + ) + + assert documents == [ + MarkdownDocument(origin=markdown.resolve().as_uri(), content="# Hello") + ] + assert refreshed == [ + MarkdownDocument(origin=markdown.resolve().as_uri(), content="# Hello") + ] + + +def test_directory_crawler_markdown_documents_converts_in_parallel( + tmp_path: Path, +) -> None: + first = _write(tmp_path, "docs/a.md", "# First") + second = _write(tmp_path, "docs/b.md", "# Second") + crawler = DirectoryCrawler(max_workers=2) + scope = CrawlScope(roots=[tmp_path / "docs"]) + barrier = threading.Barrier(2) + lock = threading.Lock() + in_flight = 0 + max_in_flight = 0 + + def convert(source: FetchedSource) -> MarkdownDocument: + nonlocal in_flight, max_in_flight + with lock: + in_flight += 1 + max_in_flight = max(max_in_flight, in_flight) + try: + barrier.wait(timeout=1.0) + return MarkdownDocument( + origin=source.origin, + content=source.body_path.read_text(encoding="utf-8"), + ) + finally: + with lock: + in_flight -= 1 + + documents = list(crawler.markdown_documents(scope, progress=False, convert=convert)) + + assert documents == [ + MarkdownDocument(origin=first.resolve().as_uri(), content="# First"), + MarkdownDocument(origin=second.resolve().as_uri(), content="# Second"), + ] + assert max_in_flight == 2 + + +def test_directory_crawler_reopens_origins_with_uri_escaped_characters( + tmp_path: Path, +) -> None: + root = tmp_path / "My Docs" + markdown = _write(root, "read me.md", "# Hello") + crawler = DirectoryCrawler() + + origin = next(crawler.origins(CrawlScope(roots=[root]), progress=False)) + document = crawler.fetch_markdown(origin) + + assert "%20" in origin + assert document == MarkdownDocument( + origin=markdown.resolve().as_uri(), + content="# Hello", + ) + + +def test_directory_crawler_accepts_percent_escaped_file_uri_roots( + tmp_path: Path, +) -> None: + root = tmp_path / "My Docs" + markdown = _write(root, "read me.md", "# Hello") + crawler = DirectoryCrawler() + + origins = list( + crawler.origins(CrawlScope(roots=[root.resolve().as_uri()]), progress=False) + ) + + assert origins == [markdown.resolve().as_uri()] + + +def test_directory_crawler_accepts_windows_drive_letter_string_roots( + tmp_path: Path, + monkeypatch, +) -> None: + root = tmp_path / "C:\\docs" + markdown = _write(root, "readme.md", "# Hello") + monkeypatch.chdir(tmp_path) + crawler = DirectoryCrawler() + + origins = list(crawler.origins(CrawlScope(roots=["C:\\docs"]), progress=False)) + + assert origins == [markdown.resolve().as_uri()] + + +@pytest.mark.skipif(os.name != "nt", reason="Windows-specific file URI handling") +def test_directory_crawler_round_trips_windows_file_uris( + tmp_path: Path, +) -> None: + root = tmp_path / "My Docs" + markdown = _write(root, "read me.md", "# Hello") + crawler = DirectoryCrawler() + + root_uri = root.resolve().as_uri() + origin = markdown.resolve().as_uri() + + origins = list(crawler.origins(CrawlScope(roots=[root_uri]), progress=False)) + source = crawler.fetch_raw(origin) + + assert origins == [origin] + assert source.origin == origin + assert source.body_path == markdown.resolve() + + +def test_web_crawler_returns_no_origins_or_requests_when_limit_is_zero( + tmp_path: Path, +) -> None: + with _serve( + { + "/": { + "body": "
Root
", + "content_type": "text/html; charset=utf-8", + "etag": None, + } + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}/" + crawler = WebCrawler( + cache_dir=tmp_path / "zero-limit-cache", + ) + scope = CrawlScope(roots=[root_url], depth=0, limit=0) + + origins = list(crawler.origins(scope, progress=False)) + + assert origins == [] + assert getattr(server, "requests") == [] + + +def test_web_crawler_does_not_fetch_extra_root_once_limit_is_reached( + tmp_path: Path, +) -> None: + with _serve( + { + "/first": { + "body": "
First
", + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + "/second": { + "body": "
Second
", + "content_type": "text/html; charset=utf-8", + "etag": None, + }, + } + ) as server: + root_url = f"http://127.0.0.1:{server.server_port}" + crawler = WebCrawler( + cache_dir=tmp_path / "limit-cache", + max_workers=2, + ) + scope = CrawlScope( + roots=[f"{root_url}/first", f"{root_url}/second"], + depth=0, + limit=1, + ) + + origins = list(crawler.origins(scope, progress=False)) + requests = [request["path"] for request in getattr(server, "requests")] + + assert origins == [f"{root_url}/first"] + assert requests == ["/first"] diff --git a/tests/test_store_ingest.py b/tests/test_store_ingest.py new file mode 100644 index 0000000..1b2ad61 --- /dev/null +++ b/tests/test_store_ingest.py @@ -0,0 +1,495 @@ +from __future__ import annotations + +from concurrent.futures import CancelledError +from dataclasses import replace +from pathlib import Path +import threading +import time +from types import SimpleNamespace +from typing import Any + +import pytest + +import raghilda._store as store_module +from raghilda.chunker import MarkdownChunker +from raghilda.document import Document, MarkdownDocument +from raghilda.store import ( + BaseStore, + ChromaDBStore, + DuckDBStore, + IngestSummary, + OpenAIStore, + WriteResult, +) + + +class _RecordingStore(BaseStore): + def __init__(self) -> None: + self.lock = threading.Lock() + self.prepare_thread_ids: dict[str, int] = {} + self.upsert_thread_ids: dict[str, int] = {} + self.started_origins: list[str] = [] + self.max_in_flight = 0 + self.in_flight = 0 + + @staticmethod + def connect(*args, **kwargs) -> "_RecordingStore": + return _RecordingStore() + + @staticmethod + def create(*args, **kwargs) -> "_RecordingStore": + return _RecordingStore() + + def upsert( + self, + document: Document, + *, + skip_if_unchanged: bool = True, + ) -> WriteResult[Document]: + origin = document.origin + assert isinstance(origin, str) + with self.lock: + self.started_origins.append(origin) + self.in_flight += 1 + self.max_in_flight = max(self.max_in_flight, self.in_flight) + try: + time.sleep(0.02 if origin == "doc-1" else 0) + if origin == "doc-2": + raise RuntimeError("boom") + self.upsert_thread_ids[origin] = threading.get_ident() + action = ( + document.attributes["action"] if document.attributes else "inserted" + ) + return WriteResult(action=action, document=document) + finally: + with self.lock: + self.in_flight -= 1 + + def retrieve(self, text: str, top_k: int, *args, **kwargs): + return [] + + def size(self) -> int: + return len(self.started_origins) + + +class _BlockingFailureStore(BaseStore): + def __init__(self) -> None: + self.blocking_started = threading.Event() + self.release_blocked = threading.Event() + self.blocking_finished = threading.Event() + + @staticmethod + def connect(*args, **kwargs) -> "_BlockingFailureStore": + return _BlockingFailureStore() + + @staticmethod + def create(*args, **kwargs) -> "_BlockingFailureStore": + return _BlockingFailureStore() + + def upsert( + self, + document: Document, + *, + skip_if_unchanged: bool = True, + ) -> WriteResult[Document]: + del skip_if_unchanged + origin = document.origin + assert isinstance(origin, str) + if origin == "doc-1": + self.blocking_started.set() + self.release_blocked.wait(timeout=1.0) + self.blocking_finished.set() + return WriteResult(action="inserted", document=document) + if origin == "doc-2": + assert self.blocking_started.wait(timeout=1.0) + raise RuntimeError("boom") + return WriteResult(action="inserted", document=document) + + def retrieve(self, text: str, top_k: int, *args, **kwargs): + return [] + + def size(self) -> int: + return 0 + + +class _CancelledSiblingStore(BaseStore): + def __init__(self) -> None: + self.allow_failure = threading.Event() + self.release_cancelled = threading.Event() + + @staticmethod + def connect(*args, **kwargs) -> "_CancelledSiblingStore": + return _CancelledSiblingStore() + + @staticmethod + def create(*args, **kwargs) -> "_CancelledSiblingStore": + return _CancelledSiblingStore() + + def upsert( + self, + document: Document, + *, + skip_if_unchanged: bool = True, + ) -> WriteResult[Document]: + del skip_if_unchanged + assert isinstance(document.origin, str) + if document.origin == "doc-1": + self.allow_failure.set() + raise RuntimeError("boom") + return WriteResult(action="inserted", document=document) + + def retrieve(self, text: str, top_k: int, *args, **kwargs): + return [] + + def size(self) -> int: + return 0 + + +def test_base_store_ingest_returns_summary_and_applies_prepare_before_upsert() -> None: + store = _RecordingStore() + main_thread_id = threading.get_ident() + documents = [ + MarkdownDocument( + origin="doc-1", content="# One", attributes={"action": "inserted"} + ), + MarkdownDocument( + origin="doc-3", content="# Three", attributes={"action": "skipped"} + ), + ] + + def prepare(document: MarkdownDocument) -> MarkdownDocument: + assert document.origin is not None + store.prepare_thread_ids[document.origin] = threading.get_ident() + return replace(document, content=document.content + "\nprepared") + + summary = store.ingest(documents, prepare=prepare, max_workers=2) + + assert summary == IngestSummary(inserted=1, replaced=0, skipped=1) + assert set(store.prepare_thread_ids) == {"doc-1", "doc-3"} + assert set(store.upsert_thread_ids) == {"doc-1", "doc-3"} + assert set(store.prepare_thread_ids.values()).isdisjoint({main_thread_id}) + + +def test_base_store_ingest_runs_prepare_in_worker_pool_concurrently() -> None: + store = _RecordingStore() + barrier = threading.Barrier(2) + lock = threading.Lock() + in_prepare = 0 + max_in_prepare = 0 + + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-3", content="# Three"), + ] + + def prepare(document: MarkdownDocument) -> MarkdownDocument: + nonlocal in_prepare, max_in_prepare + with lock: + in_prepare += 1 + max_in_prepare = max(max_in_prepare, in_prepare) + try: + barrier.wait(timeout=1.0) + return document + finally: + with lock: + in_prepare -= 1 + + summary = store.ingest(documents, prepare=prepare, max_workers=2) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert max_in_prepare == 2 + + +def test_base_store_ingest_starts_writes_before_input_is_exhausted() -> None: + class _StreamingStore(BaseStore): + def __init__(self) -> None: + self.started = threading.Event() + self.started_origins: list[str] = [] + + @staticmethod + def connect(*args, **kwargs) -> "_StreamingStore": + return _StreamingStore() + + @staticmethod + def create(*args, **kwargs) -> "_StreamingStore": + return _StreamingStore() + + def upsert( + self, + document: Document, + *, + skip_if_unchanged: bool = True, + ) -> WriteResult[Document]: + del skip_if_unchanged + assert isinstance(document.origin, str) + self.started_origins.append(document.origin) + self.started.set() + return WriteResult(action="inserted", document=document) + + def retrieve(self, text: str, top_k: int, *args, **kwargs): + return [] + + def size(self) -> int: + return len(self.started_origins) + + store = _StreamingStore() + + def documents(): + yield MarkdownDocument(origin="doc-1", content="# One") + assert store.started.wait(timeout=1.0) + yield MarkdownDocument(origin="doc-2", content="# Two") + + summary = store.ingest(documents(), max_workers=1) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert store.started_origins == ["doc-1", "doc-2"] + + +def test_base_store_ingest_raises_on_duplicate_after_streaming_started() -> None: + store = _RecordingStore() + documents = [ + MarkdownDocument(origin="dup", content="# One"), + MarkdownDocument(origin="dup", content="# Two"), + MarkdownDocument(origin="doc-3", content="# Three"), + ] + + with pytest.raises(ValueError, match="Duplicate origin during ingest: dup"): + store.ingest(documents, max_workers=1) + + assert store.started_origins == ["dup"] + + +def test_base_store_ingest_fails_fast_and_bounds_worker_count() -> None: + store = _RecordingStore() + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-2", content="# Two"), + MarkdownDocument(origin="doc-3", content="# Three"), + ] + + with pytest.raises(RuntimeError, match="boom"): + store.ingest(documents, max_workers=2) + + assert "doc-3" not in store.started_origins + assert store.max_in_flight <= 2 + + +def test_base_store_ingest_waits_for_running_workers_before_raising() -> None: + store = _BlockingFailureStore() + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-2", content="# Two"), + ] + + def release_blocked() -> None: + assert store.blocking_started.wait(timeout=1.0) + time.sleep(0.2) + store.release_blocked.set() + + releaser = threading.Thread(target=release_blocked) + releaser.start() + + try: + with pytest.raises(RuntimeError, match="boom"): + store.ingest(documents, max_workers=2) + + assert store.release_blocked.is_set() + assert store.blocking_finished.is_set() + finally: + releaser.join() + + +def test_base_store_ingest_ignores_cancelled_sibling_when_worker_failed( + monkeypatch, +) -> None: + class _FakeFuture: + def __init__( + self, + *, + result: WriteResult[Document] | None = None, + error: BaseException | None = None, + ) -> None: + self._result = result + self._error = error + + def result(self) -> WriteResult[Document]: + if self._error is not None: + raise self._error + assert self._result is not None + return self._result + + def cancel(self) -> None: + return None + + class _FakeExecutor: + def __init__(self, *, max_workers: int) -> None: + del max_workers + self._submissions = [ + _FakeFuture(error=CancelledError()), + _FakeFuture(error=RuntimeError("boom")), + ] + + def submit(self, fn, arg): + del fn, arg + return self._submissions.pop(0) + + def shutdown(self, *, wait: bool, cancel_futures: bool) -> None: + del wait, cancel_futures + return None + + def fake_wait(pending, return_when): + del pending, return_when + return ( + [ + _FakeFuture(error=CancelledError()), + _FakeFuture(error=RuntimeError("boom")), + ], + set(), + ) + + monkeypatch.setattr(store_module, "ThreadPoolExecutor", _FakeExecutor) + monkeypatch.setattr(store_module, "wait", fake_wait) + + store = _RecordingStore() + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-2", content="# Two"), + ] + + with pytest.raises(RuntimeError, match="boom"): + store.ingest(documents, max_workers=2) + + +def test_base_store_ingest_propagates_worker_cancelled_error() -> None: + store = _RecordingStore() + documents = [MarkdownDocument(origin="doc-1", content="# One")] + + def prepare(document: MarkdownDocument) -> MarkdownDocument: + del document + raise CancelledError("prepare cancelled") + + with pytest.raises(CancelledError, match="prepare cancelled"): + store.ingest(documents, prepare=prepare, max_workers=1) + + +def test_postgresql_store_ingest_serializes_upsert_calls() -> None: + pytest.importorskip("psycopg2") + from raghilda._postgres_store import PostgreSQLStore + + store = PostgreSQLStore.__new__(PostgreSQLStore) + store._ingest_upsert_lock = threading.Lock() + lock = threading.Lock() + in_flight = 0 + max_in_flight = 0 + + def upsert( + document: Document, + *, + skip_if_unchanged: bool = True, + ) -> WriteResult[Document]: + del skip_if_unchanged + nonlocal in_flight, max_in_flight + with lock: + in_flight += 1 + max_in_flight = max(max_in_flight, in_flight) + try: + time.sleep(0.02) + return WriteResult(action="inserted", document=document) + finally: + with lock: + in_flight -= 1 + + store.upsert = upsert # type: ignore[method-assign] + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-2", content="# Two"), + ] + + summary = store.ingest(documents, max_workers=2) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert max_in_flight == 1 + + +def test_duckdb_store_ingest_prepares_chunked_documents() -> None: + store = DuckDBStore.create( + location=":memory:", + embed=None, + overwrite=True, + name="duckdb_ingest", + ) + documents = [ + MarkdownDocument(origin="doc-1", content="# One\n\nHello"), + MarkdownDocument(origin="doc-2", content="# Two\n\nWorld"), + ] + + summary = store.ingest( + documents, + prepare=MarkdownChunker(chunk_size=32, target_overlap=0).chunk, + max_workers=2, + ) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert store.size() == 2 + + +def test_chromadb_store_ingest_prepares_chunked_documents(tmp_path: Path) -> None: + store = ChromaDBStore.create( + location=tmp_path / "chroma", + overwrite=True, + name="chroma_ingest", + embed=None, + ) + documents = [ + MarkdownDocument(origin="doc-1", content="# One\n\nHello"), + MarkdownDocument(origin="doc-2", content="# Two\n\nWorld"), + ] + + summary = store.ingest( + documents, + prepare=MarkdownChunker(chunk_size=32, target_overlap=0).chunk, + max_workers=2, + ) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert store.size() == 2 + + +class _SinglePage: + def __init__(self, data: list[Any]): + self.data = data + + def has_next_page(self) -> bool: + return False + + +class _FakeVectorStoreFiles: + def __init__(self) -> None: + self.uploads: list[dict[str, Any]] = [] + + def list(self, **kwargs): + return _SinglePage([]) + + def upload_and_poll(self, **kwargs): + self.uploads.append(kwargs) + return SimpleNamespace(id=f"file-{len(self.uploads)}") + + def delete(self, **kwargs): + raise AssertionError("delete should not be called") + + +def test_openai_store_ingest_accepts_markdown_documents_without_prepare() -> None: + vector_store_files = _FakeVectorStoreFiles() + fake_client = SimpleNamespace( + vector_stores=SimpleNamespace(files=vector_store_files), + ) + store = OpenAIStore(client=fake_client, store_id="vs_test") + documents = [ + MarkdownDocument(origin="doc-1", content="# One"), + MarkdownDocument(origin="doc-2", content="# Two"), + ] + + summary = store.ingest(documents, max_workers=2) + + assert summary == IngestSummary(inserted=2, replaced=0, skipped=0) + assert len(vector_store_files.uploads) == 2 diff --git a/user_guide/04-crawling-and-ingestion.qmd b/user_guide/04-crawling-and-ingestion.qmd new file mode 100644 index 0000000..d877d16 --- /dev/null +++ b/user_guide/04-crawling-and-ingestion.qmd @@ -0,0 +1,268 @@ +--- +title: "Crawling and Ingestion" +guide-section: "Getting Started" +--- + +raghilda's core workflow is intentionally sequential: find a source, read it, +chunk it, and upsert it. That is the recommended first path for building a store +because every step is visible, easy to inspect, and easy to change. + +As your source collection grows, store creation can become mostly waiting on +network requests, file conversion, chunking, and writes. The crawling and +ingestion API is the next step when you want that work to run concurrently. It +can make store creation substantially faster while still letting you inspect the +origins, fetched sources, converted Markdown documents, and final ingest +summary. + +The tradeoff is a few extra concepts. Use this API when the simple sequential +workflow is too slow, or when you need a repeatable refresh job for a larger +site, document collection, or codebase. The API has three parts: + +- `CrawlScope` describes what to crawl. +- A crawler discovers sources and returns `MarkdownDocument` objects. +- `store.ingest()` prepares and upserts the stream. + +## Crawl a website + +Use `WebCrawler` when you want raghilda to fetch pages directly with +`requests`. The crawler starts from one or more roots, follows links up to +`depth`, and yields matching pages as Markdown documents. + +```{python} +#| eval: false +from datetime import timedelta + +from raghilda.chunker import MarkdownChunker +from raghilda.crawl import CrawlScope, WebCrawler +from raghilda.embedding import EmbeddingOpenAI +from raghilda.store import DuckDBStore + +store = DuckDBStore.create( + location="docs.db", + embed=EmbeddingOpenAI(), + name="docs", + overwrite=True, +) + +crawler = WebCrawler( + cache_dir=True, + cache_stale_after=timedelta(days=1), + max_workers=4, +) +scope = CrawlScope( + roots=["https://quarto.org/docs/guide/"], + depth=2, + include_patterns=[r"^https://quarto\.org/docs/guide/"], + exclude_patterns=[r"/reference/"], + include_types=["html"], +) + +chunker = MarkdownChunker(chunk_size=1600, target_overlap=0.5) + +summary = store.ingest( + crawler.markdown_documents(scope), + prepare=chunker.chunk, + max_workers=4, +) +store.build_index() + +print(summary) +``` + +`CrawlScope` owns traversal policy: + +| Field | Description | +|-------|-------------| +| `roots` | Starting files, directories, or URLs. | +| `depth` | Number of link or directory levels to follow. `0` means only the roots. | +| `limit` | Maximum number of origins to yield. | +| `include_patterns` | Regular expressions that origins must match. | +| `exclude_patterns` | Regular expressions that remove origins from the crawl. | +| `include_types` | Type labels to include, such as `html`, `markdown`, `pdf`, `python`, or `text`. | +| `exclude_types` | Type labels to skip. | +| `include_external_links` | Allow links outside the root origin. Defaults to `False`. | +| `include_subdomains` | Allow subdomains under the root host. Defaults to `False`. | + +`WebCrawler(cache_dir=True)` stores fetched response bodies under +`.raghilda/cache/web`. With `cache_stale_after`, fresh cached responses are +reused, and stale responses are revalidated with `ETag` or `Last-Modified` +headers when the server provides them. Pass `cache_force_refresh=True` to +`origins()`, `fetch_raw()`, `fetch_markdown()`, or `markdown_documents()` when a +run must bypass the cache. + +## Crawl local files + +Use `DirectoryCrawler` for local Markdown, notebooks, PDFs, text files, and +other files supported by `read_as_markdown()`. + +```{python} +#| eval: false +from raghilda.chunker import MarkdownChunker +from raghilda.crawl import CrawlScope, DirectoryCrawler +from raghilda.store import DuckDBStore + +store = DuckDBStore.create( + location="local-docs.db", + embed=None, + name="local_docs", + overwrite=True, +) + +crawler = DirectoryCrawler(cache_dir=True, max_workers=4) +scope = CrawlScope( + roots=["docs"], + depth=3, + include_patterns=[r".*\.(md|qmd|ipynb|pdf)$"], + exclude_patterns=[r".*/_site/.*", r".*/\.venv/.*"], +) + +chunker = MarkdownChunker() +summary = store.ingest( + crawler.markdown_documents(scope), + prepare=chunker.chunk, + max_workers=4, +) + +print(summary) +``` + +Directory crawling always reads the current filesystem tree. If you enable +`cache_dir`, converted Markdown is reused only when the source file hash and +modification time still match the cached metadata. The crawler also skips its +own cache directory when the cache is inside a crawled root. + +## Inspect before ingesting + +The crawler interface is useful even when you are not ready to write to a +store. Use `origins()` to inspect what the scope discovers, or use +`fetch_markdown()` to convert one source. + +```{python} +#| eval: false +from raghilda.crawl import CrawlScope, WebCrawler + +crawler = WebCrawler(cache_dir=True) +scope = CrawlScope( + roots=["https://example.com/docs/"], + depth=1, + limit=10, +) + +for origin in crawler.origins(scope): + print(origin) + +doc = crawler.fetch_markdown("https://example.com/docs/") +print(doc.origin) +print(doc.content[:500]) +``` + +All crawler classes implement the same public methods: + +| Method | Returns | +|--------|---------| +| `origins(scope)` | A lazy iterator of source origins. | +| `fetch_raw(origin)` | A `FetchedSource` with the cached body path and metadata. | +| `fetch_markdown(origin)` | One `MarkdownDocument`. | +| `markdown_documents(scope)` | A lazy iterator of `MarkdownDocument` objects. | + +## Customize conversion + +By default, crawlers convert fetched sources with raghilda's Markdown reader. +Pass a `convert` function when a site or file collection needs custom cleanup. +The function receives a `FetchedSource` and returns a `MarkdownDocument`. + +```{python} +#| eval: false +from raghilda.crawl import CrawlScope, FetchedSource, WebCrawler +from raghilda.document import MarkdownDocument +from raghilda.read import read_as_markdown + + +def convert_reference_page(source: FetchedSource) -> MarkdownDocument: + doc = read_as_markdown(str(source.body_path)) + markdown = doc.content + markdown = markdown.replace("Edit this page", "") + return MarkdownDocument(origin=source.origin, content=markdown) + + +crawler = WebCrawler(cache_dir=True) +scope = CrawlScope(roots=["https://example.com/reference/"], depth=1) +documents = crawler.markdown_documents(scope, convert=convert_reference_page) +``` + +Keep chunking in `store.ingest(prepare=...)`, not in the converter. The +converter should return one unchunked Markdown document per origin; `prepare` +can then apply the same chunking policy to every document. + +## Use Cloudflare crawling + +Use `CloudflareCrawler` when you want Cloudflare to perform the browser-rendered +crawl and return Markdown records. This is useful for sites that need rendering +or where you want Cloudflare's crawl service to manage discovery. + +```{python} +#| eval: false +import os +from datetime import timedelta + +from raghilda.chunker import MarkdownChunker +from raghilda.crawl import CloudflareCrawler, CrawlScope +from raghilda.store import DuckDBStore + +store = DuckDBStore.create( + location="rendered-docs.db", + embed=None, + name="rendered_docs", + overwrite=True, +) + +crawler = CloudflareCrawler( + account_id=os.environ["CLOUDFLARE_ACCOUNT_ID"], + api_token=os.environ["CLOUDFLARE_API_TOKEN"], + cache_dir=True, + cache_stale_after=timedelta(days=1), + render=True, +) +scope = CrawlScope( + roots=["https://example.com/docs/"], + depth=2, + include_patterns=["https://example.com/docs/**"], + exclude_patterns=["https://example.com/docs/archive/**"], + limit=250, +) + +summary = store.ingest( + crawler.markdown_documents(scope), + prepare=MarkdownChunker().chunk, + max_workers=4, +) + +print(summary) +``` + +For Cloudflare crawls, `include_patterns` and `exclude_patterns` use +Cloudflare-style wildcard patterns, such as `https://example.com/docs/**`. +`include_external_links` and `include_subdomains` are passed through to the +Cloudflare crawl request. + +## Refresh a store + +`store.ingest()` upserts each prepared document and returns an `IngestSummary` +with counts for inserted, replaced, and skipped documents. The input stream is +consumed lazily, and `prepare` runs in the worker pool. + +```{python} +#| eval: false +summary = store.ingest( + crawler.markdown_documents(scope, cache_force_refresh=True), + prepare=chunker.chunk, + max_workers=4, +) + +print(f"Inserted: {summary.inserted}") +print(f"Replaced: {summary.replaced}") +print(f"Skipped: {summary.skipped}") +``` + +Use `upsert()` directly when you need per-document `WriteResult` objects. +Use `ingest()` when you want one aggregate summary for a crawl or refresh job.