From 081bd295cfda58f220f42b2148b1b5f1f50634d2 Mon Sep 17 00:00:00 2001 From: AIMLPM Date: Mon, 11 May 2026 21:44:31 -0700 Subject: [PATCH] =?UTF-8?q?v0.11.1=20=E2=80=94=20default=20aggregator-page?= =?UTF-8?q?=20URL=20filter=20(mdBook=20/print.html,=20Hugo=20/=5Fprint/)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reject single-render-of-whole-tree aggregator pages during crawl-time URL filtering. These pages contain the entire docs tree on one URL, so embedding- based retrieval ranks them above the dedicated chapter pages a user actually wants. Patterns rejected pre-fetch (saves crawl budget): */print.html, */_print, */_print/, */_print/*, */print/index.html Opt out via include_aggregator_pages=True engine kwarg or --include-aggregators CLI flag for offline-archive use cases. Motivation from llm-crawler-benchmarks v1.4 cycle: markcrawl was returning /print.html in 49% of rust-book top-5 retrieval slots and /_print/ in 39% of kubernetes-docs slots, while four of the five well-functioning competitors returned 0% /_print/ on kubernetes-docs. Predicted MRR lift on the 9-site bench pool: +0.02 to +0.04, concentrated on rust-book and kubernetes-docs. 36 new tests in tests/test_v011_1_aggregator_filter.py covering default rejection, substring-match safety (/blueprint.html, /preprint.html, /imprint/ all pass through), opt-out flag, composition with user-supplied exclude_paths and include_paths, sync + async engine parity. 647 tests total (was 611), no regressions. --- CHANGELOG.md | 43 ++++++ markcrawl/cli.py | 10 ++ markcrawl/core.py | 43 ++++++ pyproject.toml | 2 +- tests/test_v011_1_aggregator_filter.py | 185 +++++++++++++++++++++++++ 5 files changed, 282 insertions(+), 1 deletion(-) create mode 100644 tests/test_v011_1_aggregator_filter.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 68339a8..35b411b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,49 @@ All notable changes to MarkCrawl are documented in this file. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and this project follows [SemVer](https://semver.org/) once it reaches 1.0. +## [0.11.1] - 2026-05-11 + +### Added — default aggregator-page URL filter +Markcrawl now rejects mdBook `/print.html` and Hugo `/_print/` pages +during crawl-time URL filtering. These single-render-of-whole-tree +pages have artificially high keyword density (they contain the entire +docs tree on one URL), which causes embedding-based retrieval to rank +them above the dedicated chapter pages a user actually wants. + +- New default patterns rejected pre-fetch (saves crawl budget): + `*/print.html`, `*/_print`, `*/_print/`, `*/_print/*`, + `*/print/index.html`. +- New kwarg `include_aggregator_pages: bool = False` on + `crawl(...)` and both engine classes for offline-archive use cases. +- CLI flag `--include-aggregators` mirrors. +- User-supplied `exclude_paths` and `include_paths` still apply + independently — the aggregator filter composes with both, doesn't + replace either. + +### Why now +The public `llm-crawler-benchmarks` v1.4 cycle surfaced this as a +markcrawl-specific issue: markcrawl was returning `/print.html` in +49% of rust-book top-5 retrieval slots and `/_print/` in 39% of +kubernetes-docs slots, while all four well-functioning competitors +returned 0% `/_print/` on kubernetes-docs. The retrieval-ranking +damage is structural — these pages will always beat real chapter +pages on cosine similarity because they contain everything. + +### Expected impact +Per the v1.4 retrieval-bucket audit, ~9-12 of markcrawl's 43 +retrieval-bucket misses concentrate on this issue. Predicted MRR +lift on the 9-site bench pool: **+0.02 to +0.04**, concentrated on +rust-book and kubernetes-docs. Measurement waits for the bench's +v1.5 methodology refresh (helpful-pages-universe approach replaces +the v1.4 single-tool-anchor query corpus). + +### Tests +36 new tests covering: default rejection of observed bench failures, +substring-match safety (`/blueprint.html`, `/preprint.html`, +`/imprint/` all pass through), opt-out flag, composition with user +exclude_paths and include_paths, both `CrawlEngine` and +`AsyncCrawlEngine` parity. Total test count: 647 (was 611). + ## [0.11.0] - 2026-05-06 Two new modules expand markcrawl from "HTML to Markdown converter" to diff --git a/markcrawl/cli.py b/markcrawl/cli.py index 141a4d6..e982b53 100644 --- a/markcrawl/cli.py +++ b/markcrawl/cli.py @@ -141,6 +141,14 @@ def build_parser() -> argparse.ArgumentParser: metavar="PATTERN", help="Glob pattern to include URL paths (e.g. '/blog/*'). Only matching paths are crawled. Can be repeated.", ) + parser.add_argument( + "--include-aggregators", + action="store_true", + help="Include aggregator/print-view pages (e.g. mdBook /print.html, Hugo /_print/) " + "that bundle many sub-pages into a single render. Off by default — these pages have " + "artificially high keyword density and pollute retrieval rankings without adding new " + "content. Enable only for offline-archive use cases.", + ) parser.add_argument( "--download-images", action="store_true", @@ -370,6 +378,7 @@ def main() -> None: i18n_filter=args.i18n_filter, title_at_top=args.title_at_top, screenshot_config=screenshot_config, + include_aggregator_pages=args.include_aggregators, ) total_pages += site_result.pages_saved except Exception as exc: @@ -430,6 +439,7 @@ def main() -> None: i18n_filter=args.i18n_filter, title_at_top=args.title_at_top, screenshot_config=screenshot_config, + include_aggregator_pages=args.include_aggregators, ) if not args.dry_run: diff --git a/markcrawl/core.py b/markcrawl/core.py index 0004465..89f0a0e 100644 --- a/markcrawl/core.py +++ b/markcrawl/core.py @@ -158,6 +158,22 @@ def _resolve_idle_timeout(arg: Optional[float]) -> float: }) +# Default URL-path patterns for aggregator pages that bundle many sub-pages +# into a single render (mdBook `/print.html`, Hugo `/_print/`). These pages +# have high keyword density on almost any query because they contain the +# entire docs tree on one URL, so embedding-based retrieval ranks them above +# the dedicated chapter pages a user actually wants. Pre-fetch URL filter +# instead of post-index dedup because we'd otherwise burn crawl budget on +# them. Opt out via ``include_aggregator_pages=True``. (v0.11.1) +_DEFAULT_AGGREGATOR_PATH_PATTERNS: Tuple[str, ...] = ( + "*/print.html", # mdBook (rust-book), VuePress + "*/_print", # Hugo bare path + "*/_print/", # Hugo trailing slash + "*/_print/*", # Hugo subpaths (kubernetes-docs) + "*/print/index.html", # Alternate single-page generators +) + + def _compute_broader_scope(current_paths: List[str]) -> Optional[List[str]]: """One-level scope broadening for adaptive crawl expansion. @@ -452,6 +468,7 @@ def __init__( download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES, download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB, download_filter: Optional[Callable[[DownloadCandidate], bool]] = None, + include_aggregator_pages: bool = False, ): self.out_dir = out_dir self.fmt = fmt @@ -469,6 +486,7 @@ def __init__( self.show_progress = show_progress self.exclude_paths = exclude_paths or [] self.include_paths = include_paths or [] + self._exclude_aggregator_paths = not include_aggregator_pages self.download_images = download_images self.min_image_size = min_image_size self.screenshot_config = screenshot_config @@ -748,12 +766,20 @@ def path_excluded(self, url: str) -> bool: or when ``include_paths`` is set and it matches none of them. Seed URLs (base URL fallback) bypass include filtering so we can still discover links from the entry point. + + Default aggregator-page patterns (mdBook /print.html, Hugo /_print/) + are applied unless ``include_aggregator_pages=True`` was passed at + construction. See ``_DEFAULT_AGGREGATOR_PATH_PATTERNS``. (v0.11.1) """ if self.i18n_filter: from .analyzer import i18n_path_excluded if i18n_path_excluded(url): return True path = up.urlsplit(url).path + if self._exclude_aggregator_paths and any( + fnmatch.fnmatch(path, pat) for pat in _DEFAULT_AGGREGATOR_PATH_PATTERNS + ): + return True if self.exclude_paths and any(fnmatch.fnmatch(path, pat) for pat in self.exclude_paths): return True if self.include_paths and not any(fnmatch.fnmatch(path, pat) for pat in self.include_paths): @@ -1480,6 +1506,7 @@ def __init__( download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES, download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB, download_filter: Optional[Callable[[DownloadCandidate], bool]] = None, + include_aggregator_pages: bool = False, ): self.out_dir = out_dir self.fmt = fmt @@ -1497,6 +1524,7 @@ def __init__( self.show_progress = show_progress self.exclude_paths = exclude_paths or [] self.include_paths = include_paths or [] + self._exclude_aggregator_paths = not include_aggregator_pages self.i18n_filter = i18n_filter self.title_at_top = title_at_top self.auto_path_priority = auto_path_priority @@ -1694,12 +1722,20 @@ def path_excluded(self, url: str) -> bool: or when ``include_paths`` is set and it matches none of them. Seed URLs (base URL fallback) bypass include filtering so we can still discover links from the entry point. + + Default aggregator-page patterns (mdBook /print.html, Hugo /_print/) + are applied unless ``include_aggregator_pages=True`` was passed at + construction. See ``_DEFAULT_AGGREGATOR_PATH_PATTERNS``. (v0.11.1) """ if self.i18n_filter: from .analyzer import i18n_path_excluded if i18n_path_excluded(url): return True path = up.urlsplit(url).path + if self._exclude_aggregator_paths and any( + fnmatch.fnmatch(path, pat) for pat in _DEFAULT_AGGREGATOR_PATH_PATTERNS + ): + return True if self.exclude_paths and any(fnmatch.fnmatch(path, pat) for pat in self.exclude_paths): return True if self.include_paths and not any(fnmatch.fnmatch(path, pat) for pat in self.include_paths): @@ -2426,6 +2462,7 @@ def crawl( download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES, download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB, download_filter: Optional[Callable[[DownloadCandidate], bool]] = None, + include_aggregator_pages: bool = False, ) -> CrawlResult: """Crawl a website and save cleaned content to disk. @@ -2575,6 +2612,7 @@ def crawl( download_max_files=download_max_files, download_max_size_mb=download_max_size_mb, download_filter=download_filter, + include_aggregator_pages=include_aggregator_pages, ) return _crawl_sync( @@ -2602,6 +2640,7 @@ def crawl( download_max_files=download_max_files, download_max_size_mb=download_max_size_mb, download_filter=download_filter, + include_aggregator_pages=include_aggregator_pages, ) @@ -2646,6 +2685,7 @@ def _crawl_sync( download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES, download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB, download_filter: Optional[Callable[[DownloadCandidate], bool]] = None, + include_aggregator_pages: bool = False, ) -> CrawlResult: """Synchronous crawl path using ThreadPoolExecutor.""" engine = CrawlEngine( @@ -2678,6 +2718,7 @@ def _crawl_sync( download_max_files=download_max_files, download_max_size_mb=download_max_size_mb, download_filter=download_filter, + include_aggregator_pages=include_aggregator_pages, ) base_url = norm_url(base_url) @@ -2930,6 +2971,7 @@ def _crawl_async( download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES, download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB, download_filter: Optional[Callable[[DownloadCandidate], bool]] = None, + include_aggregator_pages: bool = False, ) -> CrawlResult: """Async crawl path using native asyncio event loop.""" @@ -2962,6 +3004,7 @@ async def _run() -> CrawlResult: download_max_files=download_max_files, download_max_size_mb=download_max_size_mb, download_filter=download_filter, + include_aggregator_pages=include_aggregator_pages, ) nonlocal base_url diff --git a/pyproject.toml b/pyproject.toml index a19ffe2..5fd36f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "markcrawl" -version = "0.11.0" +version = "0.11.1" description = "Fast Python web crawler for AI & RAG ingestion — crawl, extract, and embed website content with one tool." readme = "README.md" requires-python = ">=3.10" diff --git a/tests/test_v011_1_aggregator_filter.py b/tests/test_v011_1_aggregator_filter.py new file mode 100644 index 0000000..f8a65e3 --- /dev/null +++ b/tests/test_v011_1_aggregator_filter.py @@ -0,0 +1,185 @@ +"""Tests for the default aggregator-page URL filter (v0.11.1). + +The filter rejects mdBook /print.html, Hugo /_print/, and similar +single-render-of-whole-tree pages during crawl-time URL filtering. These +pages have artificially high keyword density on almost any retrieval +query because they contain the entire docs tree on a single URL, so the +embedder ranks them above the dedicated chapter pages a user actually +wants. + +Bench evidence motivating the filter: markcrawl returned /print.html in +49% of rust-book top-5 retrieval slots and /_print/ in 39% of +kubernetes-docs slots; all five other well-functioning competitors +return 0% on kubernetes-docs. +""" + +from __future__ import annotations + +import pytest + +from markcrawl.core import ( + AsyncCrawlEngine, + CrawlEngine, + _DEFAULT_AGGREGATOR_PATH_PATTERNS, +) + + +# ---------- engine fixtures ---------------------------------------------- + +def _make_sync_engine(tmp_path, **kwargs): + out = tmp_path / "out" + out.mkdir(exist_ok=True) + defaults = dict( + out_dir=str(out), fmt="markdown", min_words=20, delay=0, timeout=10, + concurrency=1, include_subdomains=False, user_agent="test", + render_js=False, proxy=None, show_progress=False, + ) + defaults.update(kwargs) + return CrawlEngine(**defaults) + + +def _make_async_engine(tmp_path, **kwargs): + out = tmp_path / "out" + out.mkdir(exist_ok=True) + defaults = dict( + out_dir=str(out), fmt="markdown", min_words=20, delay=0, timeout=10, + concurrency=1, include_subdomains=False, user_agent="test", + proxy=None, show_progress=False, + ) + defaults.update(kwargs) + return AsyncCrawlEngine(**defaults) + + +# ---------- default patterns rejected ------------------------------------ + +# (URL, reason) pairs covering the observed bench failure modes. +_REJECTED_URLS = [ + ("https://doc.rust-lang.org/book/print.html", "mdBook print view"), + ("https://example.com/print.html", "root-level print.html"), + ("https://kubernetes.io/docs/concepts/_print/", "Hugo _print trailing slash"), + ("https://kubernetes.io/docs/concepts/_print/index.html", "Hugo _print explicit index"), + ("https://example.com/foo/_print", "Hugo _print bare path"), + ("https://example.com/foo/print/index.html", "alternate single-page generator"), +] + + +@pytest.mark.parametrize("url, reason", _REJECTED_URLS) +def test_sync_engine_rejects_aggregator_by_default(tmp_path, url, reason): + engine = _make_sync_engine(tmp_path) + engine._seed_urls = set() + assert engine.path_excluded(url), f"should reject {url} ({reason})" + + +@pytest.mark.parametrize("url, reason", _REJECTED_URLS) +def test_async_engine_rejects_aggregator_by_default(tmp_path, url, reason): + engine = _make_async_engine(tmp_path) + engine._seed_urls = set() + assert engine.path_excluded(url), f"should reject {url} ({reason})" + + +# ---------- non-aggregator URLs pass through ----------------------------- + +_ACCEPTED_URLS = [ + "https://doc.rust-lang.org/book/ch12-02-reading-a-file.html", + "https://kubernetes.io/docs/concepts/architecture/control-plane-node-communication/", + "https://example.com/index.html", # plain index.html is content, not aggregator + "https://example.com/blueprint.html", # 'print' substring inside a word — must not over-match + "https://example.com/imprint/", # legal page; not aggregator + "https://example.com/preprint.html", # academic preprint; not aggregator + "https://example.com/_printer-friendly/css.css", # asset path containing _print as prefix-only +] + + +@pytest.mark.parametrize("url", _ACCEPTED_URLS) +def test_sync_engine_accepts_non_aggregator(tmp_path, url): + engine = _make_sync_engine(tmp_path) + engine._seed_urls = set() + assert not engine.path_excluded(url), f"should NOT reject {url}" + + +# ---------- opt-out flag preserves aggregators --------------------------- + +@pytest.mark.parametrize("url, reason", _REJECTED_URLS) +def test_sync_engine_include_aggregator_pages_allows_through(tmp_path, url, reason): + engine = _make_sync_engine(tmp_path, include_aggregator_pages=True) + engine._seed_urls = set() + assert not engine.path_excluded(url), ( + f"with include_aggregator_pages=True, should allow {url} ({reason})" + ) + + +@pytest.mark.parametrize("url, reason", _REJECTED_URLS) +def test_async_engine_include_aggregator_pages_allows_through(tmp_path, url, reason): + engine = _make_async_engine(tmp_path, include_aggregator_pages=True) + engine._seed_urls = set() + assert not engine.path_excluded(url), ( + f"with include_aggregator_pages=True, should allow {url} ({reason})" + ) + + +# ---------- composition with user-supplied filters ----------------------- + +def test_user_exclude_paths_still_applied(tmp_path): + """User-supplied exclude_paths must still reject matching URLs even + when aggregator filter is the primary default. Both sets compose.""" + engine = _make_sync_engine(tmp_path, exclude_paths=["/job/*"]) + engine._seed_urls = set() + # User pattern rejects /job/* + assert engine.path_excluded("https://example.com/job/listings") + # Aggregator default still rejects /print.html + assert engine.path_excluded("https://example.com/print.html") + # Non-matching URL passes + assert not engine.path_excluded("https://example.com/articles/foo") + + +def test_include_paths_with_aggregator_filter(tmp_path): + """When include_paths is set, aggregator default applies first. + A URL under include_paths but matching aggregator pattern is still + rejected; a URL outside include_paths but not aggregator is + rejected by the include filter, not the aggregator filter.""" + engine = _make_sync_engine(tmp_path, include_paths=["/docs/*"]) + engine._seed_urls = set() + # Inside scope but is aggregator → rejected + assert engine.path_excluded("https://example.com/docs/_print/index.html") + # Inside scope, real content → allowed + assert not engine.path_excluded("https://example.com/docs/getting-started") + # Outside scope → rejected by include filter + assert engine.path_excluded("https://example.com/blog/post-1") + + +def test_opt_out_with_user_exclude_paths(tmp_path): + """include_aggregator_pages=True disables ONLY the aggregator defaults. + User exclude_paths still apply.""" + engine = _make_sync_engine( + tmp_path, + include_aggregator_pages=True, + exclude_paths=["/internal/*"], + ) + engine._seed_urls = set() + # Aggregator now allowed + assert not engine.path_excluded("https://example.com/print.html") + # User pattern still rejects + assert engine.path_excluded("https://example.com/internal/secret") + + +# ---------- patterns constant invariants --------------------------------- + +def test_default_patterns_are_a_tuple(): + """Constant must be immutable — patterns are part of the public-ish + invariant surface and shouldn't be mutated at runtime.""" + assert isinstance(_DEFAULT_AGGREGATOR_PATH_PATTERNS, tuple) + assert len(_DEFAULT_AGGREGATOR_PATH_PATTERNS) >= 4 + + +def test_default_patterns_cover_observed_bench_failures(): + """Sanity check: the slam-dunk bench-observed cases must be covered.""" + import fnmatch + for url_path, reason in [ + ("/book/print.html", "rust-book"), + ("/docs/concepts/_print/", "kubernetes-docs trailing slash"), + ("/docs/concepts/_print/index.html", "kubernetes-docs explicit index"), + ]: + assert any( + fnmatch.fnmatch(url_path, pat) + for pat in _DEFAULT_AGGREGATOR_PATH_PATTERNS + ), f"no pattern matches {url_path} ({reason})"