From 081bd295cfda58f220f42b2148b1b5f1f50634d2 Mon Sep 17 00:00:00 2001
From: AIMLPM <paul@datascienceinstitute.ai>
Date: Mon, 11 May 2026 21:44:31 -0700
Subject: [PATCH] =?UTF-8?q?v0.11.1=20=E2=80=94=20default=20aggregator-page?=
 =?UTF-8?q?=20URL=20filter=20(mdBook=20/print.html,=20Hugo=20/=5Fprint/)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reject single-render-of-whole-tree aggregator pages during crawl-time URL
filtering. These pages contain the entire docs tree on one URL, so embedding-
based retrieval ranks them above the dedicated chapter pages a user actually
wants.

Patterns rejected pre-fetch (saves crawl budget):
  */print.html, */_print, */_print/, */_print/*, */print/index.html

Opt out via include_aggregator_pages=True engine kwarg or
--include-aggregators CLI flag for offline-archive use cases.

Motivation from llm-crawler-benchmarks v1.4 cycle: markcrawl was returning
/print.html in 49% of rust-book top-5 retrieval slots and /_print/ in 39%
of kubernetes-docs slots, while four of the five well-functioning competitors
returned 0% /_print/ on kubernetes-docs. Predicted MRR lift on the 9-site
bench pool: +0.02 to +0.04, concentrated on rust-book and kubernetes-docs.

36 new tests in tests/test_v011_1_aggregator_filter.py covering default
rejection, substring-match safety (/blueprint.html, /preprint.html,
/imprint/ all pass through), opt-out flag, composition with user-supplied
exclude_paths and include_paths, sync + async engine parity. 647 tests
total (was 611), no regressions.
---
 CHANGELOG.md                           |  43 ++++++
 markcrawl/cli.py                       |  10 ++
 markcrawl/core.py                      |  43 ++++++
 pyproject.toml                         |   2 +-
 tests/test_v011_1_aggregator_filter.py | 185 +++++++++++++++++++++++++
 5 files changed, 282 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_v011_1_aggregator_filter.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 68339a8..35b411b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,49 @@ All notable changes to MarkCrawl are documented in this file. The format
 follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and this
 project follows [SemVer](https://semver.org/) once it reaches 1.0.
 
+## [0.11.1] - 2026-05-11
+
+### Added — default aggregator-page URL filter
+Markcrawl now rejects mdBook `/print.html` and Hugo `/_print/` pages
+during crawl-time URL filtering. These single-render-of-whole-tree
+pages have artificially high keyword density (they contain the entire
+docs tree on one URL), which causes embedding-based retrieval to rank
+them above the dedicated chapter pages a user actually wants.
+
+- New default patterns rejected pre-fetch (saves crawl budget):
+  `*/print.html`, `*/_print`, `*/_print/`, `*/_print/*`,
+  `*/print/index.html`.
+- New kwarg `include_aggregator_pages: bool = False` on
+  `crawl(...)` and both engine classes for offline-archive use cases.
+- CLI flag `--include-aggregators` mirrors.
+- User-supplied `exclude_paths` and `include_paths` still apply
+  independently — the aggregator filter composes with both, doesn't
+  replace either.
+
+### Why now
+The public `llm-crawler-benchmarks` v1.4 cycle surfaced this as a
+markcrawl-specific issue: markcrawl was returning `/print.html` in
+49% of rust-book top-5 retrieval slots and `/_print/` in 39% of
+kubernetes-docs slots, while all four well-functioning competitors
+returned 0% `/_print/` on kubernetes-docs. The retrieval-ranking
+damage is structural — these pages will always beat real chapter
+pages on cosine similarity because they contain everything.
+
+### Expected impact
+Per the v1.4 retrieval-bucket audit, ~9-12 of markcrawl's 43
+retrieval-bucket misses concentrate on this issue. Predicted MRR
+lift on the 9-site bench pool: **+0.02 to +0.04**, concentrated on
+rust-book and kubernetes-docs. Measurement waits for the bench's
+v1.5 methodology refresh (helpful-pages-universe approach replaces
+the v1.4 single-tool-anchor query corpus).
+
+### Tests
+36 new tests covering: default rejection of observed bench failures,
+substring-match safety (`/blueprint.html`, `/preprint.html`,
+`/imprint/` all pass through), opt-out flag, composition with user
+exclude_paths and include_paths, both `CrawlEngine` and
+`AsyncCrawlEngine` parity. Total test count: 647 (was 611).
+
 ## [0.11.0] - 2026-05-06
 
 Two new modules expand markcrawl from "HTML to Markdown converter" to
diff --git a/markcrawl/cli.py b/markcrawl/cli.py
index 141a4d6..e982b53 100644
--- a/markcrawl/cli.py
+++ b/markcrawl/cli.py
@@ -141,6 +141,14 @@ def build_parser() -> argparse.ArgumentParser:
         metavar="PATTERN",
         help="Glob pattern to include URL paths (e.g. '/blog/*'). Only matching paths are crawled. Can be repeated.",
     )
+    parser.add_argument(
+        "--include-aggregators",
+        action="store_true",
+        help="Include aggregator/print-view pages (e.g. mdBook /print.html, Hugo /_print/) "
+        "that bundle many sub-pages into a single render. Off by default — these pages have "
+        "artificially high keyword density and pollute retrieval rankings without adding new "
+        "content. Enable only for offline-archive use cases.",
+    )
     parser.add_argument(
         "--download-images",
         action="store_true",
@@ -370,6 +378,7 @@ def main() -> None:
                     i18n_filter=args.i18n_filter,
                     title_at_top=args.title_at_top,
                     screenshot_config=screenshot_config,
+                    include_aggregator_pages=args.include_aggregators,
                 )
                 total_pages += site_result.pages_saved
             except Exception as exc:
@@ -430,6 +439,7 @@ def main() -> None:
         i18n_filter=args.i18n_filter,
         title_at_top=args.title_at_top,
         screenshot_config=screenshot_config,
+        include_aggregator_pages=args.include_aggregators,
     )
 
     if not args.dry_run:
diff --git a/markcrawl/core.py b/markcrawl/core.py
index 0004465..89f0a0e 100644
--- a/markcrawl/core.py
+++ b/markcrawl/core.py
@@ -158,6 +158,22 @@ def _resolve_idle_timeout(arg: Optional[float]) -> float:
 })
 
 
+# Default URL-path patterns for aggregator pages that bundle many sub-pages
+# into a single render (mdBook `/print.html`, Hugo `/_print/`). These pages
+# have high keyword density on almost any query because they contain the
+# entire docs tree on one URL, so embedding-based retrieval ranks them above
+# the dedicated chapter pages a user actually wants. Pre-fetch URL filter
+# instead of post-index dedup because we'd otherwise burn crawl budget on
+# them. Opt out via ``include_aggregator_pages=True``. (v0.11.1)
+_DEFAULT_AGGREGATOR_PATH_PATTERNS: Tuple[str, ...] = (
+    "*/print.html",          # mdBook (rust-book), VuePress
+    "*/_print",              # Hugo bare path
+    "*/_print/",             # Hugo trailing slash
+    "*/_print/*",            # Hugo subpaths (kubernetes-docs)
+    "*/print/index.html",    # Alternate single-page generators
+)
+
+
 def _compute_broader_scope(current_paths: List[str]) -> Optional[List[str]]:
     """One-level scope broadening for adaptive crawl expansion.
 
@@ -452,6 +468,7 @@ def __init__(
         download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
         download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
         download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
+        include_aggregator_pages: bool = False,
     ):
         self.out_dir = out_dir
         self.fmt = fmt
@@ -469,6 +486,7 @@ def __init__(
         self.show_progress = show_progress
         self.exclude_paths = exclude_paths or []
         self.include_paths = include_paths or []
+        self._exclude_aggregator_paths = not include_aggregator_pages
         self.download_images = download_images
         self.min_image_size = min_image_size
         self.screenshot_config = screenshot_config
@@ -748,12 +766,20 @@ def path_excluded(self, url: str) -> bool:
         or when ``include_paths`` is set and it matches none of them.
         Seed URLs (base URL fallback) bypass include filtering so we can
         still discover links from the entry point.
+
+        Default aggregator-page patterns (mdBook /print.html, Hugo /_print/)
+        are applied unless ``include_aggregator_pages=True`` was passed at
+        construction. See ``_DEFAULT_AGGREGATOR_PATH_PATTERNS``.  (v0.11.1)
         """
         if self.i18n_filter:
             from .analyzer import i18n_path_excluded
             if i18n_path_excluded(url):
                 return True
         path = up.urlsplit(url).path
+        if self._exclude_aggregator_paths and any(
+            fnmatch.fnmatch(path, pat) for pat in _DEFAULT_AGGREGATOR_PATH_PATTERNS
+        ):
+            return True
         if self.exclude_paths and any(fnmatch.fnmatch(path, pat) for pat in self.exclude_paths):
             return True
         if self.include_paths and not any(fnmatch.fnmatch(path, pat) for pat in self.include_paths):
@@ -1480,6 +1506,7 @@ def __init__(
         download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
         download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
         download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
+        include_aggregator_pages: bool = False,
     ):
         self.out_dir = out_dir
         self.fmt = fmt
@@ -1497,6 +1524,7 @@ def __init__(
         self.show_progress = show_progress
         self.exclude_paths = exclude_paths or []
         self.include_paths = include_paths or []
+        self._exclude_aggregator_paths = not include_aggregator_pages
         self.i18n_filter = i18n_filter
         self.title_at_top = title_at_top
         self.auto_path_priority = auto_path_priority
@@ -1694,12 +1722,20 @@ def path_excluded(self, url: str) -> bool:
         or when ``include_paths`` is set and it matches none of them.
         Seed URLs (base URL fallback) bypass include filtering so we can
         still discover links from the entry point.
+
+        Default aggregator-page patterns (mdBook /print.html, Hugo /_print/)
+        are applied unless ``include_aggregator_pages=True`` was passed at
+        construction. See ``_DEFAULT_AGGREGATOR_PATH_PATTERNS``.  (v0.11.1)
         """
         if self.i18n_filter:
             from .analyzer import i18n_path_excluded
             if i18n_path_excluded(url):
                 return True
         path = up.urlsplit(url).path
+        if self._exclude_aggregator_paths and any(
+            fnmatch.fnmatch(path, pat) for pat in _DEFAULT_AGGREGATOR_PATH_PATTERNS
+        ):
+            return True
         if self.exclude_paths and any(fnmatch.fnmatch(path, pat) for pat in self.exclude_paths):
             return True
         if self.include_paths and not any(fnmatch.fnmatch(path, pat) for pat in self.include_paths):
@@ -2426,6 +2462,7 @@ def crawl(
     download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
     download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
     download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
+    include_aggregator_pages: bool = False,
 ) -> CrawlResult:
     """Crawl a website and save cleaned content to disk.
 
@@ -2575,6 +2612,7 @@ def crawl(
             download_max_files=download_max_files,
             download_max_size_mb=download_max_size_mb,
             download_filter=download_filter,
+            include_aggregator_pages=include_aggregator_pages,
         )
 
     return _crawl_sync(
@@ -2602,6 +2640,7 @@ def crawl(
         download_max_files=download_max_files,
         download_max_size_mb=download_max_size_mb,
         download_filter=download_filter,
+        include_aggregator_pages=include_aggregator_pages,
     )
 
 
@@ -2646,6 +2685,7 @@ def _crawl_sync(
     download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
     download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
     download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
+    include_aggregator_pages: bool = False,
 ) -> CrawlResult:
     """Synchronous crawl path using ThreadPoolExecutor."""
     engine = CrawlEngine(
@@ -2678,6 +2718,7 @@ def _crawl_sync(
         download_max_files=download_max_files,
         download_max_size_mb=download_max_size_mb,
         download_filter=download_filter,
+        include_aggregator_pages=include_aggregator_pages,
     )
 
     base_url = norm_url(base_url)
@@ -2930,6 +2971,7 @@ def _crawl_async(
     download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
     download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
     download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
+    include_aggregator_pages: bool = False,
 ) -> CrawlResult:
     """Async crawl path using native asyncio event loop."""
 
@@ -2962,6 +3004,7 @@ async def _run() -> CrawlResult:
             download_max_files=download_max_files,
             download_max_size_mb=download_max_size_mb,
             download_filter=download_filter,
+            include_aggregator_pages=include_aggregator_pages,
         )
 
         nonlocal base_url
diff --git a/pyproject.toml b/pyproject.toml
index a19ffe2..5fd36f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "markcrawl"
-version = "0.11.0"
+version = "0.11.1"
 description = "Fast Python web crawler for AI & RAG ingestion — crawl, extract, and embed website content with one tool."
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/tests/test_v011_1_aggregator_filter.py b/tests/test_v011_1_aggregator_filter.py
new file mode 100644
index 0000000..f8a65e3
--- /dev/null
+++ b/tests/test_v011_1_aggregator_filter.py
@@ -0,0 +1,185 @@
+"""Tests for the default aggregator-page URL filter (v0.11.1).
+
+The filter rejects mdBook /print.html, Hugo /_print/, and similar
+single-render-of-whole-tree pages during crawl-time URL filtering. These
+pages have artificially high keyword density on almost any retrieval
+query because they contain the entire docs tree on a single URL, so the
+embedder ranks them above the dedicated chapter pages a user actually
+wants.
+
+Bench evidence motivating the filter: markcrawl returned /print.html in
+49% of rust-book top-5 retrieval slots and /_print/ in 39% of
+kubernetes-docs slots; all five other well-functioning competitors
+return 0% on kubernetes-docs.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from markcrawl.core import (
+    AsyncCrawlEngine,
+    CrawlEngine,
+    _DEFAULT_AGGREGATOR_PATH_PATTERNS,
+)
+
+
+# ---------- engine fixtures ----------------------------------------------
+
+def _make_sync_engine(tmp_path, **kwargs):
+    out = tmp_path / "out"
+    out.mkdir(exist_ok=True)
+    defaults = dict(
+        out_dir=str(out), fmt="markdown", min_words=20, delay=0, timeout=10,
+        concurrency=1, include_subdomains=False, user_agent="test",
+        render_js=False, proxy=None, show_progress=False,
+    )
+    defaults.update(kwargs)
+    return CrawlEngine(**defaults)
+
+
+def _make_async_engine(tmp_path, **kwargs):
+    out = tmp_path / "out"
+    out.mkdir(exist_ok=True)
+    defaults = dict(
+        out_dir=str(out), fmt="markdown", min_words=20, delay=0, timeout=10,
+        concurrency=1, include_subdomains=False, user_agent="test",
+        proxy=None, show_progress=False,
+    )
+    defaults.update(kwargs)
+    return AsyncCrawlEngine(**defaults)
+
+
+# ---------- default patterns rejected ------------------------------------
+
+# (URL, reason) pairs covering the observed bench failure modes.
+_REJECTED_URLS = [
+    ("https://doc.rust-lang.org/book/print.html", "mdBook print view"),
+    ("https://example.com/print.html", "root-level print.html"),
+    ("https://kubernetes.io/docs/concepts/_print/", "Hugo _print trailing slash"),
+    ("https://kubernetes.io/docs/concepts/_print/index.html", "Hugo _print explicit index"),
+    ("https://example.com/foo/_print", "Hugo _print bare path"),
+    ("https://example.com/foo/print/index.html", "alternate single-page generator"),
+]
+
+
+@pytest.mark.parametrize("url, reason", _REJECTED_URLS)
+def test_sync_engine_rejects_aggregator_by_default(tmp_path, url, reason):
+    engine = _make_sync_engine(tmp_path)
+    engine._seed_urls = set()
+    assert engine.path_excluded(url), f"should reject {url} ({reason})"
+
+
+@pytest.mark.parametrize("url, reason", _REJECTED_URLS)
+def test_async_engine_rejects_aggregator_by_default(tmp_path, url, reason):
+    engine = _make_async_engine(tmp_path)
+    engine._seed_urls = set()
+    assert engine.path_excluded(url), f"should reject {url} ({reason})"
+
+
+# ---------- non-aggregator URLs pass through -----------------------------
+
+_ACCEPTED_URLS = [
+    "https://doc.rust-lang.org/book/ch12-02-reading-a-file.html",
+    "https://kubernetes.io/docs/concepts/architecture/control-plane-node-communication/",
+    "https://example.com/index.html",                # plain index.html is content, not aggregator
+    "https://example.com/blueprint.html",             # 'print' substring inside a word — must not over-match
+    "https://example.com/imprint/",                   # legal page; not aggregator
+    "https://example.com/preprint.html",              # academic preprint; not aggregator
+    "https://example.com/_printer-friendly/css.css",  # asset path containing _print as prefix-only
+]
+
+
+@pytest.mark.parametrize("url", _ACCEPTED_URLS)
+def test_sync_engine_accepts_non_aggregator(tmp_path, url):
+    engine = _make_sync_engine(tmp_path)
+    engine._seed_urls = set()
+    assert not engine.path_excluded(url), f"should NOT reject {url}"
+
+
+# ---------- opt-out flag preserves aggregators ---------------------------
+
+@pytest.mark.parametrize("url, reason", _REJECTED_URLS)
+def test_sync_engine_include_aggregator_pages_allows_through(tmp_path, url, reason):
+    engine = _make_sync_engine(tmp_path, include_aggregator_pages=True)
+    engine._seed_urls = set()
+    assert not engine.path_excluded(url), (
+        f"with include_aggregator_pages=True, should allow {url} ({reason})"
+    )
+
+
+@pytest.mark.parametrize("url, reason", _REJECTED_URLS)
+def test_async_engine_include_aggregator_pages_allows_through(tmp_path, url, reason):
+    engine = _make_async_engine(tmp_path, include_aggregator_pages=True)
+    engine._seed_urls = set()
+    assert not engine.path_excluded(url), (
+        f"with include_aggregator_pages=True, should allow {url} ({reason})"
+    )
+
+
+# ---------- composition with user-supplied filters -----------------------
+
+def test_user_exclude_paths_still_applied(tmp_path):
+    """User-supplied exclude_paths must still reject matching URLs even
+    when aggregator filter is the primary default. Both sets compose."""
+    engine = _make_sync_engine(tmp_path, exclude_paths=["/job/*"])
+    engine._seed_urls = set()
+    # User pattern rejects /job/*
+    assert engine.path_excluded("https://example.com/job/listings")
+    # Aggregator default still rejects /print.html
+    assert engine.path_excluded("https://example.com/print.html")
+    # Non-matching URL passes
+    assert not engine.path_excluded("https://example.com/articles/foo")
+
+
+def test_include_paths_with_aggregator_filter(tmp_path):
+    """When include_paths is set, aggregator default applies first.
+    A URL under include_paths but matching aggregator pattern is still
+    rejected; a URL outside include_paths but not aggregator is
+    rejected by the include filter, not the aggregator filter."""
+    engine = _make_sync_engine(tmp_path, include_paths=["/docs/*"])
+    engine._seed_urls = set()
+    # Inside scope but is aggregator → rejected
+    assert engine.path_excluded("https://example.com/docs/_print/index.html")
+    # Inside scope, real content → allowed
+    assert not engine.path_excluded("https://example.com/docs/getting-started")
+    # Outside scope → rejected by include filter
+    assert engine.path_excluded("https://example.com/blog/post-1")
+
+
+def test_opt_out_with_user_exclude_paths(tmp_path):
+    """include_aggregator_pages=True disables ONLY the aggregator defaults.
+    User exclude_paths still apply."""
+    engine = _make_sync_engine(
+        tmp_path,
+        include_aggregator_pages=True,
+        exclude_paths=["/internal/*"],
+    )
+    engine._seed_urls = set()
+    # Aggregator now allowed
+    assert not engine.path_excluded("https://example.com/print.html")
+    # User pattern still rejects
+    assert engine.path_excluded("https://example.com/internal/secret")
+
+
+# ---------- patterns constant invariants ---------------------------------
+
+def test_default_patterns_are_a_tuple():
+    """Constant must be immutable — patterns are part of the public-ish
+    invariant surface and shouldn't be mutated at runtime."""
+    assert isinstance(_DEFAULT_AGGREGATOR_PATH_PATTERNS, tuple)
+    assert len(_DEFAULT_AGGREGATOR_PATH_PATTERNS) >= 4
+
+
+def test_default_patterns_cover_observed_bench_failures():
+    """Sanity check: the slam-dunk bench-observed cases must be covered."""
+    import fnmatch
+    for url_path, reason in [
+        ("/book/print.html", "rust-book"),
+        ("/docs/concepts/_print/", "kubernetes-docs trailing slash"),
+        ("/docs/concepts/_print/index.html", "kubernetes-docs explicit index"),
+    ]:
+        assert any(
+            fnmatch.fnmatch(url_path, pat)
+            for pat in _DEFAULT_AGGREGATOR_PATH_PATTERNS
+        ), f"no pattern matches {url_path} ({reason})"