AIMLPM · AIMLPM · May 12, 2026 · May 12, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,49 @@ All notable changes to MarkCrawl are documented in this file. The format
 follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and this
 project follows [SemVer](https://semver.org/) once it reaches 1.0.
 
+## [0.11.1] - 2026-05-11
+
+### Added — default aggregator-page URL filter
+Markcrawl now rejects mdBook `/print.html` and Hugo `/_print/` pages
+during crawl-time URL filtering. These single-render-of-whole-tree
+pages have artificially high keyword density (they contain the entire
+docs tree on one URL), which causes embedding-based retrieval to rank
+them above the dedicated chapter pages a user actually wants.
+
+- New default patterns rejected pre-fetch (saves crawl budget):
+  `*/print.html`, `*/_print`, `*/_print/`, `*/_print/*`,
+  `*/print/index.html`.
+- New kwarg `include_aggregator_pages: bool = False` on
+  `crawl(...)` and both engine classes for offline-archive use cases.
+- CLI flag `--include-aggregators` mirrors.
+- User-supplied `exclude_paths` and `include_paths` still apply
+  independently — the aggregator filter composes with both, doesn't
+  replace either.
+
+### Why now
+The public `llm-crawler-benchmarks` v1.4 cycle surfaced this as a
+markcrawl-specific issue: markcrawl was returning `/print.html` in
+49% of rust-book top-5 retrieval slots and `/_print/` in 39% of
+kubernetes-docs slots, while all four well-functioning competitors
+returned 0% `/_print/` on kubernetes-docs. The retrieval-ranking
+damage is structural — these pages will always beat real chapter
+pages on cosine similarity because they contain everything.
+
+### Expected impact
+Per the v1.4 retrieval-bucket audit, ~9-12 of markcrawl's 43
+retrieval-bucket misses concentrate on this issue. Predicted MRR
+lift on the 9-site bench pool: **+0.02 to +0.04**, concentrated on
+rust-book and kubernetes-docs. Measurement waits for the bench's
+v1.5 methodology refresh (helpful-pages-universe approach replaces
+the v1.4 single-tool-anchor query corpus).
+
+### Tests
+36 new tests covering: default rejection of observed bench failures,
+substring-match safety (`/blueprint.html`, `/preprint.html`,
+`/imprint/` all pass through), opt-out flag, composition with user
+exclude_paths and include_paths, both `CrawlEngine` and
+`AsyncCrawlEngine` parity. Total test count: 647 (was 611).
+
 ## [0.11.0] - 2026-05-06
 
 Two new modules expand markcrawl from "HTML to Markdown converter" to

diff --git a/markcrawl/cli.py b/markcrawl/cli.py
@@ -141,6 +141,14 @@ def build_parser() -> argparse.ArgumentParser:
         metavar="PATTERN",
         help="Glob pattern to include URL paths (e.g. '/blog/*'). Only matching paths are crawled. Can be repeated.",
     )
+    parser.add_argument(
+        "--include-aggregators",
+        action="store_true",
+        help="Include aggregator/print-view pages (e.g. mdBook /print.html, Hugo /_print/) "
+        "that bundle many sub-pages into a single render. Off by default — these pages have "
+        "artificially high keyword density and pollute retrieval rankings without adding new "
+        "content. Enable only for offline-archive use cases.",
+    )
     parser.add_argument(
         "--download-images",
         action="store_true",
@@ -370,6 +378,7 @@ def main() -> None:
                     i18n_filter=args.i18n_filter,
                     title_at_top=args.title_at_top,
                     screenshot_config=screenshot_config,
+                    include_aggregator_pages=args.include_aggregators,
                 )
                 total_pages += site_result.pages_saved
             except Exception as exc:
@@ -430,6 +439,7 @@ def main() -> None:
         i18n_filter=args.i18n_filter,
         title_at_top=args.title_at_top,
         screenshot_config=screenshot_config,
+        include_aggregator_pages=args.include_aggregators,
     )
 
     if not args.dry_run:

diff --git a/markcrawl/core.py b/markcrawl/core.py
@@ -158,6 +158,22 @@ def _resolve_idle_timeout(arg: Optional[float]) -> float:
 })
 
 
+# Default URL-path patterns for aggregator pages that bundle many sub-pages
+# into a single render (mdBook `/print.html`, Hugo `/_print/`). These pages
+# have high keyword density on almost any query because they contain the
+# entire docs tree on one URL, so embedding-based retrieval ranks them above
+# the dedicated chapter pages a user actually wants. Pre-fetch URL filter
+# instead of post-index dedup because we'd otherwise burn crawl budget on
+# them. Opt out via ``include_aggregator_pages=True``. (v0.11.1)
+_DEFAULT_AGGREGATOR_PATH_PATTERNS: Tuple[str, ...] = (
+    "*/print.html",          # mdBook (rust-book), VuePress
+    "*/_print",              # Hugo bare path
+    "*/_print/",             # Hugo trailing slash
+    "*/_print/*",            # Hugo subpaths (kubernetes-docs)
+    "*/print/index.html",    # Alternate single-page generators
+)
+
+
 def _compute_broader_scope(current_paths: List[str]) -> Optional[List[str]]:
     """One-level scope broadening for adaptive crawl expansion.
 
@@ -452,6 +468,7 @@ def __init__(
         download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
         download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
         download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
+        include_aggregator_pages: bool = False,
     ):
         self.out_dir = out_dir
         self.fmt = fmt
@@ -469,6 +486,7 @@ def __init__(
         self.show_progress = show_progress
         self.exclude_paths = exclude_paths or []
         self.include_paths = include_paths or []
+        self._exclude_aggregator_paths = not include_aggregator_pages
         self.download_images = download_images
         self.min_image_size = min_image_size
         self.screenshot_config = screenshot_config
@@ -748,12 +766,20 @@ def path_excluded(self, url: str) -> bool:
         or when ``include_paths`` is set and it matches none of them.
         Seed URLs (base URL fallback) bypass include filtering so we can
         still discover links from the entry point.
+
+        Default aggregator-page patterns (mdBook /print.html, Hugo /_print/)
+        are applied unless ``include_aggregator_pages=True`` was passed at
+        construction. See ``_DEFAULT_AGGREGATOR_PATH_PATTERNS``.  (v0.11.1)
         """
         if self.i18n_filter:
             from .analyzer import i18n_path_excluded
             if i18n_path_excluded(url):
                 return True
         path = up.urlsplit(url).path
+        if self._exclude_aggregator_paths and any(
+            fnmatch.fnmatch(path, pat) for pat in _DEFAULT_AGGREGATOR_PATH_PATTERNS
+        ):
+            return True
         if self.exclude_paths and any(fnmatch.fnmatch(path, pat) for pat in self.exclude_paths):
             return True
         if self.include_paths and not any(fnmatch.fnmatch(path, pat) for pat in self.include_paths):
@@ -1480,6 +1506,7 @@ def __init__(
         download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
         download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
         download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
+        include_aggregator_pages: bool = False,
     ):
         self.out_dir = out_dir
         self.fmt = fmt
@@ -1497,6 +1524,7 @@ def __init__(
         self.show_progress = show_progress
         self.exclude_paths = exclude_paths or []
         self.include_paths = include_paths or []
+        self._exclude_aggregator_paths = not include_aggregator_pages
         self.i18n_filter = i18n_filter
         self.title_at_top = title_at_top
         self.auto_path_priority = auto_path_priority
@@ -1694,12 +1722,20 @@ def path_excluded(self, url: str) -> bool:
         or when ``include_paths`` is set and it matches none of them.
         Seed URLs (base URL fallback) bypass include filtering so we can
         still discover links from the entry point.
+
+        Default aggregator-page patterns (mdBook /print.html, Hugo /_print/)
+        are applied unless ``include_aggregator_pages=True`` was passed at
+        construction. See ``_DEFAULT_AGGREGATOR_PATH_PATTERNS``.  (v0.11.1)
         """
         if self.i18n_filter:
             from .analyzer import i18n_path_excluded
             if i18n_path_excluded(url):
                 return True
         path = up.urlsplit(url).path
+        if self._exclude_aggregator_paths and any(
+            fnmatch.fnmatch(path, pat) for pat in _DEFAULT_AGGREGATOR_PATH_PATTERNS
+        ):
+            return True
         if self.exclude_paths and any(fnmatch.fnmatch(path, pat) for pat in self.exclude_paths):
             return True
         if self.include_paths and not any(fnmatch.fnmatch(path, pat) for pat in self.include_paths):
@@ -2426,6 +2462,7 @@ def crawl(
     download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
     download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
     download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
+    include_aggregator_pages: bool = False,
 ) -> CrawlResult:
     """Crawl a website and save cleaned content to disk.
 
@@ -2575,6 +2612,7 @@ def crawl(
             download_max_files=download_max_files,
             download_max_size_mb=download_max_size_mb,
             download_filter=download_filter,
+            include_aggregator_pages=include_aggregator_pages,
         )
 
     return _crawl_sync(
@@ -2602,6 +2640,7 @@ def crawl(
         download_max_files=download_max_files,
         download_max_size_mb=download_max_size_mb,
         download_filter=download_filter,
+        include_aggregator_pages=include_aggregator_pages,
     )
 
 
@@ -2646,6 +2685,7 @@ def _crawl_sync(
     download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
     download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
     download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
+    include_aggregator_pages: bool = False,
 ) -> CrawlResult:
     """Synchronous crawl path using ThreadPoolExecutor."""
     engine = CrawlEngine(
@@ -2678,6 +2718,7 @@ def _crawl_sync(
         download_max_files=download_max_files,
         download_max_size_mb=download_max_size_mb,
         download_filter=download_filter,
+        include_aggregator_pages=include_aggregator_pages,
     )
 
     base_url = norm_url(base_url)
@@ -2930,6 +2971,7 @@ def _crawl_async(
     download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
     download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
     download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
+    include_aggregator_pages: bool = False,
 ) -> CrawlResult:
     """Async crawl path using native asyncio event loop."""
 
@@ -2962,6 +3004,7 @@ async def _run() -> CrawlResult:
             download_max_files=download_max_files,
             download_max_size_mb=download_max_size_mb,
             download_filter=download_filter,
+            include_aggregator_pages=include_aggregator_pages,
         )
 
         nonlocal base_url

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "markcrawl"
-version = "0.11.0"
+version = "0.11.1"
 description = "Fast Python web crawler for AI & RAG ingestion — crawl, extract, and embed website content with one tool."
 readme = "README.md"
 requires-python = ">=3.10"