Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,49 @@ All notable changes to MarkCrawl are documented in this file. The format
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and this
project follows [SemVer](https://semver.org/) once it reaches 1.0.

## [0.11.1] - 2026-05-11

### Added — default aggregator-page URL filter
Markcrawl now rejects mdBook `/print.html` and Hugo `/_print/` pages
during crawl-time URL filtering. These single-render-of-whole-tree
pages have artificially high keyword density (they contain the entire
docs tree on one URL), which causes embedding-based retrieval to rank
them above the dedicated chapter pages a user actually wants.

- New default patterns rejected pre-fetch (saves crawl budget):
`*/print.html`, `*/_print`, `*/_print/`, `*/_print/*`,
`*/print/index.html`.
- New kwarg `include_aggregator_pages: bool = False` on
`crawl(...)` and both engine classes for offline-archive use cases.
- CLI flag `--include-aggregators` mirrors.
- User-supplied `exclude_paths` and `include_paths` still apply
independently — the aggregator filter composes with both, doesn't
replace either.

### Why now
The public `llm-crawler-benchmarks` v1.4 cycle surfaced this as a
markcrawl-specific issue: markcrawl was returning `/print.html` in
49% of rust-book top-5 retrieval slots and `/_print/` in 39% of
kubernetes-docs slots, while all four well-functioning competitors
returned 0% `/_print/` on kubernetes-docs. The retrieval-ranking
damage is structural — these pages will always beat real chapter
pages on cosine similarity because they contain everything.

### Expected impact
Per the v1.4 retrieval-bucket audit, ~9-12 of markcrawl's 43
retrieval-bucket misses concentrate on this issue. Predicted MRR
lift on the 9-site bench pool: **+0.02 to +0.04**, concentrated on
rust-book and kubernetes-docs. Measurement waits for the bench's
v1.5 methodology refresh (helpful-pages-universe approach replaces
the v1.4 single-tool-anchor query corpus).

### Tests
36 new tests covering: default rejection of observed bench failures,
substring-match safety (`/blueprint.html`, `/preprint.html`,
`/imprint/` all pass through), opt-out flag, composition with user
exclude_paths and include_paths, both `CrawlEngine` and
`AsyncCrawlEngine` parity. Total test count: 647 (was 611).

## [0.11.0] - 2026-05-06

Two new modules expand markcrawl from "HTML to Markdown converter" to
Expand Down
10 changes: 10 additions & 0 deletions markcrawl/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,14 @@ def build_parser() -> argparse.ArgumentParser:
metavar="PATTERN",
help="Glob pattern to include URL paths (e.g. '/blog/*'). Only matching paths are crawled. Can be repeated.",
)
parser.add_argument(
"--include-aggregators",
action="store_true",
help="Include aggregator/print-view pages (e.g. mdBook /print.html, Hugo /_print/) "
"that bundle many sub-pages into a single render. Off by default — these pages have "
"artificially high keyword density and pollute retrieval rankings without adding new "
"content. Enable only for offline-archive use cases.",
)
parser.add_argument(
"--download-images",
action="store_true",
Expand Down Expand Up @@ -370,6 +378,7 @@ def main() -> None:
i18n_filter=args.i18n_filter,
title_at_top=args.title_at_top,
screenshot_config=screenshot_config,
include_aggregator_pages=args.include_aggregators,
)
total_pages += site_result.pages_saved
except Exception as exc:
Expand Down Expand Up @@ -430,6 +439,7 @@ def main() -> None:
i18n_filter=args.i18n_filter,
title_at_top=args.title_at_top,
screenshot_config=screenshot_config,
include_aggregator_pages=args.include_aggregators,
)

if not args.dry_run:
Expand Down
43 changes: 43 additions & 0 deletions markcrawl/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,22 @@ def _resolve_idle_timeout(arg: Optional[float]) -> float:
})


# Default URL-path patterns for aggregator pages that bundle many sub-pages
# into a single render (mdBook `/print.html`, Hugo `/_print/`). These pages
# have high keyword density on almost any query because they contain the
# entire docs tree on one URL, so embedding-based retrieval ranks them above
# the dedicated chapter pages a user actually wants. Pre-fetch URL filter
# instead of post-index dedup because we'd otherwise burn crawl budget on
# them. Opt out via ``include_aggregator_pages=True``. (v0.11.1)
_DEFAULT_AGGREGATOR_PATH_PATTERNS: Tuple[str, ...] = (
"*/print.html", # mdBook (rust-book), VuePress
"*/_print", # Hugo bare path
"*/_print/", # Hugo trailing slash
"*/_print/*", # Hugo subpaths (kubernetes-docs)
"*/print/index.html", # Alternate single-page generators
)


def _compute_broader_scope(current_paths: List[str]) -> Optional[List[str]]:
"""One-level scope broadening for adaptive crawl expansion.

Expand Down Expand Up @@ -452,6 +468,7 @@ def __init__(
download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
include_aggregator_pages: bool = False,
):
self.out_dir = out_dir
self.fmt = fmt
Expand All @@ -469,6 +486,7 @@ def __init__(
self.show_progress = show_progress
self.exclude_paths = exclude_paths or []
self.include_paths = include_paths or []
self._exclude_aggregator_paths = not include_aggregator_pages
self.download_images = download_images
self.min_image_size = min_image_size
self.screenshot_config = screenshot_config
Expand Down Expand Up @@ -748,12 +766,20 @@ def path_excluded(self, url: str) -> bool:
or when ``include_paths`` is set and it matches none of them.
Seed URLs (base URL fallback) bypass include filtering so we can
still discover links from the entry point.

Default aggregator-page patterns (mdBook /print.html, Hugo /_print/)
are applied unless ``include_aggregator_pages=True`` was passed at
construction. See ``_DEFAULT_AGGREGATOR_PATH_PATTERNS``. (v0.11.1)
"""
if self.i18n_filter:
from .analyzer import i18n_path_excluded
if i18n_path_excluded(url):
return True
path = up.urlsplit(url).path
if self._exclude_aggregator_paths and any(
fnmatch.fnmatch(path, pat) for pat in _DEFAULT_AGGREGATOR_PATH_PATTERNS
):
return True
if self.exclude_paths and any(fnmatch.fnmatch(path, pat) for pat in self.exclude_paths):
return True
if self.include_paths and not any(fnmatch.fnmatch(path, pat) for pat in self.include_paths):
Expand Down Expand Up @@ -1480,6 +1506,7 @@ def __init__(
download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
include_aggregator_pages: bool = False,
):
self.out_dir = out_dir
self.fmt = fmt
Expand All @@ -1497,6 +1524,7 @@ def __init__(
self.show_progress = show_progress
self.exclude_paths = exclude_paths or []
self.include_paths = include_paths or []
self._exclude_aggregator_paths = not include_aggregator_pages
self.i18n_filter = i18n_filter
self.title_at_top = title_at_top
self.auto_path_priority = auto_path_priority
Expand Down Expand Up @@ -1694,12 +1722,20 @@ def path_excluded(self, url: str) -> bool:
or when ``include_paths`` is set and it matches none of them.
Seed URLs (base URL fallback) bypass include filtering so we can
still discover links from the entry point.

Default aggregator-page patterns (mdBook /print.html, Hugo /_print/)
are applied unless ``include_aggregator_pages=True`` was passed at
construction. See ``_DEFAULT_AGGREGATOR_PATH_PATTERNS``. (v0.11.1)
"""
if self.i18n_filter:
from .analyzer import i18n_path_excluded
if i18n_path_excluded(url):
return True
path = up.urlsplit(url).path
if self._exclude_aggregator_paths and any(
fnmatch.fnmatch(path, pat) for pat in _DEFAULT_AGGREGATOR_PATH_PATTERNS
):
return True
if self.exclude_paths and any(fnmatch.fnmatch(path, pat) for pat in self.exclude_paths):
return True
if self.include_paths and not any(fnmatch.fnmatch(path, pat) for pat in self.include_paths):
Expand Down Expand Up @@ -2426,6 +2462,7 @@ def crawl(
download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
include_aggregator_pages: bool = False,
) -> CrawlResult:
"""Crawl a website and save cleaned content to disk.

Expand Down Expand Up @@ -2575,6 +2612,7 @@ def crawl(
download_max_files=download_max_files,
download_max_size_mb=download_max_size_mb,
download_filter=download_filter,
include_aggregator_pages=include_aggregator_pages,
)

return _crawl_sync(
Expand Down Expand Up @@ -2602,6 +2640,7 @@ def crawl(
download_max_files=download_max_files,
download_max_size_mb=download_max_size_mb,
download_filter=download_filter,
include_aggregator_pages=include_aggregator_pages,
)


Expand Down Expand Up @@ -2646,6 +2685,7 @@ def _crawl_sync(
download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
include_aggregator_pages: bool = False,
) -> CrawlResult:
"""Synchronous crawl path using ThreadPoolExecutor."""
engine = CrawlEngine(
Expand Down Expand Up @@ -2678,6 +2718,7 @@ def _crawl_sync(
download_max_files=download_max_files,
download_max_size_mb=download_max_size_mb,
download_filter=download_filter,
include_aggregator_pages=include_aggregator_pages,
)

base_url = norm_url(base_url)
Expand Down Expand Up @@ -2930,6 +2971,7 @@ def _crawl_async(
download_max_files: int = DEFAULT_DOWNLOAD_MAX_FILES,
download_max_size_mb: int = DEFAULT_DOWNLOAD_MAX_SIZE_MB,
download_filter: Optional[Callable[[DownloadCandidate], bool]] = None,
include_aggregator_pages: bool = False,
) -> CrawlResult:
"""Async crawl path using native asyncio event loop."""

Expand Down Expand Up @@ -2962,6 +3004,7 @@ async def _run() -> CrawlResult:
download_max_files=download_max_files,
download_max_size_mb=download_max_size_mb,
download_filter=download_filter,
include_aggregator_pages=include_aggregator_pages,
)

nonlocal base_url
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "markcrawl"
version = "0.11.0"
version = "0.11.1"
description = "Fast Python web crawler for AI & RAG ingestion — crawl, extract, and embed website content with one tool."
readme = "README.md"
requires-python = ">=3.10"
Expand Down
Loading
Loading