From 54d6d222d0f65e162c6a6aa530013e7cd40ecde7 Mon Sep 17 00:00:00 2001 From: E Nelson Date: Fri, 10 Apr 2026 12:02:00 -0400 Subject: [PATCH] feat(open-source): add llms-info-from-quartodoc skill Adds a skill for generating llms-full.txt for Python package documentation sites built with Quarto and quartodoc. Quarto 1.9 handles llms.txt natively; this skill covers the llms-full.txt complement with cleaned page content. Co-Authored-By: Claude Sonnet 4.6 --- .claude-plugin/marketplace.json | 3 +- README.md | 1 + open-source/README.md | 6 + open-source/llms-info-from-quartodoc/SKILL.md | 127 +++++++ .../generate-llms-full-txt-example.py | 358 ++++++++++++++++++ 5 files changed, 494 insertions(+), 1 deletion(-) create mode 100644 open-source/llms-info-from-quartodoc/SKILL.md create mode 100644 open-source/llms-info-from-quartodoc/references/generate-llms-full-txt-example.py diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 4edc581..3aa4091 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -37,7 +37,8 @@ "strict": false, "skills": [ "./open-source/release-post", - "./open-source/create-release-checklist" + "./open-source/create-release-checklist", + "./open-source/llms-info-from-quartodoc" ] }, { diff --git a/README.md b/README.md index f882fac..c07355e 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Skills for open-source R and Python package developers, streamlining common work - **[create-release-checklist](./open-source/create-release-checklist/)** - Create a release checklist and GitHub issue for an R package, with automatic version calculation and customizable checklist generation - **[release-post](./open-source/release-post/)** - Create professional package release blog posts following Tidyverse or Shiny blog conventions, with support for both R and Python packages +- **[llms-info-from-quartodoc](./open-source/llms-info-from-quartodoc/)** - Generate `llms-full.txt` for Python package documentation sites built with Quarto and quartodoc, making docs fully consumable by LLMs and coding assistants ### R Package Development diff --git a/open-source/README.md b/open-source/README.md index 0e03341..fe88d43 100644 --- a/open-source/README.md +++ b/open-source/README.md @@ -4,6 +4,12 @@ Skills for open-source R and Python package developers. These skills streamline For general-purpose developer skills (code review, architecture docs, etc.), see [posit-dev](../posit-dev/). +## Skills + +- **[create-release-checklist](./create-release-checklist/)** - Create a release checklist and GitHub issue for an R package +- **[release-post](./release-post/)** - Create professional package release blog posts following Tidyverse or Shiny blog conventions +- **[llms-info-from-quartodoc](./llms-info-from-quartodoc/)** - Generate `llms-full.txt` for Python package documentation sites built with Quarto and quartodoc, making docs fully consumable by LLMs + ## Contributing See the main [CONTRIBUTING.md](../CONTRIBUTING.md) for guidelines on adding new skills to this category. diff --git a/open-source/llms-info-from-quartodoc/SKILL.md b/open-source/llms-info-from-quartodoc/SKILL.md new file mode 100644 index 0000000..4419589 --- /dev/null +++ b/open-source/llms-info-from-quartodoc/SKILL.md @@ -0,0 +1,127 @@ +--- +name: llms-info-from-quartodoc +description: > + Use when adding llms-full.txt to a Python package documentation site built + with Quarto and quartodoc. Triggers when the user wants to make their package + docs fully consumable by LLMs, mentions the llmstxt.org spec, or asks about + generating rich LLM context files for quartodoc sites. +metadata: + author: Elizabeth Nelson (@enelson) + version: "1.0" +license: MIT +--- + +# LLM-Friendly Docs from quartodoc + +Generate `llms-full.txt` for Python package documentation sites built with Quarto and quartodoc, following the [llmstxt.org](https://llmstxt.org/) spec. + +Quarto 1.9 generates `llms.txt` (a concise link index) natively — you don't need to write that. This skill covers `llms-full.txt` only: the same structure as `llms.txt`, but with cleaned page content under each entry, giving LLMs and coding assistants the full documentation context. + +## The llms-full.txt Format + +Per the llmstxt.org spec: + +- **H1 heading**: Project name +- **Blockquote**: Short project summary (from `website.description` or package metadata) +- **H2 sections**: Content grouped by topic, matching the sidebar structure +- **Per entry**: markdown link followed by cleaned page content + +--- + +## Workflow + +### Step 1: Understand the site structure + +Read `_quarto.yml` to understand: +- Sidebar structure (IDs, contents, sections) — this determines section grouping +- quartodoc configuration (package name, API sections) — determines API reference pages +- Base URL (`website.site-url`) — needed for absolute URLs in the output +- Site description (`website.description`) — used in the blockquote + +### Step 2: Write `scripts/generate_llms_full_txt.py` + +Model the script after `references/generate-llms-full-txt-example.py` and adapt it to the project's sidebar layout, quartodoc sections, and content structure. + +**Script responsibilities:** + +1. Parse `_quarto.yml` sidebars to discover pages in sidebar order +2. Read quartodoc-generated `_sidebar.yml` files for API reference pages +3. For each page: + - Extract title from YAML frontmatter (`pagetitle` > `title` > filename) + - Build the page URL (`.qmd` → `.html`, `index.qmd` → trailing slash) + - Clean the QMD content (see below) +4. Write `llms-full.txt` with section H2 headers, markdown links, and cleaned content + +**QMD content cleaning pipeline** (apply in order): +1. Strip YAML frontmatter (`---` blocks at top) +2. Remove Quarto div fences (`:::`, `::::`, etc.) +3. Remove raw HTML blocks (` ```{=html} ... ``` `) +4. Remove shinylive/quartodoc metadata comments (`#| key: value`) +5. Convert Quarto code fences (`{python}`, `{shinylive-python}`, etc.) to plain ` ```python ` +6. Strip inline HTML tags (`<...>`) +7. Collapse 3+ consecutive blank lines to 2 + +**Use `@dataclass` for site structure types:** +```python +@dataclass +class Page: + title: str + url: str + source: Path + +@dataclass +class Section: + title: str + pages: list[Page] +``` + +**Edge cases to handle:** +- Fragment-only hrefs (e.g. `#section`) — resolve to parent page, deduplicate +- Missing `.qmd` files — skip with a warning +- Duplicate entries across sections — deduplicate by URL + +### Step 3: Write tests + +Unit tests covering: +- QMD content cleaning (each cleaning step independently) +- Sidebar walking (all 4 entry formats: string, section dict, href dict, file dict) +- URL generation (`index.qmd` → trailing slash, `.qmd` → `.html`, nested paths) +- Title extraction (`pagetitle` precedence, missing frontmatter) +- Integration test: build site structure from a minimal `_quarto.yml` fixture + +### Step 4: Integrate into build + +Add a Makefile target. `llms-full.txt` is committed to the repo — not a build-only artifact — so changes are reviewable in PRs. + +```makefile +llms-full-txt: $(PYBIN) quartodoc + $(PYBIN) scripts/generate_llms_full_txt.py + +all: quartodoc llms-full-txt site +``` + +Add `llms-full.txt` to `_quarto.yml` project resources so Quarto copies it to the output: +```yaml +project: + resources: + - llms-full.txt +``` + +### Step 5: Add CI freshness check + +After the site build step, add: +```yaml +- name: Check llms-full.txt is up to date + run: | + make llms-full-txt + git diff --exit-code llms-full.txt || \ + (echo "Run 'make llms-full-txt' locally and commit the result." && exit 1) +``` + +**After all steps:** run `make llms-full-txt` and commit `llms-full.txt`. + +--- + +## Additional Reference + +See `references/generate-llms-full-txt-example.py` for a complete working implementation to study and adapt. diff --git a/open-source/llms-info-from-quartodoc/references/generate-llms-full-txt-example.py b/open-source/llms-info-from-quartodoc/references/generate-llms-full-txt-example.py new file mode 100644 index 0000000..f3c582e --- /dev/null +++ b/open-source/llms-info-from-quartodoc/references/generate-llms-full-txt-example.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +""" +Generate llms-full.txt for a quartodoc-based Python package documentation site. + +Note: llms.txt (link index) is generated natively by Quarto 1.9. +This script generates llms-full.txt, which adds cleaned page content +under each entry so LLMs get the full documentation context. + +Usage: + python scripts/generate_llms_full_txt.py + python scripts/generate_llms_full_txt.py --site-dir . + python scripts/generate_llms_full_txt.py --output llms-full.txt + +Requires: pyyaml (pip install pyyaml) +""" + +from __future__ import annotations + +import argparse +import re +import sys +from dataclasses import dataclass, field +from pathlib import Path + +try: + import yaml +except ImportError: + print("Error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr) + sys.exit(1) + + +# --------------------------------------------------------------------------- +# Data structures +# --------------------------------------------------------------------------- + + +@dataclass +class Page: + title: str + url: str + source: Path + + +@dataclass +class Section: + title: str + pages: list[Page] = field(default_factory=list) + + +@dataclass +class Site: + name: str + description: str + base_url: str + sections: list[Section] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Sidebar / _quarto.yml parsing +# --------------------------------------------------------------------------- + + +def load_quarto_config(site_dir: Path) -> dict: + quarto_yml = site_dir / "_quarto.yml" + if not quarto_yml.exists(): + raise FileNotFoundError(f"_quarto.yml not found in {site_dir}") + with open(quarto_yml) as f: + return yaml.safe_load(f) + + +def qmd_path_to_url(base_url: str, qmd_path: str) -> str: + """Convert a .qmd path relative to site root to an absolute URL.""" + base_url = base_url.rstrip("/") + path = qmd_path.strip("/") + + # index.qmd files map to the directory URL (trailing slash) + if path.endswith("/index.qmd") or path == "index.qmd": + url_path = path[: -len("index.qmd")].rstrip("/") + return f"{base_url}/{url_path}/" if url_path else f"{base_url}/" + + # Regular .qmd files map to .html + if path.endswith(".qmd"): + path = path[: -len(".qmd")] + ".html" + + return f"{base_url}/{path}" + + +def extract_title(qmd_path: Path) -> str: + """Extract title from QMD YAML frontmatter. Prefers pagetitle > title > filename.""" + if not qmd_path.exists(): + return qmd_path.stem + + content = qmd_path.read_text(encoding="utf-8") + frontmatter = _extract_frontmatter(content) + if frontmatter: + parsed = yaml.safe_load(frontmatter) + if isinstance(parsed, dict): + return parsed.get("pagetitle") or parsed.get("title") or qmd_path.stem + + return qmd_path.stem + + +def _extract_frontmatter(content: str) -> str | None: + """Return the raw YAML between opening --- and closing ---.""" + if not content.startswith("---"): + return None + end = content.find("\n---", 3) + if end == -1: + return None + return content[3:end].strip() + + +def walk_sidebar_contents( + contents: list, + site_dir: Path, + base_url: str, + seen_urls: set[str], +) -> list[Page]: + """ + Recursively walk a sidebar contents list and return Page objects. + + Handles all four quartodoc/Quarto entry formats: + - "path/to/file.qmd" (string) + - {section: "Title", contents:} (section dict — flattened) + - {href: "path.qmd", text: ...} (href dict) + - {file: "path.qmd"} (file dict) + """ + pages: list[Page] = [] + + for entry in contents: + if isinstance(entry, str): + # Plain string path + _add_page(entry, site_dir, base_url, seen_urls, pages) + + elif isinstance(entry, dict): + if "section" in entry: + # Nested section — flatten into the current page list + sub_contents = entry.get("contents", []) + pages.extend( + walk_sidebar_contents(sub_contents, site_dir, base_url, seen_urls) + ) + elif "href" in entry: + href = entry["href"] + # Skip fragment-only hrefs + if href.startswith("#"): + continue + # Strip fragment from href if present + href = href.split("#")[0] + _add_page(href, site_dir, base_url, seen_urls, pages) + elif "file" in entry: + _add_page(entry["file"], site_dir, base_url, seen_urls, pages) + elif "contents" in entry: + # Bare contents dict without section title + pages.extend( + walk_sidebar_contents( + entry["contents"], site_dir, base_url, seen_urls + ) + ) + + return pages + + +def _add_page( + rel_path: str, + site_dir: Path, + base_url: str, + seen_urls: set[str], + pages: list[Page], +) -> None: + url = qmd_path_to_url(base_url, rel_path) + if url in seen_urls: + return + seen_urls.add(url) + + qmd_path = site_dir / rel_path + if not qmd_path.exists(): + print(f" Warning: {qmd_path} not found, skipping", file=sys.stderr) + return + + pages.append(Page(title=extract_title(qmd_path), url=url, source=qmd_path)) + + +def read_quartodoc_sidebar(sidebar_yml: Path, site_dir: Path, base_url: str, seen_urls: set[str]) -> list[Page]: + """ + Read a quartodoc-generated _sidebar.yml file (lives next to the API reference index). + Returns pages in sidebar order. + """ + if not sidebar_yml.exists(): + return [] + + with open(sidebar_yml) as f: + data = yaml.safe_load(f) + + contents = data.get("contents", []) if isinstance(data, dict) else [] + return walk_sidebar_contents(contents, site_dir, base_url, seen_urls) + + +def build_site(config: dict, site_dir: Path) -> Site: + """Build a Site from a parsed _quarto.yml config.""" + website = config.get("website", {}) + base_url = website.get("site-url", "").rstrip("/") + name = website.get("title", "Package") + description = website.get("description", "") + + site = Site(name=name, description=description, base_url=base_url) + seen_urls: set[str] = set() + + # Walk each top-level sidebar + sidebars = website.get("sidebar", []) + if isinstance(sidebars, dict): + sidebars = [sidebars] + + for sidebar in sidebars: + sidebar_id = sidebar.get("id", "") + contents = sidebar.get("contents", []) + + for entry in contents: + if isinstance(entry, str): + # A bare file entry at the top level — add as a one-page section + pages = walk_sidebar_contents([entry], site_dir, base_url, seen_urls) + if pages: + title = pages[0].title + site.sections.append(Section(title=title, pages=pages)) + + elif isinstance(entry, dict) and "section" in entry: + section_title = entry["section"] + sub_contents = entry.get("contents", []) + + # Check if this section points to a quartodoc-generated _sidebar.yml + quartodoc_cfg = config.get("quartodoc", {}) + api_sections = quartodoc_cfg.get("sections", []) + api_dir = quartodoc_cfg.get("dir", "reference") + + sidebar_yml = site_dir / api_dir / "_sidebar.yml" + is_api_section = any( + s.get("title") == section_title for s in api_sections + ) or section_title.lower() in ("reference", "api reference", "api") + + if is_api_section and sidebar_yml.exists(): + pages = read_quartodoc_sidebar(sidebar_yml, site_dir, base_url, seen_urls) + else: + pages = walk_sidebar_contents(sub_contents, site_dir, base_url, seen_urls) + + if pages: + site.sections.append(Section(title=section_title, pages=pages)) + + return site + + +# --------------------------------------------------------------------------- +# QMD content cleaning +# --------------------------------------------------------------------------- + + +def clean_qmd_content(content: str) -> str: + """ + Clean QMD source for inclusion in llms-full.txt. + Removes Quarto-specific markup, leaving clean markdown. + """ + # 1. Strip YAML frontmatter + content = re.sub(r"^---\s*\n.*?\n---\s*\n", "", content, flags=re.DOTALL) + + # 2. Remove raw HTML blocks + content = re.sub(r"```\{=html\}.*?```", "", content, flags=re.DOTALL) + + # 3. Remove shinylive/quartodoc cell metadata comments (#| key: value) + content = re.sub(r"^#\|.*$", "", content, flags=re.MULTILINE) + + # 4. Convert Quarto code fences to plain markdown + # ```{python} -> ```python, ```{shinylive-python} -> ```python, etc. + content = re.sub( + r"```\{(shinylive-python|shinylive-r|python|r|bash|shell)\}", + lambda m: "```python" + if "python" in m.group(1) + else "```r" + if m.group(1) == "r" + else "```bash", + content, + ) + # Remove any remaining ```{...} fences (e.g. ```{.python}) + content = re.sub(r"```\{[^}]*\}", "```", content) + + # 5. Remove Quarto div fences (lines that are only colons) + content = re.sub(r"^:{3,}\s*(\{[^}]*\})?\s*$", "", content, flags=re.MULTILINE) + + # 6. Strip inline HTML tags + content = re.sub(r"<[^>]+>", "", content) + + # 7. Collapse 3+ consecutive blank lines to 2 + content = re.sub(r"\n{3,}", "\n\n", content) + + return content.strip() + + +def load_page_content(page: Page) -> str: + """Load and clean a page's QMD source.""" + if not page.source.exists(): + return "" + raw = page.source.read_text(encoding="utf-8") + return clean_qmd_content(raw) + + +# --------------------------------------------------------------------------- +# Output generation +# --------------------------------------------------------------------------- + + +def render_llms_full_txt(site: Site) -> str: + lines: list[str] = [] + + lines.append(f"# {site.name}") + lines.append("") + + if site.description: + lines.append(f"> {site.description}") + lines.append("") + + for section in site.sections: + lines.append(f"## {section.title}") + lines.append("") + + for page in section.pages: + lines.append(f"- [{page.title}]({page.url})") + lines.append("") + content = load_page_content(page) + if content: + lines.append(content) + lines.append("") + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--site-dir", type=Path, default=Path("."), help="Root directory containing _quarto.yml (default: .)") + parser.add_argument("--output", type=Path, default=Path("llms-full.txt"), help="Output file path (default: llms-full.txt)") + args = parser.parse_args() + + site_dir = args.site_dir.resolve() + print(f"Reading site config from {site_dir}/_quarto.yml", file=sys.stderr) + + config = load_quarto_config(site_dir) + site = build_site(config, site_dir) + + content = render_llms_full_txt(site) + args.output.write_text(content, encoding="utf-8") + + page_count = sum(len(s.pages) for s in site.sections) + print(f"Wrote {args.output} ({len(site.sections)} sections, {page_count} pages)", file=sys.stderr) + + +if __name__ == "__main__": + main()