From 54d6d222d0f65e162c6a6aa530013e7cd40ecde7 Mon Sep 17 00:00:00 2001
From: E Nelson <liz.nelson@posit.co>
Date: Fri, 10 Apr 2026 12:02:00 -0400
Subject: [PATCH] feat(open-source): add llms-info-from-quartodoc skill

Adds a skill for generating llms-full.txt for Python package documentation
sites built with Quarto and quartodoc. Quarto 1.9 handles llms.txt natively;
this skill covers the llms-full.txt complement with cleaned page content.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .claude-plugin/marketplace.json               |   3 +-
 README.md                                     |   1 +
 open-source/README.md                         |   6 +
 open-source/llms-info-from-quartodoc/SKILL.md | 127 +++++++
 .../generate-llms-full-txt-example.py         | 358 ++++++++++++++++++
 5 files changed, 494 insertions(+), 1 deletion(-)
 create mode 100644 open-source/llms-info-from-quartodoc/SKILL.md
 create mode 100644 open-source/llms-info-from-quartodoc/references/generate-llms-full-txt-example.py

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 4edc581..3aa4091 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -37,7 +37,8 @@
       "strict": false,
       "skills": [
         "./open-source/release-post",
-        "./open-source/create-release-checklist"
+        "./open-source/create-release-checklist",
+        "./open-source/llms-info-from-quartodoc"
       ]
     },
     {
diff --git a/README.md b/README.md
index f882fac..c07355e 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ Skills for open-source R and Python package developers, streamlining common work
 
 - **[create-release-checklist](./open-source/create-release-checklist/)** - Create a release checklist and GitHub issue for an R package, with automatic version calculation and customizable checklist generation
 - **[release-post](./open-source/release-post/)** - Create professional package release blog posts following Tidyverse or Shiny blog conventions, with support for both R and Python packages
+- **[llms-info-from-quartodoc](./open-source/llms-info-from-quartodoc/)** - Generate `llms-full.txt` for Python package documentation sites built with Quarto and quartodoc, making docs fully consumable by LLMs and coding assistants
 
 ### R Package Development
 
diff --git a/open-source/README.md b/open-source/README.md
index 0e03341..fe88d43 100644
--- a/open-source/README.md
+++ b/open-source/README.md
@@ -4,6 +4,12 @@ Skills for open-source R and Python package developers. These skills streamline
 
 For general-purpose developer skills (code review, architecture docs, etc.), see [posit-dev](../posit-dev/).
 
+## Skills
+
+- **[create-release-checklist](./create-release-checklist/)** - Create a release checklist and GitHub issue for an R package
+- **[release-post](./release-post/)** - Create professional package release blog posts following Tidyverse or Shiny blog conventions
+- **[llms-info-from-quartodoc](./llms-info-from-quartodoc/)** - Generate `llms-full.txt` for Python package documentation sites built with Quarto and quartodoc, making docs fully consumable by LLMs
+
 ## Contributing
 
 See the main [CONTRIBUTING.md](../CONTRIBUTING.md) for guidelines on adding new skills to this category.
diff --git a/open-source/llms-info-from-quartodoc/SKILL.md b/open-source/llms-info-from-quartodoc/SKILL.md
new file mode 100644
index 0000000..4419589
--- /dev/null
+++ b/open-source/llms-info-from-quartodoc/SKILL.md
@@ -0,0 +1,127 @@
+---
+name: llms-info-from-quartodoc
+description: >
+  Use when adding llms-full.txt to a Python package documentation site built
+  with Quarto and quartodoc. Triggers when the user wants to make their package
+  docs fully consumable by LLMs, mentions the llmstxt.org spec, or asks about
+  generating rich LLM context files for quartodoc sites.
+metadata:
+  author: Elizabeth Nelson (@enelson)
+  version: "1.0"
+license: MIT
+---
+
+# LLM-Friendly Docs from quartodoc
+
+Generate `llms-full.txt` for Python package documentation sites built with Quarto and quartodoc, following the [llmstxt.org](https://llmstxt.org/) spec.
+
+Quarto 1.9 generates `llms.txt` (a concise link index) natively — you don't need to write that. This skill covers `llms-full.txt` only: the same structure as `llms.txt`, but with cleaned page content under each entry, giving LLMs and coding assistants the full documentation context.
+
+## The llms-full.txt Format
+
+Per the llmstxt.org spec:
+
+- **H1 heading**: Project name
+- **Blockquote**: Short project summary (from `website.description` or package metadata)
+- **H2 sections**: Content grouped by topic, matching the sidebar structure
+- **Per entry**: markdown link followed by cleaned page content
+
+---
+
+## Workflow
+
+### Step 1: Understand the site structure
+
+Read `_quarto.yml` to understand:
+- Sidebar structure (IDs, contents, sections) — this determines section grouping
+- quartodoc configuration (package name, API sections) — determines API reference pages
+- Base URL (`website.site-url`) — needed for absolute URLs in the output
+- Site description (`website.description`) — used in the blockquote
+
+### Step 2: Write `scripts/generate_llms_full_txt.py`
+
+Model the script after `references/generate-llms-full-txt-example.py` and adapt it to the project's sidebar layout, quartodoc sections, and content structure.
+
+**Script responsibilities:**
+
+1. Parse `_quarto.yml` sidebars to discover pages in sidebar order
+2. Read quartodoc-generated `_sidebar.yml` files for API reference pages
+3. For each page:
+   - Extract title from YAML frontmatter (`pagetitle` > `title` > filename)
+   - Build the page URL (`.qmd` → `.html`, `index.qmd` → trailing slash)
+   - Clean the QMD content (see below)
+4. Write `llms-full.txt` with section H2 headers, markdown links, and cleaned content
+
+**QMD content cleaning pipeline** (apply in order):
+1. Strip YAML frontmatter (`---` blocks at top)
+2. Remove Quarto div fences (`:::`, `::::`, etc.)
+3. Remove raw HTML blocks (` ```{=html} ... ``` `)
+4. Remove shinylive/quartodoc metadata comments (`#| key: value`)
+5. Convert Quarto code fences (`{python}`, `{shinylive-python}`, etc.) to plain ` ```python `
+6. Strip inline HTML tags (`<...>`)
+7. Collapse 3+ consecutive blank lines to 2
+
+**Use `@dataclass` for site structure types:**
+```python
+@dataclass
+class Page:
+    title: str
+    url: str
+    source: Path
+
+@dataclass
+class Section:
+    title: str
+    pages: list[Page]
+```
+
+**Edge cases to handle:**
+- Fragment-only hrefs (e.g. `#section`) — resolve to parent page, deduplicate
+- Missing `.qmd` files — skip with a warning
+- Duplicate entries across sections — deduplicate by URL
+
+### Step 3: Write tests
+
+Unit tests covering:
+- QMD content cleaning (each cleaning step independently)
+- Sidebar walking (all 4 entry formats: string, section dict, href dict, file dict)
+- URL generation (`index.qmd` → trailing slash, `.qmd` → `.html`, nested paths)
+- Title extraction (`pagetitle` precedence, missing frontmatter)
+- Integration test: build site structure from a minimal `_quarto.yml` fixture
+
+### Step 4: Integrate into build
+
+Add a Makefile target. `llms-full.txt` is committed to the repo — not a build-only artifact — so changes are reviewable in PRs.
+
+```makefile
+llms-full-txt: $(PYBIN) quartodoc
+	$(PYBIN) scripts/generate_llms_full_txt.py
+
+all: quartodoc llms-full-txt site
+```
+
+Add `llms-full.txt` to `_quarto.yml` project resources so Quarto copies it to the output:
+```yaml
+project:
+  resources:
+    - llms-full.txt
+```
+
+### Step 5: Add CI freshness check
+
+After the site build step, add:
+```yaml
+- name: Check llms-full.txt is up to date
+  run: |
+    make llms-full-txt
+    git diff --exit-code llms-full.txt || \
+      (echo "Run 'make llms-full-txt' locally and commit the result." && exit 1)
+```
+
+**After all steps:** run `make llms-full-txt` and commit `llms-full.txt`.
+
+---
+
+## Additional Reference
+
+See `references/generate-llms-full-txt-example.py` for a complete working implementation to study and adapt.
diff --git a/open-source/llms-info-from-quartodoc/references/generate-llms-full-txt-example.py b/open-source/llms-info-from-quartodoc/references/generate-llms-full-txt-example.py
new file mode 100644
index 0000000..f3c582e
--- /dev/null
+++ b/open-source/llms-info-from-quartodoc/references/generate-llms-full-txt-example.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python3
+"""
+Generate llms-full.txt for a quartodoc-based Python package documentation site.
+
+Note: llms.txt (link index) is generated natively by Quarto 1.9.
+This script generates llms-full.txt, which adds cleaned page content
+under each entry so LLMs get the full documentation context.
+
+Usage:
+    python scripts/generate_llms_full_txt.py
+    python scripts/generate_llms_full_txt.py --site-dir .
+    python scripts/generate_llms_full_txt.py --output llms-full.txt
+
+Requires: pyyaml (pip install pyyaml)
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print("Error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr)
+    sys.exit(1)
+
+
+# ---------------------------------------------------------------------------
+# Data structures
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class Page:
+    title: str
+    url: str
+    source: Path
+
+
+@dataclass
+class Section:
+    title: str
+    pages: list[Page] = field(default_factory=list)
+
+
+@dataclass
+class Site:
+    name: str
+    description: str
+    base_url: str
+    sections: list[Section] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Sidebar / _quarto.yml parsing
+# ---------------------------------------------------------------------------
+
+
+def load_quarto_config(site_dir: Path) -> dict:
+    quarto_yml = site_dir / "_quarto.yml"
+    if not quarto_yml.exists():
+        raise FileNotFoundError(f"_quarto.yml not found in {site_dir}")
+    with open(quarto_yml) as f:
+        return yaml.safe_load(f)
+
+
+def qmd_path_to_url(base_url: str, qmd_path: str) -> str:
+    """Convert a .qmd path relative to site root to an absolute URL."""
+    base_url = base_url.rstrip("/")
+    path = qmd_path.strip("/")
+
+    # index.qmd files map to the directory URL (trailing slash)
+    if path.endswith("/index.qmd") or path == "index.qmd":
+        url_path = path[: -len("index.qmd")].rstrip("/")
+        return f"{base_url}/{url_path}/" if url_path else f"{base_url}/"
+
+    # Regular .qmd files map to .html
+    if path.endswith(".qmd"):
+        path = path[: -len(".qmd")] + ".html"
+
+    return f"{base_url}/{path}"
+
+
+def extract_title(qmd_path: Path) -> str:
+    """Extract title from QMD YAML frontmatter. Prefers pagetitle > title > filename."""
+    if not qmd_path.exists():
+        return qmd_path.stem
+
+    content = qmd_path.read_text(encoding="utf-8")
+    frontmatter = _extract_frontmatter(content)
+    if frontmatter:
+        parsed = yaml.safe_load(frontmatter)
+        if isinstance(parsed, dict):
+            return parsed.get("pagetitle") or parsed.get("title") or qmd_path.stem
+
+    return qmd_path.stem
+
+
+def _extract_frontmatter(content: str) -> str | None:
+    """Return the raw YAML between opening --- and closing ---."""
+    if not content.startswith("---"):
+        return None
+    end = content.find("\n---", 3)
+    if end == -1:
+        return None
+    return content[3:end].strip()
+
+
+def walk_sidebar_contents(
+    contents: list,
+    site_dir: Path,
+    base_url: str,
+    seen_urls: set[str],
+) -> list[Page]:
+    """
+    Recursively walk a sidebar contents list and return Page objects.
+
+    Handles all four quartodoc/Quarto entry formats:
+    - "path/to/file.qmd"            (string)
+    - {section: "Title", contents:} (section dict — flattened)
+    - {href: "path.qmd", text: ...} (href dict)
+    - {file: "path.qmd"}            (file dict)
+    """
+    pages: list[Page] = []
+
+    for entry in contents:
+        if isinstance(entry, str):
+            # Plain string path
+            _add_page(entry, site_dir, base_url, seen_urls, pages)
+
+        elif isinstance(entry, dict):
+            if "section" in entry:
+                # Nested section — flatten into the current page list
+                sub_contents = entry.get("contents", [])
+                pages.extend(
+                    walk_sidebar_contents(sub_contents, site_dir, base_url, seen_urls)
+                )
+            elif "href" in entry:
+                href = entry["href"]
+                # Skip fragment-only hrefs
+                if href.startswith("#"):
+                    continue
+                # Strip fragment from href if present
+                href = href.split("#")[0]
+                _add_page(href, site_dir, base_url, seen_urls, pages)
+            elif "file" in entry:
+                _add_page(entry["file"], site_dir, base_url, seen_urls, pages)
+            elif "contents" in entry:
+                # Bare contents dict without section title
+                pages.extend(
+                    walk_sidebar_contents(
+                        entry["contents"], site_dir, base_url, seen_urls
+                    )
+                )
+
+    return pages
+
+
+def _add_page(
+    rel_path: str,
+    site_dir: Path,
+    base_url: str,
+    seen_urls: set[str],
+    pages: list[Page],
+) -> None:
+    url = qmd_path_to_url(base_url, rel_path)
+    if url in seen_urls:
+        return
+    seen_urls.add(url)
+
+    qmd_path = site_dir / rel_path
+    if not qmd_path.exists():
+        print(f"  Warning: {qmd_path} not found, skipping", file=sys.stderr)
+        return
+
+    pages.append(Page(title=extract_title(qmd_path), url=url, source=qmd_path))
+
+
+def read_quartodoc_sidebar(sidebar_yml: Path, site_dir: Path, base_url: str, seen_urls: set[str]) -> list[Page]:
+    """
+    Read a quartodoc-generated _sidebar.yml file (lives next to the API reference index).
+    Returns pages in sidebar order.
+    """
+    if not sidebar_yml.exists():
+        return []
+
+    with open(sidebar_yml) as f:
+        data = yaml.safe_load(f)
+
+    contents = data.get("contents", []) if isinstance(data, dict) else []
+    return walk_sidebar_contents(contents, site_dir, base_url, seen_urls)
+
+
+def build_site(config: dict, site_dir: Path) -> Site:
+    """Build a Site from a parsed _quarto.yml config."""
+    website = config.get("website", {})
+    base_url = website.get("site-url", "").rstrip("/")
+    name = website.get("title", "Package")
+    description = website.get("description", "")
+
+    site = Site(name=name, description=description, base_url=base_url)
+    seen_urls: set[str] = set()
+
+    # Walk each top-level sidebar
+    sidebars = website.get("sidebar", [])
+    if isinstance(sidebars, dict):
+        sidebars = [sidebars]
+
+    for sidebar in sidebars:
+        sidebar_id = sidebar.get("id", "")
+        contents = sidebar.get("contents", [])
+
+        for entry in contents:
+            if isinstance(entry, str):
+                # A bare file entry at the top level — add as a one-page section
+                pages = walk_sidebar_contents([entry], site_dir, base_url, seen_urls)
+                if pages:
+                    title = pages[0].title
+                    site.sections.append(Section(title=title, pages=pages))
+
+            elif isinstance(entry, dict) and "section" in entry:
+                section_title = entry["section"]
+                sub_contents = entry.get("contents", [])
+
+                # Check if this section points to a quartodoc-generated _sidebar.yml
+                quartodoc_cfg = config.get("quartodoc", {})
+                api_sections = quartodoc_cfg.get("sections", [])
+                api_dir = quartodoc_cfg.get("dir", "reference")
+
+                sidebar_yml = site_dir / api_dir / "_sidebar.yml"
+                is_api_section = any(
+                    s.get("title") == section_title for s in api_sections
+                ) or section_title.lower() in ("reference", "api reference", "api")
+
+                if is_api_section and sidebar_yml.exists():
+                    pages = read_quartodoc_sidebar(sidebar_yml, site_dir, base_url, seen_urls)
+                else:
+                    pages = walk_sidebar_contents(sub_contents, site_dir, base_url, seen_urls)
+
+                if pages:
+                    site.sections.append(Section(title=section_title, pages=pages))
+
+    return site
+
+
+# ---------------------------------------------------------------------------
+# QMD content cleaning
+# ---------------------------------------------------------------------------
+
+
+def clean_qmd_content(content: str) -> str:
+    """
+    Clean QMD source for inclusion in llms-full.txt.
+    Removes Quarto-specific markup, leaving clean markdown.
+    """
+    # 1. Strip YAML frontmatter
+    content = re.sub(r"^---\s*\n.*?\n---\s*\n", "", content, flags=re.DOTALL)
+
+    # 2. Remove raw HTML blocks
+    content = re.sub(r"```\{=html\}.*?```", "", content, flags=re.DOTALL)
+
+    # 3. Remove shinylive/quartodoc cell metadata comments (#| key: value)
+    content = re.sub(r"^#\|.*$", "", content, flags=re.MULTILINE)
+
+    # 4. Convert Quarto code fences to plain markdown
+    #    ```{python} -> ```python, ```{shinylive-python} -> ```python, etc.
+    content = re.sub(
+        r"```\{(shinylive-python|shinylive-r|python|r|bash|shell)\}",
+        lambda m: "```python"
+        if "python" in m.group(1)
+        else "```r"
+        if m.group(1) == "r"
+        else "```bash",
+        content,
+    )
+    # Remove any remaining ```{...} fences (e.g. ```{.python})
+    content = re.sub(r"```\{[^}]*\}", "```", content)
+
+    # 5. Remove Quarto div fences (lines that are only colons)
+    content = re.sub(r"^:{3,}\s*(\{[^}]*\})?\s*$", "", content, flags=re.MULTILINE)
+
+    # 6. Strip inline HTML tags
+    content = re.sub(r"<[^>]+>", "", content)
+
+    # 7. Collapse 3+ consecutive blank lines to 2
+    content = re.sub(r"\n{3,}", "\n\n", content)
+
+    return content.strip()
+
+
+def load_page_content(page: Page) -> str:
+    """Load and clean a page's QMD source."""
+    if not page.source.exists():
+        return ""
+    raw = page.source.read_text(encoding="utf-8")
+    return clean_qmd_content(raw)
+
+
+# ---------------------------------------------------------------------------
+# Output generation
+# ---------------------------------------------------------------------------
+
+
+def render_llms_full_txt(site: Site) -> str:
+    lines: list[str] = []
+
+    lines.append(f"# {site.name}")
+    lines.append("")
+
+    if site.description:
+        lines.append(f"> {site.description}")
+        lines.append("")
+
+    for section in site.sections:
+        lines.append(f"## {section.title}")
+        lines.append("")
+
+        for page in section.pages:
+            lines.append(f"- [{page.title}]({page.url})")
+            lines.append("")
+            content = load_page_content(page)
+            if content:
+                lines.append(content)
+                lines.append("")
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("--site-dir", type=Path, default=Path("."), help="Root directory containing _quarto.yml (default: .)")
+    parser.add_argument("--output", type=Path, default=Path("llms-full.txt"), help="Output file path (default: llms-full.txt)")
+    args = parser.parse_args()
+
+    site_dir = args.site_dir.resolve()
+    print(f"Reading site config from {site_dir}/_quarto.yml", file=sys.stderr)
+
+    config = load_quarto_config(site_dir)
+    site = build_site(config, site_dir)
+
+    content = render_llms_full_txt(site)
+    args.output.write_text(content, encoding="utf-8")
+
+    page_count = sum(len(s.pages) for s in site.sections)
+    print(f"Wrote {args.output} ({len(site.sections)} sections, {page_count} pages)", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()