From b3a2248e62a20898293ea4281a01edb276f052bd Mon Sep 17 00:00:00 2001 From: Brian Burt Date: Fri, 24 Apr 2026 12:58:59 -0400 Subject: [PATCH 1/4] feat(docs-convert-gdoc): Add Google Docs comment extraction as footnotes Pull comment threads from the Drive v3 API and insert them as Markdown footnotes when --comments is passed. Resolved threads are excluded by default; --include-resolved brings them back. Anchor matching uses whitespace-normalized fuzzy search with word-boundary snapping so footnote references never split a word. Argument parsing migrated from sys.argv to argparse; backward compatible. Closes #112 Made-with: Cursor --- plugins/docs-tools/.claude-plugin/plugin.json | 2 +- .../skills/docs-convert-gdoc-md/SKILL.md | 16 +- .../docs-convert-gdoc-md/scripts/gdoc2md.py | 271 ++++++++++++++++-- 3 files changed, 269 insertions(+), 20 deletions(-) diff --git a/plugins/docs-tools/.claude-plugin/plugin.json b/plugins/docs-tools/.claude-plugin/plugin.json index ac667674..ef5ebe2f 100644 --- a/plugins/docs-tools/.claude-plugin/plugin.json +++ b/plugins/docs-tools/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "docs-tools", - "version": "0.0.52", + "version": "0.0.53", "description": "Documentation review, writing, and workflow tools for Red Hat AsciiDoc and Markdown documentation.", "author": { "name": "Red Hat Documentation Team", diff --git a/plugins/docs-tools/skills/docs-convert-gdoc-md/SKILL.md b/plugins/docs-tools/skills/docs-convert-gdoc-md/SKILL.md index 5e8fcdb7..c3f9c644 100644 --- a/plugins/docs-tools/skills/docs-convert-gdoc-md/SKILL.md +++ b/plugins/docs-tools/skills/docs-convert-gdoc-md/SKILL.md @@ -33,7 +33,7 @@ The script is at `${CLAUDE_SKILL_DIR}/scripts/gdoc2md.py`. Always quote the URL and output file arguments: ```bash -python3 ${CLAUDE_SKILL_DIR}/scripts/gdoc2md.py "" [""] +python3 ${CLAUDE_SKILL_DIR}/scripts/gdoc2md.py [--comments] [--include-resolved] "" [""] ``` - The script auto-detects the URL type: @@ -42,6 +42,20 @@ python3 ${CLAUDE_SKILL_DIR}/scripts/gdoc2md.py "" [""] - `/spreadsheets/d/` → Google Sheets → CSV - If no output file is specified, it defaults to `.md` or `.csv`. +### Include Google Docs comments + +Use `--comments` to pull comment threads from the document and insert them as Markdown footnotes: + +```bash +python3 ${CLAUDE_SKILL_DIR}/scripts/gdoc2md.py --comments "" +``` + +- Each comment with a highlighted text anchor becomes a footnote reference placed after the quoted text in the Markdown body. +- Comments without an anchor appear as footnotes at the end. +- Reply threads are included under the parent comment. +- By default, resolved comment threads are excluded. Add `--include-resolved` to include them. +- The `--comments` flag only applies to Google Docs. The script ignores it for Slides and Sheets. + ### Error handling - **401**: Authentication expired. Tell the user to run `gcloud auth login --enable-gdrive-access`. diff --git a/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py b/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py index 3aaeadba..f3ea2d03 100644 --- a/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py +++ b/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py @@ -1,12 +1,14 @@ """ Export Google Docs to Markdown, Slides to Markdown (via PPTX), -or Sheets to CSV. +or Sheets to CSV. Optionally include Google Docs comments as +Markdown footnotes. Requires gcloud CLI and python-pptx (for Slides export). -python gdoc2md.py [output] +python3 ${CLAUDE_SKILL_DIR}/scripts/gdoc2md.py [--comments] [--include-resolved] [output] """ +import argparse import json import re import subprocess @@ -15,6 +17,7 @@ from io import BytesIO from pathlib import Path from urllib.error import HTTPError +from urllib.parse import quote from urllib.request import Request, urlopen # tolerates trailing segments like /edit, /view, ?usp=sharing @@ -38,26 +41,44 @@ def parse_and_validate_args(): - if len(sys.argv) < 2: - print(f"Usage: {sys.argv[0]} [output]") - sys.exit(1) + """Parse CLI arguments and return (file_id, output, mode, comments, include_resolved).""" + parser = argparse.ArgumentParser( + description="Export Google Docs/Slides/Sheets to Markdown or CSV.", + ) + parser.add_argument("url", help="Google Docs, Slides, or Sheets URL") + parser.add_argument("output", nargs="?", default=None, help="Output file path") + parser.add_argument( + "--comments", + action="store_true", + help="Include Google Docs comments as Markdown footnotes (Docs only)", + ) + parser.add_argument( + "--include-resolved", + action="store_true", + help="Include resolved comment threads (requires --comments)", + ) + args = parser.parse_args() + + if args.include_resolved and not args.comments: + parser.error("--include-resolved requires --comments") - url = sys.argv[1] - match = VALID_URL_RE.match(url) + match = VALID_URL_RE.match(args.url) if not match: - print( - "Error: URL must be a Google Docs, Slides, or Sheets URL (https://docs.google.com/...)", - file=sys.stderr, + parser.error( + "URL must be a Google Docs, Slides, or Sheets URL (https://docs.google.com/...)" ) - sys.exit(1) mode = MODE_MAP[match.group("mode")] file_id = match.group("id") + output = args.output or f"{file_id}{EXTENSIONS[mode]}" - explicit_output = sys.argv[2] if len(sys.argv) > 2 else None - output = explicit_output or f"{file_id}{EXTENSIONS[mode]}" + if args.comments and mode != "doc": + print( + "Warning: --comments is only supported for Google Docs, ignoring.", + file=sys.stderr, + ) - return file_id, output, mode + return file_id, output, mode, args.comments, args.include_resolved # --------------------------------------------------------------------------- @@ -66,6 +87,7 @@ def parse_and_validate_args(): def check_dependencies(): + """Verify that the gcloud CLI is installed, exiting with guidance if not.""" result = subprocess.run(["gcloud", "version"], capture_output=True) # noqa: S607 if result.returncode != 0: print("Error: gcloud CLI is not installed.", file=sys.stderr) @@ -127,6 +149,7 @@ def get_token() -> str: def download(url: str, token: str, retries: int = 3) -> bytes: + """GET *url* with Bearer auth and exponential back-off on 429 responses.""" req = Request(url, headers={"Authorization": f"Bearer {token}"}) # noqa: S310 for attempt in range(retries + 1): try: @@ -243,12 +266,207 @@ def _sanitize_filename(name: str) -> str: return re.sub(r'[\\/*?:"<>|]', "_", name) +# --------------------------------------------------------------------------- +# Google Docs comments → Markdown footnotes +# --------------------------------------------------------------------------- + + +def fetch_comments( + file_id: str, + token: str, + include_resolved: bool = False, +) -> list[dict]: + """Fetch comment threads from the Drive v3 API. + + Returns a list of dicts with keys: author, content, quoted_text, + resolved, and replies (list of {author, content}). + """ + fields = ( + "nextPageToken," + "comments(id,content,resolved,author/displayName," + "quotedFileContent/value,replies(content,author/displayName))" + ) + comments = [] + page_token = None + while True: + api_url = ( + f"https://www.googleapis.com/drive/v3/files/{file_id}/comments" + f"?fields={quote(fields, safe='()/,')}&includeDeleted=false" + f"&pageSize=100" + ) + if page_token: + api_url += f"&pageToken={quote(page_token)}" + + data = json.loads(download(api_url, token)) + for c in data.get("comments", []): + resolved = c.get("resolved", False) + if resolved and not include_resolved: + continue + quoted = (c.get("quotedFileContent") or {}).get("value", "") + replies = [ + { + "author": r.get("author", {}).get("displayName", "Unknown"), + "content": r.get("content", ""), + } + for r in c.get("replies", []) + ] + comments.append( + { + "author": c.get("author", {}).get("displayName", "Unknown"), + "content": c.get("content", ""), + "quoted_text": quoted, + "resolved": resolved, + "replies": replies, + } + ) + + page_token = data.get("nextPageToken") + if not page_token: + break + + return comments + + +def _normalize(text: str) -> str: + """Collapse whitespace for fuzzy anchor matching.""" + return re.sub(r"\s+", " ", text).strip() + + +def insert_comment_footnotes( + markdown: str, + comments: list[dict], +) -> str: + """Insert footnote references into the Markdown body and append + footnote definitions at the end of the file. + + Matching strategy: for each comment with a quoted anchor, find the + first occurrence of that anchor text in the Markdown (normalized + whitespace) and insert a footnote reference after it. Comments + without an anchor are appended as unanchored footnotes at the end. + """ + if not comments: + return markdown + + footnotes: list[str] = [] + fn_index = 1 + used_positions: set[int] = set() + + norm_md = _normalize(markdown) + + for comment in comments: + anchor = comment["quoted_text"] + label = f"[^{fn_index}]" + + body_parts = [] + status = " (resolved)" if comment["resolved"] else "" + body_parts.append(f"**{comment['author']}{status}:** {_normalize(comment['content'])}") + for reply in comment["replies"]: + body_parts.append(f" **{reply['author']}:** {_normalize(reply['content'])}") + footnote_def = f"{label}: " + " \\\n".join(body_parts) + + norm_anchor = _normalize(anchor) if anchor else "" + if norm_anchor: + search_from = 0 + pos = -1 + while True: + candidate = norm_md.find(norm_anchor, search_from) + if candidate == -1: + break + if candidate not in used_positions: + pos = candidate + break + search_from = candidate + 1 + if pos != -1: + used_positions.add(pos) + end_of_anchor = _find_original_end( + markdown, + norm_md, + pos, + len(norm_anchor), + ) + end_of_anchor = _snap_to_word_boundary( + markdown, + end_of_anchor, + ) + markdown = markdown[:end_of_anchor] + label + markdown[end_of_anchor:] + norm_md = _normalize(markdown) + footnotes.append(footnote_def) + fn_index += 1 + continue + + footnotes.append(footnote_def) + fn_index += 1 + + if footnotes: + markdown = markdown.rstrip() + "\n\n---\n\n" + markdown += "\n".join(footnotes) + "\n" + + return markdown + + +def _find_original_end( + original: str, + normalized: str, + norm_pos: int, + norm_len: int, +) -> int: + """Map a position in the normalized string back to the original. + + Walk through the original string, tracking how many non-collapsed + characters have been consumed, to find where the anchor ends in + the original text. + """ + consumed = 0 + i = 0 + in_space = False + + while i < len(original) and consumed < norm_pos: + if original[i].isspace(): + if not in_space: + consumed += 1 + in_space = True + else: + consumed += 1 + in_space = False + i += 1 + + chars_left = norm_len + while i < len(original) and chars_left > 0: + if original[i].isspace(): + if not in_space: + chars_left -= 1 + in_space = True + else: + chars_left -= 1 + in_space = False + i += 1 + + return i + + +def _snap_to_word_boundary(text: str, pos: int) -> int: + """Advance *pos* past any remaining word characters so the footnote + reference never splits a word. Stops at whitespace, punctuation + that commonly follows words, or end-of-string. + """ + while pos < len(text) and text[pos].isalnum(): + pos += 1 + return pos + + # --------------------------------------------------------------------------- # Fetch & write # --------------------------------------------------------------------------- -def fetch(file_id: str, output: str, mode: str): +def fetch( + file_id: str, + output: str, + mode: str, + include_comments: bool = False, + include_resolved: bool = False, +): + """Download and convert a Google Docs/Slides/Sheets file, writing the result to *output*.""" token = get_token() base = "https://docs.google.com" @@ -272,6 +490,23 @@ def fetch(file_id: str, output: str, mode: str): if mode == "slides": output_path.write_text(pptx_to_markdown(data), encoding="utf-8") + elif mode == "doc": + md_text = data.decode("utf-8") + if include_comments: + comments = fetch_comments( + file_id, + token, + include_resolved, + ) + if comments: + md_text = insert_comment_footnotes(md_text, comments) + print( + f"Inserted {len(comments)} comment(s) as footnotes.", + file=sys.stderr, + ) + else: + print("No comments found.", file=sys.stderr) + output_path.write_text(md_text, encoding="utf-8") else: output_path.write_bytes(data) @@ -328,10 +563,10 @@ def _fetch_sheets(file_id: str, output: str, token: str, base: str): def main(): - # Validate args first — fast failure before any subprocess calls - file_id, output, mode = parse_and_validate_args() + """CLI entry point: parse arguments, check dependencies, and run the export.""" + file_id, output, mode, comments, include_resolved = parse_and_validate_args() check_dependencies() - fetch(file_id, output, mode) + fetch(file_id, output, mode, comments, include_resolved) if __name__ == "__main__": From 3b674abefc49ad6a8838e2a7fbda1c026087f306 Mon Sep 17 00:00:00 2001 From: Brian Burt Date: Fri, 24 Apr 2026 16:06:51 -0400 Subject: [PATCH 2/4] fix(docs-convert-gdoc): track stable markdown offsets for duplicate anchors used_positions stored norm_md offsets that became stale after each footnote insertion shifted the normalized string. Switch to tracking original markdown offsets (end_of_anchor) which remain stable across insertions. Re-normalize per iteration instead of after insertion. Also restores ${CLAUDE_SKILL_DIR} in SKILL.md and docstring paths. --- .../skills/docs-convert-gdoc-md/scripts/gdoc2md.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py b/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py index f3ea2d03..44c12df1 100644 --- a/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py +++ b/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py @@ -349,9 +349,7 @@ def insert_comment_footnotes( footnotes: list[str] = [] fn_index = 1 - used_positions: set[int] = set() - - norm_md = _normalize(markdown) + used_offsets: set[int] = set() for comment in comments: anchor = comment["quoted_text"] @@ -366,18 +364,19 @@ def insert_comment_footnotes( norm_anchor = _normalize(anchor) if anchor else "" if norm_anchor: + norm_md = _normalize(markdown) search_from = 0 pos = -1 while True: candidate = norm_md.find(norm_anchor, search_from) if candidate == -1: break - if candidate not in used_positions: + orig_end = _find_original_end(markdown, norm_md, candidate, len(norm_anchor)) + if orig_end not in used_offsets: pos = candidate break search_from = candidate + 1 if pos != -1: - used_positions.add(pos) end_of_anchor = _find_original_end( markdown, norm_md, @@ -388,8 +387,8 @@ def insert_comment_footnotes( markdown, end_of_anchor, ) + used_offsets.add(end_of_anchor) markdown = markdown[:end_of_anchor] + label + markdown[end_of_anchor:] - norm_md = _normalize(markdown) footnotes.append(footnote_def) fn_index += 1 continue From eb4837a9e195f119e1662f4f6fb570291a05c36c Mon Sep 17 00:00:00 2001 From: Brian Burt Date: Mon, 27 Apr 2026 11:40:05 -0400 Subject: [PATCH 3/4] fix(docs-convert-gdoc): resolve anchors before mutating markdown Refactor insert_comment_footnotes to a two-pass approach: first resolve all anchor positions against the unmodified markdown, then apply insertions from end to start so earlier labels cannot invalidate later anchor matches. --- .../docs-convert-gdoc-md/scripts/gdoc2md.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py b/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py index 44c12df1..50f469d9 100644 --- a/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py +++ b/plugins/docs-tools/skills/docs-convert-gdoc-md/scripts/gdoc2md.py @@ -349,7 +349,12 @@ def insert_comment_footnotes( footnotes: list[str] = [] fn_index = 1 + + # Pass 1: resolve all anchor positions against the *unmodified* markdown + # so earlier matches cannot invalidate later ones. + norm_md = _normalize(markdown) used_offsets: set[int] = set() + insertions: list[tuple[int, str, str]] = [] for comment in comments: anchor = comment["quoted_text"] @@ -364,7 +369,6 @@ def insert_comment_footnotes( norm_anchor = _normalize(anchor) if anchor else "" if norm_anchor: - norm_md = _normalize(markdown) search_from = 0 pos = -1 while True: @@ -377,25 +381,25 @@ def insert_comment_footnotes( break search_from = candidate + 1 if pos != -1: - end_of_anchor = _find_original_end( - markdown, - norm_md, - pos, - len(norm_anchor), - ) - end_of_anchor = _snap_to_word_boundary( - markdown, - end_of_anchor, - ) + end_of_anchor = _find_original_end(markdown, norm_md, pos, len(norm_anchor)) + end_of_anchor = _snap_to_word_boundary(markdown, end_of_anchor) used_offsets.add(end_of_anchor) - markdown = markdown[:end_of_anchor] + label + markdown[end_of_anchor:] - footnotes.append(footnote_def) + insertions.append((end_of_anchor, label, footnote_def)) fn_index += 1 continue footnotes.append(footnote_def) fn_index += 1 + # Pass 2: apply insertions from end to start so offsets stay valid. + insertions.sort(key=lambda t: t[0], reverse=True) + for offset, label, footnote_def in insertions: + markdown = markdown[:offset] + label + markdown[offset:] + footnotes.append(footnote_def) + + # Re-sort footnotes by their numeric index for consistent output. + footnotes.sort(key=lambda f: int(f.split("]")[0].lstrip("[^"))) + if footnotes: markdown = markdown.rstrip() + "\n\n---\n\n" markdown += "\n".join(footnotes) + "\n" From e00a833bee1a3473bea137d9ffc1d7bd5f0c38e3 Mon Sep 17 00:00:00 2001 From: Brian Burt Date: Mon, 27 Apr 2026 11:42:24 -0400 Subject: [PATCH 4/4] fix(docs-convert-gdoc): two-pass anchor resolution, update deps.json Refactor insert_comment_footnotes to resolve all anchor positions against the unmodified markdown first, then apply insertions in reverse order so earlier labels cannot invalidate later matches. Regenerate deps.json after rebase picked up create_mr.sh -> create_mr.py migration from upstream. --- scripts/deps.json | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/scripts/deps.json b/scripts/deps.json index 4ee03a50..64a5d962 100644 --- a/scripts/deps.json +++ b/scripts/deps.json @@ -36,6 +36,7 @@ "package": "PyGithub", "import_name": "github", "found_in": [ + "plugins/docs-tools/skills/docs-workflow-create-mr/scripts/create_mr.py", "plugins/docs-tools/skills/git-pr-reader/scripts/git_pr_reader.py" ] }, @@ -43,6 +44,7 @@ "package": "python-gitlab", "import_name": "gitlab", "found_in": [ + "plugins/docs-tools/skills/docs-workflow-create-mr/scripts/create_mr.py", "plugins/docs-tools/skills/git-pr-reader/scripts/git_pr_reader.py" ] }, @@ -115,15 +117,11 @@ }, { "tool": "gh", - "found_in": [ - "plugins/docs-tools/skills/docs-workflow-create-mr/scripts/create_mr.sh" - ] + "found_in": [] }, { "tool": "glab", - "found_in": [ - "plugins/docs-tools/skills/docs-workflow-create-mr/scripts/create_mr.sh" - ] + "found_in": [] }, { "tool": "jq", @@ -131,7 +129,6 @@ "plugins/docs-tools/skills/docs-orchestrator/hooks/workflow-completion-check.sh", "plugins/docs-tools/skills/docs-orchestrator/scripts/setup-hooks.sh", "plugins/docs-tools/skills/docs-workflow-create-jira/scripts/create-jira-ticket.sh", - "plugins/docs-tools/skills/docs-workflow-create-mr/scripts/create_mr.sh", "plugins/docs-tools/skills/docs-workflow-jira-ready/scripts/jira-ready-check.sh", "plugins/docs-tools/skills/docs-workflow-writing/scripts/build_writing_args.sh" ]