diff --git a/.github/workflows/enhance.yml b/.github/workflows/enhance.yml new file mode 100644 index 00000000000..d60fb72a88c --- /dev/null +++ b/.github/workflows/enhance.yml @@ -0,0 +1,30 @@ +name: Enhance content + +on: push + +jobs: + enhance: + # Runs only on forks when contributor pushes to their fork + if: github.event.repository.fork == true + runs-on: ubuntu-24.04 + steps: + - name: Checkout + # Using checkout v5, as v4 was warning that it will soon be deprecated (Node 20) + uses: actions/checkout@v5 + with: + fetch-depth: 0 + + - name: Setup Python + # Using setup-python v6, as v5 has same warning as above + uses: actions/setup-python@v6 + with: + python-version: '3.12' + + - name: Install dependencies + run: pip install --no-warn-script-location --user -r requirements.txt -c constraints.txt + + - name: Enhance topics + env: + BASE_SHA: ${{ github.event.before }} + HEAD_SHA: ${{ github.event.after }} + run: make enhance-topics \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5cea1c262d8..8b40cfe0cca 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -78,6 +78,15 @@ jobs: - name: Build the docs run: make html + + - name: Setup Node.js (Pagefind) + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Index HTML with Pagefind + run: make pagefind + - name: Upload document artifacts uses: actions/upload-artifact@v4 id: artifact-upload-step @@ -147,3 +156,12 @@ jobs: - name: Build the docs run: make multiversion + + + - name: Setup Node.js (Pagefind) + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Index HTML with Pagefind + run: make pagefind diff --git a/.gitignore b/.gitignore index 652f1b03313..24c42db21a7 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ _build/ __pycache__ ros2doc/ .DS_Store +.env diff --git a/Makefile b/Makefile index f2d90d3a3a8..8f76cc64c69 100644 --- a/Makefile +++ b/Makefile @@ -21,9 +21,26 @@ multiversion: Makefile @echo "" > build/html/index.html $(PYTHON) make_sitemapindex.py +# Pagefind static search index (requires Node.js / npx). Run after html or multiversion. +PAGEFIND_VERSION ?= 1.5.2 +pagefind: + npx -y pagefind@$(PAGEFIND_VERSION) --site "$(OUT)/html" + + +# Convenience: Sphinx build + Pagefind index (does not replace plain html / multiversion). +html-search: + $(MAKE) html + $(MAKE) pagefind + +multiversion-search: multiversion + $(MAKE) pagefind + %: Makefile @$(BUILD) -M $@ "$(SOURCE)" "$(OUT)" $(OPTS) +enhance-topics: + git diff --name-only --diff-filter=d $(BASE_SHA) $(HEAD_SHA) | xargs -r $(PYTHON) scripts/enhance_topics.py + lint: ./sphinx-lint-with-ros source @@ -61,4 +78,4 @@ linkcheck: @echo @echo "Check finished. Report is in $(LINKCHECKDIR)." -.PHONY: help Makefile multiversion test test-tools linkcheck lint spellcheck check-dictionaries sort-dictionaries +.PHONY: help Makefile multiversion pagefind test test-tools linkcheck lint spellcheck check-dictionaries sort-dictionaries diff --git a/README.md b/README.md index 7c613ccbf1f..11f8010e39c 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,41 @@ To test building the multisite version deployed to the website use: **NB:** This will ignore local workspace changes and build from the branches. +### Pagefind search index + +After `make html` or `make multiversion`, run [Pagefind](https://pagefind.app/) so the built HTML under `build/html` is indexed and `build/html/pagefind/` is written (search bundle and Component UI assets). From the repo root: + +`make pagefind` + +Or use convenience targets that run Sphinx and Pagefind in one step: + +- `make html-search` — `make html` then `make pagefind` +- `make multiversion-search` — `make multiversion` then `make pagefind` + +Plain `make html` and `make multiversion` do **not** run Pagefind (Node.js is only required when you index search). + +This requires **Node.js** (for `npx`). Pin the CLI with `PAGEFIND_VERSION` in the Makefile if needed. + +To preview search locally, serve the site over HTTP (Pagefind may not load from `file://`), for example from the repo root: + +`python -m http.server 8000 --directory build/html` + +Then open `http://localhost:8000/` in a browser. + +#### Search results page verification + +After `make html` and `make pagefind`, serve `build/html` over HTTP and check: + +1. **Direct URL** — Open `http://localhost:8000/search.html?q=tutorial` (or the same path under a distro prefix for multiversion builds). The input should show the query and results should load (not stay empty or skeleton-only). +2. **Modal redirect** — From a nested page (e.g. a tutorial), open the sidebar search modal (Ctrl/Cmd+K), type a term, press Enter. You should land on the search page with `?q=` set and matching results visible. +3. **Empty query** — Open `search.html` with no `q` parameter. The page should load without errors; no search is run until you type in the input. +4. **Result metadata** — Search for `Ubuntu deb` and open a result card. Metadata labels (e.g. Area, Content Type, Experience) should match that page’s `` `` tags from its `.. meta::` block (e.g. `area: installation` on the Ubuntu deb install page), not URL-path guesses. + +In DevTools Network, confirm `pagefind/` bundle requests return 200 (not 404). + +The production [Jenkins doc job](https://build.ros.org/job/doc_ros2doc) should run the same `pagefind` step on `build/html` after Sphinx so deployed pages include the search bundle. + + ### Note for Windows (WSL) Users When building the documentation on windows using WSL, it is recommended to clone and work with this repository inside the Linux filesystem (for example, under `/home//`) rather than under `/mnt/c`. diff --git a/conf.py b/conf.py index 2a9def973fd..f172f9d4674 100644 --- a/conf.py +++ b/conf.py @@ -89,6 +89,29 @@ 'sphinx_adopters', 'sphinxcontrib.googleanalytics', 'sphinxcontrib.mermaid', + 'pagefind_meta', + 'showmeta', +] + +pagefind_merge_enabled = False +pagefind_merge_package_pkgs = [] +pagefind_merge_index_base = 'https://docs.ros.org' +pagefind_merge_index_overrides = {} +pagefind_merge_filter_per_pkg = None +pagefind_merge_index_weight_per_pkg = None + +pagefind_filter_labels = { + 'contentType': 'Content type', +} + +pagefind_result_meta_order = [ + 'product', + 'distro', + 'area', + 'capability', + 'contentType', + 'experience', + ] # Intersphinx mapping @@ -168,6 +191,7 @@ 'DISTRO_TITLE': 'Rolling', 'DISTRO_TITLE_FULL': 'Rolling Ridley', 'REPOS_FILE_BRANCH': 'rolling', + 'PRODUCT': 'ROS 2', } html_favicon = 'favicon.ico' @@ -181,7 +205,7 @@ html_sourcelink_suffix = '' # Relative to html_static_path -html_css_files = ['custom.css', 'adopters.css'] +html_css_files = ['custom.css', 'adopters.css', 'pagefind-docsearch.css'] html_js_files = ['adopters.js'] # -- Options for HTMLHelp output ------------------------------------------ diff --git a/constraints.txt b/constraints.txt index 56ae59259be..2ba36b535b7 100644 --- a/constraints.txt +++ b/constraints.txt @@ -11,11 +11,13 @@ imagesize==1.4.1 iniconfig==2.1.0 Jinja2==3.1.6 MarkupSafe==3.0.3 +openai==2.33.0 packaging==25.0 pluggy==1.6.0 polib==1.2.0 Pygments==2.19.2 pytest==8.4.2 +python-dotenv==1.1.0 PyYAML==6.0.3 regex==2025.9.18 requests==2.32.5 @@ -39,4 +41,6 @@ sphinxcontrib-mermaid==1.0.0 sphinxcontrib-qthelp==2.0.0 sphinxcontrib-serializinghtml==2.0.0 stevedore==5.5.0 +tenacity==9.1.4 +timeout-decorator==0.5.0 urllib3==2.5.0 diff --git a/plugins/meta_util.py b/plugins/meta_util.py new file mode 100644 index 00000000000..32aef4d2f3b --- /dev/null +++ b/plugins/meta_util.py @@ -0,0 +1,70 @@ +# Copyright 2026 Open Robotics — shared helpers for ``.. meta::`` / Pagefind +""" +Collect every ``.. meta::`` field from the doctree, sanitize keys, and expand +``{MACRO}`` placeholders using the Sphinx ``macros`` config (longest keys first). + +Sphinx / the HTML theme may also emit plain ```` tags for the same fields. +The Pagefind extension emits additional tags with ``data-pagefind-filter`` and may +split comma-separated values into multiple tags for faceted search. +""" + +from __future__ import annotations + +import re +from typing import Dict, List, Optional + +from docutils import nodes + +# HTML ```` names should be conservative; allow common patterns. +_META_NAME_RE = re.compile(r'^[A-Za-z0-9_.:-]+$') + + +def sanitize_meta_key(raw: str) -> Optional[str]: + s = str(raw).strip() + if not s or not _META_NAME_RE.match(s): + return None + return s + + +def all_doctree_meta(doctree: Optional[nodes.document]) -> Dict[str, str]: + """Return last-wins mapping of every ``nodes.meta`` ``name``/``property`` → ``content``.""" + if doctree is None: + return {} + + out: Dict[str, str] = {} + for meta in doctree.findall(nodes.meta): + if meta.get('http-equiv'): + continue + content = meta.get('content') + if not content: + continue + key: Optional[str] = None + name = meta.get('name') + if name: + key = sanitize_meta_key(str(name)) + else: + prop = meta.get('property') + if prop: + key = sanitize_meta_key(str(prop)) + if not key: + continue + out[key] = str(content).strip() + return out + + +def expand_meta_macros(text: str, macros: Dict[str, str]) -> str: + """Expand ``{KEY}`` placeholders; longer macro names first to avoid partial matches.""" + result = text + for key, value in sorted(macros.items(), key=lambda kv: len(kv[0]), reverse=True): + result = result.replace(f'{{{key}}}', value) + return result + + +def expand_all_meta_values(meta: Dict[str, str], macros: Dict[str, str]) -> Dict[str, str]: + """Apply ``expand_meta_macros`` to every meta value.""" + return {k: expand_meta_macros(v, macros) for k, v in meta.items()} + + +def split_meta_values(value: str) -> List[str]: + """Return comma-separated metadata values as individual Pagefind values.""" + return [part.strip() for part in value.split(',') if part.strip()] diff --git a/plugins/pagefind_meta.py b/plugins/pagefind_meta.py new file mode 100644 index 00000000000..e5fc1188949 --- /dev/null +++ b/plugins/pagefind_meta.py @@ -0,0 +1,261 @@ +# Copyright 2026 Open Robotics — Pagefind metadata for ROS 2 documentation +""" +Emit SEO tags, Pagefind ``data-pagefind-meta``, and ``data-pagefind-filter`` +from every ``.. meta::`` field on the page (passthrough, no whitelist). + +Sphinx / the HTML theme typically also emits plain ```` tags for the same +``.. meta::`` fields. We intentionally emit an additional block with +``data-pagefind-filter`` (and split comma-separated values) so Pagefind faceting +works; crawlers may see duplicate name/content pairs for non-split fields. +""" + +from __future__ import annotations + +import html +import re +from pathlib import PurePosixPath +from typing import Any, Dict, List, Optional, Tuple + +from docutils import nodes + +from meta_util import all_doctree_meta, expand_all_meta_values, split_meta_values + + +def _macros_flat(app) -> Dict[str, str]: + macros = getattr(app.config, 'macros', {}) or {} + return {str(k): str(v) for k, v in macros.items()} + + +def _resolved_page_meta(app, doctree: Optional[nodes.document]) -> Dict[str, str]: + raw = all_doctree_meta(doctree) + return expand_all_meta_values(raw, _macros_flat(app)) + + +def _default_filter_label(key: str) -> str: + spaced = re.sub(r'([a-z])([A-Z])', r'\1 \2', key) + return spaced.replace('_', ' ').replace('-', ' ').strip().title() + + +def _metadata_fields_for_keys(app, sorted_keys: List[str]) -> List[List[str]]: + labels = getattr(app.config, 'pagefind_filter_labels', None) or {} + out: List[List[str]] = [] + for k in sorted_keys: + if isinstance(labels, dict) and labels.get(k): + lbl = str(labels[k]) + else: + lbl = _default_filter_label(k) + out.append([k, lbl]) + return out + + +def _pagefind_data_meta_attr(values: Dict[str, str]) -> str: + """Single data-pagefind-meta attribute value with repeated keys for multi-values.""" + parts: List[str] = [] + for key in sorted(values.keys()): + for value in split_meta_values(values.get(key, '')): + parts.append(f'{key}:{value}') + inner = ', '.join(parts) + return html.escape(inner, quote=True) + + +def _seo_and_filter_metas(values: Dict[str, str]) -> str: + """One per value: SEO name/content + data-pagefind-filter (Pagefind filtering docs).""" + lines: List[str] = [] + for key in sorted(values.keys()): + esc_name = html.escape(key, quote=True) + for value in split_meta_values(values.get(key, '')): + esc_val = html.escape(value, quote=True) + lines.append( + f'' + ) + return '\n '.join(lines) + + +def _ensure_meta_keys_store(env) -> Dict[str, Any]: + if not hasattr(env, 'pagefind_meta_keys_by_doc'): + env.pagefind_meta_keys_by_doc = {} + return env.pagefind_meta_keys_by_doc + + +def _collect_meta_keys(app, doctree: nodes.document, docname: str) -> None: + if app.builder.format != 'html': + return + raw = all_doctree_meta(doctree) + store = _ensure_meta_keys_store(app.env) + store[docname] = set(raw.keys()) + + +def _purge_meta_keys(app, env, docname: str) -> None: + if hasattr(env, 'pagefind_meta_keys_by_doc') and docname in env.pagefind_meta_keys_by_doc: + del env.pagefind_meta_keys_by_doc[docname] + + +def _merge_meta_keys(app, env, docnames, other) -> None: + """Merge per-document meta key sets from a parallel read worker environment.""" + if not hasattr(other, 'pagefind_meta_keys_by_doc'): + return + store = _ensure_meta_keys_store(env) + for docname, keys in other.pagefind_meta_keys_by_doc.items(): + store[docname] = set(keys) + + +def _union_meta_keys(env) -> List[str]: + if not hasattr(env, 'pagefind_meta_keys_by_doc'): + return [] + union: set[str] = set() + for keys in env.pagefind_meta_keys_by_doc.values(): + union |= set(keys) + return sorted(union) + + +def _pagefind_bundle_prefix(app, pagename: str) -> str: + """Relative URL prefix from current HTML page to the site root ``pagefind/`` directory. + + Must start with ``./`` or ``../`` so the browser resolves dynamic imports (e.g. + ``import(bundlePath + 'pagefind.js')``) as URLs, not bare module specifiers. + + For ``sphinx-multiversion``, each distro is built with ``pagename`` relative to that + distro tree (e.g. ``index``), but HTML is served under ``/{smv_current_version}/``. + The Pagefind bundle lives at the site root (``build/html/pagefind/``), so add one + ``../`` when ``smv_current_version`` is set. + """ + builder = getattr(app, 'builder', None) + if builder is not None: + target_uri = builder.get_target_uri(pagename, typ='html') + depth = len(PurePosixPath(target_uri).parent.parts) + else: + depth = pagename.count('/') + + version = getattr(app.config, 'smv_current_version', '') or '' + if version: + depth += 1 + + if depth == 0: + return './pagefind/' + return ('../' * depth) + 'pagefind/' + + +def _pagefind_component_urls(app, pagename: str) -> Tuple[str, str]: + """(css_href, js_href) relative to current page.""" + prefix = _pagefind_bundle_prefix(app, pagename) + return prefix + 'pagefind-component-ui.css', prefix + 'pagefind-component-ui.js' + + +def _search_results_href(app, pagename: str) -> str: + """Relative URL from the current page to Sphinx's ``search.html``. + + Uses the HTML builder's relative URI helper so multiversion pages under + ``/{distro}/`` link to ``/{distro}/search.html``, not site-root + ``/search.html`` (which may be wrong after ``make multiversion``). + """ + builder = getattr(app, 'builder', None) + if builder is None: + return 'search.html' + try: + current = builder.get_target_uri(pagename, typ='html') + target = builder.get_target_uri('search', typ='html') + rel = builder.get_relative_uri(current, target) + if rel: + return rel + except (AttributeError, KeyError, ValueError): + pass + return 'search.html' + + +def _merge_index_entries(app, distro: str) -> List[Dict[str, Any]]: + """Build mergeIndex list from conf (pinned docs.ros.org template).""" + pkgs: List[str] = list(getattr(app.config, 'pagefind_merge_package_pkgs', []) or []) + if not pkgs or not getattr(app.config, 'pagefind_merge_enabled', False): + return [] + base = getattr(app.config, 'pagefind_merge_index_base', 'https://docs.ros.org').rstrip('/') + overrides = getattr(app.config, 'pagefind_merge_index_overrides', {}) or {} + out: List[Dict[str, Any]] = [] + for pkg in pkgs: + key = f'{distro}/{pkg}' + if key in overrides: + bundle = overrides[key] + else: + bundle = f'{base}/en/{distro}/p/{pkg}/pagefind' + entry: Dict[str, Any] = {'bundlePath': bundle} + mf = getattr(app.config, 'pagefind_merge_filter_per_pkg', None) + if isinstance(mf, dict) and pkg in mf: + entry['mergeFilter'] = mf[pkg] + iw = getattr(app.config, 'pagefind_merge_index_weight_per_pkg', None) + if isinstance(iw, dict) and pkg in iw: + entry['indexWeight'] = iw[pkg] + out.append(entry) + return out + + +def _html_page_context( + app, + pagename: str, + templatename: str, + context: Dict[str, Any], + doctree, +) -> None: + sorted_keys = _union_meta_keys(app.env) + metadata_fields = _metadata_fields_for_keys(app, sorted_keys) + filter_csv = ','.join(sorted_keys) + + empty = { + 'pagefind_seo_filter_metas': '', + 'pagefind_data_meta_attr': '', + 'pagefind_bundle_prefix': './pagefind/', + 'pagefind_component_css': './pagefind/pagefind-component-ui.css', + 'pagefind_component_js': './pagefind/pagefind-component-ui.js', + 'pagefind_merge_index': [], + 'pagefind_filter_keys_csv': filter_csv, + 'pagefind_metadata_fields': metadata_fields, + 'pagefind_result_meta_order': list( + getattr(app.config, 'pagefind_result_meta_order', []) or [] + ), + 'pagefind_search_results_href': 'search.html', + } + context.update(empty) + + if app.builder.format != 'html' or templatename is None: + return + if not templatename.endswith('.html'): + return + + default_distro = (getattr(app.config, 'macros', {}) or {}).get('DISTRO', 'rolling') + values = _resolved_page_meta(app, doctree) + + seo_filters = _seo_and_filter_metas(values) + data_attr = _pagefind_data_meta_attr(values) + css_href, js_href = _pagefind_component_urls(app, pagename) + bundle_prefix = _pagefind_bundle_prefix(app, pagename) + + merge_distro = values.get('distro') or str(default_distro) + merge = _merge_index_entries(app, merge_distro) + context['pagefind_seo_filter_metas'] = seo_filters + context['pagefind_data_meta_attr'] = data_attr + context['pagefind_bundle_prefix'] = bundle_prefix + context['pagefind_component_css'] = css_href + context['pagefind_component_js'] = js_href + context['pagefind_merge_index'] = merge + context['pagefind_search_results_href'] = _search_results_href(app, pagename) + + +def setup(app) -> Dict[str, Any]: + app.add_config_value('pagefind_merge_enabled', default=False, rebuild='html') + app.add_config_value('pagefind_merge_package_pkgs', default=[], rebuild='html') + app.add_config_value('pagefind_merge_index_base', default='https://docs.ros.org', rebuild='html') + app.add_config_value('pagefind_merge_index_overrides', default={}, rebuild='html') + app.add_config_value('pagefind_merge_filter_per_pkg', default=None, rebuild='html') + app.add_config_value('pagefind_merge_index_weight_per_pkg', default=None, rebuild='html') + app.add_config_value('pagefind_filter_labels', default={}, rebuild='html') + app.add_config_value('pagefind_result_meta_order', default=[], rebuild='html') + + app.connect('html-page-context', _html_page_context) + app.connect('doctree-resolved', _collect_meta_keys) + app.connect('env-purge-doc', _purge_meta_keys) + app.connect('env-merge-info', _merge_meta_keys) + + return { + 'parallel_read_safe': True, + 'parallel_write_safe': True, + 'version': '1.0.0', + } diff --git a/plugins/showmeta.py b/plugins/showmeta.py new file mode 100644 index 00000000000..f11b140429c --- /dev/null +++ b/plugins/showmeta.py @@ -0,0 +1,120 @@ +# Copyright 2026 Open Robotics — explicit in-body ``.. showmeta::`` summary +""" +Render selected ``.. meta::`` fields in the document body with author-controlled +order and labels. Place ``.. showmeta::`` where the summary should appear (HTML only). +""" + +from __future__ import annotations + +import html as html_module +import re +from typing import List + +from docutils import nodes +from docutils.parsers.rst import directives +from sphinx.util.docutils import SphinxDirective + +from meta_util import all_doctree_meta, expand_all_meta_values + + +def _macros_flat(app) -> dict[str, str]: + return {str(k): str(v) for k, v in (getattr(app.config, 'macros', {}) or {}).items()} + + +def _default_showmeta_label(key: str) -> str: + spaced = re.sub(r'([a-z])([A-Z])', r'\1 \2', key) + return spaced.replace('_', ' ').replace('-', ' ').strip().title() + + +class showmeta_node(nodes.General, nodes.Element): + """Placeholder replaced on ``doctree-resolved`` (HTML builds only).""" + + +class ShowMetaDirective(SphinxDirective): + """Insert a visible metadata line built from ``.. meta::`` on this page.""" + + has_content = False + option_spec = { + 'order': directives.unchanged, + 'labels': directives.unchanged, + } + + def run(self) -> List[nodes.Node]: + node = showmeta_node() + node['order'] = self.options.get('order', '') + node['labels'] = self.options.get('labels', '') + self.set_source_info(node) + return [node] + + +def visit_skip_showmeta(self, node: showmeta_node) -> None: + raise nodes.SkipNode + + +def depart_showmeta_noop(self, node: showmeta_node) -> None: + pass + + +def _parse_labels(raw: str) -> dict[str, str]: + out: dict[str, str] = {} + for part in [p.strip() for p in raw.split(',') if p.strip() and '=' in p]: + key, _, value = part.partition('=') + key, value = key.strip(), value.strip() + if key: + out[key] = value + return out + + +def replace_showmeta_nodes(app, doctree: nodes.document, docname: str) -> None: + if app.builder.format != 'html': + for node in list(doctree.findall(showmeta_node)): + node.parent.remove(node) + return + + macros = _macros_flat(app) + meta = expand_all_meta_values(all_doctree_meta(doctree), macros) + + for node in list(doctree.findall(showmeta_node)): + order = [x.strip() for x in node.get('order', '').split(',') if x.strip()] + labels_map = _parse_labels(node.get('labels', '')) + if not order: + node.parent.remove(node) + continue + + parts: List[str] = [] + for key in order: + val = meta.get(key, '').strip() + if not val: + continue + label_base = labels_map.get(key) or _default_showmeta_label(key) + label_display = label_base if label_base.rstrip().endswith(':') else f'{label_base}:' + parts.append( + f'{html_module.escape(label_display)} ' + f'{html_module.escape(val)}' + ) + + if not parts: + node.parent.remove(node) + else: + inner = ' | '.join(parts) + raw = nodes.raw( + '', + f'

{inner}

', + format='html', + ) + node.replace_self(raw) + + +def setup(app): + app.add_node( + showmeta_node, + html=(visit_skip_showmeta, depart_showmeta_noop), + latex=(visit_skip_showmeta, depart_showmeta_noop), + ) + app.add_directive('showmeta', ShowMetaDirective) + app.connect('doctree-resolved', replace_showmeta_nodes) + return { + 'version': '1.0.0', + 'parallel_read_safe': True, + 'parallel_write_safe': True, + } diff --git a/requirements.txt b/requirements.txt index f952c4882fb..71bd6e769fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,12 @@ +# Non-Python build dependency (install separately; used by `make pagefind`): +# Node.js 18+ with npx — https://nodejs.org/ +# Verify: node -v && npx -v + codespell doc8 docutils +openai +python-dotenv pip pytest sphinx @@ -12,3 +18,5 @@ sphinx-tabs sphinx-tamer sphinxcontrib-googleanalytics sphinxcontrib-mermaid +tenacity +timeout-decorator diff --git a/scripts/enhance_data.py b/scripts/enhance_data.py new file mode 100644 index 00000000000..dea61b02600 --- /dev/null +++ b/scripts/enhance_data.py @@ -0,0 +1,217 @@ +""" +Data structures and pure functions for tracking enhancement results and computing metrics. + +This module provides a functional-programming-oriented core for managing analysis results +and deriving metrics. It is independent of the domain logic (e.g. RST file handling, OpenAI integration) +and can be reused in other contexts. +""" + +from typing import NamedTuple, Dict, Set, List, Optional + + +class EnhanceMetrics(NamedTuple): + """ + Immutable data structure representing analysis metrics derived from enhancement results. + + Attributes: + counts_by_analysis: Dictionary mapping analysis types to their value counts. + Example: {"content-type": {"task": 5, "concept": 3, "reference": 2}} + files_with_results_count: Number of files that had analysis results. + updated_files_count: Number of files that had metadata successfully updated. + """ + counts_by_analysis: Dict[str, Dict[str, int]] + files_with_results_count: int + updated_files_count: int + + +class EnhanceData(NamedTuple): + """ + Immutable data structure representing enhancement results. + + Attributes: + results: Dictionary mapping file paths to analysis results. + Format: {file_path: {analysis_type: result_value}} + updated_files: Set of file paths that had metadata successfully updated. + """ + results: Dict[str, Dict[str, str]] + updated_files: Set[str] + + +def get_total_analysis_count(metrics: EnhanceMetrics) -> int: + """ + Calculate the total number of analysis results across all analysis types. + + Note: Files with multiple analysis types contribute multiple counts. + For unique file count, use metrics.files_with_results_count instead. + + Args: + metrics: The metrics structure to analyse. + + Returns: + Total count of all analysis results across all analysis types. + """ + return sum(sum(counts.values()) for counts in metrics.counts_by_analysis.values()) + + +def create_enhance_data() -> EnhanceData: + """ + Initialise an empty EnhanceData structure. + + Returns: + Empty EnhanceData with no results or updated files. + """ + return EnhanceData(results={}, updated_files=set()) + + +def add_analysis_result(data: EnhanceData, filename: str, analysis_type: str, result: str) -> EnhanceData: + """ + Add an analysis result to the enhancement data. + + Returns a new EnhanceData instance with the added result. + + Args: + data: Current enhancement data. + filename: Path to the file (relative to repository root). + analysis_type: Type of analysis (e.g., "content-type"). + result: Analysis result value. + + Returns: + New EnhanceData with the result added. + """ + new_results = {**data.results} # Shallow copy: replace one filename entry immutably + file_results = {**new_results.get(filename, {})} # Preserve other analysis keys for this file + file_results[analysis_type] = result + new_results[filename] = file_results + return EnhanceData(results=new_results, updated_files=data.updated_files) # ``updated_files`` unchanged here + + +def mark_file_updated(data: EnhanceData, filename: str) -> EnhanceData: + """ + Mark a file as having been successfully updated with metadata. + + Returns a new EnhanceData instance with the file added to updated_files. + + Args: + data: Current enhancement data. + filename: Path to the file that was updated (relative to repository root). + + Returns: + New EnhanceData with the file marked as updated. + """ + return EnhanceData(results=data.results, updated_files=data.updated_files | {filename}) # Set union adds one file path + + +def calculate_metrics(data: EnhanceData) -> EnhanceMetrics: + """ + Derive metrics from enhancement data. + + Pure function that transforms EnhanceData into EnhanceMetrics for analysis and reporting. + + Args: + data: Current enhancement data. + + Returns: + EnhanceMetrics containing counts, file counts, and update counts. + """ + counts_by_analysis: Dict[str, Dict[str, int]] = {} + + for file_results in data.results.values(): + if file_results: + for analysis_type, result_value in file_results.items(): + clean_value = result_value.strip().lower() # Normalise so ``Task`` and ``task`` aggregate together + if analysis_type not in counts_by_analysis: + counts_by_analysis[analysis_type] = {} + counts_by_analysis[analysis_type][clean_value] = counts_by_analysis[analysis_type].get(clean_value, 0) + 1 + + files_with_results_count = sum(1 for file_results in data.results.values() if file_results) # Files with at least one non-empty result dict + + return EnhanceMetrics( + counts_by_analysis=counts_by_analysis, + files_with_results_count=files_with_results_count, + updated_files_count=len(data.updated_files) # Distinct files whose RST was rewritten on disk + ) + + +def get_files_with_results(data: EnhanceData) -> List[str]: + """ + Get list of file paths that had analysis results. + + Args: + data: Current enhancement data. + + Returns: + List of file paths with at least one analysis result. + """ + return [filename for filename, file_results in data.results.items() if file_results] + + +def get_updated_files(data: EnhanceData) -> List[str]: + """ + Get list of file paths that had metadata successfully updated. + + Args: + data: Current enhancement data. + + Returns: + List of file paths that were updated with metadata. + """ + return list(data.updated_files) + + +def is_file_updated(data: EnhanceData, filename: str) -> bool: + """ + Check if a file was successfully updated with metadata. + + Args: + data: Current enhancement data. + filename: Path to the file to check (relative to repository root). + + Returns: + True if the file was updated, False otherwise. + """ + return filename in data.updated_files + + +def get_analysis_types(data: EnhanceData) -> List[str]: + """ + Get list of all analysis types performed. + + Args: + data: Current enhancement data. + + Returns: + List of unique analysis types found in results. + """ + analysis_types: Set[str] = set() + for file_results in data.results.values(): + analysis_types.update(file_results.keys()) + return list(analysis_types) + + +def get_result_for_file(data: EnhanceData, filename: str, analysis_type: str) -> Optional[str]: + """ + Get analysis result for a specific file and analysis type. + + Args: + data: Current enhancement data. + filename: Path to the file (relative to repository root). + analysis_type: Type of analysis (e.g., "content-type"). + + Returns: + Analysis result or None if not found. + """ + return data.results.get(filename, {}).get(analysis_type) + + +def get_results_for_file(data: EnhanceData, filename: str) -> Dict[str, str]: + """ + Get all analysis results for a specific file. + + Args: + data: Current enhancement data. + filename: Path to the file (relative to repository root). + + Returns: + Dictionary of analysis results for the file, or empty dict if not found. + """ + return data.results.get(filename, {}) # Consumed by ``update_meta_rst_files`` as ``.. meta::`` field names diff --git a/scripts/enhance_topics.py b/scripts/enhance_topics.py new file mode 100644 index 00000000000..0a3b5745672 --- /dev/null +++ b/scripts/enhance_topics.py @@ -0,0 +1,464 @@ +import logging +import re +import sys +import os +from typing import Optional + +from dotenv import load_dotenv +from openai import OpenAI, RateLimitError, APIConnectionError, OpenAIError +from tenacity import retry, stop_after_attempt, wait_random_exponential, retry_if_exception_type +from concurrent.futures import ThreadPoolExecutor + +from enhance_data import EnhanceData, add_analysis_result, calculate_metrics, create_enhance_data, get_results_for_file, mark_file_updated +from rst_utils import get_meta_names_from_content, inject_metadata_to_content + +logger = logging.getLogger(__name__) + +# Define constants +GPT_MODEL = "gpt-5.4-nano" # GPT model to use for the API calls +# Maximum content length in characters, approximately 300k tokens (leaving 100k for instructions/output) +MAX_CONTENT_LENGTH = 1200000 +RST_EXTENSION = '.rst' # File extension for RST files + +# Define timeout and retry parameters for API calls +# - Individual API calls timeout after DEFAULT_TIMEOUT seconds +# - On rate limits/connection errors, retry up to MAX_RETRIES times +# - Wait between retries, increasing exponentially: MIN_WAIT → MAX_WAIT (capped) +DEFAULT_TIMEOUT = 30 # Default timeout in seconds for an individual API call +MAX_RETRIES = 10 # Maximum number of retry attempts for exponential backoff +MIN_WAIT = 10 # Minimum wait time between retries in seconds +MAX_WAIT = 120 # Maximum wait time between retries in seconds + +KEYWORDS_PROMPT = """You are a content analyst, and your role is to analyze text content within supplied documents. + +Your role is to extract 3 to 5 keywords from the content for use in metadata. The keywords should be single words that are the most important and relevant words to the content topic. + +Finally, generate a comma-separated list of these keywords, in lowercase, with no additional styling, characters, or formatting.""" + +DESCRIPTION_PROMPT = """You are a content analyst, and your role is to analyze text content within supplied documents. + +Your role is to create a concise description of the content for use in metadata. The description should be a single sentence (of a maximum of 130 characters) that captures the main idea of the content. + +Finally, generate this description, with no additional styling, characters, or formatting.""" + +ENGLISH_LANGUAGE_CHECK_PROMPT = """You are a validation assistant, and your role is to determine whether the following text is written entirely in English. Common technical terms, acronyms, and internationally recognised proper nouns are acceptable if they are normally used in English technical documentation. + +Answer ONLY with the single word yes or no in lowercase, with no punctuation, explanation, or additional text.""" + +@retry( + retry=retry_if_exception_type((RateLimitError, APIConnectionError)), + stop=stop_after_attempt(MAX_RETRIES), + wait=wait_random_exponential(multiplier=MIN_WAIT, max=MAX_WAIT), + reraise=True +) +def analyze_content(client: OpenAI, content: str, prompt: str, timeout: int = DEFAULT_TIMEOUT) -> str: + """ + Analyse content using OpenAI's API with retry and timeout logic. + Uses ThreadPoolExecutor for cross-platform timeout handling and retries for transient API errors. + + Args: + client (OpenAI): OpenAI client instance. + content (str): Preprocessed content. + prompt (str): Prompt for the AI model. + timeout (int): Maximum time to wait for response in seconds. + + Returns: + str: Analysis result from the AI model, or empty string if analysis fails. + + Raises: + TimeoutError: If the API call exceeds the specified timeout. + RateLimitError: If API rate limits are exceeded (will trigger retry). + APIConnectionError: If connection fails (will trigger retry). + """ + # Log the content length before potential truncation + logger.debug(f"Processing content of length: {len(content)} characters") + + # Truncate content if its too long + if len(content) > MAX_CONTENT_LENGTH: + logger.warning(f"Content truncated to {MAX_CONTENT_LENGTH} characters for analysis.") + content = content[:MAX_CONTENT_LENGTH] + + def _make_api_call() -> str: + """ + Inner function to handle the OpenAI API call. + Separated to allow for clean timeout handling via ThreadPoolExecutor. + + Returns: + str: The model's response content + + Raises: + RateLimitError, APIConnectionError: Propagated for retry handling + """ + try: + logger.debug("Sending request to OpenAI API...") + completion = client.chat.completions.create( + model=GPT_MODEL, + messages=[ + {"role": "system", "content": prompt}, + {"role": "user", "content": f"Content:\n\n{content}"} + ] + ) + result = completion.choices[0].message.content + logger.debug("Successfully received response from OpenAI API") + return result if result is not None else "" + except (RateLimitError, APIConnectionError) as e: + logger.warning(f"Retryable error occurred: {str(e)}") + raise # Re-raise for retry decorator to handle + + # Use ThreadPoolExecutor for cross-platform timeout handling + with ThreadPoolExecutor() as executor: + try: + future = executor.submit(_make_api_call) + return future.result(timeout=timeout) + except TimeoutError: + logger.error(f"API call timed out after {timeout} seconds") + raise # Re-raise the original timeout error + +@retry( + retry=retry_if_exception_type((RateLimitError, APIConnectionError)), + stop=stop_after_attempt(MAX_RETRIES), + wait=wait_random_exponential(multiplier=MIN_WAIT, max=MAX_WAIT), + reraise=True +) +def validate_content(client: OpenAI, generated: str, timeout: int = DEFAULT_TIMEOUT) -> bool: + """ + Validate generated content using the moderation API and a separate English-language check. + + Intended for any model-generated text before it is persisted (metadata today; other content later). + Uses ThreadPoolExecutor for cross-platform timeout handling and retries for transient API errors. + + Args: + client (OpenAI): OpenAI client instance. + generated (str): Model-generated text to validate. + timeout (int): Maximum time to wait for the combined validation calls in seconds. + + Returns: + bool: True if content passes moderation and the language check; False otherwise. + + Raises: + TimeoutError: If the validation calls exceed the specified timeout. + RateLimitError: If API rate limits are exceeded (will trigger retry). + APIConnectionError: If connection fails (will trigger retry). + """ + if not generated.strip(): + logger.debug("Validation skipped: empty generated content") + return False + + text = generated + if len(text) > MAX_CONTENT_LENGTH: + logger.warning( + "Generated text truncated to %s characters for validation.", + MAX_CONTENT_LENGTH, + ) + text = text[:MAX_CONTENT_LENGTH] + + def _run_validation() -> bool: + """ + Run moderation and English checks sequentially. + + Returns: + bool: True if both checks pass. + + Raises: + RateLimitError, APIConnectionError: Propagated for retry handling. + """ + try: + logger.debug("Sending generated text to moderation API...") + moderation = client.moderations.create(input=text) + except (RateLimitError, APIConnectionError) as e: + logger.warning("Retryable error during moderation: %s", e) + raise + + if not moderation.results: + logger.warning("Moderation API returned no results; treating as validation failure") + return False + + result0 = moderation.results[0] + if result0.flagged: + categories = [ + name + for name, flagged in result0.categories.model_dump().items() + if flagged + ] + logger.warning( + "Content failed moderation (flagged). Categories: %s", + ", ".join(categories) if categories else "unknown", + ) + return False + + try: + logger.debug("Sending generated text for English-language validation...") + completion = client.chat.completions.create( + model=GPT_MODEL, + messages=[ + {"role": "system", "content": ENGLISH_LANGUAGE_CHECK_PROMPT}, + {"role": "user", "content": f"Text:\n\n{text}"}, + ], + ) + except (RateLimitError, APIConnectionError) as e: + logger.warning("Retryable error during language validation: %s", e) + raise + + answer = completion.choices[0].message.content + raw = (answer or "").strip().lower() + # Accept a single leading yes/no token even if the model adds stray whitespace + match = re.match(r"^(yes|no)\b", raw) + if not match or match.group(1) != "yes": + logger.warning( + "Content failed English-language validation (model answer: %r)", + answer, + ) + return False + + logger.debug("Generated content passed moderation and English-language validation") + return True + + with ThreadPoolExecutor() as executor: + try: + future = executor.submit(_run_validation) + return future.result(timeout=timeout) + except TimeoutError: + logger.error("Validation timed out after %s seconds", timeout) + raise + +def analyze_files(files: list[str], client: OpenAI, prompts: dict[str, str], timeout: int = DEFAULT_TIMEOUT) -> EnhanceData: + """ + Process a list of files and analyse their content using each of the passed prompts. + + Args: + files (list[str]): List of paths to files. + client (OpenAI): OpenAI client instance. + prompts (dict[str, str]): Dictionary of prompts for the AI model. + timeout (int): Maximum time to wait for each API call in seconds. + + Returns: + EnhanceData: Enhancement data structure containing analysis results and update tracking. + """ + data = create_enhance_data() + + logger.debug("============================") + logger.debug("Performing content analysis:") + logger.debug("============================") + + for file_path in files: # Iterate through each file in the list + logger.debug(f"Analysing file: {file_path}") + + # Read the content of the file + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + except (OSError, PermissionError) as e: + logger.error("Error reading file %s: %s", file_path, e) + continue + except UnicodeDecodeError as e: + logger.error("Unicode decode error reading file %s: %s", file_path, e) + continue + + # Check if the content is not empty + if content.strip(): + # Check if the content has any meta fields already + existing_meta_names = get_meta_names_from_content(content) + for prompt_name, prompt in prompts.items(): # Iterate through each prompt in the dictionary + if prompt_name in existing_meta_names: + logger.warning( + "Skipping analysis for %s: meta field %r already present in .. meta::", + file_path, + prompt_name, + ) + continue + logger.debug(f"Running analysis: {prompt_name}") + try: + # Analyse the content using API with timeout and retry logic + result = analyze_content( + client, + content, + prompt, + timeout=timeout + ) + if result: + if validate_content(client, result, timeout=timeout): + # Add the analysis result to the data structure + data = add_analysis_result(data, file_path, prompt_name, result) + else: + logger.warning( + "Validation failed for generated %s in %s; result not stored", + prompt_name, + file_path, + ) + else: + logger.warning(f"No result for {file_path} with prompt name: {prompt_name}") + + except (RateLimitError, APIConnectionError) as e: + # Exhausted all retries due to rate limits or connection errors + logger.error(f"Failed to analyse {file_path} with prompt {prompt_name} after {MAX_RETRIES} retries: {e}") + continue + except TimeoutError as e: + # Timeout error due to an individual API call timing out + logger.error(f"Analysis timed out for {file_path} with prompt {prompt_name}: {e}") + continue + except (OpenAIError, ValueError) as e: + # Other API errors and value errors + logger.error(f"Failed to analyse {file_path} with prompt {prompt_name}: {e}") + continue + else: + logger.info(f"No analysable content found for {file_path}") + + return data + + +def get_openai_client() -> OpenAI: + """ + Create an OpenAI client with proper authentication. + + The API key is sourced in the following order: + 1. Environment variable OPENAI_API_KEY + 2. .env file in the project root + + Returns: + OpenAI: Authenticated OpenAI client instance + + Raises: + AuthenticationError: If no valid API key is found + """ + # Load environment variables from .env file if present + load_dotenv() + + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise OpenAIError("OpenAI API key not found. Set OPENAI_API_KEY environment variable.") + + return OpenAI(api_key=api_key) + +def enhance_metadata(files: list[str], client: Optional[OpenAI] = None) -> EnhanceData: + """ + Enhance files with metadata based on content analysis. + + Args: + files (list[str]): Paths to files to enhance. + client (OpenAI, optional): OpenAI client instance. If None, creates new instance. + + Returns: + EnhanceData: Enhancement data structure containing analysis results and update tracking. + + Raises: + OpenAIError: If no valid API key is found when creating a new client. + """ + try: + client = client or get_openai_client() + except OpenAIError as e: + logger.error(f"Failed to initialise OpenAI client: {e}") + return create_enhance_data() + + # TODO: Make this config-driven, so that we can easily add more prompts and analysis types + prompts: dict[str, str] = {"description": DESCRIPTION_PROMPT, "keywords": KEYWORDS_PROMPT} + + data = analyze_files(files, client, prompts) # Populate and validate ``EnhanceData.results`` from the model + data = update_meta_files(files, data) # Persist results as metadata fields and set ``updated_files`` + + return data + +def update_meta_files(files: list[str], data: EnhanceData) -> EnhanceData: + """ + Process a list of files and update them with passed metadata. + + Args: + files (list[str]): List of paths to files. + data (EnhanceData): Enhancement data structure containing metadata for files. + + Returns: + EnhanceData: Updated enhancement data with files marked as updated. + """ + + logger.debug("===========================") + logger.debug("Updating metadata in files:") + logger.debug("===========================") + + current_data = data # Thread results through ``mark_file_updated`` immutably + + for file_path in files: + logger.debug("Updating metadata in file: %s", file_path) + metadata = get_results_for_file(current_data, file_path) + + # Confirm the metadata is not empty for the file, else skip + if not metadata: + logger.info("Skipping %s as it has no results for enhancement", file_path) + continue + + logger.debug("Metadata found for %s, proceeding with updates.", file_path) + + try: + with open(file_path, encoding="utf-8") as file: + content = file.read() # Full document; helpers locate or synthesise ``.. meta::`` + except (OSError, PermissionError) as exc: + logger.error("Error reading file %s: %s", file_path, exc) + continue + except UnicodeDecodeError as exc: + logger.error("Unicode decode error reading file %s: %s", file_path, exc) + continue + + new_content, changed = inject_metadata_to_content(content, metadata) + + # Confirm that at least one metadata has been changed for the file, else skip + if not changed: + logger.debug("No metadata changes applied for %s", file_path) + continue # All keys already present or no additions—do not touch the file + + try: + with open(file_path, "w", encoding="utf-8") as file: + file.write(new_content) # Full-document rewrite (same path as read) + except (OSError, PermissionError) as exc: + logger.error("Error writing file %s: %s", file_path, exc) + continue + except UnicodeEncodeError as exc: + logger.error("Unicode encode error while writing file %s: %s", file_path, exc) + continue + + current_data = mark_file_updated(current_data, file_path) # Record success for metrics only after a clean write + logger.debug("Updated file with supplied metadata: %s", file_path) + logger.debug("-" * 50) + + # ``files_with_results_count`` reflects files with at least one valid analysis result, and ``updated_files_count`` reflects files we rewrote + metrics = calculate_metrics(current_data) + logger.info("Updated metadata in %s files out of %s files processed.", metrics.updated_files_count, len(files)) + return current_data + +def main() -> None: + """ + Main entry point for the script. + + - Parses command-line arguments to collect input file paths. + - Filters the provided files to include only reStructuredText (.rst) files. + - Enhances the metadata of each RST file using AI-based analysis (keywords and description). + - Writes updated metadata back to files and logs processing metrics. + + Usage: + python enhance_topics.py ... + + Only files with the .rst extension will be processed. + Logs the number of files successfully enhanced. + """ + + logging.basicConfig( + level=logging.INFO, + format="%(levelname)s %(name)s: %(message)s", + ) + + # Collect filenames from command line arguments and filter for RST files + input_files = sys.argv[1:] + rst_files = [f for f in input_files if f.lower().endswith(RST_EXTENSION)] + + if not rst_files: + if input_files: + logger.info("No RST files found among provided arguments. Skipping enhancement.") + else: + logger.error("No input files provided. Pass a list of RST files as arguments.") + sys.exit(0) + + # Enhance the metadata in the RST files and return the enhancement data with updated files + data = enhance_metadata(rst_files) + # Log the metrics for the enhancement data + metrics = calculate_metrics(data) + logger.info(f"Enhanced files: {metrics.files_with_results_count} with at least one valid analysis result, and {metrics.updated_files_count} files updated, out of {len(rst_files)} RST files.") + +if __name__ == "__main__": + main() diff --git a/scripts/rst_utils.py b/scripts/rst_utils.py new file mode 100644 index 00000000000..6599f7a97d9 --- /dev/null +++ b/scripts/rst_utils.py @@ -0,0 +1,126 @@ +""" +Utilities for editing reStructuredText source, in particular ``.. meta::`` directives. +""" + +import logging +import re + +logger = logging.getLogger(__name__) + + +def _find_meta_block(content: str) -> tuple[int, int, int, str, str]: + """ + Locate the first ``.. meta::`` directive in RST source. + + The directive block consists of the explicit marker line followed by + contiguous indented lines; a blank line or a less-indented line ends the + block (per reStructuredText directive block rules). + + Returns: + Tuple of ``(start, marker_end, block_end, inner, indent)``. + If no directive is found, ``start``, ``marker_end``, and ``block_end`` + are ``-1``, ``inner`` is ``''``, and ``indent`` defaults to three spaces. + """ + # Explicit markup + directive name; block body starts on the following line only + match = re.search(r"^\.\.\s+meta::\s*\n", content, re.MULTILINE) + if not match: + return -1, -1, -1, "", " " + + start = match.start() # Byte index of ``.. meta::`` (for whole-directive splice) + marker_end = match.end() # First character after the marker line's newline + indent = " " # Default field indent when the block is empty or we prepend a new block + inner_parts: list[str] = [] + consumed = 0 # Length of directive body in ``content`` (may omit final ``\n`` on last line) + remainder = content[marker_end:] # Scan forward only inside this file slice + + for line in remainder.splitlines(keepends=True): + if line.strip() == "": + break # Blank line terminates the directive block + if not line.startswith((" ", "\t")): + break # Body element at column 0 ends the block + if not inner_parts: + ws_len = len(line) - len(line.lstrip(" \t")) + indent = line[:ws_len] # Reuse the author's indent for new ``:name:`` lines + inner_parts.append(line) + consumed += len(line) + + block_end = marker_end + consumed # Exclusive end of the directive in ``content`` + inner = "".join(inner_parts) + # EOF without ``\n`` yields a last ``splitlines`` element with no newline—append one before new fields + if inner and not inner.endswith("\n"): + inner += "\n" + return start, marker_end, block_end, inner, indent + + +def _extract_meta_names_from_block(meta_block_inner: str) -> set[str]: + """ + Collect field names from the body of a ``.. meta::`` directive. + + Each line of the form ``:name: value`` contributes ``name`` (Docutils also + allows forms such as ``:name attr=value:``; the captured segment matches + that usage). + """ + names: set[str] = set() + # Field list lines only; group 1 is the name segment (includes ``attr=value`` forms before the final ``:``) + for field_match in re.finditer(r"^[ \t]+:([^:\n]+?):", meta_block_inner, re.MULTILINE): + names.add(field_match.group(1).strip()) + return names + + +def get_meta_names_from_content(content: str) -> set[str]: + """ + Return the set of field names already present in the first ``.. meta::`` block. + + If no ``.. meta::`` directive exists, returns an empty set. + """ + _start, _marker_end, _block_end, inner, _indent = _find_meta_block(content) + return _extract_meta_names_from_block(inner) + + +def _normalise_meta_field_value(value: str) -> str: + """Collapse whitespace so the meta field body stays a single logical line.""" + return " ".join(value.split()) # Docutils treats the field body as one string; keep it one physical line + + +def inject_metadata_to_content(content: str, metadata: dict[str, str]) -> tuple[str, bool]: + """ + Insert or append ``.. meta::`` field entries for the given name/value pairs. + + Appends to an existing ``.. meta::`` block when present; otherwise prepends + a new block at the start of the document (leading whitespace is stripped so + the directive is the first element). Skips keys that already appear in the + block. + + Returns: + Updated source and whether any change was made. + """ + start, marker_end, block_end, inner, indent = _find_meta_block(content) + names = _extract_meta_names_from_block(inner) # Snapshot before we add keys from this same batch + additions: list[str] = [] + + for key, raw_value in metadata.items(): + if key in names: + logger.warning( + "Existing meta field %r in .. meta:: block; skipping", + key, + ) + continue + value = _normalise_meta_field_value(raw_value) + additions.append(f"{indent}:{key}: {value}\n") + names.add(key) # Prevent duplicate inserts if ``metadata`` repeats a key + + if not additions: + return content, False # Nothing new to write; leave the file untouched + + new_inner = inner + "".join(additions) # Existing fields unchanged, then appended lines + + if start >= 0: + # Replace only the directive body slice; ``marker_end``/``block_end`` bracket the original inner + new_content = content[:marker_end] + new_inner + content[block_end:] + else: + # No ``.. meta::`` yet: insert at document start; strip leading whitespace so the block is truly first + remainder = content.lstrip() + new_content = ".. meta::\n" + "".join(additions) + "\n" + remainder # Blank line after block separates it from the body + + return new_content, True + diff --git a/scripts/test/__init__.py b/scripts/test/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/scripts/test/test_enhance_topics.py b/scripts/test/test_enhance_topics.py new file mode 100644 index 00000000000..4b8bdbd64aa --- /dev/null +++ b/scripts/test/test_enhance_topics.py @@ -0,0 +1,191 @@ +import pytest +from unittest.mock import MagicMock, patch, mock_open +import sys +import os +from openai import OpenAIError + +# Add the scripts directory to sys.path to allow importing enhance_topics +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from enhance_topics import ( + analyze_content, + get_openai_client, + analyze_files, + update_meta_files, + enhance_metadata, + MAX_CONTENT_LENGTH +) +from enhance_data import EnhanceData + +@pytest.fixture +def mock_client(): + """Provides a mocked OpenAI client.""" + return MagicMock() + +# --- Tests for analyze_content --- + +def test_analyze_content_success(mock_client): + """Test successful content analysis.""" + mock_completion = MagicMock() + mock_completion.choices = [MagicMock(message=MagicMock(content='Analysis result'))] + mock_client.chat.completions.create.return_value = mock_completion + + result = analyze_content(mock_client, "Some content", "Some prompt") + assert result == 'Analysis result' + mock_client.chat.completions.create.assert_called_once() + +def test_analyze_content_truncation(mock_client): + """Test that content is truncated if it exceeds MAX_CONTENT_LENGTH.""" + long_content = "a" * (MAX_CONTENT_LENGTH + 100) + mock_completion = MagicMock() + mock_completion.choices = [MagicMock(message=MagicMock(content='Result'))] + mock_client.chat.completions.create.return_value = mock_completion + + analyze_content(mock_client, long_content, "Prompt") + + # Check the call arguments to ensure content was truncated + args, kwargs = mock_client.chat.completions.create.call_args + sent_content = kwargs['messages'][1]['content'] + assert len(sent_content) <= MAX_CONTENT_LENGTH + len("Content:\n\n") + +def test_analyze_content_empty_response(mock_client): + """Test handling of empty response from API.""" + mock_completion = MagicMock() + mock_completion.choices = [MagicMock(message=MagicMock(content=None))] + mock_client.chat.completions.create.return_value = mock_completion + + result = analyze_content(mock_client, "Content", "Prompt") + assert result == "" + +# --- Tests for get_openai_client --- + +@patch('enhance_topics.load_dotenv') +@patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}) +def test_get_openai_client_success(mock_load_dotenv): + """Test successful client initialisation.""" + client = get_openai_client() + assert client.api_key == "test-key" + +@patch('enhance_topics.load_dotenv') +@patch.dict(os.environ, {}, clear=True) +def test_get_openai_client_missing_key(mock_load_dotenv): + """Test error when API key is missing.""" + with pytest.raises(OpenAIError, match="OpenAI API key not found"): + get_openai_client() + +# --- Tests for analyze_files --- + +@patch('enhance_topics.get_meta_names_from_content') +@patch('enhance_topics.analyze_content') +@patch('enhance_topics.validate_content') +@patch('enhance_topics.add_analysis_result') +@patch('enhance_topics.create_enhance_data') +def test_analyze_files_basic_flow( + mock_create_data, + mock_add_result, + mock_validate, + mock_analyze, + mock_get_meta, + mock_client +): + """Test the basic flow of analyze_files.""" + mock_create_data.return_value = EnhanceData(results={}, updated_files=set()) + mock_get_meta.return_value = [] # No existing metadata + mock_analyze.return_value = "Generated result" + mock_validate.return_value = True + mock_add_result.return_value = EnhanceData( + results={"file1.rst": {"description": "res"}}, + updated_files=set() + ) + + files = ["file1.rst"] + prompts = {"description": "desc prompt"} + + with patch("builtins.open", mock_open(read_data="File content")): + analyze_files(files, mock_client, prompts) + + mock_analyze.assert_called_once() + mock_validate.assert_called_once() + mock_add_result.assert_called_once() + +@patch('enhance_topics.get_meta_names_from_content') +def test_analyze_files_skips_existing_meta(mock_get_meta, mock_client): + """Test that files with existing metadata are skipped.""" + mock_get_meta.return_value = ["description"] # Description already exists + + files = ["file1.rst"] + prompts = {"description": "desc prompt"} + + with patch("builtins.open", mock_open(read_data="File content")): + with patch('enhance_topics.analyze_content') as mock_analyze: + analyze_files(files, mock_client, prompts) + mock_analyze.assert_not_called() + +# --- Tests for update_meta_files --- + +@patch('enhance_topics.get_results_for_file') +@patch('enhance_topics.inject_metadata_to_content') +@patch('enhance_topics.mark_file_updated') +def test_update_meta_files_writes_on_change( + mock_mark_updated, + mock_inject, + mock_get_results, + mock_client +): + """Test that files are written only when metadata changes.""" + mock_get_results.return_value = {"description": "new desc"} + mock_inject.return_value = ("New content", True) # Changed is True + mock_mark_updated.return_value = EnhanceData( + results={}, + updated_files={"file1.rst"} + ) + + data = EnhanceData( + results={"file1.rst": {"description": "new desc"}}, + updated_files=set() + ) + + m_open = mock_open(read_data="Old content") + with patch("builtins.open", m_open): + update_meta_files(["file1.rst"], data) + + # Verify write was called + m_open().write.assert_called_once_with("New content") + mock_mark_updated.assert_called_once() + +@patch('enhance_topics.get_results_for_file') +@patch('enhance_topics.inject_metadata_to_content') +def test_update_meta_files_skips_no_change(mock_inject, mock_get_results): + """Test that files are NOT written when no metadata changes.""" + mock_get_results.return_value = {"description": "same desc"} + mock_inject.return_value = ("Old content", False) # Changed is False + + data = EnhanceData( + results={"file1.rst": {"description": "same desc"}}, + updated_files=set() + ) + + m_open = mock_open(read_data="Old content") + with patch("builtins.open", m_open): + update_meta_files(["file1.rst"], data) + + # Verify write was NOT called + m_open().write.assert_not_called() + +# --- Tests for enhance_metadata --- + +@patch('enhance_topics.get_openai_client') +@patch('enhance_topics.analyze_files') +@patch('enhance_topics.update_meta_files') +def test_enhance_metadata_orchestration(mock_update, mock_analyze, mock_get_client): + """Test the orchestration in enhance_metadata.""" + mock_get_client.return_value = MagicMock() + mock_analyze.return_value = EnhanceData(results={"f": {"d": "r"}}, updated_files=set()) + mock_update.return_value = EnhanceData(results={"f": {"d": "r"}}, updated_files={"f"}) + + result = enhance_metadata(["file1.rst"]) + + assert result.updated_files == {"f"} + mock_get_client.assert_called_once() + mock_analyze.assert_called_once() + mock_update.assert_called_once() diff --git a/scripts/test/test_enhance_topics_validation.py b/scripts/test/test_enhance_topics_validation.py new file mode 100644 index 00000000000..54309bbf340 --- /dev/null +++ b/scripts/test/test_enhance_topics_validation.py @@ -0,0 +1,61 @@ +import pytest +from unittest.mock import MagicMock +import sys +import os + +# Add the scripts directory to sys.path to allow importing enhance_topics +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from enhance_topics import validate_content + +@pytest.fixture +def mock_client(): + """Provides a mocked OpenAI client.""" + return MagicMock() + +def test_validate_content_success(mock_client): + """Test that valid English content passes both moderation and language checks.""" + # Mock Moderation: Not flagged + mock_result = MagicMock() + mock_result.flagged = False + mock_client.moderations.create.return_value.results = [mock_result] + + # Mock Chat: Returns 'yes' + mock_completion = MagicMock() + mock_completion.choices = [MagicMock(message=MagicMock(content='yes'))] + mock_client.chat.completions.create.return_value = mock_completion + + assert validate_content(mock_client, "This is a valid English sentence.") is True + +def test_validate_content_moderation_fail(mock_client): + """Test that content flagged by moderation returns False.""" + # Mock Moderation: Flagged + mock_result = MagicMock() + mock_result.flagged = True + # Mock categories.model_dump() for the logger + mock_result.categories.model_dump.return_value = {"hate": True, "violence": False} + mock_client.moderations.create.return_value.results = [mock_result] + + assert validate_content(mock_client, "Some offensive content.") is False + # Verify chat.completions was NOT called (short-circuit) + mock_client.chat.completions.create.assert_not_called() + +def test_validate_content_language_fail(mock_client): + """Test that non-English content (as determined by the LLM) returns False.""" + # Mock Moderation: Not flagged + mock_result = MagicMock() + mock_result.flagged = False + mock_client.moderations.create.return_value.results = [mock_result] + + # Mock Chat: Returns 'no' + mock_completion = MagicMock() + mock_completion.choices = [MagicMock(message=MagicMock(content='no'))] + mock_client.chat.completions.create.return_value = mock_completion + + assert validate_content(mock_client, "Ceci n'est pas anglais.") is False + +def test_validate_content_empty_input(mock_client): + """Test that empty or whitespace-only input returns False immediately.""" + assert validate_content(mock_client, "") is False + assert validate_content(mock_client, " ") is False + mock_client.moderations.create.assert_not_called() diff --git a/source/About-ROS.rst b/source/About-ROS.rst index 05fe7db14e9..9f8afece373 100644 --- a/source/About-ROS.rst +++ b/source/About-ROS.rst @@ -3,10 +3,21 @@ About ROS ========= +.. meta:: + :contentType: about + :experience: beginner + :area: framework, tools, capabilities + :capability: simulation + :distro: {DISTRO} + :product: {PRODUCT} + + ROS (Robot Operating System) is an open-source ecosystem that provides the framework, tools, and libraries for building, deploying, running, and maintaining robotic applications. This article introduces the main areas of the ecosystem and outlines their intended use. -**Area: ROS-framework, ROS-tools, ROS-capabilities | Content-type: about | Experience: beginner** +.. showmeta:: + :order: product, distro, area, capability, contentType, experience + :labels: product=Product, distro=Distribution, area=Area, capability=Capability, contentType=Content type, experience=Level .. contents:: Table of Contents :depth: 2 diff --git a/source/The-ROS2-Project/Adopters/adopters.yaml b/source/The-ROS2-Project/Adopters/adopters.yaml index 30fd1772086..a064c11f8f1 100644 --- a/source/The-ROS2-Project/Adopters/adopters.yaml +++ b/source/The-ROS2-Project/Adopters/adopters.yaml @@ -95,7 +95,7 @@ adopters: description: "Development platform for autonomous mobile robots across logistics, construction, and retail." - organization: "RT Corporation" - organization_url: "https://rt-net.jp" + organization_url: "https://en.rt-net.jp" project: "CRANE-X7" project_url: "https://github.com/rt-net/crane_x7_ros" domain: diff --git a/source/_static/pagefind-docsearch.css b/source/_static/pagefind-docsearch.css new file mode 100644 index 00000000000..1f507fc202d --- /dev/null +++ b/source/_static/pagefind-docsearch.css @@ -0,0 +1,219 @@ +/* DocSearch-like sidebar trigger for Pagefind modal (plan §3) */ +.ros2-pagefind-search { + margin: 0.5rem 0 1rem; +} + +.ros2-pagefind-search pagefind-modal-trigger { + display: block; + width: 100%; +} + +/* Light styling for the trigger button (Pagefind exposes light DOM button) */ +.ros2-pagefind-search pagefind-modal-trigger::part(button), +.ros2-pagefind-search button { + align-items: center; + background: var(--wy-menu-vertical-background-color, #fcfcfc); + border: 1px solid #ccc; + border-radius: 40px; + color: var(--wy-menu-vertical-color, #404040); + cursor: pointer; + display: flex; + font-size: 0.85rem; + gap: 0.35rem; + justify-content: space-between; + min-height: 2.25rem; + padding: 0.35rem 0.6rem 0.35rem 0.75rem; + text-align: left; + width: 100%; +} + +.ros2-pagefind-search pagefind-modal-trigger::part(button):hover, +.ros2-pagefind-search button:hover { + border-color: #999; + box-shadow: 0 1px 2px rgba(0, 0, 0, 0.06); +} + +/* Keyboard hint styling (Algolia DocSearch-like) */ +.ros2-pagefind-search .DocSearch-Button-Keys, +.ros2-pagefind-search pagefind-modal-trigger::part(keys) { + display: flex; + gap: 0.2rem; +} + +.ros2-pagefind-search kbd, +.ros2-pagefind-search pagefind-modal-trigger::part(kbd) { + align-items: center; + background: linear-gradient(-225deg, #d5dbe4, #f8f8f8); + border: 0; + border-radius: 3px; + box-shadow: inset 0 -2px 0 0 #cdcde6, inset 0 0 1px 1px #fff, 0 1px 2px 1px rgba(30, 35, 90, 0.2); + color: #969faf; + display: flex; + font-size: 0.65rem; + font-weight: 600; + line-height: 1; + min-height: 1.25rem; + min-width: 1.25rem; + padding: 0 0.3rem; + justify-content: center; +} + +.wy-nav-side-scroll .ros2-pagefind-search { + padding-right: 0.5rem; +} + +.ros-page-meta-summary, +.ros2-pagefind-search .pf-result-meta-block, +#ros-search-page .pf-result-meta-block, +dialog.pf-modal .pf-result-meta-block { + margin: -0.25rem 0 1rem !important; + padding: 0.45rem 0.75rem !important; + border-left: 4px solid #6c757d !important; + background: #f8f9fa !important; + color: #495057 !important; + font-size: 0.85rem !important; +} + +.ros2-pagefind-search dialog.pf-modal { + width: clamp(900px, 60vw, 1200px) !important; + max-width: 92vw !important; + min-width: min(900px, 92vw) !important; +} + +.ros2-pagefind-search .ros-search-two-col, +#ros-search-page .ros-search-two-col { + display: grid; + grid-template-columns: minmax(220px, 260px) minmax(0, 1fr); + gap: 1rem; + min-height: 0; + width: 100%; +} + +.ros2-pagefind-search .ros-search-facets, +.ros2-pagefind-search .ros-search-results { + max-height: 62vh; + overflow: auto; + min-width: 0; +} + +#ros-search-page .ros-search-facets, +#ros-search-page .ros-search-results { + min-width: 0; +} + +.ros2-pagefind-search .ros-search-facets { + border-right: 1px solid #e9ecef; + padding-right: 0.75rem; +} + +#ros-search-page .ros-search-facets { + border-right: 1px solid #e9ecef; + padding-right: 0.75rem; +} + +.ros2-pagefind-search .ros-search-facets pagefind-filter-pane, +.ros2-pagefind-search .ros-search-results pagefind-summary, +.ros2-pagefind-search .ros-search-results pagefind-results, +#ros-search-page .ros-search-facets pagefind-filter-pane, +#ros-search-page .ros-search-results pagefind-summary, +#ros-search-page .ros-search-results pagefind-results { + display: block; +} + +.ros2-pagefind-search .ros-search-results pagefind-summary, +#ros-search-page .ros-search-results pagefind-summary { + margin-bottom: 0.75rem; +} + +.ros2-pagefind-search .pf-result-link, +#ros-search-page .pf-result-link { + font-size: 1rem; + font-weight: 700; + line-height: 1.25; +} + +.ros2-pagefind-search .pf-result-excerpt, +.ros2-pagefind-search .pf-result-preview, +#ros-search-page .pf-result-excerpt, +#ros-search-page .pf-result-preview { + font-size: 0.85rem; + line-height: 1.35; +} + +.ros2-pagefind-search .pf-result-meta-block, +#ros-search-page .pf-result-meta-block, +dialog.pf-modal .pf-result-meta-block { + margin-top: 0.35rem !important; + margin-bottom: 0.45rem !important; + border-radius: 0 !important; + display: block !important; + line-height: 1.35 !important; +} + +.ros2-pagefind-search .pf-result-meta-block b, +#ros-search-page .pf-result-meta-block b, +dialog.pf-modal .pf-result-meta-block b { + color: #495057 !important; + font-weight: 600 !important; +} + +/* Full-page search results (search.html) */ +.ros-search-page { + padding: 0 0 2rem; +} + +.ros-search-page-input-row { + margin-bottom: 1.5rem; +} + +.ros-search-page-input-row pagefind-input { + display: block; + width: 100%; +} + +.ros-search-page-two-col .ros-search-facets, +.ros-search-page-two-col .ros-search-results { + max-height: none; + overflow: visible; +} + +/* + Force Pagefind's per-result IntersectionObserver to use this + element as its root. The component walks up the DOM looking for an ancestor + whose computed overflow-y is not "visible" or "hidden"; without this, no + ancestor matches on a dedicated search page (everything renders with default + overflow), the observer never fires, and result cards remain skeletons. + + Setting overflow-y: auto with no max-height gives the observer a valid root + without producing any visible scrollbar - the element grows to fit content + naturally and the page itself remains the scroll context for the user. +*/ +#ros-search-page pagefind-results { + overflow-y: auto !important; +} + +@media (max-width: 980px) { + .ros2-pagefind-search .ros-search-two-col, + #ros-search-page .ros-search-two-col { + grid-template-columns: 1fr; + } + + .ros2-pagefind-search .ros-search-facets, + .ros2-pagefind-search .ros-search-results { + max-height: none; + } + + .ros2-pagefind-search .ros-search-facets { + border-right: 0; + border-bottom: 1px solid #e9ecef; + margin-bottom: 0.75rem; + padding: 0 0 0.75rem; + } + + #ros-search-page .ros-search-facets { + border-right: 0; + border-bottom: 1px solid #e9ecef; + margin-bottom: 0.75rem; + padding: 0 0 0.75rem; + } +} diff --git a/source/_templates/layout.html b/source/_templates/layout.html new file mode 100644 index 00000000000..94830854a69 --- /dev/null +++ b/source/_templates/layout.html @@ -0,0 +1,9 @@ +{% extends "!layout.html" %} +{% block extrahead %} + {{ super() }} + {% if pagefind_seo_filter_metas %} + + {{ pagefind_seo_filter_metas|safe }} + + {% endif %} +{% endblock %} diff --git a/source/_templates/search.html b/source/_templates/search.html new file mode 100644 index 00000000000..520bfb9b715 --- /dev/null +++ b/source/_templates/search.html @@ -0,0 +1,37 @@ +{# + Override RTD/Sphinx search page: Pagefind full-page UI instead of searchtools.js. +#} +{%- extends "layout.html" %} +{% set title = _('Search') %} +{% set display_vcs_links = False %} +{%- block scripts %} + {{ super() }} +{%- endblock %} +{% block footer %} + {{ super() }} +{% endblock %} +{% block body %} + + +
+
+ +
+
+ +
+ + +
+
+
+{% endblock %} diff --git a/source/_templates/searchbox.html b/source/_templates/searchbox.html new file mode 100644 index 00000000000..4485c542d3c --- /dev/null +++ b/source/_templates/searchbox.html @@ -0,0 +1,573 @@ +{# Pagefind Component UI (plan §3) + DocSearch-inspired styling via pagefind-docsearch.css #} + + + +