Add a tool to check removed HTML anchors

encukou · encukou · commit 3aa8db06993c · 2026-02-18T15:54:48.000+01:00
diff --git a/Doc/.ruff.toml b/Doc/.ruff.toml
@@ -30,6 +30,7 @@ select = [
 ]
 ignore = [
     "E501",  # Ignore line length errors (we use auto-formatting)
+    "I001",  # Import block is un-sorted or un-formatted
 ]
 
 [format]
diff --git a/Doc/Makefile b/Doc/Makefile
@@ -336,3 +336,9 @@ autobuild-stable-html:
 		exit 1;; \
 	esac
 	@$(MAKE) autobuild-dev-html
+
+# Collect HTML IDs to a JSON document
+.PHONY: html-ids
+html-ids:
+	$(PYTHON) tools/check-html-ids.py collect build/html \
+		-o build/html/html-ids.json.gz
diff --git a/Doc/data/removed-ids.txt b/Doc/data/removed-ids.txt
@@ -0,0 +1,93 @@
+# Known removed HTML IDs:
+
+c-api/complex.html: complex-numbers-as-python-objects
+
+c-api/extension-modules.html: initialization-function
+
+c-api/import.html: c.PyImport_ImportModuleNoBlock
+
+c-api/init.html: c.Py_GetExecPrefix
+c-api/init.html: c.Py_GetPath
+c-api/init.html: c.Py_GetPrefix
+c-api/init.html: c.Py_GetProgramFullPath
+c-api/init.html: c.Py_GetProgramName
+c-api/init.html: c.Py_GetPythonHome
+
+c-api/module.html: module-definitions
+c-api/module.html: module-slots
+
+c-api/stable.html: c-api-stability
+
+c-api/sys.html: c.PySys_ResetWarnOptions
+
+c-api/weakref.html: c.PyWeakref_GET_OBJECT
+c-api/weakref.html: c.PyWeakref_GetObject
+
+extending/extending.html: a-simple-example
+extending/extending.html: back-to-the-example
+extending/extending.html: backtoexample
+extending/extending.html: compilation-and-linkage
+extending/extending.html: extending-python-with-c-or-c
+extending/extending.html: extending-simpleexample
+extending/extending.html: intermezzo-errors-and-exceptions
+extending/extending.html: methodtable
+extending/extending.html: the-module-s-method-table-and-initialization-function
+
+extending/index.html: creating-extensions-without-third-party-tools
+
+howto/perf_profiling.html: python-support-for-the-linux-perf-profiler
+
+library/dis.html: opcode-LOAD_CONST_IMMORTAL
+
+library/ftplib.html: ftplib.FTP_TLS.ssl_version
+
+library/http.server.html: cmdoption-http.server-cgi
+library/http.server.html: http.server.CGIHTTPRequestHandler
+library/http.server.html: http.server.CGIHTTPRequestHandler.cgi_directories
+library/http.server.html: http.server.CGIHTTPRequestHandler.do_POST
+
+library/importlib.html: importlib.abc.FileLoader.load_module
+library/importlib.html: importlib.abc.InspectLoader.load_module
+library/importlib.html: importlib.abc.Loader.load_module
+library/importlib.html: importlib.abc.SourceLoader.load_module
+library/importlib.html: importlib.machinery.SourceFileLoader.load_module
+library/importlib.html: importlib.machinery.SourcelessFileLoader.load_module
+
+library/pathlib.html: pathlib.PurePath.is_reserved
+
+library/platform.html: java-platform
+library/platform.html: platform.java_ver
+
+library/profile.html: cmdoption-cProfile-m
+library/profile.html: cmdoption-cProfile-o
+library/profile.html: cmdoption-cProfile-s
+library/profile.html: instant-user-s-manual
+library/profile.html: introduction-to-the-profilers
+library/profile.html: module-cProfile
+library/profile.html: module-pstats
+library/profile.html: profile-and-cprofile-module-reference
+library/profile.html: profile-cli
+library/profile.html: profile-instant
+library/profile.html: profile-stats
+library/profile.html: profiler-introduction
+library/profile.html: pstats.Stats
+library/profile.html: pstats.Stats.add
+library/profile.html: pstats.Stats.dump_stats
+library/profile.html: pstats.Stats.get_stats_profile
+library/profile.html: pstats.Stats.print_callees
+library/profile.html: pstats.Stats.print_callers
+library/profile.html: pstats.Stats.print_stats
+library/profile.html: pstats.Stats.reverse_order
+library/profile.html: pstats.Stats.sort_stats
+library/profile.html: pstats.Stats.strip_dirs
+library/profile.html: the-python-profilers
+library/profile.html: the-stats-class
+
+library/typing.html: typing.no_type_check_decorator
+
+library/wave.html: wave.Wave_read.getmark
+library/wave.html: wave.Wave_read.getmarkers
+
+library/zipimport.html: zipimport.zipimporter.load_module
+
+reference/datamodel.html: module.__cached__
diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py
@@ -0,0 +1,171 @@
+from compression import gzip
+import concurrent.futures
+from pathlib import Path
+import html.parser
+import functools
+import argparse
+import json
+import sys
+import re
+
+
+IGNORED_ID_RE = re.compile(r"""
+    index-\d+
+    | id\d+
+    | [_a-z]+_\d+
+""", re.VERBOSE)
+
+
+class IDGatherer(html.parser.HTMLParser):
+    def __init__(self, ids):
+        super().__init__()
+        self.__ids = ids
+
+    def handle_starttag(self, tag, attrs):
+        for name, value in attrs:
+            if name == 'id':
+                if not IGNORED_ID_RE.fullmatch(value):
+                    self.__ids.add(value)
+
+
+def get_ids_from_file(path):
+    ids = set()
+    gatherer = IDGatherer(ids)
+    with path.open() as file:
+        while chunk := file.read(4096):
+            gatherer.feed(chunk)
+    return ids
+
+
+def gather_ids(htmldir, *, verbose_print):
+    if not htmldir.joinpath('objects.inv').exists():
+        raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')
+
+    if sys._is_gil_enabled:
+        pool = concurrent.futures.ProcessPoolExecutor()
+    else:
+        pool = concurrent.futures.ThreadPoolExecutor()
+    tasks = {}
+    for path in htmldir.glob('**/*.html'):
+        relative_path = path.relative_to(htmldir)
+        if '_static' in relative_path.parts:
+            continue
+        if 'whatsnew' in relative_path.parts:
+            continue
+        tasks[relative_path] = pool.submit(get_ids_from_file, path=path)
+
+    ids_by_page = {}
+    for relative_path, future in tasks.items():
+        verbose_print(relative_path)
+        ids = future.result()
+        ids_by_page[str(relative_path)] = future.result()
+        verbose_print(f'    - {len(ids)} ids found')
+
+    common = set.intersection(*ids_by_page.values())
+    verbose_print(f'Filtering out {len(common)} common ids')
+    for key, page_ids in ids_by_page.items():
+        ids_by_page[key] = sorted(page_ids - common)
+
+    return ids_by_page
+
+
+def do_check(baseline, checked, excluded, *, verbose_print):
+    successful = True
+    for name, baseline_ids in sorted(baseline.items()):
+        try:
+            checked_ids = checked[name]
+        except KeyError:
+            successful = False
+            print(f'{name}: (page missing)')
+            print()
+        else:
+            missing_ids = set(baseline_ids) - set(checked_ids)
+            if missing_ids:
+                missing_ids = {
+                    a for a in missing_ids
+                    if not IGNORED_ID_RE.fullmatch(a)
+                    and (name, a) not in excluded
+                }
+            if missing_ids:
+                successful = False
+                for missing_id in sorted(missing_ids):
+                    print(f'{name}: {missing_id}')
+                print()
+    return successful
+
+
+def main(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-v', '--verbose', action='store_true',
+        help='print out more information')
+    subparsers = parser.add_subparsers(dest='command', required=True)
+
+    collect = subparsers.add_parser(
+        'collect',
+        help='collect IDs from a set of HTML files')
+    collect.add_argument(
+        'htmldir', type=Path,
+        help='directory with HTML documentation')
+    collect.add_argument(
+        '-o', '--outfile',
+        help='File to save the result in; default <htmldir>/html-ids.json.gz')
+
+    check = subparsers.add_parser(
+        'check',
+        help='check two archives of IDs')
+    check.add_argument(
+        'baseline_file', type=Path,
+        help='file with baseline IDs')
+    check.add_argument(
+        'checked_file', type=Path,
+        help='file with checked IDs')
+    check.add_argument(
+        '-x', '--exclude-file', type=Path,
+        help='file with IDs to exclude from the check')
+
+    args = parser.parse_args(argv[1:])
+
+    if args.verbose:
+        verbose_print = functools.partial(print, file=sys.stderr)
+    else:
+        def verbose_print(*args, **kwargs):
+            """do nothing"""
+
+    if args.command == 'collect':
+        ids = gather_ids(args.htmldir, verbose_print=verbose_print)
+        if args.outfile is None:
+            args.outfile = args.htmldir / 'html-ids.json.gz'
+        with gzip.open(args.outfile, 'wt') as zfile:
+            json.dump({'ids_by_page': ids}, zfile)
+
+    if args.command == 'check':
+        with gzip.open(args.baseline_file) as zfile:
+            baseline = json.load(zfile)['ids_by_page']
+        with gzip.open(args.checked_file) as zfile:
+            checked = json.load(zfile)['ids_by_page']
+        excluded = set()
+        if args.exclude_file:
+            with open(args.exclude_file) as file:
+                for line in file:
+                    line = line.strip()
+                    if line and not line.startswith('#'):
+                        name, sep, excluded_id = line.partition(':')
+                        if sep:
+                            excluded.add((name.strip(), excluded_id.strip()))
+        if do_check(baseline, checked, excluded, verbose_print=verbose_print):
+            verbose_print('All OK')
+        else:
+            sys.stdout.flush()
+            print(
+                'ERROR: Removed IDs found',
+                'The above HTML IDs were removed from the documentation, '
+                + 'resulting in broken links. Please add them back.',
+                sep='\n',
+                file=sys.stderr)
+            if args.exclude_file:
+                print(f'Alternatively, add them to {args.exclude_file}.')
+
+
+if __name__ == '__main__':
+    main(sys.argv)

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ select = [`
`30`	`30`	`]`
`31`	`31`	`ignore = [`
`32`	`32`	`"E501", # Ignore line length errors (we use auto-formatting)`
	`33`	`+ "I001", # Import block is un-sorted or un-formatted`
`33`	`34`	`]`
`34`	`35`
`35`	`36`	`[format]`