Skip to content

Commit 3aa8db0

Browse files
committed
Add a tool to check removed HTML anchors
1 parent f705486 commit 3aa8db0

File tree

4 files changed

+271
-0
lines changed

4 files changed

+271
-0
lines changed

Doc/.ruff.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ select = [
3030
]
3131
ignore = [
3232
"E501", # Ignore line length errors (we use auto-formatting)
33+
"I001", # Import block is un-sorted or un-formatted
3334
]
3435

3536
[format]

Doc/Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,3 +336,9 @@ autobuild-stable-html:
336336
exit 1;; \
337337
esac
338338
@$(MAKE) autobuild-dev-html
339+
340+
# Collect HTML IDs to a JSON document
341+
.PHONY: html-ids
342+
html-ids:
343+
$(PYTHON) tools/check-html-ids.py collect build/html \
344+
-o build/html/html-ids.json.gz

Doc/data/removed-ids.txt

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# Known removed HTML IDs:
2+
3+
c-api/complex.html: complex-numbers-as-python-objects
4+
5+
c-api/extension-modules.html: initialization-function
6+
7+
c-api/import.html: c.PyImport_ImportModuleNoBlock
8+
9+
c-api/init.html: c.Py_GetExecPrefix
10+
c-api/init.html: c.Py_GetPath
11+
c-api/init.html: c.Py_GetPrefix
12+
c-api/init.html: c.Py_GetProgramFullPath
13+
c-api/init.html: c.Py_GetProgramName
14+
c-api/init.html: c.Py_GetPythonHome
15+
16+
c-api/module.html: module-definitions
17+
c-api/module.html: module-slots
18+
19+
c-api/stable.html: c-api-stability
20+
21+
c-api/sys.html: c.PySys_ResetWarnOptions
22+
23+
c-api/weakref.html: c.PyWeakref_GET_OBJECT
24+
c-api/weakref.html: c.PyWeakref_GetObject
25+
26+
extending/extending.html: a-simple-example
27+
extending/extending.html: back-to-the-example
28+
extending/extending.html: backtoexample
29+
extending/extending.html: compilation-and-linkage
30+
extending/extending.html: extending-python-with-c-or-c
31+
extending/extending.html: extending-simpleexample
32+
extending/extending.html: intermezzo-errors-and-exceptions
33+
extending/extending.html: methodtable
34+
extending/extending.html: the-module-s-method-table-and-initialization-function
35+
36+
extending/index.html: creating-extensions-without-third-party-tools
37+
38+
howto/perf_profiling.html: python-support-for-the-linux-perf-profiler
39+
40+
library/dis.html: opcode-LOAD_CONST_IMMORTAL
41+
42+
library/ftplib.html: ftplib.FTP_TLS.ssl_version
43+
44+
library/http.server.html: cmdoption-http.server-cgi
45+
library/http.server.html: http.server.CGIHTTPRequestHandler
46+
library/http.server.html: http.server.CGIHTTPRequestHandler.cgi_directories
47+
library/http.server.html: http.server.CGIHTTPRequestHandler.do_POST
48+
49+
library/importlib.html: importlib.abc.FileLoader.load_module
50+
library/importlib.html: importlib.abc.InspectLoader.load_module
51+
library/importlib.html: importlib.abc.Loader.load_module
52+
library/importlib.html: importlib.abc.SourceLoader.load_module
53+
library/importlib.html: importlib.machinery.SourceFileLoader.load_module
54+
library/importlib.html: importlib.machinery.SourcelessFileLoader.load_module
55+
56+
library/pathlib.html: pathlib.PurePath.is_reserved
57+
58+
library/platform.html: java-platform
59+
library/platform.html: platform.java_ver
60+
61+
library/profile.html: cmdoption-cProfile-m
62+
library/profile.html: cmdoption-cProfile-o
63+
library/profile.html: cmdoption-cProfile-s
64+
library/profile.html: instant-user-s-manual
65+
library/profile.html: introduction-to-the-profilers
66+
library/profile.html: module-cProfile
67+
library/profile.html: module-pstats
68+
library/profile.html: profile-and-cprofile-module-reference
69+
library/profile.html: profile-cli
70+
library/profile.html: profile-instant
71+
library/profile.html: profile-stats
72+
library/profile.html: profiler-introduction
73+
library/profile.html: pstats.Stats
74+
library/profile.html: pstats.Stats.add
75+
library/profile.html: pstats.Stats.dump_stats
76+
library/profile.html: pstats.Stats.get_stats_profile
77+
library/profile.html: pstats.Stats.print_callees
78+
library/profile.html: pstats.Stats.print_callers
79+
library/profile.html: pstats.Stats.print_stats
80+
library/profile.html: pstats.Stats.reverse_order
81+
library/profile.html: pstats.Stats.sort_stats
82+
library/profile.html: pstats.Stats.strip_dirs
83+
library/profile.html: the-python-profilers
84+
library/profile.html: the-stats-class
85+
86+
library/typing.html: typing.no_type_check_decorator
87+
88+
library/wave.html: wave.Wave_read.getmark
89+
library/wave.html: wave.Wave_read.getmarkers
90+
91+
library/zipimport.html: zipimport.zipimporter.load_module
92+
93+
reference/datamodel.html: module.__cached__

Doc/tools/check-html-ids.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
from compression import gzip
2+
import concurrent.futures
3+
from pathlib import Path
4+
import html.parser
5+
import functools
6+
import argparse
7+
import json
8+
import sys
9+
import re
10+
11+
12+
IGNORED_ID_RE = re.compile(r"""
13+
index-\d+
14+
| id\d+
15+
| [_a-z]+_\d+
16+
""", re.VERBOSE)
17+
18+
19+
class IDGatherer(html.parser.HTMLParser):
20+
def __init__(self, ids):
21+
super().__init__()
22+
self.__ids = ids
23+
24+
def handle_starttag(self, tag, attrs):
25+
for name, value in attrs:
26+
if name == 'id':
27+
if not IGNORED_ID_RE.fullmatch(value):
28+
self.__ids.add(value)
29+
30+
31+
def get_ids_from_file(path):
32+
ids = set()
33+
gatherer = IDGatherer(ids)
34+
with path.open() as file:
35+
while chunk := file.read(4096):
36+
gatherer.feed(chunk)
37+
return ids
38+
39+
40+
def gather_ids(htmldir, *, verbose_print):
41+
if not htmldir.joinpath('objects.inv').exists():
42+
raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')
43+
44+
if sys._is_gil_enabled:
45+
pool = concurrent.futures.ProcessPoolExecutor()
46+
else:
47+
pool = concurrent.futures.ThreadPoolExecutor()
48+
tasks = {}
49+
for path in htmldir.glob('**/*.html'):
50+
relative_path = path.relative_to(htmldir)
51+
if '_static' in relative_path.parts:
52+
continue
53+
if 'whatsnew' in relative_path.parts:
54+
continue
55+
tasks[relative_path] = pool.submit(get_ids_from_file, path=path)
56+
57+
ids_by_page = {}
58+
for relative_path, future in tasks.items():
59+
verbose_print(relative_path)
60+
ids = future.result()
61+
ids_by_page[str(relative_path)] = future.result()
62+
verbose_print(f' - {len(ids)} ids found')
63+
64+
common = set.intersection(*ids_by_page.values())
65+
verbose_print(f'Filtering out {len(common)} common ids')
66+
for key, page_ids in ids_by_page.items():
67+
ids_by_page[key] = sorted(page_ids - common)
68+
69+
return ids_by_page
70+
71+
72+
def do_check(baseline, checked, excluded, *, verbose_print):
73+
successful = True
74+
for name, baseline_ids in sorted(baseline.items()):
75+
try:
76+
checked_ids = checked[name]
77+
except KeyError:
78+
successful = False
79+
print(f'{name}: (page missing)')
80+
print()
81+
else:
82+
missing_ids = set(baseline_ids) - set(checked_ids)
83+
if missing_ids:
84+
missing_ids = {
85+
a for a in missing_ids
86+
if not IGNORED_ID_RE.fullmatch(a)
87+
and (name, a) not in excluded
88+
}
89+
if missing_ids:
90+
successful = False
91+
for missing_id in sorted(missing_ids):
92+
print(f'{name}: {missing_id}')
93+
print()
94+
return successful
95+
96+
97+
def main(argv):
98+
parser = argparse.ArgumentParser()
99+
parser.add_argument(
100+
'-v', '--verbose', action='store_true',
101+
help='print out more information')
102+
subparsers = parser.add_subparsers(dest='command', required=True)
103+
104+
collect = subparsers.add_parser(
105+
'collect',
106+
help='collect IDs from a set of HTML files')
107+
collect.add_argument(
108+
'htmldir', type=Path,
109+
help='directory with HTML documentation')
110+
collect.add_argument(
111+
'-o', '--outfile',
112+
help='File to save the result in; default <htmldir>/html-ids.json.gz')
113+
114+
check = subparsers.add_parser(
115+
'check',
116+
help='check two archives of IDs')
117+
check.add_argument(
118+
'baseline_file', type=Path,
119+
help='file with baseline IDs')
120+
check.add_argument(
121+
'checked_file', type=Path,
122+
help='file with checked IDs')
123+
check.add_argument(
124+
'-x', '--exclude-file', type=Path,
125+
help='file with IDs to exclude from the check')
126+
127+
args = parser.parse_args(argv[1:])
128+
129+
if args.verbose:
130+
verbose_print = functools.partial(print, file=sys.stderr)
131+
else:
132+
def verbose_print(*args, **kwargs):
133+
"""do nothing"""
134+
135+
if args.command == 'collect':
136+
ids = gather_ids(args.htmldir, verbose_print=verbose_print)
137+
if args.outfile is None:
138+
args.outfile = args.htmldir / 'html-ids.json.gz'
139+
with gzip.open(args.outfile, 'wt') as zfile:
140+
json.dump({'ids_by_page': ids}, zfile)
141+
142+
if args.command == 'check':
143+
with gzip.open(args.baseline_file) as zfile:
144+
baseline = json.load(zfile)['ids_by_page']
145+
with gzip.open(args.checked_file) as zfile:
146+
checked = json.load(zfile)['ids_by_page']
147+
excluded = set()
148+
if args.exclude_file:
149+
with open(args.exclude_file) as file:
150+
for line in file:
151+
line = line.strip()
152+
if line and not line.startswith('#'):
153+
name, sep, excluded_id = line.partition(':')
154+
if sep:
155+
excluded.add((name.strip(), excluded_id.strip()))
156+
if do_check(baseline, checked, excluded, verbose_print=verbose_print):
157+
verbose_print('All OK')
158+
else:
159+
sys.stdout.flush()
160+
print(
161+
'ERROR: Removed IDs found',
162+
'The above HTML IDs were removed from the documentation, '
163+
+ 'resulting in broken links. Please add them back.',
164+
sep='\n',
165+
file=sys.stderr)
166+
if args.exclude_file:
167+
print(f'Alternatively, add them to {args.exclude_file}.')
168+
169+
170+
if __name__ == '__main__':
171+
main(sys.argv)

0 commit comments

Comments
 (0)