From aca231bf10185705ff88591e5b510434710bea5c Mon Sep 17 00:00:00 2001 From: Steve LLamb <38917682+SteveLLamb@users.noreply.github.com> Date: Tue, 28 Apr 2026 11:09:18 -0700 Subject: [PATCH 1/2] Validate illegal characters (#414) Source-based check flags literal occurrences of word-processor substitutes (curly quotes, en/em dashes, ellipsis, NBSP, soft and non-breaking hyphens, copyright and registered signs) in the document source. Numeric and named character references are recognized as deliberate and are not flagged. Browser logger entries link to the offending element via correlated source/DOM walks. Adds reference table in sec-special-characters and a note in sec-formulae recommending the U+2212 minus sign in math expressions. --- doc/main.html | 43 ++++- js/validate.mjs | 153 +++++++++++++++++- scripts/validate.mjs | 5 +- smpte.js | 8 +- .../html/validation/bibliography-valid.html | 2 +- .../illegal-characters-attr-invalid.html | 19 +++ ...legal-characters-curly-quotes-invalid.html | 19 +++ .../illegal-characters-dashes-invalid.html | 19 +++ .../illegal-characters-entities-valid.html | 26 +++ .../validation/illegal-characters-valid.html | 19 +++ .../html/validation/norm-ref-valid.html | 4 +- test/src/testValidation.mjs | 5 +- 12 files changed, 305 insertions(+), 17 deletions(-) create mode 100644 test/resources/html/validation/illegal-characters-attr-invalid.html create mode 100644 test/resources/html/validation/illegal-characters-curly-quotes-invalid.html create mode 100644 test/resources/html/validation/illegal-characters-dashes-invalid.html create mode 100644 test/resources/html/validation/illegal-characters-entities-valid.html create mode 100644 test/resources/html/validation/illegal-characters-valid.html diff --git a/doc/main.html b/doc/main.html index f45670e..bb206a0 100644 --- a/doc/main.html +++ b/doc/main.html @@ -618,12 +618,12 @@
sectionsectiondt element, in which case all the terms, abbreviations and symbols in the immediately
preceding dt elements are synonyms.
dd elements following the dt element(s) shall appear in the following
- order — each group is optional (except the definition), but if present, it shall appear in this sequence:
+ order - each group is optional (except the definition), but if present, it shall appear in this sequence:
dd element, if present, shall have a class
attribute containing deprecated, indicating the term is deprecated. At most one such element is permitted.sectionTables may include footnotes. Footnotes shall appear in a tfoot element at the foot of the table,
- each as a p element with class="footnote" and a unique id attribute. Footnotes are automatically assigned superscript lowercase letters (a, b, c, …) in the order they appear in the tfoot element.
+ each as a p element with class="footnote" and a unique id attribute. Footnotes are automatically assigned superscript lowercase letters (a, b, c, etc) in the order they appear in the tfoot element.
References to footnotes within the table body are marked with an a element whose href
attribute references the footnote's id.
Tools such as https://temml.org/ can be used to convert from Latex to MathML.
+The unicode minus sign (−) is used in math expressions instead of the hyphen-minus, e.g. −42 renders as −42, and 6x − 103y renders as 6x − 103y.
Characters should be encoded as UTF-8-encoded Unicode codepoints, e.g. あ, instead of HTML entities, e.g. あ, except as needed for usage in pre or examples.
The characters listed in are an exception to this guidance. They are commonly inserted by word processors (e.g. curly quotes substituted for straight quotes, en/em dashes substituted for hyphens) and shall not appear as literal codepoints in the document source. When one of these characters is genuinely needed, it shall be written as a unicode character reference (e.g. “) so that author intent is explicit.
| Codepoint | Name | Preview | Reference |
|---|---|---|---|
| U+2018 | Left Single Quotation Mark | ‘ | ‘ |
| U+2019 | Right Single Quotation Mark | ’ | ’ |
| U+201C | Left Double Quotation Mark | “ | “ |
| U+201D | Right Double Quotation Mark | ” | ” |
| U+2013 | En Dash | – | – |
| U+2014 | Em Dash | — | — |
| U+2011 | Non-Breaking Hyphen | ‑ | ‑ |
| U+2026 | Horizontal Ellipsis | … | … |
| U+00A0 | No-Break Space | [ ] |   |
| U+00AD | Soft Hyphen | [] | ­ |
| U+00A9 | Copyright Sign | © | © |
| U+00AE | Registered Sign | ® | ® |
The validator reports each literal occurrence with line and column information. Numeric unicode, or named character references (e.g. “, “) are recognized as deliberate and are not flagged.
class attributesp or div elements to create a formatted and automatically numbered example. See .footnotep element inside a tfoot element to create a table footnote. Footnotes are only permitted inside a tfoot element — use outside of tables is not allowed. See .p element inside a tfoot element to create a table footnote. Footnotes are only permitted inside a tfoot element - use outside of tables is not allowed. See .center-cell;right-cell;
- Alignment classes are available for columns 1–8 using col-x-center, col-x-right, col-x-left, and col-x-top. Multiple classes may be combined on the same table to align different columns independently.
+ Alignment classes are available for columns 1-8 using col-x-center, col-x-right, col-x-left, and col-x-top. Multiple classes may be combined on the same table to align different columns independently.
Tables over 8 columns wide should not be used and instead split into multiple smaller tables. diff --git a/js/validate.mjs b/js/validate.mjs index 1145f57..4733d5c 100644 --- a/js/validate.mjs +++ b/js/validate.mjs @@ -123,7 +123,156 @@ export function validateDataIncludes(doc, logger, fileExists = null) { } } -export function smpteValidate(doc, logger, fileExists = null) { +const ILLEGAL_CHARS_RE = /[‘’“”–—‑… ©®]/g; + +function _formatChar(ch) { + return `U+${ch.codePointAt(0).toString(16).toUpperCase().padStart(4, "0")}`; +} + +const ILLEGAL_CHARS_SET = new Set("‘’“”–—‑… ©®"); +const NAMED_ENTITIES = { + ldquo: "“", rdquo: "”", lsquo: "‘", rsquo: "’", + ndash: "–", mdash: "—", hellip: "…", NonBreakingHyphen: "‑", + nbsp: " ", shy: "", copy: "©", reg: "®", +}; +const ID_ATTR_RE = /\bid\s*=\s*["']([^"']+)["']/g; + +function _resolveEntity(body) { + if (body[0] === "#") { + const isHex = body[1] === "x" || body[1] === "X"; + const num = isHex ? parseInt(body.slice(2), 16) : parseInt(body.slice(1), 10); + return Number.isFinite(num) ? String.fromCodePoint(num) : null; + } + return Object.prototype.hasOwnProperty.call(NAMED_ENTITIES, body) ? NAMED_ENTITIES[body] : null; +} + +function _scanSource(source) { + const textTokens = []; + const attrLiterals = []; + let inTag = false; + let inComment = false; + let inRawText = null; + let i = 0; + while (i < source.length) { + if (inComment) { + if (source.startsWith("-->", i)) { inComment = false; i += 3; continue; } + i++; + continue; + } + if (inRawText) { + const closeRe = new RegExp(`^${inRawText}\\s*>`, "i"); + const m = source.slice(i).match(closeRe); + if (m) { inRawText = null; i += m[0].length; continue; } + i++; + continue; + } + if (inTag) { + if (source[i] === ">") { inTag = false; i++; continue; } + if (ILLEGAL_CHARS_SET.has(source[i])) + attrLiterals.push({ char: source[i], srcIdx: i }); + i++; + continue; + } + if (source.startsWith("