From aca231bf10185705ff88591e5b510434710bea5c Mon Sep 17 00:00:00 2001 From: Steve LLamb <38917682+SteveLLamb@users.noreply.github.com> Date: Tue, 28 Apr 2026 11:09:18 -0700 Subject: [PATCH 1/2] Validate illegal characters (#414) Source-based check flags literal occurrences of word-processor substitutes (curly quotes, en/em dashes, ellipsis, NBSP, soft and non-breaking hyphens, copyright and registered signs) in the document source. Numeric and named character references are recognized as deliberate and are not flagged. Browser logger entries link to the offending element via correlated source/DOM walks. Adds reference table in sec-special-characters and a note in sec-formulae recommending the U+2212 minus sign in math expressions. --- doc/main.html | 43 ++++- js/validate.mjs | 153 +++++++++++++++++- scripts/validate.mjs | 5 +- smpte.js | 8 +- .../html/validation/bibliography-valid.html | 2 +- .../illegal-characters-attr-invalid.html | 19 +++ ...legal-characters-curly-quotes-invalid.html | 19 +++ .../illegal-characters-dashes-invalid.html | 19 +++ .../illegal-characters-entities-valid.html | 26 +++ .../validation/illegal-characters-valid.html | 19 +++ .../html/validation/norm-ref-valid.html | 4 +- test/src/testValidation.mjs | 5 +- 12 files changed, 305 insertions(+), 17 deletions(-) create mode 100644 test/resources/html/validation/illegal-characters-attr-invalid.html create mode 100644 test/resources/html/validation/illegal-characters-curly-quotes-invalid.html create mode 100644 test/resources/html/validation/illegal-characters-dashes-invalid.html create mode 100644 test/resources/html/validation/illegal-characters-entities-valid.html create mode 100644 test/resources/html/validation/illegal-characters-valid.html diff --git a/doc/main.html b/doc/main.html index f45670e..bb206a0 100644 --- a/doc/main.html +++ b/doc/main.html @@ -618,12 +618,12 @@

Normative references section

<a>https://html.spec.whatwg.org/multipage/</a> </li> <li> - <cite id="bib-SMPTE-st429-18">SMPTE ST 429-18</cite>, D-Cinema Packaging — Immersive Audio Track + <cite id="bib-SMPTE-st429-18">SMPTE ST 429-18</cite>, D-Cinema Packaging - Immersive Audio Track File <span class="doi">10.5594/SMPTE.ST429-18</span> </li> <li> - <cite id="bib-SMPTE-st429-18-2023">SMPTE ST 429-18:2023</cite>, D-Cinema Packaging — Immersive + <cite id="bib-SMPTE-st429-18-2023">SMPTE ST 429-18:2023</cite>, D-Cinema Packaging - Immersive Audio Track File (2023 Edition) <span class="doi">10.5594/SMPTE.ST429-18.2023</span> </li> @@ -665,7 +665,7 @@

Terms and definitions section

can follow the first dt element, in which case all the terms, abbreviations and symbols in the immediately preceding dt elements are synonyms.
  • The dd elements following the dt element(s) shall appear in the following - order — each group is optional (except the definition), but if present, it shall appear in this sequence: + order - each group is optional (except the definition), but if present, it shall appear in this sequence:
    1. Deprecated: the first dd element, if present, shall have a class attribute containing deprecated, indicating the term is deprecated. At most one such element is permitted.
    2. @@ -700,7 +700,7 @@

      Terms and definitions section

      manufacture at regular intervals, typically one foot</dd> <dd class="example">An example of a key number is 12345.</dd> <dd class="note">Key number shall not be confused with key frame.</dd> - <dd class="source"><a href="#bib-key-number-spec"></a>, modified — definition has been updated.</dd> + <dd class="source"><a href="#bib-key-number-spec"></a>, modified - definition has been updated.</dd> </dl> </section> @@ -1156,7 +1156,7 @@

      Tables

      Tables may include footnotes. Footnotes shall appear in a tfoot element at the foot of the table, - each as a p element with class="footnote" and a unique id attribute. Footnotes are automatically assigned superscript lowercase letters (a, b, c, …) in the order they appear in the tfoot element. + each as a p element with class="footnote" and a unique id attribute. Footnotes are automatically assigned superscript lowercase letters (a, b, c, etc) in the order they appear in the tfoot element. References to footnotes within the table body are marked with an a element whose href attribute references the footnote's id.

      @@ -1434,13 +1434,40 @@

      Formulae

      Tools such as https://temml.org/ can be used to convert from Latex to MathML.

      +

      The unicode minus sign (&#x2212;) is used in math expressions instead of the hyphen-minus, e.g. &#x2212;42 renders as −42, and 6x &#x2212; 103y renders as 6x − 103y.

      +

      Special Characters

      Characters should be encoded as UTF-8-encoded Unicode codepoints, e.g. あ, instead of HTML entities, e.g. &#x3042;, except as needed for usage in pre or examples.

      - + +

      The characters listed in are an exception to this guidance. They are commonly inserted by word processors (e.g. curly quotes substituted for straight quotes, en/em dashes substituted for hyphens) and shall not appear as literal codepoints in the document source. When one of these characters is genuinely needed, it shall be written as a unicode character reference (e.g. &#x201C;) so that author intent is explicit.

      + + + + + + + + + + + + + + + + + + + + +
      Characters not permitted as literals in document source
      CodepointNamePreviewReference
      U+2018Left Single Quotation Mark&#x2018;
      U+2019Right Single Quotation Mark&#x2019;
      U+201CLeft Double Quotation Mark&#x201C;
      U+201DRight Double Quotation Mark&#x201D;
      U+2013En Dash&#x2013;
      U+2014Em Dash&#x2014;
      U+2011Non-Breaking Hyphen&#x2011;
      U+2026Horizontal Ellipsis&#x2026;
      U+00A0No-Break Space[ ]&#xA0;
      U+00ADSoft Hyphen[­]&#xAD;
      U+00A9Copyright Sign©&#xA9;
      U+00AERegistered Sign®&#xAE;
      + +

      The validator reports each literal occurrence with line and column information. Numeric unicode, or named character references (e.g. &#x201C;, &ldquo;) are recognized as deliberate and are not flagged.

      +
      @@ -1489,7 +1516,7 @@

      Using class attributes

      Applied to p or div elements to create a formatted and automatically numbered example. See .
      footnote
      -
      Applied to a p element inside a tfoot element to create a table footnote. Footnotes are only permitted inside a tfoot element — use outside of tables is not allowed. See .
      +
      Applied to a p element inside a tfoot element to create a table footnote. Footnotes are only permitted inside a tfoot element - use outside of tables is not allowed. See .
      center-cell;
      right-cell;
      @@ -1953,7 +1980,7 @@

      Column Alignment

      - Alignment classes are available for columns 1–8 using col-x-center, col-x-right, col-x-left, and col-x-top. Multiple classes may be combined on the same table to align different columns independently. + Alignment classes are available for columns 1-8 using col-x-center, col-x-right, col-x-left, and col-x-top. Multiple classes may be combined on the same table to align different columns independently.

      Tables over 8 columns wide should not be used and instead split into multiple smaller tables. diff --git a/js/validate.mjs b/js/validate.mjs index 1145f57..4733d5c 100644 --- a/js/validate.mjs +++ b/js/validate.mjs @@ -123,7 +123,156 @@ export function validateDataIncludes(doc, logger, fileExists = null) { } } -export function smpteValidate(doc, logger, fileExists = null) { +const ILLEGAL_CHARS_RE = /[‘’“”–—‑… ­©®]/g; + +function _formatChar(ch) { + return `U+${ch.codePointAt(0).toString(16).toUpperCase().padStart(4, "0")}`; +} + +const ILLEGAL_CHARS_SET = new Set("‘’“”–—‑… ­©®"); +const NAMED_ENTITIES = { + ldquo: "“", rdquo: "”", lsquo: "‘", rsquo: "’", + ndash: "–", mdash: "—", hellip: "…", NonBreakingHyphen: "‑", + nbsp: " ", shy: "­", copy: "©", reg: "®", +}; +const ID_ATTR_RE = /\bid\s*=\s*["']([^"']+)["']/g; + +function _resolveEntity(body) { + if (body[0] === "#") { + const isHex = body[1] === "x" || body[1] === "X"; + const num = isHex ? parseInt(body.slice(2), 16) : parseInt(body.slice(1), 10); + return Number.isFinite(num) ? String.fromCodePoint(num) : null; + } + return Object.prototype.hasOwnProperty.call(NAMED_ENTITIES, body) ? NAMED_ENTITIES[body] : null; +} + +function _scanSource(source) { + const textTokens = []; + const attrLiterals = []; + let inTag = false; + let inComment = false; + let inRawText = null; + let i = 0; + while (i < source.length) { + if (inComment) { + if (source.startsWith("-->", i)) { inComment = false; i += 3; continue; } + i++; + continue; + } + if (inRawText) { + const closeRe = new RegExp(`^`, "i"); + const m = source.slice(i).match(closeRe); + if (m) { inRawText = null; i += m[0].length; continue; } + i++; + continue; + } + if (inTag) { + if (source[i] === ">") { inTag = false; i++; continue; } + if (ILLEGAL_CHARS_SET.has(source[i])) + attrLiterals.push({ char: source[i], srcIdx: i }); + i++; + continue; + } + if (source.startsWith("