diff --git a/src/annotator/anchoring/rendered-text.ts b/src/annotator/anchoring/rendered-text.ts new file mode 100644 index 00000000000..2daf843c380 --- /dev/null +++ b/src/annotator/anchoring/rendered-text.ts @@ -0,0 +1,179 @@ +/** + * Tags whose boundaries should produce a whitespace break in the rendered + * text, even though the boundary itself is not represented as a character in + * the DOM's `textContent`. Mirrors the visible whitespace a user perceives + * when they read the rendered page or copy a selection out to a single-line + * input. + */ +const BLOCK_TAGS = new Set([ + 'ADDRESS', + 'ARTICLE', + 'ASIDE', + 'BLOCKQUOTE', + 'DIV', + 'DL', + 'FIELDSET', + 'FIGCAPTION', + 'FIGURE', + 'FOOTER', + 'FORM', + 'H1', + 'H2', + 'H3', + 'H4', + 'H5', + 'H6', + 'HEADER', + 'HR', + 'LI', + 'MAIN', + 'NAV', + 'OL', + 'P', + 'SECTION', + 'TABLE', + 'TD', + 'TH', + 'TR', + 'UL', +]); + +export type RenderedText = { + /** + * Rendered/normalized text of the root: collapsed whitespace, with a single + * space inserted at each `
` and block-tag boundary. + */ + text: string; + /** + * For each raw offset i (0..rawText.length, treated as a position between + * characters), the position in `text` where rawText[i] first contributes + * (or, for synthesized spaces, where the space is inserted at that + * boundary). Undefined entries indicate raw chars that were suppressed + * (e.g., a whitespace char that collapsed into a previous space) — + * `toNormalized` scans backwards to the nearest defined entry. + */ + rawToNormalized: (number | undefined)[]; + /** For each output position, the raw offset that produced it. */ + normalizedToRaw: number[]; +}; + +type BuildState = { + output: string; + rawToNormalized: (number | undefined)[]; + normalizedToRaw: number[]; + rawPosition: number; +}; + +/** + * Append `character` (originating at `fromRaw` in raw text) to `state.output`, + * collapsing whitespace runs and updating the raw↔normalized maps in place. + */ +function append(state: BuildState, character: string, fromRaw: number) { + if (/\s/.test(character)) { + if (state.output.length === 0 || state.output.endsWith(' ')) { + return; + } + character = ' '; + } + state.output += character; + state.normalizedToRaw.push(fromRaw); + if (state.rawToNormalized[fromRaw] === undefined) { + state.rawToNormalized[fromRaw] = state.normalizedToRaw.length - 1; + } +} + +function walk(node: Node, state: BuildState) { + if (node.nodeType === Node.TEXT_NODE) { + const text = node.textContent ?? ''; + for (let i = 0; i < text.length; i++) { + append(state, text[i], state.rawPosition); + state.rawPosition += 1; + } + return; + } + + if (node.nodeType !== Node.ELEMENT_NODE) { + return; + } + + const el = node as Element; + if (el.tagName === 'BR') { + append(state, ' ', state.rawPosition); + return; + } + + const block = BLOCK_TAGS.has(el.nodeName); + if (block) { + append(state, ' ', state.rawPosition); + } + for (const child of Array.from(node.childNodes)) { + walk(child, state); + } + if (block) { + append(state, ' ', state.rawPosition); + } +} + +/** + * Walk `root`'s DOM subtree and produce its rendered text along with offset + * maps between raw `textContent` and the rendered output. + * + * Whitespace runs (including the synthesized boundary spaces) are collapsed + * to a single ASCII space, and leading/consecutive whitespace is suppressed — + * matching what a user sees when they read the page or paste a selection + * into a single-line input. + */ +export function renderedTextOf(root: Element): RenderedText { + const rawText = root.textContent ?? ''; + const state: BuildState = { + output: '', + rawToNormalized: new Array(rawText.length + 1).fill(undefined), + normalizedToRaw: [], + rawPosition: 0, + }; + + walk(root, state); + + // End-of-string sentinel so callers can pass rawText.length / output.length + // without going out of bounds. + if (state.rawToNormalized[state.rawPosition] === undefined) { + state.rawToNormalized[state.rawPosition] = state.normalizedToRaw.length; + } + state.normalizedToRaw.push(state.rawPosition); + + return { + text: state.output, + rawToNormalized: state.rawToNormalized, + normalizedToRaw: state.normalizedToRaw, + }; +} + +/** Translate a raw `textContent` offset into a position in the rendered text. */ +export function toNormalized( + rawToNormalized: (number | undefined)[], + rawOffset: number, +): number { + if (rawOffset < 0) { + return 0; + } + const clamped = Math.min(rawOffset, rawToNormalized.length - 1); + for (let i = clamped; i >= 0; i--) { + const value = rawToNormalized[i]; + if (value !== undefined) { + return value; + } + } + return 0; +} + +/** Translate a position in the rendered text back to a raw `textContent` offset. */ +export function toRaw( + normalizedToRaw: number[], + normalizedOffset: number, +): number { + if (normalizedOffset <= 0) { + return 0; + } + const clamped = Math.min(normalizedOffset, normalizedToRaw.length - 1); + return normalizedToRaw[clamped]; +} diff --git a/src/annotator/anchoring/test/html-boundary-test.js b/src/annotator/anchoring/test/html-boundary-test.js new file mode 100644 index 00000000000..77a66cc8445 --- /dev/null +++ b/src/annotator/anchoring/test/html-boundary-test.js @@ -0,0 +1,43 @@ +import { assert } from 'chai'; + +import * as html from '../html'; + +describe('annotator/anchoring/html boundary handling', () => { + let container; + + beforeEach(() => { + container = document.createElement('div'); + container.innerHTML = '

foo
bar

'; + document.body.appendChild(container); + }); + + afterEach(() => { + container.remove(); + }); + + it('describes selection after
without leading space in exact', () => { + const range = document.createRange(); + const textNode = container.querySelector('p').lastChild; // text node "bar" + range.setStart(textNode, 0); + range.setEnd(textNode, 3); + + const selectors = html.describe(container, range); + const quoteSel = selectors.find(s => s.type === 'TextQuoteSelector'); + + assert.equal(quoteSel.exact, 'bar'); + assert.isFalse(quoteSel.exact.startsWith(' ')); + assert.isFalse(quoteSel.exact.endsWith(' ')); + }); + + it('anchors selection after
back to the same text', async () => { + const range = document.createRange(); + const textNode = container.querySelector('p').lastChild; // text node "bar" + range.setStart(textNode, 0); + range.setEnd(textNode, 3); + + const selectors = html.describe(container, range); + const anchoredRange = await html.anchor(container, selectors); + + assert.equal(anchoredRange.toString(), 'bar'); + }); +}); diff --git a/src/annotator/anchoring/test/html-test.js b/src/annotator/anchoring/test/html-test.js index 1ea204983b8..a687ef281a2 100644 --- a/src/annotator/anchoring/test/html-test.js +++ b/src/annotator/anchoring/test/html-test.js @@ -364,7 +364,10 @@ describe('HTML anchoring', () => { // text. We test each selector in turn to make sure they are all valid. const anchored = selectors.map(sel => { return html.anchor(container, [sel]).then(anchoredRange => { - assert.equal(range.toString(), anchoredRange.toString()); + assert.equal( + range.toString().trim(), + anchoredRange.toString().trim(), + ); }); }); return Promise.all(anchored); diff --git a/src/annotator/anchoring/test/rendered-text-test.js b/src/annotator/anchoring/test/rendered-text-test.js new file mode 100644 index 00000000000..c1d13702fc0 --- /dev/null +++ b/src/annotator/anchoring/test/rendered-text-test.js @@ -0,0 +1,75 @@ +import { assert } from 'chai'; + +import { renderedTextOf, toNormalized, toRaw } from '../rendered-text'; + +describe('annotator/anchoring/rendered-text', () => { + it('inserts a space at
boundaries', () => { + const container = document.createElement('div'); + container.innerHTML = '

foo
bar

'; + + assert.equal(renderedTextOf(container).text.trim(), 'foo bar'); + }); + + it('inserts a space at block-tag boundaries', () => { + const container = document.createElement('div'); + container.innerHTML = '

foo

bar

baz
'; + + assert.equal(renderedTextOf(container).text.trim(), 'foo bar baz'); + }); + + it('collapses runs of whitespace and suppresses leading whitespace', () => { + const container = document.createElement('div'); + container.innerHTML = '

foo \n\n bar

'; + + assert.equal(renderedTextOf(container).text.trim(), 'foo bar'); + }); + + describe('offset translation', () => { + it('round-trips raw <-> normalized offsets across a
boundary', () => { + const container = document.createElement('div'); + container.innerHTML = '

foo
bar

'; + + const { text, rawToNormalized, normalizedToRaw } = + renderedTextOf(container); + assert.equal(text.trim(), 'foo bar'); + + // Raw textContent is "foobar". The 'b' at raw offset 3 maps into the + // rendered text past the synthesized space inserted at the
. + assert.equal(toNormalized(rawToNormalized, 3), 3); + assert.equal(toRaw(normalizedToRaw, toNormalized(rawToNormalized, 3)), 3); + // End-of-string round-trips to end-of-string. + assert.equal( + toRaw(normalizedToRaw, text.length), + container.textContent.length, + ); + }); + + it('clamps out-of-range raw offsets to the start', () => { + const container = document.createElement('div'); + container.textContent = 'abc'; + + const { rawToNormalized } = renderedTextOf(container); + assert.equal(toNormalized(rawToNormalized, -5), 0); + }); + + it('clamps non-positive normalized offsets to raw start', () => { + const container = document.createElement('div'); + container.textContent = 'abc'; + + const { normalizedToRaw } = renderedTextOf(container); + assert.equal(toRaw(normalizedToRaw, 0), 0); + assert.equal(toRaw(normalizedToRaw, -10), 0); + }); + + it('handles whitespace-only content', () => { + const container = document.createElement('div'); + container.textContent = ' '; + + const { text, rawToNormalized } = renderedTextOf(container); + // Whitespace collapses; container is a block, so we get just the + // closing-block synthesized space (or empty after trim). + assert.equal(text.trim(), ''); + assert.equal(toNormalized(rawToNormalized, 1), 0); + }); + }); +}); diff --git a/src/annotator/anchoring/test/types-test.js b/src/annotator/anchoring/test/types-test.js index 5620694356c..d8c87998fa5 100644 --- a/src/annotator/anchoring/test/types-test.js +++ b/src/annotator/anchoring/test/types-test.js @@ -1,5 +1,6 @@ import { render } from 'preact'; +import { renderedTextOf } from '../rendered-text'; import { TextRange } from '../text-range'; import { MediaTimeAnchor, @@ -368,11 +369,19 @@ describe('annotator/anchoring/types', () => { quoteAnchor.toRange({ hint: 42 }); - assert.calledWith(fakeMatchQuote, container.textContent, 'Liberty', { - hint: 42, - prefix: 'expected-prefix', - suffix: 'expected-suffix', - }); + // `toPositionAnchor` matches against the rendered (whitespace-collapsed, + // BR-aware) text rather than raw `textContent`. For this container the + // two only differ by a trailing synthesized space from the block close. + assert.calledWith( + fakeMatchQuote, + renderedTextOf(container).text, + 'Liberty', + { + hint: 42, + prefix: 'expected-prefix', + suffix: 'expected-suffix', + }, + ); }); it('returns `Range` representing match found by `matchQuote`', () => { diff --git a/src/annotator/anchoring/types.ts b/src/annotator/anchoring/types.ts index fdf45adc61a..cddfacf6ff5 100644 --- a/src/annotator/anchoring/types.ts +++ b/src/annotator/anchoring/types.ts @@ -14,6 +14,7 @@ import type { TextQuoteSelector, } from '../../types/api'; import { matchQuote } from './match-quote'; +import { renderedTextOf, toNormalized, toRaw } from './rendered-text'; import { TextRange, TextPosition } from './text-range'; import { nodeFromXPath, xpathFromNode } from './xpath'; @@ -173,27 +174,27 @@ export class TextQuoteAnchor { * Will throw if `range` does not contain any text nodes. */ static fromRange(root: Element, range: Range): TextQuoteAnchor { - const text = root.textContent!; + const rawText = root.textContent ?? ''; + const { text, rawToNormalized } = renderedTextOf(root); const textRange = TextRange.fromRange(range).relativeTo(root); + const rawStart = textRange.start.offset; + const rawEnd = textRange.end.offset; + const normalizedStart = toNormalized(rawToNormalized, rawStart); + const normalizedEnd = toNormalized(rawToNormalized, rawEnd); - const start = textRange.start.offset; - const end = textRange.end.offset; - - // Number of characters around the quote to capture as context. We currently - // always use a fixed amount, but it would be better if this code was aware - // of logical boundaries in the document (paragraph, article etc.) to avoid - // capturing text unrelated to the quote. - // - // In regular prose the ideal content would often be the surrounding sentence. - // This is a natural unit of meaning which enables displaying quotes in - // context even when the document is not available. We could use `Intl.Segmenter` - // for this when available. const contextLen = 32; + const exact = text.slice(normalizedStart, normalizedEnd).trim(); + // Prefix/suffix come from raw textContent: matchQuote tolerates + // whitespace differences when matching, and keeping them raw preserves + // backward compatibility with selectors stored before the rendered-text + // normalization landed. + const prefix = rawText.slice(Math.max(0, rawStart - contextLen), rawStart); + const suffix = rawText.slice( + rawEnd, + Math.min(rawText.length, rawEnd + contextLen), + ); - return new TextQuoteAnchor(root, text.slice(start, end), { - prefix: text.slice(Math.max(0, start - contextLen), start), - suffix: text.slice(end, Math.min(text.length, end + contextLen)), - }); + return new TextQuoteAnchor(root, exact, { prefix, suffix }); } static fromSelector( @@ -218,7 +219,7 @@ export class TextQuoteAnchor { } toPositionAnchor(options: QuoteMatchOptions = {}): TextPositionAnchor { - const text = this.root.textContent!; + const { text, normalizedToRaw } = renderedTextOf(this.root); const match = matchQuote(text, this.exact, { ...this.context, hint: options.hint, @@ -226,7 +227,11 @@ export class TextQuoteAnchor { if (!match) { throw new Error('Quote not found'); } - return new TextPositionAnchor(this.root, match.start, match.end); + return new TextPositionAnchor( + this.root, + toRaw(normalizedToRaw, match.start), + toRaw(normalizedToRaw, match.end), + ); } } diff --git a/src/sidebar/components/Annotation/AnnotationQuote.tsx b/src/sidebar/components/Annotation/AnnotationQuote.tsx index f050e39608a..ac1fa4a3008 100644 --- a/src/sidebar/components/Annotation/AnnotationQuote.tsx +++ b/src/sidebar/components/Annotation/AnnotationQuote.tsx @@ -22,6 +22,8 @@ function AnnotationQuote({ isOrphan, settings, }: AnnotationQuoteProps) { + const displayQuote = quote.replace(/\s+/g, ' ').trim(); + return ( @@ -31,7 +33,7 @@ function AnnotationQuote({ })} style={applyTheme(['selectionFontFamily'], settings)} > - {quote} + {displayQuote}