diff --git a/src/annotator/anchoring/rendered-text.ts b/src/annotator/anchoring/rendered-text.ts
new file mode 100644
index 00000000000..2daf843c380
--- /dev/null
+++ b/src/annotator/anchoring/rendered-text.ts
@@ -0,0 +1,179 @@
+/**
+ * Tags whose boundaries should produce a whitespace break in the rendered
+ * text, even though the boundary itself is not represented as a character in
+ * the DOM's `textContent`. Mirrors the visible whitespace a user perceives
+ * when they read the rendered page or copy a selection out to a single-line
+ * input.
+ */
+const BLOCK_TAGS = new Set([
+ 'ADDRESS',
+ 'ARTICLE',
+ 'ASIDE',
+ 'BLOCKQUOTE',
+ 'DIV',
+ 'DL',
+ 'FIELDSET',
+ 'FIGCAPTION',
+ 'FIGURE',
+ 'FOOTER',
+ 'FORM',
+ 'H1',
+ 'H2',
+ 'H3',
+ 'H4',
+ 'H5',
+ 'H6',
+ 'HEADER',
+ 'HR',
+ 'LI',
+ 'MAIN',
+ 'NAV',
+ 'OL',
+ 'P',
+ 'SECTION',
+ 'TABLE',
+ 'TD',
+ 'TH',
+ 'TR',
+ 'UL',
+]);
+
+export type RenderedText = {
+ /**
+ * Rendered/normalized text of the root: collapsed whitespace, with a single
+ * space inserted at each `
` and block-tag boundary.
+ */
+ text: string;
+ /**
+ * For each raw offset i (0..rawText.length, treated as a position between
+ * characters), the position in `text` where rawText[i] first contributes
+ * (or, for synthesized spaces, where the space is inserted at that
+ * boundary). Undefined entries indicate raw chars that were suppressed
+ * (e.g., a whitespace char that collapsed into a previous space) —
+ * `toNormalized` scans backwards to the nearest defined entry.
+ */
+ rawToNormalized: (number | undefined)[];
+ /** For each output position, the raw offset that produced it. */
+ normalizedToRaw: number[];
+};
+
+type BuildState = {
+ output: string;
+ rawToNormalized: (number | undefined)[];
+ normalizedToRaw: number[];
+ rawPosition: number;
+};
+
+/**
+ * Append `character` (originating at `fromRaw` in raw text) to `state.output`,
+ * collapsing whitespace runs and updating the raw↔normalized maps in place.
+ */
+function append(state: BuildState, character: string, fromRaw: number) {
+ if (/\s/.test(character)) {
+ if (state.output.length === 0 || state.output.endsWith(' ')) {
+ return;
+ }
+ character = ' ';
+ }
+ state.output += character;
+ state.normalizedToRaw.push(fromRaw);
+ if (state.rawToNormalized[fromRaw] === undefined) {
+ state.rawToNormalized[fromRaw] = state.normalizedToRaw.length - 1;
+ }
+}
+
+function walk(node: Node, state: BuildState) {
+ if (node.nodeType === Node.TEXT_NODE) {
+ const text = node.textContent ?? '';
+ for (let i = 0; i < text.length; i++) {
+ append(state, text[i], state.rawPosition);
+ state.rawPosition += 1;
+ }
+ return;
+ }
+
+ if (node.nodeType !== Node.ELEMENT_NODE) {
+ return;
+ }
+
+ const el = node as Element;
+ if (el.tagName === 'BR') {
+ append(state, ' ', state.rawPosition);
+ return;
+ }
+
+ const block = BLOCK_TAGS.has(el.nodeName);
+ if (block) {
+ append(state, ' ', state.rawPosition);
+ }
+ for (const child of Array.from(node.childNodes)) {
+ walk(child, state);
+ }
+ if (block) {
+ append(state, ' ', state.rawPosition);
+ }
+}
+
+/**
+ * Walk `root`'s DOM subtree and produce its rendered text along with offset
+ * maps between raw `textContent` and the rendered output.
+ *
+ * Whitespace runs (including the synthesized boundary spaces) are collapsed
+ * to a single ASCII space, and leading/consecutive whitespace is suppressed —
+ * matching what a user sees when they read the page or paste a selection
+ * into a single-line input.
+ */
+export function renderedTextOf(root: Element): RenderedText {
+ const rawText = root.textContent ?? '';
+ const state: BuildState = {
+ output: '',
+ rawToNormalized: new Array(rawText.length + 1).fill(undefined),
+ normalizedToRaw: [],
+ rawPosition: 0,
+ };
+
+ walk(root, state);
+
+ // End-of-string sentinel so callers can pass rawText.length / output.length
+ // without going out of bounds.
+ if (state.rawToNormalized[state.rawPosition] === undefined) {
+ state.rawToNormalized[state.rawPosition] = state.normalizedToRaw.length;
+ }
+ state.normalizedToRaw.push(state.rawPosition);
+
+ return {
+ text: state.output,
+ rawToNormalized: state.rawToNormalized,
+ normalizedToRaw: state.normalizedToRaw,
+ };
+}
+
+/** Translate a raw `textContent` offset into a position in the rendered text. */
+export function toNormalized(
+ rawToNormalized: (number | undefined)[],
+ rawOffset: number,
+): number {
+ if (rawOffset < 0) {
+ return 0;
+ }
+ const clamped = Math.min(rawOffset, rawToNormalized.length - 1);
+ for (let i = clamped; i >= 0; i--) {
+ const value = rawToNormalized[i];
+ if (value !== undefined) {
+ return value;
+ }
+ }
+ return 0;
+}
+
+/** Translate a position in the rendered text back to a raw `textContent` offset. */
+export function toRaw(
+ normalizedToRaw: number[],
+ normalizedOffset: number,
+): number {
+ if (normalizedOffset <= 0) {
+ return 0;
+ }
+ const clamped = Math.min(normalizedOffset, normalizedToRaw.length - 1);
+ return normalizedToRaw[clamped];
+}
diff --git a/src/annotator/anchoring/test/html-boundary-test.js b/src/annotator/anchoring/test/html-boundary-test.js
new file mode 100644
index 00000000000..77a66cc8445
--- /dev/null
+++ b/src/annotator/anchoring/test/html-boundary-test.js
@@ -0,0 +1,43 @@
+import { assert } from 'chai';
+
+import * as html from '../html';
+
+describe('annotator/anchoring/html boundary handling', () => {
+ let container;
+
+ beforeEach(() => {
+ container = document.createElement('div');
+ container.innerHTML = '
foo
bar
';
+ document.body.appendChild(container);
+ });
+
+ afterEach(() => {
+ container.remove();
+ });
+
+ it('describes selection after
without leading space in exact', () => {
+ const range = document.createRange();
+ const textNode = container.querySelector('p').lastChild; // text node "bar"
+ range.setStart(textNode, 0);
+ range.setEnd(textNode, 3);
+
+ const selectors = html.describe(container, range);
+ const quoteSel = selectors.find(s => s.type === 'TextQuoteSelector');
+
+ assert.equal(quoteSel.exact, 'bar');
+ assert.isFalse(quoteSel.exact.startsWith(' '));
+ assert.isFalse(quoteSel.exact.endsWith(' '));
+ });
+
+ it('anchors selection after
back to the same text', async () => {
+ const range = document.createRange();
+ const textNode = container.querySelector('p').lastChild; // text node "bar"
+ range.setStart(textNode, 0);
+ range.setEnd(textNode, 3);
+
+ const selectors = html.describe(container, range);
+ const anchoredRange = await html.anchor(container, selectors);
+
+ assert.equal(anchoredRange.toString(), 'bar');
+ });
+});
diff --git a/src/annotator/anchoring/test/html-test.js b/src/annotator/anchoring/test/html-test.js
index 1ea204983b8..a687ef281a2 100644
--- a/src/annotator/anchoring/test/html-test.js
+++ b/src/annotator/anchoring/test/html-test.js
@@ -364,7 +364,10 @@ describe('HTML anchoring', () => {
// text. We test each selector in turn to make sure they are all valid.
const anchored = selectors.map(sel => {
return html.anchor(container, [sel]).then(anchoredRange => {
- assert.equal(range.toString(), anchoredRange.toString());
+ assert.equal(
+ range.toString().trim(),
+ anchoredRange.toString().trim(),
+ );
});
});
return Promise.all(anchored);
diff --git a/src/annotator/anchoring/test/rendered-text-test.js b/src/annotator/anchoring/test/rendered-text-test.js
new file mode 100644
index 00000000000..c1d13702fc0
--- /dev/null
+++ b/src/annotator/anchoring/test/rendered-text-test.js
@@ -0,0 +1,75 @@
+import { assert } from 'chai';
+
+import { renderedTextOf, toNormalized, toRaw } from '../rendered-text';
+
+describe('annotator/anchoring/rendered-text', () => {
+ it('inserts a space at
boundaries', () => {
+ const container = document.createElement('div');
+ container.innerHTML = 'foo
bar
';
+
+ assert.equal(renderedTextOf(container).text.trim(), 'foo bar');
+ });
+
+ it('inserts a space at block-tag boundaries', () => {
+ const container = document.createElement('div');
+ container.innerHTML = 'foo
bar
baz
';
+
+ assert.equal(renderedTextOf(container).text.trim(), 'foo bar baz');
+ });
+
+ it('collapses runs of whitespace and suppresses leading whitespace', () => {
+ const container = document.createElement('div');
+ container.innerHTML = ' foo \n\n bar
';
+
+ assert.equal(renderedTextOf(container).text.trim(), 'foo bar');
+ });
+
+ describe('offset translation', () => {
+ it('round-trips raw <-> normalized offsets across a
boundary', () => {
+ const container = document.createElement('div');
+ container.innerHTML = 'foo
bar
';
+
+ const { text, rawToNormalized, normalizedToRaw } =
+ renderedTextOf(container);
+ assert.equal(text.trim(), 'foo bar');
+
+ // Raw textContent is "foobar". The 'b' at raw offset 3 maps into the
+ // rendered text past the synthesized space inserted at the
.
+ assert.equal(toNormalized(rawToNormalized, 3), 3);
+ assert.equal(toRaw(normalizedToRaw, toNormalized(rawToNormalized, 3)), 3);
+ // End-of-string round-trips to end-of-string.
+ assert.equal(
+ toRaw(normalizedToRaw, text.length),
+ container.textContent.length,
+ );
+ });
+
+ it('clamps out-of-range raw offsets to the start', () => {
+ const container = document.createElement('div');
+ container.textContent = 'abc';
+
+ const { rawToNormalized } = renderedTextOf(container);
+ assert.equal(toNormalized(rawToNormalized, -5), 0);
+ });
+
+ it('clamps non-positive normalized offsets to raw start', () => {
+ const container = document.createElement('div');
+ container.textContent = 'abc';
+
+ const { normalizedToRaw } = renderedTextOf(container);
+ assert.equal(toRaw(normalizedToRaw, 0), 0);
+ assert.equal(toRaw(normalizedToRaw, -10), 0);
+ });
+
+ it('handles whitespace-only content', () => {
+ const container = document.createElement('div');
+ container.textContent = ' ';
+
+ const { text, rawToNormalized } = renderedTextOf(container);
+ // Whitespace collapses; container is a block, so we get just the
+ // closing-block synthesized space (or empty after trim).
+ assert.equal(text.trim(), '');
+ assert.equal(toNormalized(rawToNormalized, 1), 0);
+ });
+ });
+});
diff --git a/src/annotator/anchoring/test/types-test.js b/src/annotator/anchoring/test/types-test.js
index 5620694356c..d8c87998fa5 100644
--- a/src/annotator/anchoring/test/types-test.js
+++ b/src/annotator/anchoring/test/types-test.js
@@ -1,5 +1,6 @@
import { render } from 'preact';
+import { renderedTextOf } from '../rendered-text';
import { TextRange } from '../text-range';
import {
MediaTimeAnchor,
@@ -368,11 +369,19 @@ describe('annotator/anchoring/types', () => {
quoteAnchor.toRange({ hint: 42 });
- assert.calledWith(fakeMatchQuote, container.textContent, 'Liberty', {
- hint: 42,
- prefix: 'expected-prefix',
- suffix: 'expected-suffix',
- });
+ // `toPositionAnchor` matches against the rendered (whitespace-collapsed,
+ // BR-aware) text rather than raw `textContent`. For this container the
+ // two only differ by a trailing synthesized space from the block close.
+ assert.calledWith(
+ fakeMatchQuote,
+ renderedTextOf(container).text,
+ 'Liberty',
+ {
+ hint: 42,
+ prefix: 'expected-prefix',
+ suffix: 'expected-suffix',
+ },
+ );
});
it('returns `Range` representing match found by `matchQuote`', () => {
diff --git a/src/annotator/anchoring/types.ts b/src/annotator/anchoring/types.ts
index fdf45adc61a..cddfacf6ff5 100644
--- a/src/annotator/anchoring/types.ts
+++ b/src/annotator/anchoring/types.ts
@@ -14,6 +14,7 @@ import type {
TextQuoteSelector,
} from '../../types/api';
import { matchQuote } from './match-quote';
+import { renderedTextOf, toNormalized, toRaw } from './rendered-text';
import { TextRange, TextPosition } from './text-range';
import { nodeFromXPath, xpathFromNode } from './xpath';
@@ -173,27 +174,27 @@ export class TextQuoteAnchor {
* Will throw if `range` does not contain any text nodes.
*/
static fromRange(root: Element, range: Range): TextQuoteAnchor {
- const text = root.textContent!;
+ const rawText = root.textContent ?? '';
+ const { text, rawToNormalized } = renderedTextOf(root);
const textRange = TextRange.fromRange(range).relativeTo(root);
+ const rawStart = textRange.start.offset;
+ const rawEnd = textRange.end.offset;
+ const normalizedStart = toNormalized(rawToNormalized, rawStart);
+ const normalizedEnd = toNormalized(rawToNormalized, rawEnd);
- const start = textRange.start.offset;
- const end = textRange.end.offset;
-
- // Number of characters around the quote to capture as context. We currently
- // always use a fixed amount, but it would be better if this code was aware
- // of logical boundaries in the document (paragraph, article etc.) to avoid
- // capturing text unrelated to the quote.
- //
- // In regular prose the ideal content would often be the surrounding sentence.
- // This is a natural unit of meaning which enables displaying quotes in
- // context even when the document is not available. We could use `Intl.Segmenter`
- // for this when available.
const contextLen = 32;
+ const exact = text.slice(normalizedStart, normalizedEnd).trim();
+ // Prefix/suffix come from raw textContent: matchQuote tolerates
+ // whitespace differences when matching, and keeping them raw preserves
+ // backward compatibility with selectors stored before the rendered-text
+ // normalization landed.
+ const prefix = rawText.slice(Math.max(0, rawStart - contextLen), rawStart);
+ const suffix = rawText.slice(
+ rawEnd,
+ Math.min(rawText.length, rawEnd + contextLen),
+ );
- return new TextQuoteAnchor(root, text.slice(start, end), {
- prefix: text.slice(Math.max(0, start - contextLen), start),
- suffix: text.slice(end, Math.min(text.length, end + contextLen)),
- });
+ return new TextQuoteAnchor(root, exact, { prefix, suffix });
}
static fromSelector(
@@ -218,7 +219,7 @@ export class TextQuoteAnchor {
}
toPositionAnchor(options: QuoteMatchOptions = {}): TextPositionAnchor {
- const text = this.root.textContent!;
+ const { text, normalizedToRaw } = renderedTextOf(this.root);
const match = matchQuote(text, this.exact, {
...this.context,
hint: options.hint,
@@ -226,7 +227,11 @@ export class TextQuoteAnchor {
if (!match) {
throw new Error('Quote not found');
}
- return new TextPositionAnchor(this.root, match.start, match.end);
+ return new TextPositionAnchor(
+ this.root,
+ toRaw(normalizedToRaw, match.start),
+ toRaw(normalizedToRaw, match.end),
+ );
}
}
diff --git a/src/sidebar/components/Annotation/AnnotationQuote.tsx b/src/sidebar/components/Annotation/AnnotationQuote.tsx
index f050e39608a..ac1fa4a3008 100644
--- a/src/sidebar/components/Annotation/AnnotationQuote.tsx
+++ b/src/sidebar/components/Annotation/AnnotationQuote.tsx
@@ -22,6 +22,8 @@ function AnnotationQuote({
isOrphan,
settings,
}: AnnotationQuoteProps) {
+ const displayQuote = quote.replace(/\s+/g, ' ').trim();
+
return (
@@ -31,7 +33,7 @@ function AnnotationQuote({
})}
style={applyTheme(['selectionFontFamily'], settings)}
>
- {quote}
+ {displayQuote}