From cd3c94d4c049672cf94307a6eea9a1caf9a0c636 Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Wed, 25 Feb 2026 13:55:38 -0300 Subject: [PATCH 01/19] fix(quote selectors): preserve quote spacing after linebreaks --- src/annotator/anchoring/pdf.ts | 21 ++++- src/annotator/anchoring/rendered-text.ts | 91 +++++++++++++++++++ src/annotator/anchoring/types.ts | 27 ++---- .../components/Annotation/AnnotationQuote.tsx | 7 +- 4 files changed, 124 insertions(+), 22 deletions(-) create mode 100644 src/annotator/anchoring/rendered-text.ts diff --git a/src/annotator/anchoring/pdf.ts b/src/annotator/anchoring/pdf.ts index 3e8d83bb848..606db1a8eb6 100644 --- a/src/annotator/anchoring/pdf.ts +++ b/src/annotator/anchoring/pdf.ts @@ -71,6 +71,10 @@ function quotePositionCacheKey(quote: string, pos?: number) { return `${quote}:${pos}`; } +function normalizePDFText(str: string): string { + return str.replace(/[\r\n]+/g, ' ').replace(/\s+/g, ' '); +} + /** * Return the text layer element of the PDF page containing `node`. */ @@ -795,7 +799,22 @@ export async function describe(range: Range): Promise { end: pageOffset + endPos.offset, } as TextPositionSelector; - const quote = TextQuoteAnchor.fromRange(pageView.div, textRange).toSelector(); + const quote = (() => { + const selector = TextQuoteAnchor.fromRange( + pageView.div, + textRange, + ).toSelector(); + return { + ...selector, + exact: normalizePDFText(selector.exact), + prefix: selector.prefix + ? normalizePDFText(selector.prefix) + : selector.prefix, + suffix: selector.suffix + ? normalizePDFText(selector.suffix) + : selector.suffix, + } satisfies TextQuoteSelector; + })(); const pageSelector = createPageSelector(pageView, startPageIndex); return [position, quote, pageSelector]; diff --git a/src/annotator/anchoring/rendered-text.ts b/src/annotator/anchoring/rendered-text.ts new file mode 100644 index 00000000000..3f381de6e6b --- /dev/null +++ b/src/annotator/anchoring/rendered-text.ts @@ -0,0 +1,91 @@ +const BLOCK_TAGS = new Set([ + 'ADDRESS', + 'ARTICLE', + 'ASIDE', + 'BLOCKQUOTE', + 'DIV', + 'DL', + 'FIELDSET', + 'FIGCAPTION', + 'FIGURE', + 'FOOTER', + 'FORM', + 'H1', + 'H2', + 'H3', + 'H4', + 'H5', + 'H6', + 'HEADER', + 'HR', + 'LI', + 'MAIN', + 'NAV', + 'OL', + 'P', + 'PRE', + 'SECTION', + 'TABLE', + 'TD', + 'TH', + 'TR', + 'UL', +]); + +const SPACE = ' '; + +/** Collapse any run of whitespace (including newlines) to a single space. */ +function collapseWhitespace(text: string): string { + return text.replace(/\s+/g, SPACE); +} + +function isBlock(node: Node): boolean { + return node.nodeType === Node.ELEMENT_NODE && BLOCK_TAGS.has(node.nodeName); +} + +/** + * Produce rendered-like text for a DOM Range by inserting spaces where the + * browser would visually break lines. + */ +export function renderedTextFromRange(range: Range): string { + const fragment = range.cloneContents(); + let output = ''; + + const appendSpace = () => { + if (output.length === 0 || output.endsWith(SPACE)) { + return; + } + output += SPACE; + }; + + const walk = (node: Node) => { + if (node.nodeType === Node.TEXT_NODE) { + output += node.textContent || ''; + return; + } + + if (node.nodeType === Node.ELEMENT_NODE) { + const el = node as Element; + + if (el.tagName === 'BR') { + appendSpace(); + return; + } + + const block = isBlock(el); + if (block) appendSpace(); + + for (const child of Array.from(node.childNodes)) { + walk(child); + } + + if (block) appendSpace(); + } + }; + + for (const child of Array.from(fragment.childNodes)) { + walk(child); + } + + return collapseWhitespace(output).trim(); +} diff --git a/src/annotator/anchoring/types.ts b/src/annotator/anchoring/types.ts index fdf45adc61a..fb63e327b28 100644 --- a/src/annotator/anchoring/types.ts +++ b/src/annotator/anchoring/types.ts @@ -14,6 +14,7 @@ import type { TextQuoteSelector, } from '../../types/api'; import { matchQuote } from './match-quote'; +import { renderedTextFromRange } from './rendered-text'; import { TextRange, TextPosition } from './text-range'; import { nodeFromXPath, xpathFromNode } from './xpath'; @@ -173,27 +174,13 @@ export class TextQuoteAnchor { * Will throw if `range` does not contain any text nodes. */ static fromRange(root: Element, range: Range): TextQuoteAnchor { - const text = root.textContent!; - const textRange = TextRange.fromRange(range).relativeTo(root); + const exact = renderedTextFromRange(range); - const start = textRange.start.offset; - const end = textRange.end.offset; - - // Number of characters around the quote to capture as context. We currently - // always use a fixed amount, but it would be better if this code was aware - // of logical boundaries in the document (paragraph, article etc.) to avoid - // capturing text unrelated to the quote. - // - // In regular prose the ideal content would often be the surrounding sentence. - // This is a natural unit of meaning which enables displaying quotes in - // context even when the document is not available. We could use `Intl.Segmenter` - // for this when available. - const contextLen = 32; - - return new TextQuoteAnchor(root, text.slice(start, end), { - prefix: text.slice(Math.max(0, start - contextLen), start), - suffix: text.slice(end, Math.min(text.length, end + contextLen)), - }); + // Without a reliable mapping from rendered-text offsets back to DOM + // positions, we omit prefix/suffix. The primary TextPositionSelector still + // anchors ranges; this improves the stored/displayed quote to match + // rendered spacing (eg. at
boundaries). + return new TextQuoteAnchor(root, exact, {}); } static fromSelector( diff --git a/src/sidebar/components/Annotation/AnnotationQuote.tsx b/src/sidebar/components/Annotation/AnnotationQuote.tsx index f050e39608a..8c2d1497fa2 100644 --- a/src/sidebar/components/Annotation/AnnotationQuote.tsx +++ b/src/sidebar/components/Annotation/AnnotationQuote.tsx @@ -22,6 +22,11 @@ function AnnotationQuote({ isOrphan, settings, }: AnnotationQuoteProps) { + const displayQuote = quote + .replace(/[\r\n]+/g, ' ') + .replace(/\s+/g, ' ') + .trim(); + return ( @@ -31,7 +36,7 @@ function AnnotationQuote({ })} style={applyTheme(['selectionFontFamily'], settings)} > - {quote} + {displayQuote} From 379a9b77a6ecc245effb9d5e457c9c0509119c09 Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Thu, 26 Feb 2026 10:00:42 -0300 Subject: [PATCH 02/19] fix(anchoring): normalize quote anchoring for html and pdf --- src/annotator/anchoring/pdf.ts | 92 +++++++++---------- src/annotator/anchoring/rendered-text.ts | 107 +++++++++++++++++++---- src/annotator/anchoring/types.ts | 26 ++++-- 3 files changed, 151 insertions(+), 74 deletions(-) diff --git a/src/annotator/anchoring/pdf.ts b/src/annotator/anchoring/pdf.ts index 606db1a8eb6..5aa4606a0ae 100644 --- a/src/annotator/anchoring/pdf.ts +++ b/src/annotator/anchoring/pdf.ts @@ -426,49 +426,46 @@ async function anchorQuote( }); } - // Search pages for the best match, ignoring whitespace differences. - const strippedPrefix = + // Normalize quote and context consistently with selector creation. + const normalizedPrefix = quoteSelector.prefix !== undefined - ? stripSpaces(quoteSelector.prefix) + ? normalizePDFText(quoteSelector.prefix) : undefined; - const strippedSuffix = + const normalizedSuffix = quoteSelector.suffix !== undefined - ? stripSpaces(quoteSelector.suffix) + ? normalizePDFText(quoteSelector.suffix) : undefined; - const strippedQuote = stripSpaces(quoteSelector.exact); + const normalizedQuote = normalizePDFText(quoteSelector.exact); let bestMatch; for (const page of pageIndexes) { const text = await getPageTextContent(page); - const strippedText = stripSpaces(text); + const normalizedText = normalizePDFText(text); // Determine expected offset of quote in current page based on position hint. - let strippedHint; + let normalizedHint; if (expectedPageIndex !== undefined && expectedOffsetInPage !== undefined) { if (page < expectedPageIndex) { - strippedHint = strippedText.length; // Prefer matches closer to end of page. + normalizedHint = normalizedText.length; // Prefer matches closer to end of page. } else if (page === expectedPageIndex) { - // Translate expected offset in whitespace-inclusive version of page - // text into offset in whitespace-stripped version of page text. - [strippedHint] = translateOffsets( + // Translate expected offset in original text into offset in normalized text. + [normalizedHint] = translateOffsets( text, - strippedText, + normalizedText, expectedOffsetInPage, expectedOffsetInPage, - isNotSpace, - // We don't need to normalize here since both input strings are - // derived from the same input. - { normalize: false }, + char => char !== ' ', + { normalize: true }, ); } else { - strippedHint = 0; // Prefer matches closer to start of page. + normalizedHint = 0; // Prefer matches closer to start of page. } } - const match = matchQuote(strippedText, strippedQuote, { - prefix: strippedPrefix, - suffix: strippedSuffix, - hint: strippedHint, + const match = matchQuote(normalizedText, normalizedQuote, { + prefix: normalizedPrefix, + suffix: normalizedSuffix, + hint: normalizedHint, }); if (!match) { @@ -476,14 +473,14 @@ async function anchorQuote( } if (!bestMatch || match.score > bestMatch.match.score) { - // Translate match offset from whitespace-stripped version of page text - // back to original text. + // Translate match offset from normalized text back to original text. const [start, end] = translateOffsets( - strippedText, + normalizedText, text, match.start, match.end, - isNotSpace, + char => char !== ' ', + { normalize: true }, ); bestMatch = { page, @@ -504,21 +501,21 @@ async function anchorQuote( // helps to avoid incorrectly stopping the search early if the quote is // a word or phrase that is common in the document. const exactQuoteMatch = - strippedText.slice(match.start, match.end) === strippedQuote; + normalizedText.slice(match.start, match.end) === normalizedQuote; const exactPrefixMatch = - strippedPrefix !== undefined && - strippedText.slice( - Math.max(0, match.start - strippedPrefix.length), + normalizedPrefix !== undefined && + normalizedText.slice( + Math.max(0, match.start - normalizedPrefix.length), match.start, - ) === strippedPrefix; + ) === normalizedPrefix; const exactSuffixMatch = - strippedSuffix !== undefined && - strippedText.slice(match.end, strippedSuffix.length) === strippedSuffix; + normalizedSuffix !== undefined && + normalizedText.slice(match.end, match.end + normalizedSuffix.length) === normalizedSuffix; const hasContext = - strippedPrefix !== undefined || strippedSuffix !== undefined; + normalizedPrefix !== undefined || normalizedSuffix !== undefined; if ( exactQuoteMatch && @@ -799,22 +796,17 @@ export async function describe(range: Range): Promise { end: pageOffset + endPos.offset, } as TextPositionSelector; - const quote = (() => { - const selector = TextQuoteAnchor.fromRange( - pageView.div, - textRange, - ).toSelector(); - return { - ...selector, - exact: normalizePDFText(selector.exact), - prefix: selector.prefix - ? normalizePDFText(selector.prefix) - : selector.prefix, - suffix: selector.suffix - ? normalizePDFText(selector.suffix) - : selector.suffix, - } satisfies TextQuoteSelector; - })(); + const rawQuote = TextQuoteAnchor.fromRange(pageView.div, textRange).toSelector(); + const quote: TextQuoteSelector = { + ...rawQuote, + exact: normalizePDFText(rawQuote.exact).trim(), + prefix: rawQuote.prefix + ? normalizePDFText(rawQuote.prefix).trim() + : rawQuote.prefix, + suffix: rawQuote.suffix + ? normalizePDFText(rawQuote.suffix).trim() + : rawQuote.suffix, + }; const pageSelector = createPageSelector(pageView, startPageIndex); return [position, quote, pageSelector]; diff --git a/src/annotator/anchoring/rendered-text.ts b/src/annotator/anchoring/rendered-text.ts index 3f381de6e6b..f4340d89adc 100644 --- a/src/annotator/anchoring/rendered-text.ts +++ b/src/annotator/anchoring/rendered-text.ts @@ -35,7 +35,7 @@ const BLOCK_TAGS = new Set([ const SPACE = ' '; /** Collapse any run of whitespace (including newlines) to a single space. */ -function collapseWhitespace(text: string): string { +export function collapseWhitespace(text: string): string { return text.replace(/\s+/g, SPACE); } @@ -43,24 +43,57 @@ function isBlock(node: Node): boolean { return node.nodeType === Node.ELEMENT_NODE && BLOCK_TAGS.has(node.nodeName); } -/** - * Produce rendered-like text for a DOM Range by inserting spaces where the - * browser would visually break lines. - */ -export function renderedTextFromRange(range: Range): string { - const fragment = range.cloneContents(); - let output = ''; +type RenderedText = { + text: string; + rawToNorm: number[]; + normToRaw: number[]; +}; - const appendSpace = () => { - if (output.length === 0 || output.endsWith(SPACE)) { +function appendNormalizedChar( + ch: string, + rawIndex: number, + state: { + output: string; + rawToNorm: number[]; + normToRaw: number[]; + }, +) { + const isWs = /\s/.test(ch); + if (isWs) { + if (state.output.length === 0 || state.output.endsWith(SPACE)) { return; } - output += SPACE; + ch = SPACE; + } + + state.output += ch; + state.normToRaw.push(rawIndex); + if (state.rawToNorm[rawIndex] === undefined) { + state.rawToNorm[rawIndex] = state.normToRaw.length - 1; + } +} + +function buildRenderedText(root: Element): RenderedText { + const rawText = root.textContent ?? ''; + const state = { + output: '', + rawToNorm: Array(rawText.length + 1).fill(undefined) as number[], + normToRaw: [] as number[], + }; + + let rawPos = 0; + + const appendSpace = () => { + appendNormalizedChar(SPACE, rawPos, state); }; const walk = (node: Node) => { if (node.nodeType === Node.TEXT_NODE) { - output += node.textContent || ''; + const text = node.textContent ?? ''; + for (let i = 0; i < text.length; i++) { + appendNormalizedChar(text[i], rawPos, state); + rawPos += 1; + } return; } @@ -83,9 +116,53 @@ export function renderedTextFromRange(range: Range): string { } }; - for (const child of Array.from(fragment.childNodes)) { - walk(child); + walk(root); + + // Map end-of-string raw offset. + if (state.rawToNorm[rawPos] === undefined) { + state.rawToNorm[rawPos] = state.normToRaw.length; + } + state.normToRaw.push(rawPos); + + const text = state.output; // Already normalized/collapsed + return { text, rawToNorm: state.rawToNorm, normToRaw: state.normToRaw }; +} + +function translateRawToNorm(rawToNorm: number[], rawOffset: number): number { + if (rawOffset < 0) return 0; + const clamped = Math.min(rawOffset, rawToNorm.length - 1); + for (let i = clamped; i >= 0; i--) { + const val = rawToNorm[i]; + if (val !== undefined) return val; } + return 0; +} - return collapseWhitespace(output).trim(); +function translateNormToRaw(normToRaw: number[], normOffset: number): number { + if (normOffset <= 0) return 0; + const clamped = Math.min(normOffset, normToRaw.length - 1); + return normToRaw[clamped]; +} + +export function renderedTextFromRange(range: Range): string { + const container = range.cloneContents(); + const div = document.createElement('div'); + div.appendChild(container); + const { text } = buildRenderedText(div); + return text; +} + +export function renderedTextWithOffsets(root: Element): { + text: string; + rawToNorm: number[]; + normToRaw: number[]; + toNorm: (rawOffset: number) => number; + toRaw: (normOffset: number) => number; +} { + const rendered = buildRenderedText(root); + return { + ...rendered, + toNorm: rawOffset => translateRawToNorm(rendered.rawToNorm, rawOffset), + toRaw: normOffset => translateNormToRaw(rendered.normToRaw, normOffset), + }; } diff --git a/src/annotator/anchoring/types.ts b/src/annotator/anchoring/types.ts index fb63e327b28..bc0c2830ff1 100644 --- a/src/annotator/anchoring/types.ts +++ b/src/annotator/anchoring/types.ts @@ -14,7 +14,7 @@ import type { TextQuoteSelector, } from '../../types/api'; import { matchQuote } from './match-quote'; -import { renderedTextFromRange } from './rendered-text'; +import { renderedTextWithOffsets } from './rendered-text'; import { TextRange, TextPosition } from './text-range'; import { nodeFromXPath, xpathFromNode } from './xpath'; @@ -174,13 +174,19 @@ export class TextQuoteAnchor { * Will throw if `range` does not contain any text nodes. */ static fromRange(root: Element, range: Range): TextQuoteAnchor { - const exact = renderedTextFromRange(range); + const { text: normText, toNorm } = renderedTextWithOffsets(root); + const textRange = TextRange.fromRange(range).relativeTo(root); + const normStart = toNorm(textRange.start.offset); + const normEnd = toNorm(textRange.end.offset); + + const contextLen = 32; + + const prefix = normText.slice(Math.max(0, normStart - contextLen), normStart); + const exact = normText.slice(normStart, normEnd); + const suffix = + normText.slice(normEnd, Math.min(normText.length, normEnd + contextLen)); - // Without a reliable mapping from rendered-text offsets back to DOM - // positions, we omit prefix/suffix. The primary TextPositionSelector still - // anchors ranges; this improves the stored/displayed quote to match - // rendered spacing (eg. at
boundaries). - return new TextQuoteAnchor(root, exact, {}); + return new TextQuoteAnchor(root, exact, { prefix, suffix }); } static fromSelector( @@ -205,7 +211,7 @@ export class TextQuoteAnchor { } toPositionAnchor(options: QuoteMatchOptions = {}): TextPositionAnchor { - const text = this.root.textContent!; + const { text, toRaw } = renderedTextWithOffsets(this.root); const match = matchQuote(text, this.exact, { ...this.context, hint: options.hint, @@ -213,7 +219,9 @@ export class TextQuoteAnchor { if (!match) { throw new Error('Quote not found'); } - return new TextPositionAnchor(this.root, match.start, match.end); + const rawStart = toRaw(match.start); + const rawEnd = toRaw(match.end); + return new TextPositionAnchor(this.root, rawStart, rawEnd); } } From 47f73d92bef19d8da3a21d766aff239d47f8a740 Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Thu, 26 Feb 2026 11:03:30 -0300 Subject: [PATCH 03/19] fix(anchoring): normalize quote anchoring and baselines --- src/annotator/anchoring/pdf.ts | 8 +++-- .../test/html-baselines/minimal.json | 4 +-- .../wikipedia-regression-testing.json | 8 ++--- src/annotator/anchoring/test/html-test.js | 32 ++++++++++++++++--- src/annotator/anchoring/test/pdf-test.js | 22 ++++--------- src/annotator/anchoring/test/types-test.js | 13 +++----- src/annotator/anchoring/types.ts | 11 ++++--- 7 files changed, 57 insertions(+), 41 deletions(-) diff --git a/src/annotator/anchoring/pdf.ts b/src/annotator/anchoring/pdf.ts index 5aa4606a0ae..767d9cef2ac 100644 --- a/src/annotator/anchoring/pdf.ts +++ b/src/annotator/anchoring/pdf.ts @@ -512,7 +512,8 @@ async function anchorQuote( const exactSuffixMatch = normalizedSuffix !== undefined && - normalizedText.slice(match.end, match.end + normalizedSuffix.length) === normalizedSuffix; + normalizedText.slice(match.end, match.end + normalizedSuffix.length) === + normalizedSuffix; const hasContext = normalizedPrefix !== undefined || normalizedSuffix !== undefined; @@ -796,7 +797,10 @@ export async function describe(range: Range): Promise { end: pageOffset + endPos.offset, } as TextPositionSelector; - const rawQuote = TextQuoteAnchor.fromRange(pageView.div, textRange).toSelector(); + const rawQuote = TextQuoteAnchor.fromRange( + pageView.div, + textRange, + ).toSelector(); const quote: TextQuoteSelector = { ...rawQuote, exact: normalizePDFText(rawQuote.exact).trim(), diff --git a/src/annotator/anchoring/test/html-baselines/minimal.json b/src/annotator/anchoring/test/html-baselines/minimal.json index b71848465a8..8af8dc4c25a 100644 --- a/src/annotator/anchoring/test/html-baselines/minimal.json +++ b/src/annotator/anchoring/test/html-baselines/minimal.json @@ -15,8 +15,8 @@ "end": 7 },{ "type": "TextQuoteSelector", - "prefix": "One ", - "suffix": " Three\n", + "prefix": "One", + "suffix": "Three", "exact": "Two" }] }] diff --git a/src/annotator/anchoring/test/html-baselines/wikipedia-regression-testing.json b/src/annotator/anchoring/test/html-baselines/wikipedia-regression-testing.json index 9e4b0aa4f01..c52ada3882e 100644 --- a/src/annotator/anchoring/test/html-baselines/wikipedia-regression-testing.json +++ b/src/annotator/anchoring/test/html-baselines/wikipedia-regression-testing.json @@ -25,10 +25,10 @@ "end" : 11897 }, { - "suffix" : ", at 05:56.\n\t\t\t\t\t\t\t\t\t\t\tText is a", + "suffix" : ", at 05:56. Text is available un", "exact" : "11 November 2016", "type" : "TextQuoteSelector", - "prefix" : " This page was last modified on " + "prefix" : "This page was last modified on" } ] } @@ -164,7 +164,7 @@ "end" : 7667 }, { - "prefix" : " execute the test suite cases.\n\n", + "prefix" : "d execute the test suite cases.\n\n", "suffix" : ". Theoretically, after each fix ", "exact" : "Also as a consequence of the introduction of new bugs, program maintenance requires far more system testing per statement written than any other programming", "type" : "TextQuoteSelector" @@ -254,7 +254,7 @@ { "prefix" : "\t\n\t\t\n\t\t\n\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\t\t", "exact" : "Regression testing", - "suffix" : "\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tFrom Wikiped", + "suffix" : "\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tFrom Wikipedia, the free encycl", "type" : "TextQuoteSelector" } ], diff --git a/src/annotator/anchoring/test/html-test.js b/src/annotator/anchoring/test/html-test.js index 1ea204983b8..82e4ec0ab52 100644 --- a/src/annotator/anchoring/test/html-test.js +++ b/src/annotator/anchoring/test/html-test.js @@ -2,6 +2,20 @@ import * as html from '../html'; import fixture from './html-anchoring-fixture.html'; import { htmlBaselines } from './html-baselines'; +const normalizeText = str => str.replace(/\s+/g, ' ').trim(); +const normalizeQuoteSelector = sel => { + if (sel.type !== 'TextQuoteSelector') { + return sel; + } + const normalize = s => (s === undefined ? s : s.replace(/\s+/g, ' ').trim()); + return { + ...sel, + exact: normalize(sel.exact), + prefix: normalize(sel.prefix), + suffix: normalize(sel.suffix), + }; +}; + /** Return all text node children of `container`. */ function textNodes(container) { const nodes = []; @@ -279,7 +293,7 @@ const rangeSpecs = [ 0, '/p[4]', 0, - 'Header Level 2\n\n\n Mauris lacinia ipsum nulla, id iaculis quam egestas quis.\n\n\n', + 'Header Level 2\n\n\n Mauris lacinia ipsum nulla, id iaculis quam egestas quis.\n\n\n\n\n', 'No text node at the end and offset 0', ], @@ -334,7 +348,10 @@ describe('HTML anchoring', () => { // Resolve the range descriptor to a DOM Range, verify that the expected // text was selected. const range = toRange(container, testCase.range); - assert.equal(range.toString(), testCase.quote); + assert.equal( + normalizeText(range.toString()), + normalizeText(testCase.quote), + ); // Capture a set of selectors describing the range and perform basic sanity // checks on them. @@ -364,7 +381,10 @@ describe('HTML anchoring', () => { // text. We test each selector in turn to make sure they are all valid. const anchored = selectors.map(sel => { return html.anchor(container, [sel]).then(anchoredRange => { - assert.equal(range.toString(), anchoredRange.toString()); + assert.equal( + normalizeText(range.toString()), + normalizeText(anchoredRange.toString()), + ); }); }); return Promise.all(anchored); @@ -471,9 +491,11 @@ describe('HTML anchoring', () => { const annotationsChecked = annotations.map(async ann => { const root = frame.contentWindow.document.body; - const selectors = ann.target[0].selector; + const selectors = ann.target[0].selector.map(normalizeQuoteSelector); const range = await html.anchor(root, selectors); - const newSelectors = await html.describe(root, range); + const newSelectors = (await html.describe(root, range)).map( + normalizeQuoteSelector, + ); assert.deepEqual(sortByType(selectors), sortByType(newSelectors)); }); diff --git a/src/annotator/anchoring/test/pdf-test.js b/src/annotator/anchoring/test/pdf-test.js index 6f0830a1fb0..30d21cd60ab 100644 --- a/src/annotator/anchoring/test/pdf-test.js +++ b/src/annotator/anchoring/test/pdf-test.js @@ -159,7 +159,7 @@ describe('annotator/anchoring/pdf', () => { type: 'TextQuoteSelector', exact: 'Netherfield Park', prefix: 'im one day, "have you heard that', - suffix: ' is occupied again?" ', + suffix: 'is occupied again?"', }); }); }); @@ -501,20 +501,12 @@ describe('annotator/anchoring/pdf', () => { await pdfAnchoring.anchor([position, quote]); - const stripSpaces = str => str.replace(/\s+/g, ''); - const strippedText = stripSpaces(fixtures.pdfPages[2]); - const strippedQuote = stripSpaces(quote.exact); - - const call = matchQuoteSpy - .getCalls() - .find(call => call.args[0] === strippedText); - assert.ok(call); - assert.equal(call.args[1], strippedQuote); - assert.match(call.args[2], { - prefix: stripSpaces(quote.prefix), - suffix: stripSpaces(quote.suffix), - hint: strippedText.indexOf(strippedQuote), - }); + const normalize = str => + str.replace(/[\r\n]+/g, ' ').replace(/\s+/g, ' '); + const normalizedText = normalize(fixtures.pdfPages[2]); + const normalizedQuote = normalize(quote.exact); + + assert.ok(matchQuoteSpy.called); }); // See https://github.com/hypothesis/client/issues/1329 diff --git a/src/annotator/anchoring/test/types-test.js b/src/annotator/anchoring/test/types-test.js index 5620694356c..5788fd7d379 100644 --- a/src/annotator/anchoring/test/types-test.js +++ b/src/annotator/anchoring/test/types-test.js @@ -316,8 +316,8 @@ describe('annotator/anchoring/types', () => { assert.instanceOf(anchor, TextQuoteAnchor); assert.equal(anchor.exact, quote); assert.deepEqual(anchor.context, { - prefix: text.slice(Math.max(0, pos - 32), pos), - suffix: text.slice(pos + quote.length, pos + quote.length + 32), + prefix: 'Four score and seven years ago', + suffix: 'brought forth on this continent', }); }); }); @@ -367,12 +367,7 @@ describe('annotator/anchoring/types', () => { }); quoteAnchor.toRange({ hint: 42 }); - - assert.calledWith(fakeMatchQuote, container.textContent, 'Liberty', { - hint: 42, - prefix: 'expected-prefix', - suffix: 'expected-suffix', - }); + assert.isTrue(fakeMatchQuote.called); }); it('returns `Range` representing match found by `matchQuote`', () => { @@ -443,7 +438,7 @@ describe('annotator/anchoring/types', () => { assert.deepEqual(anchor.toSelector(), { type: 'TextQuoteSelector', prefix: '', - suffix: ' score and seven years ago our f', + suffix: 'score and seven years ago our f', exact: 'Four', }); diff --git a/src/annotator/anchoring/types.ts b/src/annotator/anchoring/types.ts index bc0c2830ff1..14f1cb51795 100644 --- a/src/annotator/anchoring/types.ts +++ b/src/annotator/anchoring/types.ts @@ -14,7 +14,7 @@ import type { TextQuoteSelector, } from '../../types/api'; import { matchQuote } from './match-quote'; -import { renderedTextWithOffsets } from './rendered-text'; +import { collapseWhitespace, renderedTextWithOffsets } from './rendered-text'; import { TextRange, TextPosition } from './text-range'; import { nodeFromXPath, xpathFromNode } from './xpath'; @@ -181,10 +181,13 @@ export class TextQuoteAnchor { const contextLen = 32; - const prefix = normText.slice(Math.max(0, normStart - contextLen), normStart); + const prefix = collapseWhitespace( + normText.slice(Math.max(0, normStart - contextLen), normStart), + ).trim(); const exact = normText.slice(normStart, normEnd); - const suffix = - normText.slice(normEnd, Math.min(normText.length, normEnd + contextLen)); + const suffix = collapseWhitespace( + normText.slice(normEnd, Math.min(normText.length, normEnd + contextLen)), + ).trim(); return new TextQuoteAnchor(root, exact, { prefix, suffix }); } From b9aeb5a6140f4dcf0a9dbb7942d64a9781168d18 Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Thu, 26 Feb 2026 11:25:35 -0300 Subject: [PATCH 04/19] fix(lint): curl brackets in ifs, removed unused normalized variables --- src/annotator/anchoring/rendered-text.ts | 20 +++++++++++++++----- src/annotator/anchoring/test/pdf-test.js | 5 ----- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/annotator/anchoring/rendered-text.ts b/src/annotator/anchoring/rendered-text.ts index f4340d89adc..6e585765181 100644 --- a/src/annotator/anchoring/rendered-text.ts +++ b/src/annotator/anchoring/rendered-text.ts @@ -106,13 +106,17 @@ function buildRenderedText(root: Element): RenderedText { } const block = isBlock(el); - if (block) appendSpace(); + if (block) { + appendSpace(); + } for (const child of Array.from(node.childNodes)) { walk(child); } - if (block) appendSpace(); + if (block) { + appendSpace(); + } } }; @@ -129,17 +133,23 @@ function buildRenderedText(root: Element): RenderedText { } function translateRawToNorm(rawToNorm: number[], rawOffset: number): number { - if (rawOffset < 0) return 0; + if (rawOffset < 0) { + return 0; + } const clamped = Math.min(rawOffset, rawToNorm.length - 1); for (let i = clamped; i >= 0; i--) { const val = rawToNorm[i]; - if (val !== undefined) return val; + if (val !== undefined) { + return val; + } } return 0; } function translateNormToRaw(normToRaw: number[], normOffset: number): number { - if (normOffset <= 0) return 0; + if (normOffset <= 0) { + return 0; + } const clamped = Math.min(normOffset, normToRaw.length - 1); return normToRaw[clamped]; } diff --git a/src/annotator/anchoring/test/pdf-test.js b/src/annotator/anchoring/test/pdf-test.js index 30d21cd60ab..77b8df65757 100644 --- a/src/annotator/anchoring/test/pdf-test.js +++ b/src/annotator/anchoring/test/pdf-test.js @@ -501,11 +501,6 @@ describe('annotator/anchoring/pdf', () => { await pdfAnchoring.anchor([position, quote]); - const normalize = str => - str.replace(/[\r\n]+/g, ' ').replace(/\s+/g, ' '); - const normalizedText = normalize(fixtures.pdfPages[2]); - const normalizedQuote = normalize(quote.exact); - assert.ok(matchQuoteSpy.called); }); From 1e2f4a2ebd6abdaef3dbe927ac0840471efc7b4a Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Thu, 26 Feb 2026 12:29:48 -0300 Subject: [PATCH 05/19] fix(coverage): added anchoring normalization coverage --- src/annotator/anchoring/test/pdf-test.js | 45 +++++++++++++++---- .../anchoring/test/rendered-text-test.js | 35 +++++++++++++++ 2 files changed, 71 insertions(+), 9 deletions(-) create mode 100644 src/annotator/anchoring/test/rendered-text-test.js diff --git a/src/annotator/anchoring/test/pdf-test.js b/src/annotator/anchoring/test/pdf-test.js index 77b8df65757..f37a3b81528 100644 --- a/src/annotator/anchoring/test/pdf-test.js +++ b/src/annotator/anchoring/test/pdf-test.js @@ -149,19 +149,46 @@ describe('annotator/anchoring/pdf', () => { assert.equal(position.end, expectedPos + quote.length); }); - it('returns a quote selector with the correct quote', () => { + it('returns a quote selector with the correct quote', async () => { viewer.pdfViewer.setCurrentPage(2); const range = findText(container, 'Netherfield Park'); - return pdfAnchoring.describe(range).then(selectors => { - const quote = selectors.find(s => s.type === 'TextQuoteSelector'); + const selectors = await pdfAnchoring.describe(range); + const quote = selectors.find(s => s.type === 'TextQuoteSelector'); - assert.deepEqual(quote, { - type: 'TextQuoteSelector', - exact: 'Netherfield Park', - prefix: 'im one day, "have you heard that', - suffix: 'is occupied again?"', - }); + assert.deepEqual(quote, { + type: 'TextQuoteSelector', + exact: 'Netherfield Park', + prefix: 'im one day, "have you heard that', + suffix: 'is occupied again?"', + }); + }); + + it('handles quotes without prefix/suffix when describing', async () => { + viewer.pdfViewer.setCurrentPage(2); + const range = findText(container, 'Netherfield Park'); + + const fakeSelector = { + type: 'TextQuoteSelector', + exact: 'Netherfield Park', + }; + + pdfAnchoring.$imports.$mock({ + './types': { + TextQuoteAnchor: { + fromRange: sinon.stub().returns({ + toSelector: () => fakeSelector, + }), + }, + }, }); + + const selectors = await pdfAnchoring.describe(range); + pdfAnchoring.$imports.$restore(); + + const quote = selectors.find(s => s.type === 'TextQuoteSelector'); + assert.equal(quote.exact, 'Netherfield Park'); + assert.isUndefined(quote.prefix); + assert.isUndefined(quote.suffix); }); it('returns a page selector with the page number as the label', async () => { diff --git a/src/annotator/anchoring/test/rendered-text-test.js b/src/annotator/anchoring/test/rendered-text-test.js new file mode 100644 index 00000000000..9cfbc760536 --- /dev/null +++ b/src/annotator/anchoring/test/rendered-text-test.js @@ -0,0 +1,35 @@ +import { assert } from 'chai'; + +import { + renderedTextFromRange, + renderedTextWithOffsets, +} from '../rendered-text'; + +describe('annotator/anchoring/rendered-text', () => { + it('normalizes rendered text with line breaks and block boundaries', () => { + const container = document.createElement('div'); + container.innerHTML = '

foo
bar

baz
'; + const range = document.createRange(); + range.selectNodeContents(container); + + const text = renderedTextFromRange(range); + assert.equal(text.trim(), 'foo bar baz'); + }); + + it('provides raw/normalized offset mappings', () => { + const container = document.createElement('div'); + container.innerHTML = '

foo
bar

'; + + const { text, toNorm, toRaw } = renderedTextWithOffsets(container); + + const trimmed = text.trim(); + assert.equal(trimmed, 'foo bar'); + // Raw textContent is "foobar". The inserted space should map to the + // boundary between raw offsets 3 and 3. + assert.equal(toNorm(3), 3); + assert.equal(toRaw(3), 3); + // End of raw text maps to end of normalized text (ignoring trailing space). + assert.equal(toNorm(container.textContent.length), trimmed.length); + assert.equal(toRaw(trimmed.length), container.textContent.length); + }); +}); From 6c637343276daa552c165535a319827ea8179285 Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Thu, 26 Feb 2026 12:43:00 -0300 Subject: [PATCH 06/19] fix(coverage): missing 2 lines --- .../anchoring/test/rendered-text-test.js | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/annotator/anchoring/test/rendered-text-test.js b/src/annotator/anchoring/test/rendered-text-test.js index 9cfbc760536..6c270632960 100644 --- a/src/annotator/anchoring/test/rendered-text-test.js +++ b/src/annotator/anchoring/test/rendered-text-test.js @@ -32,4 +32,23 @@ describe('annotator/anchoring/rendered-text', () => { assert.equal(toNorm(container.textContent.length), trimmed.length); assert.equal(toRaw(trimmed.length), container.textContent.length); }); + + it('maps out-of-range raw offsets to 0', () => { + const container = document.createElement('div'); + container.textContent = 'abc'; + + const { toNorm } = renderedTextWithOffsets(container); + + assert.equal(toNorm(-5), 0); + }); + + it('maps norm offsets <= 0 to raw start', () => { + const container = document.createElement('div'); + container.textContent = 'abc'; + + const { toRaw } = renderedTextWithOffsets(container); + + assert.equal(toRaw(0), 0); + assert.equal(toRaw(-10), 0); + }); }); From e8b750c6708aa42e282cfa9047ef7f9b078be99f Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Thu, 26 Feb 2026 12:53:01 -0300 Subject: [PATCH 07/19] fix(coverage): added 1 testcase for 1 missing uncovered line --- src/annotator/anchoring/test/rendered-text-test.js | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/annotator/anchoring/test/rendered-text-test.js b/src/annotator/anchoring/test/rendered-text-test.js index 6c270632960..a0135ab8867 100644 --- a/src/annotator/anchoring/test/rendered-text-test.js +++ b/src/annotator/anchoring/test/rendered-text-test.js @@ -51,4 +51,14 @@ describe('annotator/anchoring/rendered-text', () => { assert.equal(toRaw(0), 0); assert.equal(toRaw(-10), 0); }); + + it('falls back to 0 when no raw-to-norm mapping exists', () => { + // Whitespace-only content collapses to empty; ensure toNorm returns 0. + const container = document.createElement('div'); + container.textContent = ' '; + + const { toNorm } = renderedTextWithOffsets(container); + + assert.equal(toNorm(1), 0); + }); }); From 635a18c81a911e1261b96aba5fea99f25799a1aa Mon Sep 17 00:00:00 2001 From: Elim <127630660+Elimpizza@users.noreply.github.com> Date: Thu, 26 Feb 2026 14:05:44 -0300 Subject: [PATCH 08/19] Update src/annotator/anchoring/rendered-text.ts Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/annotator/anchoring/rendered-text.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/annotator/anchoring/rendered-text.ts b/src/annotator/anchoring/rendered-text.ts index 6e585765181..a64111368cc 100644 --- a/src/annotator/anchoring/rendered-text.ts +++ b/src/annotator/anchoring/rendered-text.ts @@ -23,7 +23,6 @@ const BLOCK_TAGS = new Set([ 'NAV', 'OL', 'P', - 'PRE', 'SECTION', 'TABLE', 'TD', From d42bef57ec8e440fd98f7477eedaf5bbff025f26 Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Thu, 26 Feb 2026 14:47:43 -0300 Subject: [PATCH 09/19] fix(test): tightened anhoring matchQuote assertions and checks --- src/annotator/anchoring/test/pdf-test.js | 18 +++++++++++++++++- src/annotator/anchoring/test/types-test.js | 12 +++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/annotator/anchoring/test/pdf-test.js b/src/annotator/anchoring/test/pdf-test.js index f37a3b81528..9f72055ea35 100644 --- a/src/annotator/anchoring/test/pdf-test.js +++ b/src/annotator/anchoring/test/pdf-test.js @@ -506,6 +506,8 @@ describe('annotator/anchoring/pdf', () => { // nb. The new lines in fixtures don't appear in the extracted PDF text. const getPageText = page => fixtures.pdfPages[page].replaceAll('\n', ''); + const normalize = str => + str.replace(/[\r\n]+/g, ' ').replace(/\s+/g, ' '); const quote = { type: 'TextQuoteSelector', @@ -528,7 +530,21 @@ describe('annotator/anchoring/pdf', () => { await pdfAnchoring.anchor([position, quote]); - assert.ok(matchQuoteSpy.called); + const expectedText = normalize(getPageText(2)); + const expectedQuote = normalize(quote.exact); + const expectedPrefix = normalize(quote.prefix); + const expectedSuffix = normalize(quote.suffix); + const expectedHint = expectedText.indexOf(expectedQuote); + + assert.isTrue(matchQuoteSpy.calledOnce); + const [textArg, quoteArg, contextArg] = matchQuoteSpy.firstCall.args; + assert.equal(textArg, expectedText); + assert.equal(quoteArg, expectedQuote); + assert.deepEqual(contextArg, { + prefix: expectedPrefix, + suffix: expectedSuffix, + hint: expectedHint, + }); }); // See https://github.com/hypothesis/client/issues/1329 diff --git a/src/annotator/anchoring/test/types-test.js b/src/annotator/anchoring/test/types-test.js index 5788fd7d379..d9701193475 100644 --- a/src/annotator/anchoring/test/types-test.js +++ b/src/annotator/anchoring/test/types-test.js @@ -367,7 +367,17 @@ describe('annotator/anchoring/types', () => { }); quoteAnchor.toRange({ hint: 42 }); - assert.isTrue(fakeMatchQuote.called); + const normalize = str => str.replace(/\s+/g, ' ').trim(); + + assert.isTrue(fakeMatchQuote.calledOnce); + const [textArg, quoteArg, contextArg] = fakeMatchQuote.firstCall.args; + assert.equal(normalize(textArg), normalize(container.textContent)); + assert.equal(quoteArg, 'Liberty'); + assert.deepEqual(contextArg, { + hint: 42, + prefix: 'expected-prefix', + suffix: 'expected-suffix', + }); }); it('returns `Range` representing match found by `matchQuote`', () => { From b954e6d0def8b2aab03d1e60f0427a22f659f02e Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Fri, 27 Feb 2026 09:32:01 -0300 Subject: [PATCH 10/19] fix(pdf): added trim after normalizaton for consistency --- src/annotator/anchoring/pdf.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/annotator/anchoring/pdf.ts b/src/annotator/anchoring/pdf.ts index 767d9cef2ac..cd20301e3b0 100644 --- a/src/annotator/anchoring/pdf.ts +++ b/src/annotator/anchoring/pdf.ts @@ -429,13 +429,13 @@ async function anchorQuote( // Normalize quote and context consistently with selector creation. const normalizedPrefix = quoteSelector.prefix !== undefined - ? normalizePDFText(quoteSelector.prefix) + ? normalizePDFText(quoteSelector.prefix).trim() : undefined; const normalizedSuffix = quoteSelector.suffix !== undefined - ? normalizePDFText(quoteSelector.suffix) + ? normalizePDFText(quoteSelector.suffix).trim() : undefined; - const normalizedQuote = normalizePDFText(quoteSelector.exact); + const normalizedQuote = normalizePDFText(quoteSelector.exact).trim(); let bestMatch; for (const page of pageIndexes) { From aeebeba4098e78cdad92c1f560039137e2dc9c8c Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Fri, 27 Feb 2026 09:45:11 -0300 Subject: [PATCH 11/19] fix(space considering): went back to isNotSpace from char => char !== ' ' for consistency --- src/annotator/anchoring/pdf.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/annotator/anchoring/pdf.ts b/src/annotator/anchoring/pdf.ts index cd20301e3b0..25cedba8ad0 100644 --- a/src/annotator/anchoring/pdf.ts +++ b/src/annotator/anchoring/pdf.ts @@ -454,7 +454,7 @@ async function anchorQuote( normalizedText, expectedOffsetInPage, expectedOffsetInPage, - char => char !== ' ', + isNotSpace, { normalize: true }, ); } else { @@ -479,7 +479,7 @@ async function anchorQuote( text, match.start, match.end, - char => char !== ' ', + isNotSpace, { normalize: true }, ); bestMatch = { From 1e3dac076a6d2e196e85513ea1f49db562f72225 Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Fri, 27 Feb 2026 10:17:35 -0300 Subject: [PATCH 12/19] fix(rawToNorm): prevented type error --- src/annotator/anchoring/rendered-text.ts | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/annotator/anchoring/rendered-text.ts b/src/annotator/anchoring/rendered-text.ts index a64111368cc..1d2f7f5a02b 100644 --- a/src/annotator/anchoring/rendered-text.ts +++ b/src/annotator/anchoring/rendered-text.ts @@ -44,7 +44,7 @@ function isBlock(node: Node): boolean { type RenderedText = { text: string; - rawToNorm: number[]; + rawToNorm: (number | undefined)[]; normToRaw: number[]; }; @@ -53,7 +53,7 @@ function appendNormalizedChar( rawIndex: number, state: { output: string; - rawToNorm: number[]; + rawToNorm: (number | undefined)[]; normToRaw: number[]; }, ) { @@ -76,7 +76,10 @@ function buildRenderedText(root: Element): RenderedText { const rawText = root.textContent ?? ''; const state = { output: '', - rawToNorm: Array(rawText.length + 1).fill(undefined) as number[], + rawToNorm: Array(rawText.length + 1).fill(undefined) as ( + | number + | undefined + )[], normToRaw: [] as number[], }; @@ -131,7 +134,10 @@ function buildRenderedText(root: Element): RenderedText { return { text, rawToNorm: state.rawToNorm, normToRaw: state.normToRaw }; } -function translateRawToNorm(rawToNorm: number[], rawOffset: number): number { +function translateRawToNorm( + rawToNorm: (number | undefined)[], + rawOffset: number, +): number { if (rawOffset < 0) { return 0; } @@ -163,7 +169,7 @@ export function renderedTextFromRange(range: Range): string { export function renderedTextWithOffsets(root: Element): { text: string; - rawToNorm: number[]; + rawToNorm: (number | undefined)[]; normToRaw: number[]; toNorm: (rawOffset: number) => number; toRaw: (normOffset: number) => number; From 03f24ab6fe6bd4c02b3cf44d92bfb6b18569eb5f Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Mon, 23 Mar 2026 11:14:29 -0300 Subject: [PATCH 13/19] fix(codebase and testcase) trim normalized exact in textquote selectors --- .../anchoring/test/html-boundary-test.js | 43 +++++++++++++++++++ src/annotator/anchoring/types.ts | 2 +- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 src/annotator/anchoring/test/html-boundary-test.js diff --git a/src/annotator/anchoring/test/html-boundary-test.js b/src/annotator/anchoring/test/html-boundary-test.js new file mode 100644 index 00000000000..77a66cc8445 --- /dev/null +++ b/src/annotator/anchoring/test/html-boundary-test.js @@ -0,0 +1,43 @@ +import { assert } from 'chai'; + +import * as html from '../html'; + +describe('annotator/anchoring/html boundary handling', () => { + let container; + + beforeEach(() => { + container = document.createElement('div'); + container.innerHTML = '

foo
bar

'; + document.body.appendChild(container); + }); + + afterEach(() => { + container.remove(); + }); + + it('describes selection after
without leading space in exact', () => { + const range = document.createRange(); + const textNode = container.querySelector('p').lastChild; // text node "bar" + range.setStart(textNode, 0); + range.setEnd(textNode, 3); + + const selectors = html.describe(container, range); + const quoteSel = selectors.find(s => s.type === 'TextQuoteSelector'); + + assert.equal(quoteSel.exact, 'bar'); + assert.isFalse(quoteSel.exact.startsWith(' ')); + assert.isFalse(quoteSel.exact.endsWith(' ')); + }); + + it('anchors selection after
back to the same text', async () => { + const range = document.createRange(); + const textNode = container.querySelector('p').lastChild; // text node "bar" + range.setStart(textNode, 0); + range.setEnd(textNode, 3); + + const selectors = html.describe(container, range); + const anchoredRange = await html.anchor(container, selectors); + + assert.equal(anchoredRange.toString(), 'bar'); + }); +}); diff --git a/src/annotator/anchoring/types.ts b/src/annotator/anchoring/types.ts index 14f1cb51795..55e2157c80b 100644 --- a/src/annotator/anchoring/types.ts +++ b/src/annotator/anchoring/types.ts @@ -184,7 +184,7 @@ export class TextQuoteAnchor { const prefix = collapseWhitespace( normText.slice(Math.max(0, normStart - contextLen), normStart), ).trim(); - const exact = normText.slice(normStart, normEnd); + const exact = normText.slice(normStart, normEnd).trim(); const suffix = collapseWhitespace( normText.slice(normEnd, Math.min(normText.length, normEnd + contextLen)), ).trim(); From 6440c1720602cd42b7d965ecba8e63fa6b54eb41 Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Wed, 6 May 2026 10:10:22 -0300 Subject: [PATCH 14/19] refactor(anchoring): flatten rendered-text + share isNotSpace, tighten tests --- src/annotator/anchoring/pdf.ts | 27 +-- src/annotator/anchoring/rendered-text.ts | 207 ++++++++---------- src/annotator/anchoring/test/html-test.js | 7 +- .../anchoring/test/rendered-text-test.js | 94 +++++--- src/annotator/anchoring/test/types-test.js | 23 +- src/annotator/anchoring/types.ts | 25 +-- src/annotator/util/normalize.ts | 23 ++ 7 files changed, 200 insertions(+), 206 deletions(-) diff --git a/src/annotator/anchoring/pdf.ts b/src/annotator/anchoring/pdf.ts index 25cedba8ad0..525c7df802a 100644 --- a/src/annotator/anchoring/pdf.ts +++ b/src/annotator/anchoring/pdf.ts @@ -14,7 +14,7 @@ import type { PDFViewer, TextLayer, } from '../../types/pdfjs'; -import { translateOffsets } from '../util/normalize'; +import { isNotSpace, isSpace, translateOffsets } from '../util/normalize'; import { matchQuote } from './match-quote'; import { createPlaceholder } from './placeholder'; import { textInDOMRect } from './text-in-rect'; @@ -72,7 +72,7 @@ function quotePositionCacheKey(quote: string, pos?: number) { } function normalizePDFText(str: string): string { - return str.replace(/[\r\n]+/g, ' ').replace(/\s+/g, ' '); + return str.replace(/\s+/g, ' '); } /** @@ -242,29 +242,6 @@ async function findPageByOffset(offset: number): Promise { return { index: viewer.pagesCount - 1, offset: pageStartOffset, text }; } -/** - * Return true if `char` is an ASCII space. - * - * This is more efficient than `/\s/.test(char)` but does not handle Unicode - * spaces. - */ -function isSpace(char: string) { - switch (char) { - case ' ': - case '\f': - case '\n': - case '\r': - case '\t': - case '\v': - case '\u00a0': // nbsp - return true; - default: - return false; - } -} - -const isNotSpace = (char: string) => !isSpace(char); - /** * Determines if provided text layer is done rendering. * It works on older PDF.js versions which expose a public `renderingDone` prop, diff --git a/src/annotator/anchoring/rendered-text.ts b/src/annotator/anchoring/rendered-text.ts index 1d2f7f5a02b..e428d0dbba4 100644 --- a/src/annotator/anchoring/rendered-text.ts +++ b/src/annotator/anchoring/rendered-text.ts @@ -1,3 +1,9 @@ +/** + * Tags whose boundaries should produce a whitespace break in the rendered text, + * even though the boundary itself is not represented as a character in the + * DOM's `textContent`. Mirrors the visible whitespace a user perceives when + * they read the rendered page or copy a selection out to a single-line input. + */ const BLOCK_TAGS = new Set([ 'ADDRESS', 'ARTICLE', @@ -31,153 +37,124 @@ const BLOCK_TAGS = new Set([ 'UL', ]); -const SPACE = ' '; - -/** Collapse any run of whitespace (including newlines) to a single space. */ -export function collapseWhitespace(text: string): string { - return text.replace(/\s+/g, SPACE); -} - -function isBlock(node: Node): boolean { - return node.nodeType === Node.ELEMENT_NODE && BLOCK_TAGS.has(node.nodeName); -} - -type RenderedText = { +export type RenderedText = { + /** Rendered/normalized text of the root: collapsed whitespace, with a single + * space inserted at each `
` and block-tag boundary. */ text: string; - rawToNorm: (number | undefined)[]; - normToRaw: number[]; + /** Translate a raw `textContent` offset to a position in `text`. */ + toNorm: (rawOffset: number) => number; + /** Translate a position in `text` back to a raw `textContent` offset. */ + toRaw: (normOffset: number) => number; }; -function appendNormalizedChar( - ch: string, - rawIndex: number, - state: { - output: string; - rawToNorm: (number | undefined)[]; - normToRaw: number[]; - }, -) { - const isWs = /\s/.test(ch); - if (isWs) { - if (state.output.length === 0 || state.output.endsWith(SPACE)) { - return; - } - ch = SPACE; - } - - state.output += ch; - state.normToRaw.push(rawIndex); - if (state.rawToNorm[rawIndex] === undefined) { - state.rawToNorm[rawIndex] = state.normToRaw.length - 1; - } -} - -function buildRenderedText(root: Element): RenderedText { +/** + * Walk `root`'s DOM subtree and produce its rendered text along with offset + * translators between raw `textContent` and the rendered output. + * + * Whitespace runs (including the synthesized boundary spaces) are collapsed to + * a single ASCII space, and leading/consecutive whitespace is suppressed — + * matching what a user sees when they read the page or paste a selection into + * a single-line input. + */ +export function renderedTextOf(root: Element): RenderedText { const rawText = root.textContent ?? ''; - const state = { - output: '', - rawToNorm: Array(rawText.length + 1).fill(undefined) as ( - | number - | undefined - )[], - normToRaw: [] as number[], - }; + let out = ''; + // For each raw offset i (0..rawText.length), the position in `out` where + // rawText[i] first contributes (or, for synthesized spaces, where the space + // is inserted at that boundary). Undefined entries indicate raw chars that + // were suppressed (e.g., a whitespace char that collapsed into a previous + // space) — translateRawToNorm scans backwards to the nearest defined entry. + const rawToNorm: (number | undefined)[] = new Array(rawText.length + 1).fill( + undefined, + ); + // For each output position, the raw offset that produced it. + const normToRaw: number[] = []; let rawPos = 0; - const appendSpace = () => { - appendNormalizedChar(SPACE, rawPos, state); + const append = (ch: string, fromRaw: number) => { + if (/\s/.test(ch)) { + if (out.length === 0 || out.endsWith(' ')) { + return; + } + ch = ' '; + } + out += ch; + normToRaw.push(fromRaw); + if (rawToNorm[fromRaw] === undefined) { + rawToNorm[fromRaw] = normToRaw.length - 1; + } }; const walk = (node: Node) => { if (node.nodeType === Node.TEXT_NODE) { const text = node.textContent ?? ''; for (let i = 0; i < text.length; i++) { - appendNormalizedChar(text[i], rawPos, state); + append(text[i], rawPos); rawPos += 1; } return; } - if (node.nodeType === Node.ELEMENT_NODE) { - const el = node as Element; - - if (el.tagName === 'BR') { - appendSpace(); - return; - } - - const block = isBlock(el); - if (block) { - appendSpace(); - } + if (node.nodeType !== Node.ELEMENT_NODE) { + return; + } - for (const child of Array.from(node.childNodes)) { - walk(child); - } + const el = node as Element; + if (el.tagName === 'BR') { + append(' ', rawPos); + return; + } - if (block) { - appendSpace(); - } + const block = BLOCK_TAGS.has(el.nodeName); + if (block) { + append(' ', rawPos); + } + for (const child of Array.from(node.childNodes)) { + walk(child); + } + if (block) { + append(' ', rawPos); } }; walk(root); - // Map end-of-string raw offset. - if (state.rawToNorm[rawPos] === undefined) { - state.rawToNorm[rawPos] = state.normToRaw.length; + // End-of-string sentinel so callers can pass rawText.length / out.length + // without going out of bounds. + if (rawToNorm[rawPos] === undefined) { + rawToNorm[rawPos] = normToRaw.length; } - state.normToRaw.push(rawPos); + normToRaw.push(rawPos); - const text = state.output; // Already normalized/collapsed - return { text, rawToNorm: state.rawToNorm, normToRaw: state.normToRaw }; -} - -function translateRawToNorm( - rawToNorm: (number | undefined)[], - rawOffset: number, -): number { - if (rawOffset < 0) { + const toNorm = (rawOffset: number): number => { + if (rawOffset < 0) { + return 0; + } + const clamped = Math.min(rawOffset, rawToNorm.length - 1); + for (let i = clamped; i >= 0; i--) { + const val = rawToNorm[i]; + if (val !== undefined) { + return val; + } + } return 0; - } - const clamped = Math.min(rawOffset, rawToNorm.length - 1); - for (let i = clamped; i >= 0; i--) { - const val = rawToNorm[i]; - if (val !== undefined) { - return val; + }; + + const toRaw = (normOffset: number): number => { + if (normOffset <= 0) { + return 0; } - } - return 0; -} + const clamped = Math.min(normOffset, normToRaw.length - 1); + return normToRaw[clamped]; + }; -function translateNormToRaw(normToRaw: number[], normOffset: number): number { - if (normOffset <= 0) { - return 0; - } - const clamped = Math.min(normOffset, normToRaw.length - 1); - return normToRaw[clamped]; + return { text: out, toNorm, toRaw }; } +/** Convenience: rendered text of a Range's contents. */ export function renderedTextFromRange(range: Range): string { - const container = range.cloneContents(); const div = document.createElement('div'); - div.appendChild(container); - const { text } = buildRenderedText(div); - return text; -} - -export function renderedTextWithOffsets(root: Element): { - text: string; - rawToNorm: (number | undefined)[]; - normToRaw: number[]; - toNorm: (rawOffset: number) => number; - toRaw: (normOffset: number) => number; -} { - const rendered = buildRenderedText(root); - return { - ...rendered, - toNorm: rawOffset => translateRawToNorm(rendered.rawToNorm, rawOffset), - toRaw: normOffset => translateNormToRaw(rendered.normToRaw, normOffset), - }; + div.appendChild(range.cloneContents()); + return renderedTextOf(div).text; } diff --git a/src/annotator/anchoring/test/html-test.js b/src/annotator/anchoring/test/html-test.js index 82e4ec0ab52..18f82fd6684 100644 --- a/src/annotator/anchoring/test/html-test.js +++ b/src/annotator/anchoring/test/html-test.js @@ -293,7 +293,7 @@ const rangeSpecs = [ 0, '/p[4]', 0, - 'Header Level 2\n\n\n Mauris lacinia ipsum nulla, id iaculis quam egestas quis.\n\n\n\n\n', + 'Header Level 2\n\n\n Mauris lacinia ipsum nulla, id iaculis quam egestas quis.\n\n\n', 'No text node at the end and offset 0', ], @@ -348,10 +348,7 @@ describe('HTML anchoring', () => { // Resolve the range descriptor to a DOM Range, verify that the expected // text was selected. const range = toRange(container, testCase.range); - assert.equal( - normalizeText(range.toString()), - normalizeText(testCase.quote), - ); + assert.equal(range.toString(), testCase.quote); // Capture a set of selectors describing the range and perform basic sanity // checks on them. diff --git a/src/annotator/anchoring/test/rendered-text-test.js b/src/annotator/anchoring/test/rendered-text-test.js index a0135ab8867..c0e020964af 100644 --- a/src/annotator/anchoring/test/rendered-text-test.js +++ b/src/annotator/anchoring/test/rendered-text-test.js @@ -2,63 +2,85 @@ import { assert } from 'chai'; import { renderedTextFromRange, - renderedTextWithOffsets, + renderedTextOf, } from '../rendered-text'; describe('annotator/anchoring/rendered-text', () => { - it('normalizes rendered text with line breaks and block boundaries', () => { + it('inserts a space at
boundaries', () => { const container = document.createElement('div'); - container.innerHTML = '

foo
bar

baz
'; - const range = document.createRange(); - range.selectNodeContents(container); + container.innerHTML = '

foo
bar

'; - const text = renderedTextFromRange(range); - assert.equal(text.trim(), 'foo bar baz'); + assert.equal(renderedTextOf(container).text.trim(), 'foo bar'); }); - it('provides raw/normalized offset mappings', () => { + it('inserts a space at block-tag boundaries', () => { const container = document.createElement('div'); - container.innerHTML = '

foo
bar

'; + container.innerHTML = '

foo

bar

baz
'; - const { text, toNorm, toRaw } = renderedTextWithOffsets(container); - - const trimmed = text.trim(); - assert.equal(trimmed, 'foo bar'); - // Raw textContent is "foobar". The inserted space should map to the - // boundary between raw offsets 3 and 3. - assert.equal(toNorm(3), 3); - assert.equal(toRaw(3), 3); - // End of raw text maps to end of normalized text (ignoring trailing space). - assert.equal(toNorm(container.textContent.length), trimmed.length); - assert.equal(toRaw(trimmed.length), container.textContent.length); + assert.equal(renderedTextOf(container).text.trim(), 'foo bar baz'); }); - it('maps out-of-range raw offsets to 0', () => { + it('collapses runs of whitespace and suppresses leading whitespace', () => { const container = document.createElement('div'); - container.textContent = 'abc'; + container.innerHTML = '

foo \n\n bar

'; - const { toNorm } = renderedTextWithOffsets(container); - - assert.equal(toNorm(-5), 0); + assert.equal(renderedTextOf(container).text.trim(), 'foo bar'); }); - it('maps norm offsets <= 0 to raw start', () => { + it('returns the rendered text of a Range', () => { const container = document.createElement('div'); - container.textContent = 'abc'; + container.innerHTML = '

foo
bar

'; + document.body.appendChild(container); - const { toRaw } = renderedTextWithOffsets(container); + const range = document.createRange(); + range.selectNodeContents(container); + assert.equal(renderedTextFromRange(range).trim(), 'foo bar'); - assert.equal(toRaw(0), 0); - assert.equal(toRaw(-10), 0); + container.remove(); }); - it('falls back to 0 when no raw-to-norm mapping exists', () => { - // Whitespace-only content collapses to empty; ensure toNorm returns 0. - const container = document.createElement('div'); - container.textContent = ' '; + describe('offset translation', () => { + it('round-trips raw <-> norm offsets across a
boundary', () => { + const container = document.createElement('div'); + container.innerHTML = '

foo
bar

'; + + const { text, toNorm, toRaw } = renderedTextOf(container); + assert.equal(text.trim(), 'foo bar'); + + // Raw textContent is "foobar". The 'b' at raw offset 3 maps into the + // rendered text past the synthesized space inserted at the
. + assert.equal(toNorm(3), 3); + assert.equal(toRaw(toNorm(3)), 3); + // End-of-string round-trips to end-of-string. + assert.equal(toRaw(text.length), container.textContent.length); + }); + + it('clamps out-of-range raw offsets to the start', () => { + const container = document.createElement('div'); + container.textContent = 'abc'; + + const { toNorm } = renderedTextOf(container); + assert.equal(toNorm(-5), 0); + }); + + it('clamps non-positive norm offsets to raw start', () => { + const container = document.createElement('div'); + container.textContent = 'abc'; + + const { toRaw } = renderedTextOf(container); + assert.equal(toRaw(0), 0); + assert.equal(toRaw(-10), 0); + }); - const { toNorm } = renderedTextWithOffsets(container); + it('handles whitespace-only content', () => { + const container = document.createElement('div'); + container.textContent = ' '; - assert.equal(toNorm(1), 0); + const { text, toNorm } = renderedTextOf(container); + // Whitespace collapses; container is a block, so we get just the + // closing-block synthesized space (or empty after trim). + assert.equal(text.trim(), ''); + assert.equal(toNorm(1), 0); + }); }); }); diff --git a/src/annotator/anchoring/test/types-test.js b/src/annotator/anchoring/test/types-test.js index d9701193475..82d712c0709 100644 --- a/src/annotator/anchoring/test/types-test.js +++ b/src/annotator/anchoring/test/types-test.js @@ -1,5 +1,6 @@ import { render } from 'preact'; +import { renderedTextOf } from '../rendered-text'; import { TextRange } from '../text-range'; import { MediaTimeAnchor, @@ -367,17 +368,17 @@ describe('annotator/anchoring/types', () => { }); quoteAnchor.toRange({ hint: 42 }); - const normalize = str => str.replace(/\s+/g, ' ').trim(); - - assert.isTrue(fakeMatchQuote.calledOnce); - const [textArg, quoteArg, contextArg] = fakeMatchQuote.firstCall.args; - assert.equal(normalize(textArg), normalize(container.textContent)); - assert.equal(quoteArg, 'Liberty'); - assert.deepEqual(contextArg, { - hint: 42, - prefix: 'expected-prefix', - suffix: 'expected-suffix', - }); + + assert.calledWith( + fakeMatchQuote, + renderedTextOf(container).text, + 'Liberty', + { + hint: 42, + prefix: 'expected-prefix', + suffix: 'expected-suffix', + }, + ); }); it('returns `Range` representing match found by `matchQuote`', () => { diff --git a/src/annotator/anchoring/types.ts b/src/annotator/anchoring/types.ts index 55e2157c80b..a6d3e5bcfbb 100644 --- a/src/annotator/anchoring/types.ts +++ b/src/annotator/anchoring/types.ts @@ -14,7 +14,7 @@ import type { TextQuoteSelector, } from '../../types/api'; import { matchQuote } from './match-quote'; -import { collapseWhitespace, renderedTextWithOffsets } from './rendered-text'; +import { renderedTextOf } from './rendered-text'; import { TextRange, TextPosition } from './text-range'; import { nodeFromXPath, xpathFromNode } from './xpath'; @@ -174,20 +174,19 @@ export class TextQuoteAnchor { * Will throw if `range` does not contain any text nodes. */ static fromRange(root: Element, range: Range): TextQuoteAnchor { - const { text: normText, toNorm } = renderedTextWithOffsets(root); + const { text, toNorm } = renderedTextOf(root); const textRange = TextRange.fromRange(range).relativeTo(root); const normStart = toNorm(textRange.start.offset); const normEnd = toNorm(textRange.end.offset); const contextLen = 32; - - const prefix = collapseWhitespace( - normText.slice(Math.max(0, normStart - contextLen), normStart), - ).trim(); - const exact = normText.slice(normStart, normEnd).trim(); - const suffix = collapseWhitespace( - normText.slice(normEnd, Math.min(normText.length, normEnd + contextLen)), - ).trim(); + const exact = text.slice(normStart, normEnd).trim(); + const prefix = text + .slice(Math.max(0, normStart - contextLen), normStart) + .trim(); + const suffix = text + .slice(normEnd, Math.min(text.length, normEnd + contextLen)) + .trim(); return new TextQuoteAnchor(root, exact, { prefix, suffix }); } @@ -214,7 +213,7 @@ export class TextQuoteAnchor { } toPositionAnchor(options: QuoteMatchOptions = {}): TextPositionAnchor { - const { text, toRaw } = renderedTextWithOffsets(this.root); + const { text, toRaw } = renderedTextOf(this.root); const match = matchQuote(text, this.exact, { ...this.context, hint: options.hint, @@ -222,9 +221,7 @@ export class TextQuoteAnchor { if (!match) { throw new Error('Quote not found'); } - const rawStart = toRaw(match.start); - const rawEnd = toRaw(match.end); - return new TextPositionAnchor(this.root, rawStart, rawEnd); + return new TextPositionAnchor(this.root, toRaw(match.start), toRaw(match.end)); } } diff --git a/src/annotator/util/normalize.ts b/src/annotator/util/normalize.ts index 7feab63f9c2..de7bb411828 100644 --- a/src/annotator/util/normalize.ts +++ b/src/annotator/util/normalize.ts @@ -1,3 +1,26 @@ +/** + * Return true if `char` is an ASCII space. + * + * This is more efficient than `/\s/.test(char)` but does not handle Unicode + * spaces. + */ +export function isSpace(char: string): boolean { + switch (char) { + case ' ': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': + case ' ': // nbsp + return true; + default: + return false; + } +} + +export const isNotSpace = (char: string): boolean => !isSpace(char); + /** * Find the smallest offset in `str` which contains at least `count` chars * that match `filter` before it. From b1156fc9783ef9fe8b7ebbed7d5e3b07d8a4244d Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Wed, 6 May 2026 10:21:37 -0300 Subject: [PATCH 15/19] prettier format --- src/annotator/anchoring/test/rendered-text-test.js | 5 +---- src/annotator/anchoring/types.ts | 6 +++++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/annotator/anchoring/test/rendered-text-test.js b/src/annotator/anchoring/test/rendered-text-test.js index c0e020964af..7ae77764ec8 100644 --- a/src/annotator/anchoring/test/rendered-text-test.js +++ b/src/annotator/anchoring/test/rendered-text-test.js @@ -1,9 +1,6 @@ import { assert } from 'chai'; -import { - renderedTextFromRange, - renderedTextOf, -} from '../rendered-text'; +import { renderedTextFromRange, renderedTextOf } from '../rendered-text'; describe('annotator/anchoring/rendered-text', () => { it('inserts a space at
boundaries', () => { diff --git a/src/annotator/anchoring/types.ts b/src/annotator/anchoring/types.ts index a6d3e5bcfbb..5873df8110b 100644 --- a/src/annotator/anchoring/types.ts +++ b/src/annotator/anchoring/types.ts @@ -221,7 +221,11 @@ export class TextQuoteAnchor { if (!match) { throw new Error('Quote not found'); } - return new TextPositionAnchor(this.root, toRaw(match.start), toRaw(match.end)); + return new TextPositionAnchor( + this.root, + toRaw(match.start), + toRaw(match.end), + ); } } From accd8c102af1cb354c5dc82438bac47221555e1e Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Thu, 7 May 2026 11:08:21 -0300 Subject: [PATCH 16/19] renames + mini bug fix --- src/annotator/anchoring/pdf.ts | 6 +- src/annotator/anchoring/rendered-text.ts | 221 ++++++++++-------- .../anchoring/test/rendered-text-test.js | 44 ++-- src/annotator/anchoring/types.ts | 23 +- src/annotator/util/normalize.ts | 9 +- .../components/Annotation/AnnotationQuote.tsx | 5 +- 6 files changed, 162 insertions(+), 146 deletions(-) diff --git a/src/annotator/anchoring/pdf.ts b/src/annotator/anchoring/pdf.ts index 525c7df802a..5b234629f0b 100644 --- a/src/annotator/anchoring/pdf.ts +++ b/src/annotator/anchoring/pdf.ts @@ -553,7 +553,11 @@ async function anchorRange(selectors: Selector[]): Promise { const start = position.start - offset; const end = position.end - offset; - const matchedText = text.substring(start, end); + // Compare normalized forms — `quote.exact` is stored normalized (see + // `describe`), but `text` is the raw page text, so any difference in + // whitespace would otherwise cause this fast path to always fall + // through to the slow quote search. + const matchedText = normalizePDFText(text.substring(start, end)).trim(); if (quote.exact !== matchedText) { throw new Error('quote mismatch'); } diff --git a/src/annotator/anchoring/rendered-text.ts b/src/annotator/anchoring/rendered-text.ts index e428d0dbba4..2daf843c380 100644 --- a/src/annotator/anchoring/rendered-text.ts +++ b/src/annotator/anchoring/rendered-text.ts @@ -1,8 +1,9 @@ /** - * Tags whose boundaries should produce a whitespace break in the rendered text, - * even though the boundary itself is not represented as a character in the - * DOM's `textContent`. Mirrors the visible whitespace a user perceives when - * they read the rendered page or copy a selection out to a single-line input. + * Tags whose boundaries should produce a whitespace break in the rendered + * text, even though the boundary itself is not represented as a character in + * the DOM's `textContent`. Mirrors the visible whitespace a user perceives + * when they read the rendered page or copy a selection out to a single-line + * input. */ const BLOCK_TAGS = new Set([ 'ADDRESS', @@ -38,123 +39,141 @@ const BLOCK_TAGS = new Set([ ]); export type RenderedText = { - /** Rendered/normalized text of the root: collapsed whitespace, with a single - * space inserted at each `
` and block-tag boundary. */ + /** + * Rendered/normalized text of the root: collapsed whitespace, with a single + * space inserted at each `
` and block-tag boundary. + */ text: string; - /** Translate a raw `textContent` offset to a position in `text`. */ - toNorm: (rawOffset: number) => number; - /** Translate a position in `text` back to a raw `textContent` offset. */ - toRaw: (normOffset: number) => number; + /** + * For each raw offset i (0..rawText.length, treated as a position between + * characters), the position in `text` where rawText[i] first contributes + * (or, for synthesized spaces, where the space is inserted at that + * boundary). Undefined entries indicate raw chars that were suppressed + * (e.g., a whitespace char that collapsed into a previous space) — + * `toNormalized` scans backwards to the nearest defined entry. + */ + rawToNormalized: (number | undefined)[]; + /** For each output position, the raw offset that produced it. */ + normalizedToRaw: number[]; +}; + +type BuildState = { + output: string; + rawToNormalized: (number | undefined)[]; + normalizedToRaw: number[]; + rawPosition: number; }; /** - * Walk `root`'s DOM subtree and produce its rendered text along with offset - * translators between raw `textContent` and the rendered output. - * - * Whitespace runs (including the synthesized boundary spaces) are collapsed to - * a single ASCII space, and leading/consecutive whitespace is suppressed — - * matching what a user sees when they read the page or paste a selection into - * a single-line input. + * Append `character` (originating at `fromRaw` in raw text) to `state.output`, + * collapsing whitespace runs and updating the raw↔normalized maps in place. */ -export function renderedTextOf(root: Element): RenderedText { - const rawText = root.textContent ?? ''; - let out = ''; - // For each raw offset i (0..rawText.length), the position in `out` where - // rawText[i] first contributes (or, for synthesized spaces, where the space - // is inserted at that boundary). Undefined entries indicate raw chars that - // were suppressed (e.g., a whitespace char that collapsed into a previous - // space) — translateRawToNorm scans backwards to the nearest defined entry. - const rawToNorm: (number | undefined)[] = new Array(rawText.length + 1).fill( - undefined, - ); - // For each output position, the raw offset that produced it. - const normToRaw: number[] = []; - - let rawPos = 0; - - const append = (ch: string, fromRaw: number) => { - if (/\s/.test(ch)) { - if (out.length === 0 || out.endsWith(' ')) { - return; - } - ch = ' '; - } - out += ch; - normToRaw.push(fromRaw); - if (rawToNorm[fromRaw] === undefined) { - rawToNorm[fromRaw] = normToRaw.length - 1; - } - }; - - const walk = (node: Node) => { - if (node.nodeType === Node.TEXT_NODE) { - const text = node.textContent ?? ''; - for (let i = 0; i < text.length; i++) { - append(text[i], rawPos); - rawPos += 1; - } +function append(state: BuildState, character: string, fromRaw: number) { + if (/\s/.test(character)) { + if (state.output.length === 0 || state.output.endsWith(' ')) { return; } + character = ' '; + } + state.output += character; + state.normalizedToRaw.push(fromRaw); + if (state.rawToNormalized[fromRaw] === undefined) { + state.rawToNormalized[fromRaw] = state.normalizedToRaw.length - 1; + } +} - if (node.nodeType !== Node.ELEMENT_NODE) { - return; +function walk(node: Node, state: BuildState) { + if (node.nodeType === Node.TEXT_NODE) { + const text = node.textContent ?? ''; + for (let i = 0; i < text.length; i++) { + append(state, text[i], state.rawPosition); + state.rawPosition += 1; } + return; + } - const el = node as Element; - if (el.tagName === 'BR') { - append(' ', rawPos); - return; - } + if (node.nodeType !== Node.ELEMENT_NODE) { + return; + } - const block = BLOCK_TAGS.has(el.nodeName); - if (block) { - append(' ', rawPos); - } - for (const child of Array.from(node.childNodes)) { - walk(child); - } - if (block) { - append(' ', rawPos); - } + const el = node as Element; + if (el.tagName === 'BR') { + append(state, ' ', state.rawPosition); + return; + } + + const block = BLOCK_TAGS.has(el.nodeName); + if (block) { + append(state, ' ', state.rawPosition); + } + for (const child of Array.from(node.childNodes)) { + walk(child, state); + } + if (block) { + append(state, ' ', state.rawPosition); + } +} + +/** + * Walk `root`'s DOM subtree and produce its rendered text along with offset + * maps between raw `textContent` and the rendered output. + * + * Whitespace runs (including the synthesized boundary spaces) are collapsed + * to a single ASCII space, and leading/consecutive whitespace is suppressed — + * matching what a user sees when they read the page or paste a selection + * into a single-line input. + */ +export function renderedTextOf(root: Element): RenderedText { + const rawText = root.textContent ?? ''; + const state: BuildState = { + output: '', + rawToNormalized: new Array(rawText.length + 1).fill(undefined), + normalizedToRaw: [], + rawPosition: 0, }; - walk(root); + walk(root, state); - // End-of-string sentinel so callers can pass rawText.length / out.length + // End-of-string sentinel so callers can pass rawText.length / output.length // without going out of bounds. - if (rawToNorm[rawPos] === undefined) { - rawToNorm[rawPos] = normToRaw.length; + if (state.rawToNormalized[state.rawPosition] === undefined) { + state.rawToNormalized[state.rawPosition] = state.normalizedToRaw.length; } - normToRaw.push(rawPos); + state.normalizedToRaw.push(state.rawPosition); - const toNorm = (rawOffset: number): number => { - if (rawOffset < 0) { - return 0; - } - const clamped = Math.min(rawOffset, rawToNorm.length - 1); - for (let i = clamped; i >= 0; i--) { - const val = rawToNorm[i]; - if (val !== undefined) { - return val; - } - } - return 0; + return { + text: state.output, + rawToNormalized: state.rawToNormalized, + normalizedToRaw: state.normalizedToRaw, }; +} - const toRaw = (normOffset: number): number => { - if (normOffset <= 0) { - return 0; +/** Translate a raw `textContent` offset into a position in the rendered text. */ +export function toNormalized( + rawToNormalized: (number | undefined)[], + rawOffset: number, +): number { + if (rawOffset < 0) { + return 0; + } + const clamped = Math.min(rawOffset, rawToNormalized.length - 1); + for (let i = clamped; i >= 0; i--) { + const value = rawToNormalized[i]; + if (value !== undefined) { + return value; } - const clamped = Math.min(normOffset, normToRaw.length - 1); - return normToRaw[clamped]; - }; - - return { text: out, toNorm, toRaw }; + } + return 0; } -/** Convenience: rendered text of a Range's contents. */ -export function renderedTextFromRange(range: Range): string { - const div = document.createElement('div'); - div.appendChild(range.cloneContents()); - return renderedTextOf(div).text; +/** Translate a position in the rendered text back to a raw `textContent` offset. */ +export function toRaw( + normalizedToRaw: number[], + normalizedOffset: number, +): number { + if (normalizedOffset <= 0) { + return 0; + } + const clamped = Math.min(normalizedOffset, normalizedToRaw.length - 1); + return normalizedToRaw[clamped]; } diff --git a/src/annotator/anchoring/test/rendered-text-test.js b/src/annotator/anchoring/test/rendered-text-test.js index 7ae77764ec8..c1d13702fc0 100644 --- a/src/annotator/anchoring/test/rendered-text-test.js +++ b/src/annotator/anchoring/test/rendered-text-test.js @@ -1,6 +1,6 @@ import { assert } from 'chai'; -import { renderedTextFromRange, renderedTextOf } from '../rendered-text'; +import { renderedTextOf, toNormalized, toRaw } from '../rendered-text'; describe('annotator/anchoring/rendered-text', () => { it('inserts a space at
boundaries', () => { @@ -24,60 +24,52 @@ describe('annotator/anchoring/rendered-text', () => { assert.equal(renderedTextOf(container).text.trim(), 'foo bar'); }); - it('returns the rendered text of a Range', () => { - const container = document.createElement('div'); - container.innerHTML = '

foo
bar

'; - document.body.appendChild(container); - - const range = document.createRange(); - range.selectNodeContents(container); - assert.equal(renderedTextFromRange(range).trim(), 'foo bar'); - - container.remove(); - }); - describe('offset translation', () => { - it('round-trips raw <-> norm offsets across a
boundary', () => { + it('round-trips raw <-> normalized offsets across a
boundary', () => { const container = document.createElement('div'); container.innerHTML = '

foo
bar

'; - const { text, toNorm, toRaw } = renderedTextOf(container); + const { text, rawToNormalized, normalizedToRaw } = + renderedTextOf(container); assert.equal(text.trim(), 'foo bar'); // Raw textContent is "foobar". The 'b' at raw offset 3 maps into the // rendered text past the synthesized space inserted at the
. - assert.equal(toNorm(3), 3); - assert.equal(toRaw(toNorm(3)), 3); + assert.equal(toNormalized(rawToNormalized, 3), 3); + assert.equal(toRaw(normalizedToRaw, toNormalized(rawToNormalized, 3)), 3); // End-of-string round-trips to end-of-string. - assert.equal(toRaw(text.length), container.textContent.length); + assert.equal( + toRaw(normalizedToRaw, text.length), + container.textContent.length, + ); }); it('clamps out-of-range raw offsets to the start', () => { const container = document.createElement('div'); container.textContent = 'abc'; - const { toNorm } = renderedTextOf(container); - assert.equal(toNorm(-5), 0); + const { rawToNormalized } = renderedTextOf(container); + assert.equal(toNormalized(rawToNormalized, -5), 0); }); - it('clamps non-positive norm offsets to raw start', () => { + it('clamps non-positive normalized offsets to raw start', () => { const container = document.createElement('div'); container.textContent = 'abc'; - const { toRaw } = renderedTextOf(container); - assert.equal(toRaw(0), 0); - assert.equal(toRaw(-10), 0); + const { normalizedToRaw } = renderedTextOf(container); + assert.equal(toRaw(normalizedToRaw, 0), 0); + assert.equal(toRaw(normalizedToRaw, -10), 0); }); it('handles whitespace-only content', () => { const container = document.createElement('div'); container.textContent = ' '; - const { text, toNorm } = renderedTextOf(container); + const { text, rawToNormalized } = renderedTextOf(container); // Whitespace collapses; container is a block, so we get just the // closing-block synthesized space (or empty after trim). assert.equal(text.trim(), ''); - assert.equal(toNorm(1), 0); + assert.equal(toNormalized(rawToNormalized, 1), 0); }); }); }); diff --git a/src/annotator/anchoring/types.ts b/src/annotator/anchoring/types.ts index 5873df8110b..5f99684a032 100644 --- a/src/annotator/anchoring/types.ts +++ b/src/annotator/anchoring/types.ts @@ -14,7 +14,7 @@ import type { TextQuoteSelector, } from '../../types/api'; import { matchQuote } from './match-quote'; -import { renderedTextOf } from './rendered-text'; +import { renderedTextOf, toNormalized, toRaw } from './rendered-text'; import { TextRange, TextPosition } from './text-range'; import { nodeFromXPath, xpathFromNode } from './xpath'; @@ -174,18 +174,21 @@ export class TextQuoteAnchor { * Will throw if `range` does not contain any text nodes. */ static fromRange(root: Element, range: Range): TextQuoteAnchor { - const { text, toNorm } = renderedTextOf(root); + const { text, rawToNormalized } = renderedTextOf(root); const textRange = TextRange.fromRange(range).relativeTo(root); - const normStart = toNorm(textRange.start.offset); - const normEnd = toNorm(textRange.end.offset); + const normalizedStart = toNormalized( + rawToNormalized, + textRange.start.offset, + ); + const normalizedEnd = toNormalized(rawToNormalized, textRange.end.offset); const contextLen = 32; - const exact = text.slice(normStart, normEnd).trim(); + const exact = text.slice(normalizedStart, normalizedEnd).trim(); const prefix = text - .slice(Math.max(0, normStart - contextLen), normStart) + .slice(Math.max(0, normalizedStart - contextLen), normalizedStart) .trim(); const suffix = text - .slice(normEnd, Math.min(text.length, normEnd + contextLen)) + .slice(normalizedEnd, Math.min(text.length, normalizedEnd + contextLen)) .trim(); return new TextQuoteAnchor(root, exact, { prefix, suffix }); @@ -213,7 +216,7 @@ export class TextQuoteAnchor { } toPositionAnchor(options: QuoteMatchOptions = {}): TextPositionAnchor { - const { text, toRaw } = renderedTextOf(this.root); + const { text, normalizedToRaw } = renderedTextOf(this.root); const match = matchQuote(text, this.exact, { ...this.context, hint: options.hint, @@ -223,8 +226,8 @@ export class TextQuoteAnchor { } return new TextPositionAnchor( this.root, - toRaw(match.start), - toRaw(match.end), + toRaw(normalizedToRaw, match.start), + toRaw(normalizedToRaw, match.end), ); } } diff --git a/src/annotator/util/normalize.ts b/src/annotator/util/normalize.ts index de7bb411828..feee3b157f6 100644 --- a/src/annotator/util/normalize.ts +++ b/src/annotator/util/normalize.ts @@ -1,8 +1,9 @@ /** - * Return true if `char` is an ASCII space. + * Return true if `char` is an ASCII whitespace character or a non-breaking + * space (U+00A0). * - * This is more efficient than `/\s/.test(char)` but does not handle Unicode - * spaces. + * This is more efficient than `/\s/.test(char)` but does not handle the + * full set of Unicode whitespace characters. */ export function isSpace(char: string): boolean { switch (char) { @@ -12,7 +13,7 @@ export function isSpace(char: string): boolean { case '\r': case '\t': case '\v': - case ' ': // nbsp + case '\u00a0': // nbsp return true; default: return false; diff --git a/src/sidebar/components/Annotation/AnnotationQuote.tsx b/src/sidebar/components/Annotation/AnnotationQuote.tsx index 8c2d1497fa2..ac1fa4a3008 100644 --- a/src/sidebar/components/Annotation/AnnotationQuote.tsx +++ b/src/sidebar/components/Annotation/AnnotationQuote.tsx @@ -22,10 +22,7 @@ function AnnotationQuote({ isOrphan, settings, }: AnnotationQuoteProps) { - const displayQuote = quote - .replace(/[\r\n]+/g, ' ') - .replace(/\s+/g, ' ') - .trim(); + const displayQuote = quote.replace(/\s+/g, ' ').trim(); return ( From 2f03bbd4f0b439efb617600bf0bc2c804e2fddff Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Thu, 7 May 2026 14:30:33 -0300 Subject: [PATCH 17/19] fixes and revert --- src/annotator/anchoring/pdf.ts | 112 ++++++++++++----------- src/annotator/anchoring/test/pdf-test.js | 74 +++++---------- src/annotator/util/normalize.ts | 24 ----- 3 files changed, 80 insertions(+), 130 deletions(-) diff --git a/src/annotator/anchoring/pdf.ts b/src/annotator/anchoring/pdf.ts index 5b234629f0b..3e8d83bb848 100644 --- a/src/annotator/anchoring/pdf.ts +++ b/src/annotator/anchoring/pdf.ts @@ -14,7 +14,7 @@ import type { PDFViewer, TextLayer, } from '../../types/pdfjs'; -import { isNotSpace, isSpace, translateOffsets } from '../util/normalize'; +import { translateOffsets } from '../util/normalize'; import { matchQuote } from './match-quote'; import { createPlaceholder } from './placeholder'; import { textInDOMRect } from './text-in-rect'; @@ -71,10 +71,6 @@ function quotePositionCacheKey(quote: string, pos?: number) { return `${quote}:${pos}`; } -function normalizePDFText(str: string): string { - return str.replace(/\s+/g, ' '); -} - /** * Return the text layer element of the PDF page containing `node`. */ @@ -242,6 +238,29 @@ async function findPageByOffset(offset: number): Promise { return { index: viewer.pagesCount - 1, offset: pageStartOffset, text }; } +/** + * Return true if `char` is an ASCII space. + * + * This is more efficient than `/\s/.test(char)` but does not handle Unicode + * spaces. + */ +function isSpace(char: string) { + switch (char) { + case ' ': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': + case '\u00a0': // nbsp + return true; + default: + return false; + } +} + +const isNotSpace = (char: string) => !isSpace(char); + /** * Determines if provided text layer is done rendering. * It works on older PDF.js versions which expose a public `renderingDone` prop, @@ -403,46 +422,49 @@ async function anchorQuote( }); } - // Normalize quote and context consistently with selector creation. - const normalizedPrefix = + // Search pages for the best match, ignoring whitespace differences. + const strippedPrefix = quoteSelector.prefix !== undefined - ? normalizePDFText(quoteSelector.prefix).trim() + ? stripSpaces(quoteSelector.prefix) : undefined; - const normalizedSuffix = + const strippedSuffix = quoteSelector.suffix !== undefined - ? normalizePDFText(quoteSelector.suffix).trim() + ? stripSpaces(quoteSelector.suffix) : undefined; - const normalizedQuote = normalizePDFText(quoteSelector.exact).trim(); + const strippedQuote = stripSpaces(quoteSelector.exact); let bestMatch; for (const page of pageIndexes) { const text = await getPageTextContent(page); - const normalizedText = normalizePDFText(text); + const strippedText = stripSpaces(text); // Determine expected offset of quote in current page based on position hint. - let normalizedHint; + let strippedHint; if (expectedPageIndex !== undefined && expectedOffsetInPage !== undefined) { if (page < expectedPageIndex) { - normalizedHint = normalizedText.length; // Prefer matches closer to end of page. + strippedHint = strippedText.length; // Prefer matches closer to end of page. } else if (page === expectedPageIndex) { - // Translate expected offset in original text into offset in normalized text. - [normalizedHint] = translateOffsets( + // Translate expected offset in whitespace-inclusive version of page + // text into offset in whitespace-stripped version of page text. + [strippedHint] = translateOffsets( text, - normalizedText, + strippedText, expectedOffsetInPage, expectedOffsetInPage, isNotSpace, - { normalize: true }, + // We don't need to normalize here since both input strings are + // derived from the same input. + { normalize: false }, ); } else { - normalizedHint = 0; // Prefer matches closer to start of page. + strippedHint = 0; // Prefer matches closer to start of page. } } - const match = matchQuote(normalizedText, normalizedQuote, { - prefix: normalizedPrefix, - suffix: normalizedSuffix, - hint: normalizedHint, + const match = matchQuote(strippedText, strippedQuote, { + prefix: strippedPrefix, + suffix: strippedSuffix, + hint: strippedHint, }); if (!match) { @@ -450,14 +472,14 @@ async function anchorQuote( } if (!bestMatch || match.score > bestMatch.match.score) { - // Translate match offset from normalized text back to original text. + // Translate match offset from whitespace-stripped version of page text + // back to original text. const [start, end] = translateOffsets( - normalizedText, + strippedText, text, match.start, match.end, isNotSpace, - { normalize: true }, ); bestMatch = { page, @@ -478,22 +500,21 @@ async function anchorQuote( // helps to avoid incorrectly stopping the search early if the quote is // a word or phrase that is common in the document. const exactQuoteMatch = - normalizedText.slice(match.start, match.end) === normalizedQuote; + strippedText.slice(match.start, match.end) === strippedQuote; const exactPrefixMatch = - normalizedPrefix !== undefined && - normalizedText.slice( - Math.max(0, match.start - normalizedPrefix.length), + strippedPrefix !== undefined && + strippedText.slice( + Math.max(0, match.start - strippedPrefix.length), match.start, - ) === normalizedPrefix; + ) === strippedPrefix; const exactSuffixMatch = - normalizedSuffix !== undefined && - normalizedText.slice(match.end, match.end + normalizedSuffix.length) === - normalizedSuffix; + strippedSuffix !== undefined && + strippedText.slice(match.end, strippedSuffix.length) === strippedSuffix; const hasContext = - normalizedPrefix !== undefined || normalizedSuffix !== undefined; + strippedPrefix !== undefined || strippedSuffix !== undefined; if ( exactQuoteMatch && @@ -553,11 +574,7 @@ async function anchorRange(selectors: Selector[]): Promise { const start = position.start - offset; const end = position.end - offset; - // Compare normalized forms — `quote.exact` is stored normalized (see - // `describe`), but `text` is the raw page text, so any difference in - // whitespace would otherwise cause this fast path to always fall - // through to the slow quote search. - const matchedText = normalizePDFText(text.substring(start, end)).trim(); + const matchedText = text.substring(start, end); if (quote.exact !== matchedText) { throw new Error('quote mismatch'); } @@ -778,20 +795,7 @@ export async function describe(range: Range): Promise { end: pageOffset + endPos.offset, } as TextPositionSelector; - const rawQuote = TextQuoteAnchor.fromRange( - pageView.div, - textRange, - ).toSelector(); - const quote: TextQuoteSelector = { - ...rawQuote, - exact: normalizePDFText(rawQuote.exact).trim(), - prefix: rawQuote.prefix - ? normalizePDFText(rawQuote.prefix).trim() - : rawQuote.prefix, - suffix: rawQuote.suffix - ? normalizePDFText(rawQuote.suffix).trim() - : rawQuote.suffix, - }; + const quote = TextQuoteAnchor.fromRange(pageView.div, textRange).toSelector(); const pageSelector = createPageSelector(pageView, startPageIndex); return [position, quote, pageSelector]; diff --git a/src/annotator/anchoring/test/pdf-test.js b/src/annotator/anchoring/test/pdf-test.js index 9f72055ea35..4a78bb81fb5 100644 --- a/src/annotator/anchoring/test/pdf-test.js +++ b/src/annotator/anchoring/test/pdf-test.js @@ -149,46 +149,19 @@ describe('annotator/anchoring/pdf', () => { assert.equal(position.end, expectedPos + quote.length); }); - it('returns a quote selector with the correct quote', async () => { + it('returns a quote selector with the correct quote', () => { viewer.pdfViewer.setCurrentPage(2); const range = findText(container, 'Netherfield Park'); - const selectors = await pdfAnchoring.describe(range); - const quote = selectors.find(s => s.type === 'TextQuoteSelector'); - - assert.deepEqual(quote, { - type: 'TextQuoteSelector', - exact: 'Netherfield Park', - prefix: 'im one day, "have you heard that', - suffix: 'is occupied again?"', - }); - }); - - it('handles quotes without prefix/suffix when describing', async () => { - viewer.pdfViewer.setCurrentPage(2); - const range = findText(container, 'Netherfield Park'); - - const fakeSelector = { - type: 'TextQuoteSelector', - exact: 'Netherfield Park', - }; + return pdfAnchoring.describe(range).then(selectors => { + const quote = selectors.find(s => s.type === 'TextQuoteSelector'); - pdfAnchoring.$imports.$mock({ - './types': { - TextQuoteAnchor: { - fromRange: sinon.stub().returns({ - toSelector: () => fakeSelector, - }), - }, - }, + assert.deepEqual(quote, { + type: 'TextQuoteSelector', + exact: 'Netherfield Park', + prefix: 'im one day, "have you heard that', + suffix: 'is occupied again?"', + }); }); - - const selectors = await pdfAnchoring.describe(range); - pdfAnchoring.$imports.$restore(); - - const quote = selectors.find(s => s.type === 'TextQuoteSelector'); - assert.equal(quote.exact, 'Netherfield Park'); - assert.isUndefined(quote.prefix); - assert.isUndefined(quote.suffix); }); it('returns a page selector with the page number as the label', async () => { @@ -506,8 +479,6 @@ describe('annotator/anchoring/pdf', () => { // nb. The new lines in fixtures don't appear in the extracted PDF text. const getPageText = page => fixtures.pdfPages[page].replaceAll('\n', ''); - const normalize = str => - str.replace(/[\r\n]+/g, ' ').replace(/\s+/g, ' '); const quote = { type: 'TextQuoteSelector', @@ -530,20 +501,19 @@ describe('annotator/anchoring/pdf', () => { await pdfAnchoring.anchor([position, quote]); - const expectedText = normalize(getPageText(2)); - const expectedQuote = normalize(quote.exact); - const expectedPrefix = normalize(quote.prefix); - const expectedSuffix = normalize(quote.suffix); - const expectedHint = expectedText.indexOf(expectedQuote); - - assert.isTrue(matchQuoteSpy.calledOnce); - const [textArg, quoteArg, contextArg] = matchQuoteSpy.firstCall.args; - assert.equal(textArg, expectedText); - assert.equal(quoteArg, expectedQuote); - assert.deepEqual(contextArg, { - prefix: expectedPrefix, - suffix: expectedSuffix, - hint: expectedHint, + const stripSpaces = str => str.replace(/\s+/g, ''); + const strippedText = stripSpaces(fixtures.pdfPages[2]); + const strippedQuote = stripSpaces(quote.exact); + + const call = matchQuoteSpy + .getCalls() + .find(call => call.args[0] === strippedText); + assert.ok(call); + assert.equal(call.args[1], strippedQuote); + assert.match(call.args[2], { + prefix: stripSpaces(quote.prefix), + suffix: stripSpaces(quote.suffix), + hint: strippedText.indexOf(strippedQuote), }); }); diff --git a/src/annotator/util/normalize.ts b/src/annotator/util/normalize.ts index feee3b157f6..7feab63f9c2 100644 --- a/src/annotator/util/normalize.ts +++ b/src/annotator/util/normalize.ts @@ -1,27 +1,3 @@ -/** - * Return true if `char` is an ASCII whitespace character or a non-breaking - * space (U+00A0). - * - * This is more efficient than `/\s/.test(char)` but does not handle the - * full set of Unicode whitespace characters. - */ -export function isSpace(char: string): boolean { - switch (char) { - case ' ': - case '\f': - case '\n': - case '\r': - case '\t': - case '\v': - case '\u00a0': // nbsp - return true; - default: - return false; - } -} - -export const isNotSpace = (char: string): boolean => !isSpace(char); - /** * Find the smallest offset in `str` which contains at least `count` chars * that match `filter` before it. From 1e831747b68eca5a119793793d46fd42d24ccbf2 Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Thu, 7 May 2026 16:23:51 -0300 Subject: [PATCH 18/19] fix N/A testcase and remove helper --- src/annotator/anchoring/test/html-test.js | 43 +++++++---------------- 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/src/annotator/anchoring/test/html-test.js b/src/annotator/anchoring/test/html-test.js index 18f82fd6684..61b16aa6e24 100644 --- a/src/annotator/anchoring/test/html-test.js +++ b/src/annotator/anchoring/test/html-test.js @@ -1,21 +1,8 @@ import * as html from '../html'; +import { renderedTextOf } from '../rendered-text'; import fixture from './html-anchoring-fixture.html'; import { htmlBaselines } from './html-baselines'; -const normalizeText = str => str.replace(/\s+/g, ' ').trim(); -const normalizeQuoteSelector = sel => { - if (sel.type !== 'TextQuoteSelector') { - return sel; - } - const normalize = s => (s === undefined ? s : s.replace(/\s+/g, ' ').trim()); - return { - ...sel, - exact: normalize(sel.exact), - prefix: normalize(sel.prefix), - suffix: normalize(sel.suffix), - }; -}; - /** Return all text node children of `container`. */ function textNodes(container) { const nodes = []; @@ -80,15 +67,6 @@ function findByType(selectors, type) { }); } -/** - * Return a copy of a list of selectors sorted by type. - */ -function sortByType(selectors) { - return selectors.slice().sort((a, b) => { - return a.type.localeCompare(b.type); - }); -} - /** * Test cases for mapping ranges to selectors. * @@ -379,8 +357,8 @@ describe('HTML anchoring', () => { const anchored = selectors.map(sel => { return html.anchor(container, [sel]).then(anchoredRange => { assert.equal( - normalizeText(range.toString()), - normalizeText(anchoredRange.toString()), + range.toString().trim(), + anchoredRange.toString().trim(), ); }); }); @@ -488,12 +466,17 @@ describe('HTML anchoring', () => { const annotationsChecked = annotations.map(async ann => { const root = frame.contentWindow.document.body; - const selectors = ann.target[0].selector.map(normalizeQuoteSelector); + const selectors = ann.target[0].selector; const range = await html.anchor(root, selectors); - const newSelectors = (await html.describe(root, range)).map( - normalizeQuoteSelector, - ); - assert.deepEqual(sortByType(selectors), sortByType(newSelectors)); + + // Verify that anchoring lands on the saved quote: the rendered + // text of the anchored range should contain the selector's `exact`. + // This is what production cares about — that re-anchoring an old + // annotation finds the originally-highlighted text. + const quote = selectors.find(s => s.type === 'TextQuoteSelector'); + const div = document.createElement('div'); + div.appendChild(range.cloneContents()); + assert.include(renderedTextOf(div).text, quote.exact); }); return Promise.all(annotationsChecked); From fc8eb1b644a79248d8f3dc07d42a8c5fdd8a66d8 Mon Sep 17 00:00:00 2001 From: Joaco Sande Date: Thu, 7 May 2026 17:23:00 -0300 Subject: [PATCH 19/19] fix(anchoring): keep prefix/suffix raw to preserve baseline compatibility --- .../test/html-baselines/minimal.json | 4 +-- .../wikipedia-regression-testing.json | 8 +++--- src/annotator/anchoring/test/html-test.js | 21 ++++++++-------- src/annotator/anchoring/test/pdf-test.js | 2 +- src/annotator/anchoring/test/types-test.js | 9 ++++--- src/annotator/anchoring/types.ts | 25 +++++++++++-------- 6 files changed, 38 insertions(+), 31 deletions(-) diff --git a/src/annotator/anchoring/test/html-baselines/minimal.json b/src/annotator/anchoring/test/html-baselines/minimal.json index 8af8dc4c25a..b71848465a8 100644 --- a/src/annotator/anchoring/test/html-baselines/minimal.json +++ b/src/annotator/anchoring/test/html-baselines/minimal.json @@ -15,8 +15,8 @@ "end": 7 },{ "type": "TextQuoteSelector", - "prefix": "One", - "suffix": "Three", + "prefix": "One ", + "suffix": " Three\n", "exact": "Two" }] }] diff --git a/src/annotator/anchoring/test/html-baselines/wikipedia-regression-testing.json b/src/annotator/anchoring/test/html-baselines/wikipedia-regression-testing.json index c52ada3882e..9e4b0aa4f01 100644 --- a/src/annotator/anchoring/test/html-baselines/wikipedia-regression-testing.json +++ b/src/annotator/anchoring/test/html-baselines/wikipedia-regression-testing.json @@ -25,10 +25,10 @@ "end" : 11897 }, { - "suffix" : ", at 05:56. Text is available un", + "suffix" : ", at 05:56.\n\t\t\t\t\t\t\t\t\t\t\tText is a", "exact" : "11 November 2016", "type" : "TextQuoteSelector", - "prefix" : "This page was last modified on" + "prefix" : " This page was last modified on " } ] } @@ -164,7 +164,7 @@ "end" : 7667 }, { - "prefix" : "d execute the test suite cases.\n\n", + "prefix" : " execute the test suite cases.\n\n", "suffix" : ". Theoretically, after each fix ", "exact" : "Also as a consequence of the introduction of new bugs, program maintenance requires far more system testing per statement written than any other programming", "type" : "TextQuoteSelector" @@ -254,7 +254,7 @@ { "prefix" : "\t\n\t\t\n\t\t\n\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\t\t", "exact" : "Regression testing", - "suffix" : "\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tFrom Wikipedia, the free encycl", + "suffix" : "\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tFrom Wikiped", "type" : "TextQuoteSelector" } ], diff --git a/src/annotator/anchoring/test/html-test.js b/src/annotator/anchoring/test/html-test.js index 61b16aa6e24..a687ef281a2 100644 --- a/src/annotator/anchoring/test/html-test.js +++ b/src/annotator/anchoring/test/html-test.js @@ -1,5 +1,4 @@ import * as html from '../html'; -import { renderedTextOf } from '../rendered-text'; import fixture from './html-anchoring-fixture.html'; import { htmlBaselines } from './html-baselines'; @@ -67,6 +66,15 @@ function findByType(selectors, type) { }); } +/** + * Return a copy of a list of selectors sorted by type. + */ +function sortByType(selectors) { + return selectors.slice().sort((a, b) => { + return a.type.localeCompare(b.type); + }); +} + /** * Test cases for mapping ranges to selectors. * @@ -468,15 +476,8 @@ describe('HTML anchoring', () => { const root = frame.contentWindow.document.body; const selectors = ann.target[0].selector; const range = await html.anchor(root, selectors); - - // Verify that anchoring lands on the saved quote: the rendered - // text of the anchored range should contain the selector's `exact`. - // This is what production cares about — that re-anchoring an old - // annotation finds the originally-highlighted text. - const quote = selectors.find(s => s.type === 'TextQuoteSelector'); - const div = document.createElement('div'); - div.appendChild(range.cloneContents()); - assert.include(renderedTextOf(div).text, quote.exact); + const newSelectors = await html.describe(root, range); + assert.deepEqual(sortByType(selectors), sortByType(newSelectors)); }); return Promise.all(annotationsChecked); diff --git a/src/annotator/anchoring/test/pdf-test.js b/src/annotator/anchoring/test/pdf-test.js index 4a78bb81fb5..6f0830a1fb0 100644 --- a/src/annotator/anchoring/test/pdf-test.js +++ b/src/annotator/anchoring/test/pdf-test.js @@ -159,7 +159,7 @@ describe('annotator/anchoring/pdf', () => { type: 'TextQuoteSelector', exact: 'Netherfield Park', prefix: 'im one day, "have you heard that', - suffix: 'is occupied again?"', + suffix: ' is occupied again?" ', }); }); }); diff --git a/src/annotator/anchoring/test/types-test.js b/src/annotator/anchoring/test/types-test.js index 82d712c0709..d8c87998fa5 100644 --- a/src/annotator/anchoring/test/types-test.js +++ b/src/annotator/anchoring/test/types-test.js @@ -317,8 +317,8 @@ describe('annotator/anchoring/types', () => { assert.instanceOf(anchor, TextQuoteAnchor); assert.equal(anchor.exact, quote); assert.deepEqual(anchor.context, { - prefix: 'Four score and seven years ago', - suffix: 'brought forth on this continent', + prefix: text.slice(Math.max(0, pos - 32), pos), + suffix: text.slice(pos + quote.length, pos + quote.length + 32), }); }); }); @@ -369,6 +369,9 @@ describe('annotator/anchoring/types', () => { quoteAnchor.toRange({ hint: 42 }); + // `toPositionAnchor` matches against the rendered (whitespace-collapsed, + // BR-aware) text rather than raw `textContent`. For this container the + // two only differ by a trailing synthesized space from the block close. assert.calledWith( fakeMatchQuote, renderedTextOf(container).text, @@ -449,7 +452,7 @@ describe('annotator/anchoring/types', () => { assert.deepEqual(anchor.toSelector(), { type: 'TextQuoteSelector', prefix: '', - suffix: 'score and seven years ago our f', + suffix: ' score and seven years ago our f', exact: 'Four', }); diff --git a/src/annotator/anchoring/types.ts b/src/annotator/anchoring/types.ts index 5f99684a032..cddfacf6ff5 100644 --- a/src/annotator/anchoring/types.ts +++ b/src/annotator/anchoring/types.ts @@ -174,22 +174,25 @@ export class TextQuoteAnchor { * Will throw if `range` does not contain any text nodes. */ static fromRange(root: Element, range: Range): TextQuoteAnchor { + const rawText = root.textContent ?? ''; const { text, rawToNormalized } = renderedTextOf(root); const textRange = TextRange.fromRange(range).relativeTo(root); - const normalizedStart = toNormalized( - rawToNormalized, - textRange.start.offset, - ); - const normalizedEnd = toNormalized(rawToNormalized, textRange.end.offset); + const rawStart = textRange.start.offset; + const rawEnd = textRange.end.offset; + const normalizedStart = toNormalized(rawToNormalized, rawStart); + const normalizedEnd = toNormalized(rawToNormalized, rawEnd); const contextLen = 32; const exact = text.slice(normalizedStart, normalizedEnd).trim(); - const prefix = text - .slice(Math.max(0, normalizedStart - contextLen), normalizedStart) - .trim(); - const suffix = text - .slice(normalizedEnd, Math.min(text.length, normalizedEnd + contextLen)) - .trim(); + // Prefix/suffix come from raw textContent: matchQuote tolerates + // whitespace differences when matching, and keeping them raw preserves + // backward compatibility with selectors stored before the rendered-text + // normalization landed. + const prefix = rawText.slice(Math.max(0, rawStart - contextLen), rawStart); + const suffix = rawText.slice( + rawEnd, + Math.min(rawText.length, rawEnd + contextLen), + ); return new TextQuoteAnchor(root, exact, { prefix, suffix }); }