Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
cd3c94d
fix(quote selectors): preserve quote spacing after linebreaks
Elimpizza Feb 25, 2026
379a9b7
fix(anchoring): normalize quote anchoring for html and pdf
Elimpizza Feb 26, 2026
47f73d9
fix(anchoring): normalize quote anchoring and baselines
Elimpizza Feb 26, 2026
b9aeb5a
fix(lint): curl brackets in ifs, removed unused normalized variables
Elimpizza Feb 26, 2026
1e2f4a2
fix(coverage): added anchoring normalization coverage
Elimpizza Feb 26, 2026
6c63734
fix(coverage): missing 2 lines
Elimpizza Feb 26, 2026
e8b750c
fix(coverage): added 1 testcase for 1 missing uncovered line
Elimpizza Feb 26, 2026
635a18c
Update src/annotator/anchoring/rendered-text.ts
Elimpizza Feb 26, 2026
d42bef5
fix(test): tightened anhoring matchQuote assertions and checks
Elimpizza Feb 26, 2026
b954e6d
fix(pdf): added trim after normalizaton for consistency
Elimpizza Feb 27, 2026
aeebeba
fix(space considering): went back to isNotSpace from char => char !==…
Elimpizza Feb 27, 2026
1e3dac0
fix(rawToNorm): prevented type error
Elimpizza Feb 27, 2026
03f24ab
fix(codebase and testcase) trim normalized exact in textquote selectors
Elimpizza Mar 23, 2026
6440c17
refactor(anchoring): flatten rendered-text + share isNotSpace, tighte…
Elimpizza May 6, 2026
b1156fc
prettier format
Elimpizza May 6, 2026
accd8c1
renames + mini bug fix
Elimpizza May 7, 2026
2f03bbd
fixes and revert
Elimpizza May 7, 2026
1e83174
fix N/A testcase and remove helper
Elimpizza May 7, 2026
fc8eb1b
fix(anchoring): keep prefix/suffix raw to preserve baseline compatibi…
Elimpizza May 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions src/annotator/anchoring/rendered-text.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/**
* Tags whose boundaries should produce a whitespace break in the rendered
* text, even though the boundary itself is not represented as a character in
* the DOM's `textContent`. Mirrors the visible whitespace a user perceives
* when they read the rendered page or copy a selection out to a single-line
* input.
*/
const BLOCK_TAGS = new Set([
'ADDRESS',
'ARTICLE',
'ASIDE',
'BLOCKQUOTE',
'DIV',
'DL',
'FIELDSET',
'FIGCAPTION',
'FIGURE',
'FOOTER',
'FORM',
'H1',
'H2',
'H3',
'H4',
'H5',
'H6',
'HEADER',
'HR',
'LI',
'MAIN',
'NAV',
'OL',
'P',
'SECTION',
'TABLE',
'TD',
'TH',
'TR',
'UL',
]);

export type RenderedText = {
/**
* Rendered/normalized text of the root: collapsed whitespace, with a single
* space inserted at each `<br>` and block-tag boundary.
*/
text: string;
/**
* For each raw offset i (0..rawText.length, treated as a position between
* characters), the position in `text` where rawText[i] first contributes
* (or, for synthesized spaces, where the space is inserted at that
* boundary). Undefined entries indicate raw chars that were suppressed
* (e.g., a whitespace char that collapsed into a previous space) —
* `toNormalized` scans backwards to the nearest defined entry.
*/
rawToNormalized: (number | undefined)[];
/** For each output position, the raw offset that produced it. */
normalizedToRaw: number[];
};

type BuildState = {
output: string;
rawToNormalized: (number | undefined)[];
normalizedToRaw: number[];
rawPosition: number;
};

/**
* Append `character` (originating at `fromRaw` in raw text) to `state.output`,
* collapsing whitespace runs and updating the raw↔normalized maps in place.
*/
function append(state: BuildState, character: string, fromRaw: number) {
if (/\s/.test(character)) {
if (state.output.length === 0 || state.output.endsWith(' ')) {
return;
}
character = ' ';
}
state.output += character;
state.normalizedToRaw.push(fromRaw);
if (state.rawToNormalized[fromRaw] === undefined) {
state.rawToNormalized[fromRaw] = state.normalizedToRaw.length - 1;
}
}

function walk(node: Node, state: BuildState) {
if (node.nodeType === Node.TEXT_NODE) {
const text = node.textContent ?? '';
for (let i = 0; i < text.length; i++) {
append(state, text[i], state.rawPosition);
state.rawPosition += 1;
}
return;
}

if (node.nodeType !== Node.ELEMENT_NODE) {
return;
}

const el = node as Element;
if (el.tagName === 'BR') {
append(state, ' ', state.rawPosition);
return;
}

const block = BLOCK_TAGS.has(el.nodeName);
if (block) {
append(state, ' ', state.rawPosition);
}
for (const child of Array.from(node.childNodes)) {
walk(child, state);
}
if (block) {
append(state, ' ', state.rawPosition);
}
}

/**
* Walk `root`'s DOM subtree and produce its rendered text along with offset
* maps between raw `textContent` and the rendered output.
*
* Whitespace runs (including the synthesized boundary spaces) are collapsed
* to a single ASCII space, and leading/consecutive whitespace is suppressed —
* matching what a user sees when they read the page or paste a selection
* into a single-line input.
*/
export function renderedTextOf(root: Element): RenderedText {
const rawText = root.textContent ?? '';
const state: BuildState = {
output: '',
rawToNormalized: new Array(rawText.length + 1).fill(undefined),
normalizedToRaw: [],
rawPosition: 0,
};

walk(root, state);

// End-of-string sentinel so callers can pass rawText.length / output.length
// without going out of bounds.
if (state.rawToNormalized[state.rawPosition] === undefined) {
state.rawToNormalized[state.rawPosition] = state.normalizedToRaw.length;
}
state.normalizedToRaw.push(state.rawPosition);

return {
text: state.output,
rawToNormalized: state.rawToNormalized,
normalizedToRaw: state.normalizedToRaw,
};
}

/** Translate a raw `textContent` offset into a position in the rendered text. */
export function toNormalized(
rawToNormalized: (number | undefined)[],
rawOffset: number,
): number {
if (rawOffset < 0) {
return 0;
}
const clamped = Math.min(rawOffset, rawToNormalized.length - 1);
for (let i = clamped; i >= 0; i--) {
const value = rawToNormalized[i];
if (value !== undefined) {
return value;
}
}
return 0;
}

/** Translate a position in the rendered text back to a raw `textContent` offset. */
export function toRaw(
normalizedToRaw: number[],
normalizedOffset: number,
): number {
if (normalizedOffset <= 0) {
return 0;
}
const clamped = Math.min(normalizedOffset, normalizedToRaw.length - 1);
return normalizedToRaw[clamped];
}
43 changes: 43 additions & 0 deletions src/annotator/anchoring/test/html-boundary-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import { assert } from 'chai';

import * as html from '../html';

describe('annotator/anchoring/html boundary handling', () => {
let container;

beforeEach(() => {
container = document.createElement('div');
container.innerHTML = '<p>foo<br>bar</p>';
document.body.appendChild(container);
});

afterEach(() => {
container.remove();
});

it('describes selection after <br> without leading space in exact', () => {
const range = document.createRange();
const textNode = container.querySelector('p').lastChild; // text node "bar"
range.setStart(textNode, 0);
range.setEnd(textNode, 3);

const selectors = html.describe(container, range);
const quoteSel = selectors.find(s => s.type === 'TextQuoteSelector');

assert.equal(quoteSel.exact, 'bar');
assert.isFalse(quoteSel.exact.startsWith(' '));
assert.isFalse(quoteSel.exact.endsWith(' '));
});

it('anchors selection after <br> back to the same text', async () => {
const range = document.createRange();
const textNode = container.querySelector('p').lastChild; // text node "bar"
range.setStart(textNode, 0);
range.setEnd(textNode, 3);

const selectors = html.describe(container, range);
const anchoredRange = await html.anchor(container, selectors);

assert.equal(anchoredRange.toString(), 'bar');
});
});
5 changes: 4 additions & 1 deletion src/annotator/anchoring/test/html-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,10 @@ describe('HTML anchoring', () => {
// text. We test each selector in turn to make sure they are all valid.
const anchored = selectors.map(sel => {
return html.anchor(container, [sel]).then(anchoredRange => {
assert.equal(range.toString(), anchoredRange.toString());
assert.equal(
range.toString().trim(),
anchoredRange.toString().trim(),
);
});
});
return Promise.all(anchored);
Expand Down
75 changes: 75 additions & 0 deletions src/annotator/anchoring/test/rendered-text-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import { assert } from 'chai';

import { renderedTextOf, toNormalized, toRaw } from '../rendered-text';

describe('annotator/anchoring/rendered-text', () => {
it('inserts a space at <br> boundaries', () => {
const container = document.createElement('div');
container.innerHTML = '<p>foo<br>bar</p>';

assert.equal(renderedTextOf(container).text.trim(), 'foo bar');
});

it('inserts a space at block-tag boundaries', () => {
const container = document.createElement('div');
container.innerHTML = '<p>foo</p><p>bar</p><div>baz</div>';

assert.equal(renderedTextOf(container).text.trim(), 'foo bar baz');
});

it('collapses runs of whitespace and suppresses leading whitespace', () => {
const container = document.createElement('div');
container.innerHTML = '<p> foo \n\n bar </p>';

assert.equal(renderedTextOf(container).text.trim(), 'foo bar');
});

describe('offset translation', () => {
it('round-trips raw <-> normalized offsets across a <br> boundary', () => {
const container = document.createElement('div');
container.innerHTML = '<p>foo<br>bar</p>';

const { text, rawToNormalized, normalizedToRaw } =
renderedTextOf(container);
assert.equal(text.trim(), 'foo bar');

// Raw textContent is "foobar". The 'b' at raw offset 3 maps into the
// rendered text past the synthesized space inserted at the <br>.
assert.equal(toNormalized(rawToNormalized, 3), 3);
assert.equal(toRaw(normalizedToRaw, toNormalized(rawToNormalized, 3)), 3);
// End-of-string round-trips to end-of-string.
assert.equal(
toRaw(normalizedToRaw, text.length),
container.textContent.length,
);
});

it('clamps out-of-range raw offsets to the start', () => {
const container = document.createElement('div');
container.textContent = 'abc';

const { rawToNormalized } = renderedTextOf(container);
assert.equal(toNormalized(rawToNormalized, -5), 0);
});

it('clamps non-positive normalized offsets to raw start', () => {
const container = document.createElement('div');
container.textContent = 'abc';

const { normalizedToRaw } = renderedTextOf(container);
assert.equal(toRaw(normalizedToRaw, 0), 0);
assert.equal(toRaw(normalizedToRaw, -10), 0);
});

it('handles whitespace-only content', () => {
const container = document.createElement('div');
container.textContent = ' ';

const { text, rawToNormalized } = renderedTextOf(container);
// Whitespace collapses; container is a block, so we get just the
// closing-block synthesized space (or empty after trim).
assert.equal(text.trim(), '');
assert.equal(toNormalized(rawToNormalized, 1), 0);
});
});
});
19 changes: 14 additions & 5 deletions src/annotator/anchoring/test/types-test.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { render } from 'preact';

import { renderedTextOf } from '../rendered-text';
import { TextRange } from '../text-range';
import {
MediaTimeAnchor,
Expand Down Expand Up @@ -368,11 +369,19 @@ describe('annotator/anchoring/types', () => {

quoteAnchor.toRange({ hint: 42 });

assert.calledWith(fakeMatchQuote, container.textContent, 'Liberty', {
hint: 42,
prefix: 'expected-prefix',
suffix: 'expected-suffix',
});
// `toPositionAnchor` matches against the rendered (whitespace-collapsed,
// BR-aware) text rather than raw `textContent`. For this container the
// two only differ by a trailing synthesized space from the block close.
assert.calledWith(
fakeMatchQuote,
renderedTextOf(container).text,
'Liberty',
{
hint: 42,
prefix: 'expected-prefix',
suffix: 'expected-suffix',
},
);
});

it('returns `Range` representing match found by `matchQuote`', () => {
Expand Down
Loading
Loading