Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion packages/layout-engine/contracts/src/direction-context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,23 @@ export type ParagraphDirectionContext = {
* w:bdo (§17.3.2.3 override).
*/
export type RunBidiContext = {
/** w:rPr/w:rtl. Forces complex-script formatting; see RunScriptContext. */
/**
* w:rPr/w:rtl. Preserves the source OOXML signal that the run carries
* the `w:rtl` flag. Per §17.3.2.30, `w:rtl` does two things at the model
* level:
* 1. Forces the complex-script formatting stack (bCs, iCs, szCs,
* rFonts/@cs). See RunScriptContext for the formatting half.
* 2. Acts as a Character Directionality Override for weak/neutral
* characters in the run (NOT a forced visual flip of strong-LTR text;
* §17.3.2.30 explicitly says behavior on strong-LTR is unspecified).
*
* `rtl: true` is the source signal, NOT a directive that every consumer
* must project to `dir="rtl"` in the rendered DOM. The painter decides
* the DOM projection per its Word-parity rules (see
* `features/inline-direction/resolveRunDirectionAttribute`). Exporters
* must preserve `rtl: true` on round-trip regardless of paint decisions,
* since dropping it would lose the source `w:rPr/w:rtl` semantics.
*/
rtl: boolean;
/** w:dir; bidi embedding direction (RLE/LRE). Wave 1c. */
embedding?: BaseDirection;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,11 @@
*/

export { applyRtlStyles, shouldUseSegmentPositioning } from './rtl-styles.js';
export {
resolveRunDirectionAttribute,
normalizeRtlDateTokenForWordParity,
RTL_DATE_LIKE_TOKEN_RE,
STRONG_RTL_CHAR_RE,
LATIN_DIGIT_NEUTRAL_ONLY_RE,
type RunDirAttribute,
} from './run-direction.js';
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
import { describe, expect, it } from 'vitest';
import {
resolveRunDirectionAttribute,
normalizeRtlDateTokenForWordParity,
RTL_DATE_LIKE_TOKEN_RE,
STRONG_RTL_CHAR_RE,
LATIN_DIGIT_NEUTRAL_ONLY_RE,
} from './run-direction.js';

describe('resolveRunDirectionAttribute', () => {
describe('rtl-tagged runs', () => {
it('returns "rtl" for Hebrew text', () => {
expect(
resolveRunDirectionAttribute({
runText: 'שלום',
effectiveText: 'שלום',
isRtlTagged: true,
}),
).toBe('rtl');
});

it('returns "rtl" for Arabic text', () => {
expect(
resolveRunDirectionAttribute({
runText: 'مرحبا',
effectiveText: 'مرحبا',
isRtlTagged: true,
}),
).toBe('rtl');
});

it('returns null for Latin-only text (Word-parity: §17.3.2.30 unspecified)', () => {
expect(
resolveRunDirectionAttribute({
runText: 'Hello',
effectiveText: 'Hello',
isRtlTagged: true,
}),
).toBe(null);
});

it('returns null for digit-only text', () => {
expect(
resolveRunDirectionAttribute({
runText: '2026',
effectiveText: '2026',
isRtlTagged: true,
}),
).toBe(null);
});

it('returns "rtl" for date-like numeric (isolates the date as RTL unit)', () => {
expect(
resolveRunDirectionAttribute({
runText: '2026-03-15',
effectiveText: '2026-03-15',
isRtlTagged: true,
}),
).toBe('rtl');
});

it('returns "rtl" for mixed strong-RTL + Latin (Hebrew present)', () => {
expect(
resolveRunDirectionAttribute({
runText: 'first שלום',
effectiveText: 'first שלום',
isRtlTagged: true,
}),
).toBe('rtl');
});

it('returns "rtl" for empty text (honor source signal when no content)', () => {
expect(
resolveRunDirectionAttribute({
runText: '',
effectiveText: '',
isRtlTagged: true,
}),
).toBe('rtl');
});

it('returns "rtl" for whitespace-only text', () => {
expect(
resolveRunDirectionAttribute({
runText: ' ',
effectiveText: ' ',
isRtlTagged: true,
}),
).toBe('rtl');
});

// Fail-safe: anything that doesn't match the Latin/digit/neutral set OR the
// strong-RTL set still honors the source signal. East Asian, presentation
// forms, symbols outside the neutral set all fall into this branch.
it('returns "rtl" for text that is neither Latin nor strong-RTL', () => {
expect(
resolveRunDirectionAttribute({
runText: '世界',
effectiveText: '世界',
isRtlTagged: true,
}),
).toBe('rtl');
});

it('uses effectiveText when runText is undefined', () => {
expect(
resolveRunDirectionAttribute({
runText: undefined,
effectiveText: 'שלום',
isRtlTagged: true,
}),
).toBe('rtl');
});
});

describe('non-rtl-tagged runs', () => {
it('returns "ltr" for date-like numeric (Word-parity in RTL paragraph)', () => {
expect(
resolveRunDirectionAttribute({
runText: '2026-03-15',
effectiveText: '2026-03-15',
isRtlTagged: false,
}),
).toBe('ltr');
});

it('returns null for plain Latin (let paragraph + UBA decide)', () => {
expect(
resolveRunDirectionAttribute({
runText: 'Hello',
effectiveText: 'Hello',
isRtlTagged: false,
}),
).toBe(null);
});

it('returns null for Hebrew text without w:rtl (paragraph context resolves)', () => {
expect(
resolveRunDirectionAttribute({
runText: 'שלום',
effectiveText: 'שלום',
isRtlTagged: false,
}),
).toBe(null);
});

it('returns null when runText is undefined (no date pattern to match)', () => {
expect(
resolveRunDirectionAttribute({
runText: undefined,
effectiveText: '2026-03-15',
isRtlTagged: false,
}),
).toBe(null);
});
});
});

describe('normalizeRtlDateTokenForWordParity', () => {
const RLM = '\u200F';

it('wraps separators with RLM in date-like text', () => {
expect(normalizeRtlDateTokenForWordParity('2026-03-15')).toBe(`2026${RLM}-${RLM}03${RLM}-${RLM}15`);
});

it('handles slash separators', () => {
expect(normalizeRtlDateTokenForWordParity('15/03/2026')).toBe(`15${RLM}/${RLM}03${RLM}/${RLM}2026`);
});

it('handles dot separators', () => {
expect(normalizeRtlDateTokenForWordParity('1.2.3')).toBe(`1${RLM}.${RLM}2${RLM}.${RLM}3`);
});

it('wraps the leading sign too (no special-case for leading "-")', () => {
// Implementation is text.replace(/[./-]/g, ...). The leading sign is also
// a `-`, so it gets RLM-wrapped. This matches the pre-extraction behavior.
expect(normalizeRtlDateTokenForWordParity('-2026-03')).toBe(`${RLM}-${RLM}2026${RLM}-${RLM}03`);
});

it('returns unchanged for non-date text', () => {
expect(normalizeRtlDateTokenForWordParity('Hello world')).toBe('Hello world');
expect(normalizeRtlDateTokenForWordParity('2026')).toBe('2026'); // no separator
expect(normalizeRtlDateTokenForWordParity('שלום')).toBe('שלום');
});
});

describe('regex coverage smoke tests', () => {
it('RTL_DATE_LIKE_TOKEN_RE matches numeric dates', () => {
expect(RTL_DATE_LIKE_TOKEN_RE.test('2026-03-15')).toBe(true);
expect(RTL_DATE_LIKE_TOKEN_RE.test('15/03/2026')).toBe(true);
expect(RTL_DATE_LIKE_TOKEN_RE.test('1.2.3')).toBe(true);
expect(RTL_DATE_LIKE_TOKEN_RE.test('-2026-03')).toBe(true);
expect(RTL_DATE_LIKE_TOKEN_RE.test('2026')).toBe(false); // no separator
expect(RTL_DATE_LIKE_TOKEN_RE.test('a-b-c')).toBe(false);
});

it('STRONG_RTL_CHAR_RE matches Hebrew and Arabic core blocks', () => {
expect(STRONG_RTL_CHAR_RE.test('שלום')).toBe(true);
expect(STRONG_RTL_CHAR_RE.test('مرحبا')).toBe(true);
expect(STRONG_RTL_CHAR_RE.test('Hello')).toBe(false);
expect(STRONG_RTL_CHAR_RE.test('2026')).toBe(false);
});

it('LATIN_DIGIT_NEUTRAL_ONLY_RE matches Latin + digit + neutral chars', () => {
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('Hello world')).toBe(true);
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('copy 2')).toBe(true);
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('a/b-c.d')).toBe(true);
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('שלום')).toBe(false);
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('Hello שלום')).toBe(false);
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/**
* Run-level direction helpers for DomPainter.
*
* These helpers encode paint-time decisions about how to project the OOXML
* `w:rPr/w:rtl` signal onto a rendered span's `dir` attribute, plus a narrow
* Word-parity workaround for RTL-tagged date-like numeric runs.
*
* The heuristic is intentionally scoped to current Word-parity fixtures
* (SD-3098 mixed-bidi date tokens). It is NOT a full implementation of
* §17.3.2.30 semantics - notably absent: `w:dir` embedding (§17.3.2.8),
* `w:bdo` override (§17.3.2.3), and `w:lang/@bidi` Hebrew vs Arabic numeric
* differences. Those gaps are tracked separately; see SD-2767 follow-ups.
*
* @spec ECMA-376 §17.3.2.30 (rtl), §17.17.4 (boolean property)
*/

/**
* Matches numeric date-like tokens such as `2026-03-15`, `15/03/2026`, `1.2.3`.
* Used by both the run direction resolver and the paint-time RLM injection
* for Word parity on RTL date strings.
*/
export const RTL_DATE_LIKE_TOKEN_RE = /^-?\d+(?:[./-]\d+)+$/;

/**
* Matches strong-RTL characters in the Hebrew / Arabic / Syriac core blocks.
*
* Known gap: misses Hebrew presentation forms (FB1D-FB4F) and Arabic
* presentation forms (FB50-FDFF, FE70-FEFF). Tracked under SD-2767 follow-ups.
*/
export const STRONG_RTL_CHAR_RE = /[\u0590-\u08FF]/;

/**
* Matches runs whose content is exclusively Latin / digit / neutral. Used as
* the "skip per-run dir=rtl" guard: per §17.3.2.30, behavior of w:rtl on
* strongly LTR text is unspecified, and Word's empirical output for these
* runs does not visually reorder.
*/
export const LATIN_DIGIT_NEUTRAL_ONLY_RE = /^[\s0-9A-Za-z./\-_:,+()]+$/;

const RLM = '\u200F';

/**
* Word-parity workaround for RTL date-like tokens.
*
* Word internally injects RLM around numeric separators in RTL date strings,
* preserving LTR order for the digits while keeping the run RTL. The browser's
* UBA alone does not match this. We mirror Word by injecting RLM at paint
* time only - the DOM text differs from the PM model and from the exported
* OOXML, which both keep the original separators.
*
* Intentionally narrow: only matches numeric date-like patterns so other
* numeric content is unaffected. Scope is current SD-3098 fixtures.
*/
export const normalizeRtlDateTokenForWordParity = (text: string): string => {
if (!RTL_DATE_LIKE_TOKEN_RE.test(text)) {
return text;
}
return text.replace(/[./-]/g, (separator) => `${RLM}${separator}${RLM}`);
};

/**
* Compute the `dir` attribute (if any) to apply to a rendered run span.
*
* Decision table:
* - rtl-tagged + empty text -> 'rtl' (no content to classify, honor source signal)
* - rtl-tagged + date-like numeric -> 'rtl' (isolates the date as a unit)
* - rtl-tagged + contains strong-RTL chars -> 'rtl' (standard case)
* - rtl-tagged + only Latin/digit/neutral -> null (per §17.3.2.30, unspecified;
* Word does not visually reorder these, so omit dir to inherit paragraph)
* - rtl-tagged + other (e.g. East Asian, presentation forms) -> 'rtl' (fail-safe)
* - NOT rtl-tagged + date-like numeric text -> 'ltr' (Word-parity: keeps date
* LTR-classified within an RTL paragraph context so digits don't drift)
* - NOT rtl-tagged + anything else -> null (let paragraph + UBA decide)
*/
export type RunDirAttribute = 'rtl' | 'ltr' | null;

export const resolveRunDirectionAttribute = (opts: {
/** Original run text from the model. */
runText: string | undefined;
/** Post-token-resolution text used for rendering (e.g. field token expansion). */
effectiveText: string;
/** True when the source OOXML carries `w:rPr/w:rtl`. */
isRtlTagged: boolean;
}): RunDirAttribute => {
if (opts.isRtlTagged) {
const sample = (opts.runText ?? opts.effectiveText).trim();
if (!sample) return 'rtl';
if (RTL_DATE_LIKE_TOKEN_RE.test(sample)) return 'rtl';
if (STRONG_RTL_CHAR_RE.test(sample)) return 'rtl';
if (LATIN_DIGIT_NEUTRAL_ONLY_RE.test(sample)) return null;
return 'rtl';
}

if (typeof opts.runText === 'string' && RTL_DATE_LIKE_TOKEN_RE.test(opts.runText)) {
return 'ltr';
}

return null;
};
Loading
Loading