diff --git a/.changeset/nice-weeks-enter.md b/.changeset/nice-weeks-enter.md new file mode 100644 index 0000000..dca2157 --- /dev/null +++ b/.changeset/nice-weeks-enter.md @@ -0,0 +1,5 @@ +--- +"unicode-segmenter": patch +--- + +Fix `Extend + Extended_Pictographic` cluster break diff --git a/README.md b/README.md index 3e58470..b5b7680 100644 --- a/README.md +++ b/README.md @@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) | |------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:| -| `unicode-segmenter/grapheme` | 17.0.0 | ✔️ | 11,935 | 7,781 | 3,868 | 3,118 | 3,997 | +| `unicode-segmenter/grapheme` | 17.0.0 | ✔️ | 11,964 | 7,780 | 3,872 | 3,102 | 3,996 | | `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 | | `grapheme-splittetr` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 | | `@formatjs/intl-segmenter`* | 17.0.0 | ✖️ | 268,301 | 176,759 | 45,988 | 31,701 | 45,370 | @@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Bytecode size | Bytecode size (gzip)* | |------------------------------|--------------:|----------------------:| -| `unicode-segmenter/grapheme` | 21,427 | 12,048 | +| `unicode-segmenter/grapheme` | 21,419 | 12,075 | | `graphemer` | 134,085 | 31,770 | | `grapheme-splitter` | 63,942 | 19,165 | | `@formatjs/intl-segmenter` | 329,547 | 136,751 | diff --git a/src/grapheme.js b/src/grapheme.js index d0a6ef4..f9dc394 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -65,9 +65,12 @@ export function* graphemeSegments(input) { /** The number of RI codepoints preceding `cursor`. */ let riCount = 0; + /** Tracks if Extended_Pictographic was seen in the current Extend* sequence for GB11 */ + let extPic = catBefore === 4; + /** * Emoji state for GB11: tracks if we've seen Extended_Pictographic followed by Extend* ZWJ - * Only relevant when catBefore === ZWJ && catAfter === Extended_Pictographic + * Only relevant when catBefore === ZWJ && extPic (catAfter === Extended_Pictographic) */ let emoji = false; @@ -112,7 +115,7 @@ export function* graphemeSegments(input) { boundary = false; } // GB11: ExtPic Extend* ZWJ × ExtPic - else if (catBefore === 14 && catAfter === 4) { + else if (catBefore === 14 && extPic) { boundary = !emoji; } // GB12, GB13: RI × RI (odd count means no break) @@ -149,6 +152,7 @@ export function* graphemeSegments(input) { }; // Reset segment state + extPic = catAfter === 4; emoji = false; consonant = false; riCount = 0; @@ -158,8 +162,8 @@ export function* graphemeSegments(input) { } // Update state for continuing segment else { - // emoji state for GB11 - if (catAfter === 14 && (catBefore === 3 || catBefore === 4)) { + // emoji state for GB11: ExtPic Extend* ZWJ × ExtPic + if (catAfter === 14 && extPic) { emoji = true; } // InCB state for GB9c diff --git a/test/grapheme.js b/test/grapheme.js index f91d4f2..5295105 100644 --- a/test/grapheme.js +++ b/test/grapheme.js @@ -308,6 +308,7 @@ test('counterexamples', async t => { ' କା', ' ଶ୍ୟା', 'ക് വ', + 'à‍❤', ]; for (let counter of counterExamples) {