Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/nice-weeks-enter.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"unicode-segmenter": patch
---

Fix `Extend + Extended_Pictographic` cluster break
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
| `unicode-segmenter/grapheme` | 17.0.0 | ✔️ | 11,935 | 7,781 | 3,868 | 3,118 | 3,997 |
| `unicode-segmenter/grapheme` | 17.0.0 | ✔️ | 11,964 | 7,780 | 3,872 | 3,102 | 3,996 |
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
| `grapheme-splittetr` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
| `@formatjs/intl-segmenter`* | 17.0.0 | ✖️ | 268,301 | 176,759 | 45,988 | 31,701 | 45,370 |
Expand All @@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Bytecode size | Bytecode size (gzip)* |
|------------------------------|--------------:|----------------------:|
| `unicode-segmenter/grapheme` | 21,427 | 12,048 |
| `unicode-segmenter/grapheme` | 21,419 | 12,075 |
| `graphemer` | 134,085 | 31,770 |
| `grapheme-splitter` | 63,942 | 19,165 |
| `@formatjs/intl-segmenter` | 329,547 | 136,751 |
Expand Down
12 changes: 8 additions & 4 deletions src/grapheme.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,12 @@ export function* graphemeSegments(input) {
/** The number of RI codepoints preceding `cursor`. */
let riCount = 0;

/** Tracks if Extended_Pictographic was seen in the current Extend* sequence for GB11 */
let extPic = catBefore === 4;

/**
* Emoji state for GB11: tracks if we've seen Extended_Pictographic followed by Extend* ZWJ
* Only relevant when catBefore === ZWJ && catAfter === Extended_Pictographic
* Only relevant when catBefore === ZWJ && extPic (catAfter === Extended_Pictographic)
*/
let emoji = false;

Expand Down Expand Up @@ -112,7 +115,7 @@ export function* graphemeSegments(input) {
boundary = false;
}
// GB11: ExtPic Extend* ZWJ × ExtPic
else if (catBefore === 14 && catAfter === 4) {
else if (catBefore === 14 && extPic) {
boundary = !emoji;
}
// GB12, GB13: RI × RI (odd count means no break)
Expand Down Expand Up @@ -149,6 +152,7 @@ export function* graphemeSegments(input) {
};

// Reset segment state
extPic = catAfter === 4;
emoji = false;
consonant = false;
riCount = 0;
Expand All @@ -158,8 +162,8 @@ export function* graphemeSegments(input) {
}
// Update state for continuing segment
else {
// emoji state for GB11
if (catAfter === 14 && (catBefore === 3 || catBefore === 4)) {
Comment thread
cometkim marked this conversation as resolved.
// emoji state for GB11: ExtPic Extend* ZWJ × ExtPic
if (catAfter === 14 && extPic) {
emoji = true;
}
// InCB state for GB9c
Expand Down
1 change: 1 addition & 0 deletions test/grapheme.js
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ test('counterexamples', async t => {
' କା',
' ଶ୍ୟା',
'ക് വ',
'à‍❤',
];

for (let counter of counterExamples) {
Expand Down
Loading