diff --git a/.changeset/huge-banks-attend.md b/.changeset/huge-banks-attend.md new file mode 100644 index 0000000..5223c7e --- /dev/null +++ b/.changeset/huge-banks-attend.md @@ -0,0 +1,5 @@ +--- +"unicode-segmenter": patch +--- + +Fix G9Bc rule; `ZWNJ`(InCB=None) handling was missing. Thanks to @spaceemotion for reporting this. diff --git a/src/grapheme.js b/src/grapheme.js index 3261869..f5e8a6a 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -166,8 +166,10 @@ export function* graphemeSegments(input) { if (!consonant && catBefore === 0) { consonant = isIndicConjunctConsonant(_hd); } - if (consonant && catAfter === 3) { - linker = linker + if (consonant && (catAfter === 3 || catAfter === 14)) { + // ZWNJ(U+200C) has InCB=None, it should break the GB9c pattern + linker = cp !== 0x200C && ( + linker || cp === 0x094D // Devanagari Sign Virama || cp === 0x09CD // Bengali Sign Virama || cp === 0x0A4D // Gurmukhi Sign Virama @@ -188,7 +190,8 @@ export function* graphemeSegments(input) { || cp === 0x1193E // Dives Akuru Virama || cp === 0x11A47 // Zanabazar Square Subjoiner || cp === 0x11A99 // Soyombo Subjoiner - || cp === 0x11F42; // Kawi Conjoiner + || cp === 0x11F42 // Kawi Conjoiner + ); } else { linker = false; } diff --git a/test/grapheme.js b/test/grapheme.js index b0daaba..d4efaed 100644 --- a/test/grapheme.js +++ b/test/grapheme.js @@ -280,6 +280,33 @@ test('counterexamples', async t => { 'क्‍त', '് ', '्क', + 'গ্‌ডু', + 'স্ট্‌মা', + 'আপি', + 'স্ট্‌মা', + 'ল্‌জ্ব', + 'এবং', + 'ল্‌ছ', + ' ক', + ' വ', + ' ക', + ' വൃ', + ' പു', + ' യ', + ' പോ', + ' ജോ', + ' നീ', + 'എന്നാ', + 'ഡ്‌ഢി', + 'ഇനി', + 'ഉദ്യോ', + ' താ', + ' ദു', + ' ബു', + 'ഒര', + ' जा', + ' କା', + ' ଶ୍ୟା', ]; for (let counter of counterExamples) {