From 37e97cfbc4a9ca67496482cd0152fcc69822cad2 Mon Sep 17 00:00:00 2001 From: ET Date: Mon, 26 Jan 2026 15:22:05 -0800 Subject: [PATCH 1/4] Fix text mapping for special characters --- .../CPP/DWriteWrapper/IClassification.h | 12 ++- .../CPP/DWriteWrapper/TextAnalyzer.cpp | 76 +++++++++++------- .../MS/internal/Classification.cs | 77 ++++++++++++++++++- .../internal/FontFace/PhysicalFontFamily.cs | 15 +++- 4 files changed, 144 insertions(+), 36 deletions(-) diff --git a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h index 74ea6c84c3c..09ecbffafba 100644 --- a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h +++ b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h @@ -21,10 +21,18 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface [System::Runtime::InteropServices::Out] bool% isIndic, [System::Runtime::InteropServices::Out] bool% isDigit, [System::Runtime::InteropServices::Out] bool% isLatin, - [System::Runtime::InteropServices::Out] bool% isStrong + [System::Runtime::InteropServices::Out] bool% isStrong, + [System::Runtime::InteropServices::Out] bool% isExtended ); + + /// + /// Check whether two Unicode scalar values belong to the same script. + /// This is used to determine if combining marks should stay with their base character + /// for font fallback purposes. (See PR #6857 / Issue #6801) + /// + bool IsSameScript(int unicodeScalar1, int unicodeScalar2); }; }}}}//MS::Internal::Text::TextInterface -#endif //__ICLASSIFICATION_H \ No newline at end of file +#endif //__ICLASSIFICATION_H diff --git a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp index b7186f36dfd..f6899134ecf 100644 --- a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp +++ b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp @@ -155,54 +155,78 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface bool isStrong; bool isExtended; + WCHAR ch = text[0]; classificationUtility->GetCharAttribute( - text[0], + ch, isCombining, needsCaretInfo, isIndic, isDigit, isLatin, - isStrong - ); - - isExtended = ItemizerHelper::IsExtendedCharacter(text[0]); + isStrong, + isExtended + ); UINT32 isDigitRangeStart = 0; UINT32 isDigitRangeEnd = 0; bool previousIsDigitValue = (numberCulture == nullptr) ? false : isDigit; bool currentIsDigitValue; + // Track base character for combining mark script comparison (PR #6857 / Issue #6801) + // A combining mark should only stay with its base character if they have the same script. + int baseChar = isCombining ? -1 : ch; + // pCharAttribute is assumed to have the same length as text. This is enforced by Itemize(). pCharAttribute[0] = (CharAttributeType) - (((isCombining) ? CharAttribute::IsCombining : CharAttribute::None) - | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None) - | ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None) - | ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None) - | ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None) - | ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None)); + (((isCombining) ? CharAttribute::IsCombining : CharAttribute::None) + | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None) + | ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None) + | ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None) + | ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None) + | ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None)); for (UINT32 i = 1; i < length; ++i) { + ch = text[i]; classificationUtility->GetCharAttribute( - text[i], - isCombining, - needsCaretInfo, - isIndic, - isDigit, - isLatin, - isStrong + ch, + isCombining, + needsCaretInfo, + isIndic, + isDigit, + isLatin, + isStrong, + isExtended ); - isExtended = ItemizerHelper::IsExtendedCharacter(text[i]); - + // For combining marks, check if they have the same script as the base character. + // If not, they should not be treated as combining with the base (PR #6857 / Issue #6801). + // However, script-agnostic combining marks (variation selectors, ZWJ, emoji modifiers, etc.) + // are designed to work with any base character regardless of script, so skip the check + // for them to allow emoji sequences to stay together. + bool isCombiningWithBase = isCombining; + if (isCombining && baseChar >= 0 && !isExtended) + { + if (!classificationUtility->IsSameScript(baseChar, ch)) + { + // Different script - this combining mark should not stay with the base character + isCombiningWithBase = false; + } + } + + // Update base character tracking + if (!isCombining) + { + baseChar = ch; + } pCharAttribute[i] = (CharAttributeType) - (((isCombining) ? CharAttribute::IsCombining : CharAttribute::None) - | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None) - | ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None) - | ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None) - | ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None) - | ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None)); + (((isCombiningWithBase) ? CharAttribute::IsCombining : CharAttribute::None) + | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None) + | ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None) + | ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None) + | ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None) + | ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None)); currentIsDigitValue = (numberCulture == nullptr) ? false : isDigit; diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs index 6ea88c585e2..0aac349f0d7 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs @@ -108,7 +108,8 @@ public void GetCharAttribute( out bool isIndic, out bool isDigit, out bool isLatin, - out bool isStrong + out bool isStrong, + out bool isExtended ) { CharacterAttribute charAttribute = Classification.CharAttributeOf((int)Classification.GetUnicodeClass(unicodeScalar)); @@ -134,6 +135,16 @@ out bool isStrong { isIndic = IsScriptIndic(scriptId); } + + isExtended = Classification.IsScriptAgnosticCombining(unicodeScalar); + } + + /// + /// Check whether two Unicode scalar values belong to the same script. + /// + public bool IsSameScript(int unicodeScalar1, int unicodeScalar2) + { + return Classification.IsSameScript(unicodeScalar1, unicodeScalar2); } /// @@ -159,6 +170,7 @@ private static bool IsScriptIndic(ScriptID scriptId) } } } + /// /// Hold the classification table pointers. /// @@ -253,16 +265,73 @@ public static short GetUnicodeClass(int unicodeScalar) /// - /// Lookup script ID for a Unicode scalar value + /// Check whether two Unicode scalar values belong to the same script /// - public static ScriptID GetScript(int unicodeScalar) + static public bool IsSameScript(int unicodeScalar1, int unicodeScalar2) { unsafe { - return (ScriptID)Classification.CharAttributeTable[GetUnicodeClass(unicodeScalar)].Script; + short unicodeClass1 = GetUnicodeClass(unicodeScalar1); + short unicodeClass2 = GetUnicodeClass(unicodeScalar2); + if (unicodeClass1 != unicodeClass2) + { + CharacterAttribute a1 = Classification.CharAttributeTable[unicodeClass1]; + CharacterAttribute a2 = Classification.CharAttributeTable[unicodeClass2]; + if (a1.Script != a2.Script) + { + return false; + } + } + + return true; } } + /// + /// Check whether the character is a script-agnostic combining mark that should + /// stay with its base character regardless of script differences. + /// + /// + /// This includes variation selectors and combining enclosing marks used in emoji + /// sequences like "1️⃣" (digit + VS16 + combining enclosing keycap). + /// These characters are designed to modify any base character regardless of script. + /// + static public bool IsScriptAgnosticCombining(int unicodeScalar) + { + // ZWJ - used in many emoji/grapheme clusters + if (unicodeScalar == 0x200D) + return true; + + // Variation Selectors VS1-VS16 (U+FE00-U+FE0F) + if (unicodeScalar >= 0xFE00 && unicodeScalar <= 0xFE0F) + return true; + + // Ideographic Variation Selectors VS17-VS256 (U+E0100-U+E01EF) + if (IsIVS(unicodeScalar)) + return true; + + // Combining Diacritical Marks Extended (U+1AB0-U+1AFF) + if (unicodeScalar >= 0x1AB0 && unicodeScalar <= 0x1AFF) + return true; + + // Combining Diacritical Marks Supplement (U+1DC0-U+1DFF) + if (unicodeScalar >= 0x1DC0 && unicodeScalar <= 0x1DFF) + return true; + + // Combining Diacritical Marks for Symbols (U+20D0-U+20FF) - includes U+20E3 keycap + if (unicodeScalar >= 0x20D0 && unicodeScalar <= 0x20FF) + return true; + + // Combining Half Marks (U+FE20-U+FE2F) + if (unicodeScalar >= 0xFE20 && unicodeScalar <= 0xFE2F) + return true; + + // Emoji Modifiers / Skin tones (U+1F3FB-U+1F3FF) + if (unicodeScalar >= 0x1F3FB && unicodeScalar <= 0x1F3FF) + return true; + + return false; + } /// /// Compute Unicode scalar value from unicode codepoint stream diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs index dbc88663016..bd3173b215e 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // @@ -304,7 +304,11 @@ out sizeofChar { // continue to advance for combining mark with base char (can be precomposed by shaping engine) // except if it is a different script (#6801) - if (Classification.GetScript(baseChar) == Classification.GetScript(originalChar)) + // However, script-agnostic combining marks (variation selectors, combining enclosing marks) + // should stay with their base character regardless of script, to allow emoji sequences + // like "1️⃣" (digit + VS16 + combining enclosing keycap) to stay together. + if (Classification.IsScriptAgnosticCombining(originalChar) + || Classification.IsSameScript(baseChar, originalChar)) { continue; } @@ -359,10 +363,13 @@ out sizeofChar // // The same goes for joiner. Note that "hasBaseChar" here indicates if there is an invalid base // char in front. + // Script-agnostic combining marks (variation selectors, combining enclosing marks) should + // also stay with the base character regardless of script differences. if (Classification.IsJoiner(ch) - || (baseChar != NOBASE && Classification.IsCombining(ch) && Classification.GetScript(ch) == Classification.GetScript(baseChar)) + || (baseChar != NOBASE && Classification.IsCombining(ch) + && (Classification.IsScriptAgnosticCombining(ch) || Classification.IsSameScript(baseChar, ch))) ) - continue; + continue; // If we have a glyph it's valid. if (font.HasCharacter(checked((uint)ch))) From 8398c343f6514043b71aaa58a6241d7ea87ee38d Mon Sep 17 00:00:00 2001 From: ET Date: Tue, 27 Jan 2026 07:26:50 -0800 Subject: [PATCH 2/4] CR feedback --- .../CPP/DWriteWrapper/IClassification.h | 2 +- .../CPP/DWriteWrapper/TextAnalyzer.cpp | 35 +++++++++++-------- .../MS/internal/Classification.cs | 10 +++--- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h index 09ecbffafba..df29873f0f1 100644 --- a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h +++ b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h @@ -22,7 +22,7 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface [System::Runtime::InteropServices::Out] bool% isDigit, [System::Runtime::InteropServices::Out] bool% isLatin, [System::Runtime::InteropServices::Out] bool% isStrong, - [System::Runtime::InteropServices::Out] bool% isExtended + [System::Runtime::InteropServices::Out] bool% isScriptAgnosticCombining ); /// diff --git a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp index f6899134ecf..9a2b88e9b8c 100644 --- a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp +++ b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp @@ -154,6 +154,7 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface bool isLatin; bool isStrong; bool isExtended; + bool isScriptAgnosticCombining; WCHAR ch = text[0]; classificationUtility->GetCharAttribute( @@ -164,9 +165,11 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface isDigit, isLatin, isStrong, - isExtended + isScriptAgnosticCombining ); + isExtended = ItemizerHelper::IsExtendedCharacter(ch); + UINT32 isDigitRangeStart = 0; UINT32 isDigitRangeEnd = 0; bool previousIsDigitValue = (numberCulture == nullptr) ? false : isDigit; @@ -178,12 +181,12 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface // pCharAttribute is assumed to have the same length as text. This is enforced by Itemize(). pCharAttribute[0] = (CharAttributeType) - (((isCombining) ? CharAttribute::IsCombining : CharAttribute::None) - | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None) - | ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None) - | ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None) - | ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None) - | ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None)); + (((isCombining) ? CharAttribute::IsCombining : CharAttribute::None) + | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None) + | ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None) + | ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None) + | ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None) + | ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None)); for (UINT32 i = 1; i < length; ++i) { @@ -196,16 +199,18 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface isDigit, isLatin, isStrong, - isExtended + isScriptAgnosticCombining ); + isExtended = ItemizerHelper::IsExtendedCharacter(ch); + // For combining marks, check if they have the same script as the base character. // If not, they should not be treated as combining with the base (PR #6857 / Issue #6801). // However, script-agnostic combining marks (variation selectors, ZWJ, emoji modifiers, etc.) // are designed to work with any base character regardless of script, so skip the check // for them to allow emoji sequences to stay together. bool isCombiningWithBase = isCombining; - if (isCombining && baseChar >= 0 && !isExtended) + if (isCombining && baseChar >= 0 && !isScriptAgnosticCombining) { if (!classificationUtility->IsSameScript(baseChar, ch)) { @@ -221,12 +226,12 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface } pCharAttribute[i] = (CharAttributeType) - (((isCombiningWithBase) ? CharAttribute::IsCombining : CharAttribute::None) - | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None) - | ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None) - | ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None) - | ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None) - | ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None)); + (((isCombining) ? CharAttribute::IsCombining : CharAttribute::None) + | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None) + | ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None) + | ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None) + | ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None) + | ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None)); currentIsDigitValue = (numberCulture == nullptr) ? false : isDigit; diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs index 0aac349f0d7..e0915f63d7b 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs @@ -109,7 +109,7 @@ public void GetCharAttribute( out bool isDigit, out bool isLatin, out bool isStrong, - out bool isExtended + out bool isScriptAgnosticCombining ) { CharacterAttribute charAttribute = Classification.CharAttributeOf((int)Classification.GetUnicodeClass(unicodeScalar)); @@ -120,7 +120,7 @@ out bool isExtended || Classification.IsIVS(unicodeScalar)); isStrong = (itemClass == (byte)ItemClass.StrongClass); - + int script = charAttribute.Script; needsCaretInfo = ScriptCaretInfo[script]; @@ -136,7 +136,7 @@ out bool isExtended isIndic = IsScriptIndic(scriptId); } - isExtended = Classification.IsScriptAgnosticCombining(unicodeScalar); + isScriptAgnosticCombining = Classification.IsScriptAgnosticCombining(unicodeScalar); } /// @@ -267,7 +267,7 @@ public static short GetUnicodeClass(int unicodeScalar) /// /// Check whether two Unicode scalar values belong to the same script /// - static public bool IsSameScript(int unicodeScalar1, int unicodeScalar2) + public static bool IsSameScript(int unicodeScalar1, int unicodeScalar2) { unsafe { @@ -296,7 +296,7 @@ static public bool IsSameScript(int unicodeScalar1, int unicodeScalar2) /// sequences like "1️⃣" (digit + VS16 + combining enclosing keycap). /// These characters are designed to modify any base character regardless of script. /// - static public bool IsScriptAgnosticCombining(int unicodeScalar) + public static bool IsScriptAgnosticCombining(int unicodeScalar) { // ZWJ - used in many emoji/grapheme clusters if (unicodeScalar == 0x200D) From 1a711ce4af33bbf46651cd5c032194e56924c0be Mon Sep 17 00:00:00 2001 From: ET Date: Thu, 19 Feb 2026 09:45:12 -0800 Subject: [PATCH 3/4] Fix font-fallback run-splitting for emoji grapheme clusters Keep variation selectors and ZWJ sequences together across font-fallback boundaries. --- .../MS/internal/Classification.cs | 18 +++++++----- .../internal/FontFace/PhysicalFontFamily.cs | 29 +++++++++++++++++-- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs index e0915f63d7b..4685cb30403 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs @@ -288,20 +288,22 @@ public static bool IsSameScript(int unicodeScalar1, int unicodeScalar2) } /// - /// Check whether the character is a script-agnostic combining mark that should + /// Check whether the character is a script-agnostic combining mark (font extender) that should /// stay with its base character regardless of script differences. /// /// - /// This includes variation selectors and combining enclosing marks used in emoji - /// sequences like "1️⃣" (digit + VS16 + combining enclosing keycap). - /// These characters are designed to modify any base character regardless of script. + /// Corresponds to a subset of DWriteCore's is_font_extender predicate, covering characters + /// that are not already handled by IsCombining + IsSameScript. These are combining marks + /// whose Unicode script is not the same as the base character's script, so that emoji + /// sequences like "1️⃣" (digit + VS16 + U+20E3 combining enclosing keycap) stay together. + /// + /// Note: ZWJ (U+200D) is NOT listed here because it is a JoinerClass character. + /// IsCombining() returns false for it, so this function would never be reached for ZWJ. + /// ZWJ is handled upstream by IsJoiner() and the prevWasJoiner logic in MapCharacters. + /// /// public static bool IsScriptAgnosticCombining(int unicodeScalar) { - // ZWJ - used in many emoji/grapheme clusters - if (unicodeScalar == 0x200D) - return true; - // Variation Selectors VS1-VS16 (U+FE00-U+FE0F) if (unicodeScalar >= 0xFE00 && unicodeScalar <= 0xFE0F) return true; diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs index bd3173b215e..f4cd4462c6c 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs @@ -340,6 +340,13 @@ out sizeofChar // UnicodeScalar won't return a sizeofChar that exceeds the string length. Debug.Assert(advance + sizeofChar <= unicodeString.Length); + // Track whether the previous character was a joiner. DWriteCore's font fallback + // algorithm extends the unmapped run to include the character immediately following + // a joiner (is_joiner(previous_char) in try_map_font). This keeps ZWJ emoji + // sequences like "👨‍👩‍👧" together in the unmapped run so they are sent to + // fallback as a unit. + bool prevWasJoiner = false; + for (nextValid = advance + sizeofChar; nextValid < unicodeString.Length; nextValid += sizeofChar) { // Get the character. @@ -351,6 +358,12 @@ out sizeofChar // Apply digit substitution, if any. int ch = digitMap[originalChar]; + if (Classification.IsJoiner(ch)) + { + prevWasJoiner = true; + continue; + } + // // Combining mark should always be shaped by the same font as the base char. // If the physical font is invalid for the base char, it should also be invalid for the @@ -361,15 +374,25 @@ out sizeofChar // as the base char such that they will eventually be resolved to the same physical font. // That means FamilyMap for the combining mark is not used when it follows a base char. // - // The same goes for joiner. Note that "hasBaseChar" here indicates if there is an invalid base - // char in front. // Script-agnostic combining marks (variation selectors, combining enclosing marks) should // also stay with the base character regardless of script differences. - if (Classification.IsJoiner(ch) + // + // If the previous character was a joiner, pull this character into the unmapped run + // regardless of whether it is a combining mark (mirrors DWriteCore is_joiner(previous_char)). + if (prevWasJoiner || (baseChar != NOBASE && Classification.IsCombining(ch) && (Classification.IsScriptAgnosticCombining(ch) || Classification.IsSameScript(baseChar, ch))) ) + { + // Update baseChar for any strong char pulled into the unmapped run by a joiner so + // that combining marks that follow it are associated with the correct base. + if (prevWasJoiner && !Classification.IsCombining(ch)) + baseChar = originalChar; + prevWasJoiner = false; continue; + } + + prevWasJoiner = false; // If we have a glyph it's valid. if (font.HasCharacter(checked((uint)ch))) From f5924d9120a866ee6e30fefc64d31c92500ea9ac Mon Sep 17 00:00:00 2001 From: ET Date: Thu, 19 Feb 2026 11:20:25 -0800 Subject: [PATCH 4/4] Fix emoji run-splitting: use originalChar for IsJoiner, ch for baseChar, update remarks --- .../src/PresentationCore/MS/internal/Classification.cs | 7 ++++--- .../MS/internal/FontFace/PhysicalFontFamily.cs | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs index 4685cb30403..a0ad0e5224b 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs @@ -293,9 +293,10 @@ public static bool IsSameScript(int unicodeScalar1, int unicodeScalar2) /// /// /// Corresponds to a subset of DWriteCore's is_font_extender predicate, covering characters - /// that are not already handled by IsCombining + IsSameScript. These are combining marks - /// whose Unicode script is not the same as the base character's script, so that emoji - /// sequences like "1️⃣" (digit + VS16 + U+20E3 combining enclosing keycap) stay together. + /// that require special handling to prevent run-splitting when script comparisons would + /// otherwise split them. These are combining marks whose Unicode script is not the same + /// as the base character's script, so that emoji sequences like "1️⃣" (digit + VS16 + + /// U+20E3 combining enclosing keycap) stay together. /// /// Note: ZWJ (U+200D) is NOT listed here because it is a JoinerClass character. /// IsCombining() returns false for it, so this function would never be reached for ZWJ. diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs index f4cd4462c6c..1a4597e4cfe 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs @@ -358,7 +358,7 @@ out sizeofChar // Apply digit substitution, if any. int ch = digitMap[originalChar]; - if (Classification.IsJoiner(ch)) + if (Classification.IsJoiner(originalChar)) { prevWasJoiner = true; continue; @@ -387,7 +387,7 @@ out sizeofChar // Update baseChar for any strong char pulled into the unmapped run by a joiner so // that combining marks that follow it are associated with the correct base. if (prevWasJoiner && !Classification.IsCombining(ch)) - baseChar = originalChar; + baseChar = ch; prevWasJoiner = false; continue; }