From 4da8cf20457614f24288391c16e6d29ec0601afe Mon Sep 17 00:00:00 2001 From: Shenglong Li Date: Mon, 29 Jun 2026 14:42:05 -0700 Subject: [PATCH 1/2] Fix escape sequence semantic highlighting --- .../Highlighting/SemanticTokenVisitorTests.cs | 26 ++++++ .../Highlighting/SemanticTokenVisitor.cs | 90 ++++++++++++++++++- 2 files changed, 112 insertions(+), 4 deletions(-) create mode 100644 src/Bicep.Core.UnitTests/Highlighting/SemanticTokenVisitorTests.cs diff --git a/src/Bicep.Core.UnitTests/Highlighting/SemanticTokenVisitorTests.cs b/src/Bicep.Core.UnitTests/Highlighting/SemanticTokenVisitorTests.cs new file mode 100644 index 00000000000..084a2a11abd --- /dev/null +++ b/src/Bicep.Core.UnitTests/Highlighting/SemanticTokenVisitorTests.cs @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Bicep.Core.Highlighting; +using Bicep.Core.UnitTests.Utils; +using FluentAssertions; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Bicep.Core.UnitTests.Highlighting; + +[TestClass] +public class SemanticTokenVisitorTests +{ + [TestMethod] + public void Build_WithEscapeSequences_ExcludesEscapesFromStringTokens() + { + var bicepText = @"var foo = 'a\\b\'c\${d\n\r\t\u{1F600}z'"; + var result = CompilationHelper.Compile(bicepText); + + var stringTokenTexts = SemanticTokenVisitor.Build(result.Compilation.GetEntrypointSemanticModel()) + .Where(token => token.TokenType == SemanticTokenType.String) + .Select(token => bicepText.Substring(token.Positionable.Span.Position, token.Positionable.Span.Length)); + + stringTokenTexts.Should().Equal("'a", "b", "c", "d", "z'"); + } +} diff --git a/src/Bicep.Core/Highlighting/SemanticTokenVisitor.cs b/src/Bicep.Core/Highlighting/SemanticTokenVisitor.cs index c0cd36b36a3..fb83367d454 100644 --- a/src/Bicep.Core/Highlighting/SemanticTokenVisitor.cs +++ b/src/Bicep.Core/Highlighting/SemanticTokenVisitor.cs @@ -219,7 +219,7 @@ public override void VisitFunctionDeclarationSyntax(FunctionDeclarationSyntax sy base.VisitFunctionDeclarationSyntax(syntax); } - private void AddStringToken(Token token, string? start, string? end) + private void AddStringToken(Token token, string? start, string? end, bool isSingleLineString) { var endInterp = (token.Type, end) switch { @@ -248,7 +248,7 @@ private void AddStringToken(Token token, string? start, string? end) AddTokenType(token.GetSpanSlice(0, startOperatorLength), SemanticTokenType.Operator); } - AddTokenType(token.GetSpanSlice(startOperatorLength, token.Span.Length - startOperatorLength - endOperatorLength), SemanticTokenType.String); + this.AddStringContentTokens(token, startOperatorLength, token.Span.Length - startOperatorLength - endOperatorLength, isSingleLineString); if (hasEndOperator) { @@ -256,6 +256,86 @@ private void AddStringToken(Token token, string? start, string? end) } } + private void AddStringContentTokens(Token token, int start, int length, bool excludeEscapeSequences) + { + if (length <= 0) + { + return; + } + + if (!excludeEscapeSequences) + { + this.AddTokenType(token.GetSpanSlice(start, length), SemanticTokenType.String); + return; + } + + var end = start + length; + var segmentStart = start; + for (var position = start; position < end; position++) + { + if (TryGetEscapeSequenceLength(token.Text, position, end) is not { } escapeSequenceLength) + { + continue; + } + + this.AddStringSegmentToken(token, segmentStart, position); + position += escapeSequenceLength - 1; + segmentStart = position + 1; + } + + this.AddStringSegmentToken(token, segmentStart, end); + } + + private void AddStringSegmentToken(Token token, int start, int end) + { + if (end > start) + { + this.AddTokenType(token.GetSpanSlice(start, end - start), SemanticTokenType.String); + } + } + + private static int? TryGetEscapeSequenceLength(string text, int position, int end) + { + if (text[position] != '\\' || position + 1 >= end) + { + return null; + } + + return text[position + 1] switch + { + 'n' or 'r' or 't' or '\\' or '\'' => 2, + '$' when position + 2 < end && text[position + 2] == '{' => 3, + 'u' => TryGetUnicodeEscapeSequenceLength(text, position, end), + _ => null, + }; + } + + private static int? TryGetUnicodeEscapeSequenceLength(string text, int position, int end) + { + if (position + 3 >= end || text[position + 2] != '{') + { + return null; + } + + var hexDigitCount = 0; + for (var current = position + 3; current < end; current++) + { + if (text[current] == '}') + { + return hexDigitCount > 0 ? current - position + 1 : null; + } + + if (!Uri.IsHexDigit(text[current])) + { + return null; + } + + hexDigitCount++; + } + + return null; + } + public override void VisitTernaryOperationSyntax(TernaryOperationSyntax syntax) { AddTokenType(syntax.Colon, SemanticTokenType.Operator); @@ -266,10 +346,11 @@ public override void VisitTernaryOperationSyntax(TernaryOperationSyntax syntax) public override void VisitStringTypeLiteralSyntax(StringTypeLiteralSyntax syntax) { var startAndEndTokens = Lexer.TryGetStartAndEndTokens(syntax.StringTokens).ToImmutableArray(); + var isSingleLineString = syntax.StringTokens.Length == 0 || !Lexer.GetStringTokenInfo(syntax.StringTokens[0]).isMultiLine; for (var i = 0; i < syntax.StringTokens.Length; i++) { var result = startAndEndTokens[i]; - AddStringToken(syntax.StringTokens[i], result?.start, result?.end); + AddStringToken(syntax.StringTokens[i], result?.start, result?.end, isSingleLineString); } foreach (var expression in syntax.Expressions) { @@ -280,10 +361,11 @@ public override void VisitStringTypeLiteralSyntax(StringTypeLiteralSyntax syntax public override void VisitStringSyntax(StringSyntax syntax) { var startAndEndTokens = Lexer.TryGetStartAndEndTokens(syntax.StringTokens).ToImmutableArray(); + var isSingleLineString = syntax.StringTokens.Length == 0 || !Lexer.GetStringTokenInfo(syntax.StringTokens[0]).isMultiLine; for (var i = 0; i < syntax.StringTokens.Length; i++) { var result = startAndEndTokens[i]; - AddStringToken(syntax.StringTokens[i], result?.start, result?.end); + AddStringToken(syntax.StringTokens[i], result?.start, result?.end, isSingleLineString); } foreach (var expression in syntax.Expressions) { From 17d5c0417fd75a889f84c0aee0056b965baaecde Mon Sep 17 00:00:00 2001 From: Shenglong Li Date: Tue, 30 Jun 2026 10:49:28 -0700 Subject: [PATCH 2/2] Reuse lexer escape scanner for semantic tokens --- .../Parsing/LexerTests.cs | 1 + .../Highlighting/SemanticTokenVisitor.cs | 44 +----- src/Bicep.Core/Parsing/Lexer.cs | 134 +++++++++++------- src/Bicep.Core/Parsing/SlidingTextWindow.cs | 2 + 4 files changed, 89 insertions(+), 92 deletions(-) diff --git a/src/Bicep.Core.UnitTests/Parsing/LexerTests.cs b/src/Bicep.Core.UnitTests/Parsing/LexerTests.cs index be1aea66f9d..5bc6eb22385 100644 --- a/src/Bicep.Core.UnitTests/Parsing/LexerTests.cs +++ b/src/Bicep.Core.UnitTests/Parsing/LexerTests.cs @@ -19,6 +19,7 @@ public class LexerTests [DataRow(@"'test'", "test")] [DataRow(@"'hello there'", "hello there")] [DataRow(@"'\r\n\t\\\$\''", "\r\n\t\\$'")] + [DataRow(@"'\${foo}'", "${foo}")] [DataRow("'First line\\nSecond\\ttabbed\\tline'", "First line\nSecond\ttabbed\tline")] // escape ascii [DataRow(@"'\u{0}'", "\0")] diff --git a/src/Bicep.Core/Highlighting/SemanticTokenVisitor.cs b/src/Bicep.Core/Highlighting/SemanticTokenVisitor.cs index fb83367d454..7e00286e26e 100644 --- a/src/Bicep.Core/Highlighting/SemanticTokenVisitor.cs +++ b/src/Bicep.Core/Highlighting/SemanticTokenVisitor.cs @@ -273,7 +273,7 @@ private void AddStringContentTokens(Token token, int start, int length, bool exc var segmentStart = start; for (var position = start; position < end; position++) { - if (TryGetEscapeSequenceLength(token.Text, position, end) is not { } escapeSequenceLength) + if (!Lexer.TryScanStringEscapeSequence(token.Text.AsSpan(position, end - position), out var escapeSequenceLength)) { continue; } @@ -294,48 +294,6 @@ private void AddStringSegmentToken(Token token, int start, int end) } } - private static int? TryGetEscapeSequenceLength(string text, int position, int end) - { - if (text[position] != '\\' || position + 1 >= end) - { - return null; - } - - return text[position + 1] switch - { - 'n' or 'r' or 't' or '\\' or '\'' => 2, - '$' when position + 2 < end && text[position + 2] == '{' => 3, - 'u' => TryGetUnicodeEscapeSequenceLength(text, position, end), - _ => null, - }; - } - - private static int? TryGetUnicodeEscapeSequenceLength(string text, int position, int end) - { - if (position + 3 >= end || text[position + 2] != '{') - { - return null; - } - - var hexDigitCount = 0; - for (var current = position + 3; current < end; current++) - { - if (text[current] == '}') - { - return hexDigitCount > 0 ? current - position + 1 : null; - } - - if (!Uri.IsHexDigit(text[current])) - { - return null; - } - - hexDigitCount++; - } - - return null; - } - public override void VisitTernaryOperationSyntax(TernaryOperationSyntax syntax) { AddTokenType(syntax.Colon, SemanticTokenType.Operator); diff --git a/src/Bicep.Core/Parsing/Lexer.cs b/src/Bicep.Core/Parsing/Lexer.cs index 5a96df03e62..e4247775607 100644 --- a/src/Bicep.Core/Parsing/Lexer.cs +++ b/src/Bicep.Core/Parsing/Lexer.cs @@ -213,15 +213,14 @@ private static (string start, string end)? TryGetStartAndEndTokens(Token stringT } var (start, end) = result; - var contents = stringToken.Text.Substring(start.Length, stringToken.Text.Length - start.Length - end.Length); - var window = new SlidingTextWindow(contents); + var contents = stringToken.Text.AsSpan(start.Length, stringToken.Text.Length - start.Length - end.Length); // the value of the string will be shorter because escapes are longer than the characters they represent var buffer = new StringBuilder(contents.Length); - while (!window.IsAtEnd()) + for (var position = 0; position < contents.Length; position++) { - var nextChar = window.Next(); + var nextChar = contents[position]; if (nextChar == '\'') { @@ -230,68 +229,99 @@ private static (string start, string end)? TryGetStartAndEndTokens(Token stringT if (nextChar == '\\') { - // escape sequence begins - if (window.IsAtEnd()) + if (!TryScanStringEscapeSequence(contents[position..], out var escapeSequenceLength, buffer)) { return null; } - char escapeChar = window.Next(); + position += escapeSequenceLength - 1; + continue; + } - if (escapeChar == 'u') - { - // unicode escape - char openCurly = window.Next(); - if (openCurly != '{') - { - return null; - } + // regular string char - append to buffer + buffer.Append(nextChar); + } - var codePointText = ScanHexNumber(window); - if (!TryParseCodePoint(codePointText, out uint codePoint)) - { - // invalid codepoint - return null; - } + return buffer.ToString(); + } - char closeCurly = window.Next(); - if (closeCurly != '}') - { - return null; - } + internal static bool TryScanStringEscapeSequence(ReadOnlySpan text, out int length, StringBuilder? buffer = null) + { + length = 0; - char charOrHighSurrogate = CodepointToString(codePoint, out char lowSurrogate); - buffer.Append(charOrHighSurrogate); - if (lowSurrogate != SlidingTextWindow.InvalidCharacter) - { - // previous char was a high surrogate - // also append the low surrogate - buffer.Append(lowSurrogate); - } + if (text.Length < 2 || text[0] != '\\') + { + return false; + } - continue; - } + var escapeChar = text[1]; + if (escapeChar == 'u') + { + return TryScanUnicodeEscapeSequence(text, out length, buffer); + } - if (SingleCharacterEscapes.TryGetValue(escapeChar, out char escapeCharValue) == false) - { - // invalid escape character - return null; - } + if (!SingleCharacterEscapes.TryGetValue(escapeChar, out var escapeCharValue)) + { + return false; + } - buffer.Append(escapeCharValue); + length = escapeChar == '$' && text.Length >= 3 && text[2] == '{' ? 3 : 2; + buffer?.Append(escapeCharValue); + if (length == 3) + { + buffer?.Append('{'); + } - // continue to next iteration - continue; - } + return true; + } - // regular string char - append to buffer - buffer.Append(nextChar); + private static bool TryScanUnicodeEscapeSequence(ReadOnlySpan text, out int length, StringBuilder? buffer) + { + length = 0; + + if (text.Length < 4 || text[2] != '{') + { + return false; } - return buffer.ToString(); + var current = 3; + while (current < text.Length && IsHexDigit(text[current])) + { + current++; + } + + if (current == 3 || current >= text.Length || text[current] != '}') + { + return false; + } + + if (!TryParseCodePoint(text[3..current], out uint codePoint)) + { + return false; + } + + length = current + 1; + if (buffer is not null) + { + AppendCodePoint(buffer, codePoint); + } + + return true; } - private static bool TryParseCodePoint(string text, out uint codePoint) => uint.TryParse(text, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out codePoint) && codePoint <= 0x10FFFF; + private static bool TryParseCodePoint(string text, out uint codePoint) => TryParseCodePoint(text.AsSpan(), out codePoint); + + private static bool TryParseCodePoint(ReadOnlySpan text, out uint codePoint) => uint.TryParse(text, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out codePoint) && codePoint <= 0x10FFFF; + + private static void AppendCodePoint(StringBuilder buffer, uint codePoint) + { + char charOrHighSurrogate = CodepointToString(codePoint, out char lowSurrogate); + buffer.Append(charOrHighSurrogate); + if (lowSurrogate != SlidingTextWindow.InvalidCharacter) + { + buffer.Append(lowSurrogate); + } + } /// /// Determines if the specified string is a valid identifier. To be considered a valid identifier, the string must start @@ -772,6 +802,12 @@ private TokenType ScanStringSegment(bool isAtStartOfString) return isAtStartOfString ? TokenType.StringComplete : TokenType.StringRightPiece; } + if (TryScanStringEscapeSequence(textWindow.GetTextFromPosition(escapeBeginPosition), out var escapeSequenceLength)) + { + textWindow.Advance(escapeSequenceLength - 1); + continue; + } + // the escape sequence has a char after the \ // consume it nextChar = textWindow.Peek(); diff --git a/src/Bicep.Core/Parsing/SlidingTextWindow.cs b/src/Bicep.Core/Parsing/SlidingTextWindow.cs index a10edab49fc..a3948ae6ec2 100644 --- a/src/Bicep.Core/Parsing/SlidingTextWindow.cs +++ b/src/Bicep.Core/Parsing/SlidingTextWindow.cs @@ -119,5 +119,7 @@ public string GetTextBetweenLineStartAndCurrentPosition() var positionAfterNewLine = indexOfPreviousNewLine + 1; return text[positionAfterNewLine..position]; } + + internal ReadOnlySpan GetTextFromPosition(int absolutePosition) => text.AsSpan(absolutePosition); } }