From a70b9e4e626a0f39327bf62a640b8272c400fe11 Mon Sep 17 00:00:00 2001 From: jahav Date: Sat, 18 Oct 2025 23:51:11 +0200 Subject: [PATCH 1/3] Add a hand-made lexer for 3.0 This is a lexer for versin 3.0 that should use Pratt parsing. The lexer is handmade, though representable in DFA if necessary. After experience with 2.0, it seems that too much was done in lexer and the lexer produce far less tokens and should be far more "relaxed". That means parser will have to deal with more relaxed tokens that might not be valid in used context. Performance is acceptable (150ms for tokenization of 22MB of enron formulas), but that is because it's rather simple. A lot of logic will be in parser. --- .../Lexers/LexerTests.cs | 330 +++++++++++++ src/ClosedXML.Parser/ParsingException.cs | 35 +- src/ClosedXML.Parser/Pratt/Lexer.cs | 446 ++++++++++++++++++ src/ClosedXML.Parser/Pratt/Token.cs | 14 + src/ClosedXML.Parser/Pratt/TokenType.cs | 137 ++++++ 5 files changed, 961 insertions(+), 1 deletion(-) create mode 100644 src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs create mode 100644 src/ClosedXML.Parser/Pratt/Lexer.cs create mode 100644 src/ClosedXML.Parser/Pratt/Token.cs create mode 100644 src/ClosedXML.Parser/Pratt/TokenType.cs diff --git a/src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs b/src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs new file mode 100644 index 0000000..2eaa1f2 --- /dev/null +++ b/src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs @@ -0,0 +1,330 @@ +using ClosedXML.Parser.Pratt; + +namespace ClosedXML.Parser.Tests.Lexers; + +public class LexerTests +{ + // ( [0-9] )+ + [InlineData("0")] + [InlineData("1")] + [InlineData("90")] + [InlineData("00050")] + + // ( [0-9] )+ '.' ( [0-9]] )+ + [InlineData("0.0")] + [InlineData("1.2")] + [InlineData("0010.0020")] + [InlineData("999.99")] + + // '.' ( [0-9]] )+ + [InlineData(".0")] + [InlineData(".1")] + [InlineData(".0001")] + [InlineData(".987")] + + // ( [0-9] )+ [Ee] ( [0-9]] )+ + [InlineData("0e0")] + [InlineData("0E0")] + [InlineData("1e2")] + [InlineData("1E2")] + [InlineData("987e12")] + + // ( [0-9] )+ '.' ( [0-9]] )+ [Ee] ( [0-9]] )+ + [InlineData("0.0e4")] + [InlineData("12.724e13")] + [InlineData("12.3E2")] + + // '.' ( [0-9]] )+ [Ee] ( [0-9]] )+ + [InlineData(".0e0")] + [InlineData(".1e2")] + [InlineData(".987e54")] + + // ( [0-9] )+ [Ee] [+-] ( [0-9]] )+ + [InlineData("1e+7")] + [InlineData("74e-32")] + [InlineData("15E-0")] + [InlineData("0e+0")] + [InlineData("01e+7")] + + // ( [0-9] )+ '.' ( [0-9]] )+ [Ee] [+-] ( [0-9]] )+ + [InlineData("0.0e+0")] + [InlineData("1.2e+3")] + [InlineData("01.2e+3")] + [InlineData("1.2E+3")] + [InlineData("12.34e+56")] + + // '.' ( [0-9]] )+ [Ee] [+-] ( [0-9]] )+ + [InlineData(".0e+0")] + [InlineData(".1e+2")] + [InlineData(".12E+34")] + [InlineData(".012e+034")] + [Theory] + public void Number_ok(string input) + { + AssertToken(TokenType.Number, input); + } + + [InlineData("0e+")] + [InlineData(".0e+")] + [Theory] + public void Number_fails(string input) + { + AssertFail(input, "Number"); + } + + [InlineData("\"\"")] + [InlineData("\"Some text\"")] + [InlineData("\"Some \"\" text\"")] + [InlineData("\"\uD83E\uDD8A\"")] // Fox face through surrogates + [Theory] + public void Text_ok(string input) + { + AssertToken(TokenType.Text, input); + } + + [InlineData("\"")] + [InlineData("\"text")] + [InlineData("\"text\"\"")] + [InlineData("\"Some \"\" text")] + [Theory] + public void Text_must_be_terminated(string input) + { + AssertFail(input, "unterminated literal"); + } + + [InlineData("\"\u0015\"")] + [Theory] + public void Text_must_be_contain_xml_10_characters(string input) + { + AssertFail(input, "Invalid text character"); + } + + [InlineData("#DIV/0!")] + [InlineData("#GETTING_DATA")] + [InlineData("#N/A")] + [InlineData("#NAME?")] + [InlineData("#NULL!")] + [InlineData("#NUM!")] + [InlineData("#REF!")] + [InlineData("#VALUE!")] + [InlineData("#ref!")] + [Theory] + public void Error_ok(string input) + { + AssertToken(TokenType.Error, input); + } + + [Fact] + public void Lexer_throws_on_unpaired_surrogates() + { + // Either Visual Studio or NUnit is converting invalid surrogates to -1/65536. O + var invalidCodeUnits = new[] + { + "\uD83E", // Unpaired high surrogate for Fox Face + "\uD83E*", // Unpaired high surrogate for Fox Face + "\uDD8A", // Unpaired low surrogate for Fox Face + "\uDD8A*\"", // Unpaired low surrogate for Fox Face + "\uDD8A\uD83E", // Low surrogate first + }; + foreach (var invalidText in invalidCodeUnits) + { + AssertFail(invalidText, "surrogate"); + } + } + + [InlineData("''")] + [InlineData("'[1]Something'")] + [InlineData("'Jane''s'")] + [InlineData("'New York'")] + [InlineData("'January 1st:December 31st'")] + [InlineData("'[7]Year 20:Year 25'")] + [InlineData("'[Book.xlsx]Year 20:Year 25'")] + [InlineData("'[End*Near.xlsx]Final'")] + [InlineData("''''''")] + [Theory] + public void QIdent_ok(string input) + { + AssertToken(TokenType.QIdent, input); + } + + [InlineData("'")] + [InlineData("'Jane''s")] + [InlineData("'''''")] + [Theory] + public void QIdent_must_be_terminated(string input) + { + AssertFail(input, "unterminated literal"); + } + + [InlineData("ABC")] + [InlineData("A1")] + [InlineData("$A$1")] + [InlineData("AEF$A$1")] + [InlineData("name")] + [InlineData("TRUE")] + [InlineData("FALSE")] + [InlineData("true")] + [InlineData("false")] + [InlineData("?name")] + [InlineData("\\name")] + [InlineData("_name")] + [InlineData("name?")] + [InlineData("name\\")] + [InlineData("name_")] + [InlineData("some.name")] + [InlineData("_xlfn.ACOT")] + [InlineData("\u05D0\u05D1\u05E0")] // stone in hebrew - Letters from other languages + [InlineData("\u05E9\u05B0\u05DC\u05D5\u05DD")] // shalom - A mark from other languages + [Theory] + public void Ident_ok(string input) + { + AssertToken(TokenType.Ident, input); + } + + [Fact] + public void Ident_stops_at_operators() + { + var operators = new Dictionary + { + { TokenType.Bang, "!" }, + { TokenType.Comma, "," }, + { TokenType.Semicolon, ";" }, + { TokenType.Pow, "^" }, + { TokenType.Mul, "*" }, + { TokenType.Div, "/" }, + { TokenType.Plus, "+" }, + { TokenType.Minus, "-" }, + { TokenType.Concat, "&" }, + { TokenType.Equal, "=" }, + { TokenType.NotEqual, "<>" }, + { TokenType.Less, "<" }, + { TokenType.LessEqual, "<=" }, + { TokenType.Greater, ">" }, + { TokenType.GreaterEqual, ">=" }, + { TokenType.Percent, "%" }, + { TokenType.Range, ":" }, + { TokenType.Spill, "#" }, + { TokenType.Intersection, "@" }, + { TokenType.LeftParen, "(" }, + { TokenType.RightParen, ")" }, + { TokenType.LeftCurly, "{" }, + { TokenType.RightCurly, "}" }, + { TokenType.Whitespace, " " }, + }; + + foreach (var (opType, opText) in operators) + { + var input = "name" + opText; + var lexer = new Lexer(input); + + var identToken = lexer.Consume(); + Assert.Equal(TokenType.Ident, identToken.Type); + Assert.Equal("name", identToken.GetText(input).ToString()); + + var opToken = lexer.Consume(); + Assert.Equal(opType, opToken.Type); + Assert.Equal(opText, opToken.GetText(input).ToString()); + } + } + + [InlineData("[1]")] + [InlineData("[]")] + [InlineData("['[]")] + [InlineData("[Book1.xlsx]")] + [InlineData("[#Data]")] + [InlineData("[[#Data]]")] + [InlineData("[[#Data],[#Headers]]")] + [InlineData("['#]")] + [InlineData("[985]")] + [InlineData("[Jan:Dec]")] + [InlineData("['['['[]")] + [InlineData("[']']']]")] + [Theory] + public void SquareIdent_ok(string input) + { + AssertToken(TokenType.SquareIdent, input); + } + + [InlineData("[Ja[[a]]]")] + [Theory] + public void SquareIdent_at_most_two_nested_brackets(string input) + { + // Mostly to keep within something DFA can do. + AssertFail(input, "at most two nested square brackets"); + } + + [InlineData("[")] + [InlineData("[text")] + [InlineData("[[")] + [InlineData("[a[b")] + [InlineData("[Start[]and end")] + [InlineData("[Start[']and end']")] + [Theory] + public void SquareIdent_must_be_paired(string input) + { + AssertFail(input, "Unable to find closing square bracket"); + } + + [InlineData((int)TokenType.Bang, "!")] + [InlineData((int)TokenType.Range, ":")] + [InlineData((int)TokenType.Comma, ",")] + [InlineData((int)TokenType.Semicolon, ";")] + [InlineData((int)TokenType.Pow, "^")] + [InlineData((int)TokenType.Mul, "*")] + [InlineData((int)TokenType.Div, "/")] + [InlineData((int)TokenType.Plus, "+")] + [InlineData((int)TokenType.Minus, "-")] + [InlineData((int)TokenType.Concat, "&")] + [InlineData((int)TokenType.Percent, "%")] + [InlineData((int)TokenType.Spill, "#")] + [InlineData((int)TokenType.Intersection, "@")] + [InlineData((int)TokenType.LeftParen, "(")] + [InlineData((int)TokenType.RightParen, ")")] + [InlineData((int)TokenType.LeftCurly, "{")] + [InlineData((int)TokenType.RightCurly, "}")] + [Theory] + public void Single_char_tokens_ok(int token, string input) + { + // TODO: Dump xUnit. Can't even use internal classes as test fixtures, so I have to pass enum as int. + AssertToken((TokenType)token, input); + } + + [InlineData((int)TokenType.Equal, "=")] + [InlineData((int)TokenType.NotEqual, "<>")] + [InlineData((int)TokenType.Less, "<")] + [InlineData((int)TokenType.LessEqual, "<=")] + [InlineData((int)TokenType.Greater, ">")] + [InlineData((int)TokenType.GreaterEqual, ">=")] + [Theory] + public void Comparison_tokens_ok(int token, string input) + { + AssertToken((TokenType)token, input); + } + + [InlineData("\t")] + [InlineData("\n")] + [InlineData("\r")] + [InlineData(" ")] + [InlineData("\t \r\n")] + [Theory] + public void Whitespace_ok(string input) + { + AssertToken(TokenType.Whitespace, input); + } + + private static void AssertToken(TokenType type, string input) + { + var lexer = new Lexer(input); + var token = lexer.Consume(); + Assert.Equal(token.Type, type); + Assert.Equal(token.Text, input); + } + + private static void AssertFail(string input, string exceptionSubstring) + { + var lexer = new Lexer(input); + var exception = Assert.Throws(() => lexer.Consume()); + Assert.NotNull(exception); + Assert.True(exception.Message.Contains(exceptionSubstring), $"Expected to find '{exceptionSubstring}', but not found in '{exception.Message}'."); + } +} diff --git a/src/ClosedXML.Parser/ParsingException.cs b/src/ClosedXML.Parser/ParsingException.cs index 9a70536..c4683e0 100644 --- a/src/ClosedXML.Parser/ParsingException.cs +++ b/src/ClosedXML.Parser/ParsingException.cs @@ -1,4 +1,5 @@ using System; +using ClosedXML.Parser.Pratt; namespace ClosedXML.Parser; @@ -10,4 +11,36 @@ public class ParsingException : Exception internal ParsingException(string message) : base(message) { } -} \ No newline at end of file + + /// + /// There are problems with underlying stream. + /// + internal static ParsingException UnpairedSurrogate(int codepoint, int position) + { + throw new ParsingException($"Found an unpaired surrogate 0x{codepoint:X4} at {position}."); + } + + /// + /// Token has a start and end indicator, but no end indicator was found. + /// + internal static Exception UnterminatedLiteral(int start, char delimiter) + { + throw new ParsingException($"An unterminated literal (delimiter {delimiter}) found at position {start}."); + } + + /// + /// A token was started to be parsed, but is not complete or there is a problem with it. + /// + internal static Exception TokenPartialMatch(int start, TokenType type) + { + throw new ParsingException($"Token {type} was parsed from position {start}, but was only partially matched."); + } + + /// + /// Lexer has no idea which token it should start to parse. + /// + internal static Exception UnableToSelectToken(int start) + { + throw new ParsingException($"Unable to determine a token at position {start}."); + } +} diff --git a/src/ClosedXML.Parser/Pratt/Lexer.cs b/src/ClosedXML.Parser/Pratt/Lexer.cs new file mode 100644 index 0000000..cc35421 --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/Lexer.cs @@ -0,0 +1,446 @@ +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Xml; + +namespace ClosedXML.Parser.Pratt; + +/// +/// A lexer for pratt parser. +/// +internal class Lexer +{ + private const string OPERATOR_CHARS = "!,;^*/+-&=<>%:#@(){} "; + private const int EOF = -1; + + private static readonly bool[] IsOp; + + private readonly Queue _queue = new(4); + private readonly string _input; + + private int _start; // The start index of currently parsed token in Next() + private int _i; // Index of current code point _c in _input + private int _c; // A current code point (including astral planes) or -1 if at the EOF + + + static Lexer() + { + IsOp = new bool[128]; + foreach (var op in OPERATOR_CHARS) + IsOp[op] = true; + } + + /// + /// Create a new instance of a lexer. + /// + /// Formula to tokenize. + public Lexer(string input) + { + _input = input ?? throw new ArgumentNullException(); + _i = -1; + } + + private bool IsEof => _c == EOF; + + public Token Consume() + { + if (_queue.Count == 0) + return Next(); + + return _queue.Dequeue(); + } + + public Token Peek() + { + if (_queue.Count == 0) + _queue.Enqueue(Next()); + + return _queue.Peek(); + } + + private Token Next() + { + if (_i < 0) + Advance(); + + if (IsEof) + return new Token(TokenType.Eof, string.Empty); + + _start = _i; + + // Number + if (IsDigit(_c)) + { + // Whole number part + DigitSequence(); + + // Fractional part + if (_c == '.') + { + Advance(); + DigitSequence(); + } + + ExponentPart(); + + return T(TokenType.Number); + } + if (_c is '.') + { + Advance(); + + // Fractional part + DigitSequence(); + ExponentPart(); + + return T(TokenType.Number); + } + + // Text + if (_c == '"') + { + Advance(); + + while (!IsEof) + { + if (_c == '"') + { + Advance(); + if (_c != '"') + return T(TokenType.Text); + } + + if (!IsXml10Char(_c)) + throw new ParsingException($"Invalid text character (codepoint {_c:x8})."); + + Advance(); + } + + throw ParsingException.UnterminatedLiteral(_start, '"'); + } + + // QIdent + if (_c == '\'') + { + while (!IsEof) + { + Advance(); + + if (_c == '\'') + { + Advance(); + if (_c != '\'') + return T(TokenType.QIdent); + } + } + + throw ParsingException.UnterminatedLiteral(_start, '\''); + } + + if (IsIdentStart(_c)) + { + Advance(); + while (!IsEof && IsIdentNext(_c)) + Advance(); + + return T(TokenType.Ident); + } + + if (_c == '+') + return FoundToken(TokenType.Plus); + + if (_c == '-') + return FoundToken(TokenType.Minus); + + if (_c == '*') + return FoundToken(TokenType.Mul); + + if (_c == '/') + return FoundToken(TokenType.Div); + + if (_c == '^') + return FoundToken(TokenType.Pow); + + if (_c == '%') + return FoundToken(TokenType.Percent); + + if (_c == '&') + return FoundToken(TokenType.Concat); + + if (_c == '!') + return FoundToken(TokenType.Bang); + + if (_c == '(') + return FoundToken(TokenType.LeftParen); + + if (_c == ')') + return FoundToken(TokenType.RightParen); + + if (_c == '{') + return FoundToken(TokenType.LeftCurly); + + if (_c == '}') + return FoundToken(TokenType.RightCurly); + + if (_c == ',') + return FoundToken(TokenType.Comma); + + if (_c == ';') + return FoundToken(TokenType.Semicolon); + + if (_c == ':') + return FoundToken(TokenType.Range); + + if (_c == '@') + return FoundToken(TokenType.Intersection); + + // Comparison + if (_c == '=') + return FoundToken(TokenType.Equal); + + if (_c == '<') + { + var next = Advance(); + if (next == '>') + return FoundToken(TokenType.NotEqual); + + if (next == '=') + return FoundToken(TokenType.LessEqual); + + return T(TokenType.Less); + } + + if (_c == '>') + { + if (Advance() == '=') + { + return FoundToken(TokenType.GreaterEqual); + } + + return T(TokenType.Greater); + } + + if (IsWhitespace(_c)) + { + do + { + Advance(); + } while (IsWhitespace(_c)); + + return T(TokenType.Whitespace); + } + + // Spill operator and errors + if (_c == '#') + { + var char1 = Advance(); + switch (char1) + { + case 'D' or 'd': + return Error("#DIV/0!", 2); + case 'R' or 'r': + return Error("#REF!", 2); + case 'V' or 'v': + return Error("#VALUE!", 2); + case 'G' or 'g': + return Error("#GETTING_DATA", 2); + case 'N' or 'n': + { + var char2 = Advance(); + if (char2 == '/') + return Error("#N/A", 3); + + if (char2 == 'A') + return Error("#NAME?", 3); + + var char3 = Advance(); + if (char2 == 'U' && char3 == 'L') + return Error("#NULL!", 4); + + if (char2 == 'U' && char3 == 'M') + return Error("#NUM!", 4); + + throw ParsingException.TokenPartialMatch(_start, TokenType.Error); + } + } + + return T(TokenType.Spill); + } + + if (_c == '[') + { + var level = 0; + do + { + switch (_c) + { + case '[': + ++level; + break; + case ']': + --level; + break; + case '\'': + Advance(); // Escaped chars don't change level - skip + break; + } + + if (IsEof) + throw new ParsingException($"Unable to find closing square bracket for token from position {_start}."); + + if (level > 2) + throw new ParsingException($"There can be at most two nested square brackets in a token from position {_start}."); + + Advance(); + } while (level > 0); + + return T(TokenType.SquareIdent); + } + + throw ParsingException.UnableToSelectToken(_start); + + static bool IsWhitespace(int c) + { + return c is ' ' or '\r' or '\n' or '\t'; + } + + static bool IsDigit(int c) + { + return c is >= '0' and <= '9'; + } + + // Check [0-9]+ + void DigitSequence() + { + do + { + if (!IsDigit(_c)) + throw ParsingException.TokenPartialMatch(_start, TokenType.Number); + Advance(); + } + while (!IsEof && IsDigit(_c)); + } + + void ExponentPart() + { + if (_c is 'e' or 'E') + { + if (Advance() is '+' or '-') + Advance(); + + DigitSequence(); + } + } + + static bool IsIdentStart(int c) + { + // Ident must satisfy logical-literal, sheet-name, name and A1-cell/column/row + return + IsAsciiLetter(c) || // name + A1 + c == '$' || // A1 + (c is '_' or '\\' or '?') || // name + (c > 0x7F && IsLetterOrLetterMark(c)); // name + } + + static bool IsIdentNext(int c) + { + // Stop at operators + if (c < IsOp.Length && IsOp[c]) + return false; + + return IsIdentStart(c) || + c is >= '0' and <= '9' || // name, A1 + c == '.'; // name + future-functions + } + + Token Error(string error, int start) + { + foreach (var errorChar in error.AsSpan().Slice(start)) + { + Advance(); + if (ToUpperAlpha(_c) != errorChar) + throw ParsingException.TokenPartialMatch(_start, TokenType.Error); + } + + Advance(); + return T(TokenType.Error); + } + + // Token that ends at the Current has been found. Advance to next and return token. + Token FoundToken(TokenType type) + { + Advance(); + return T(type); + } + + // Convert a-z to A-Z, keep other codepoints same. + static int ToUpperAlpha(int codepoint) + { + return codepoint is >= 'a' and <= 'z' + ? 'A' + codepoint - 'a' + : codepoint; + } + + static bool IsAsciiLetter(int codepoint) + { + // Convert to lowercase, normalize 'a' to 0, and check if within 0 (~A)..25(~Z). + // Really cool use of cast int to uint (-1 to 0xFFFFFFFF), thus saving one comparison + // and avoiding potential pipeline stall. + return (uint)((codepoint | 32) - 97) <= 25U; + } + + static bool IsLetterOrLetterMark(int codepoint) + { + // TODO: Only netstandard 2.1 has a parameter of type int, 2.0 has only char. + if (codepoint > 0xFFFF) + return false; // No letters from astral planes for us :( + + // Letters are categories from 0 to OtherLetter category. Then there are NonSpacingMark (accents and such). + return CharUnicodeInfo.GetUnicodeCategory((char)codepoint) <= UnicodeCategory.NonSpacingMark; + } + + // Is codepoint a character per XML 1.0 spec (2.2)? + static bool IsXml10Char(int codepoint) + { + // .NET is using a lookup table with properties + if (codepoint <= 0xFFFF) + return XmlConvert.IsXmlChar((char)codepoint); + + return codepoint <= 0x10FFFF; + } + } + + private Token T(TokenType type) + { + return new Token(type, _input.Substring(_start, _i - _start)); + } + + private int Advance() + { + if (++_i >= _input.Length) + { + _c = -1; + return (char)_c; + } + + var c = _input[_i]; + + if (char.IsLowSurrogate(c)) + throw ParsingException.UnpairedSurrogate(c, _i); + + if (char.IsHighSurrogate(c)) + { + if (_i + 1 >= _input.Length) + throw ParsingException.UnpairedSurrogate(c, _i); + + var low = _input[++_i]; + if (!char.IsLowSurrogate(low)) + throw ParsingException.UnpairedSurrogate(c, _i - 1); + + _c = char.ConvertToUtf32(c, low); + return _c; + } + + return _c = c; + } +} diff --git a/src/ClosedXML.Parser/Pratt/Token.cs b/src/ClosedXML.Parser/Pratt/Token.cs new file mode 100644 index 0000000..0717b44 --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/Token.cs @@ -0,0 +1,14 @@ +namespace ClosedXML.Parser.Pratt; + +internal readonly struct Token +{ + public Token(TokenType type, string text) + { + Type = type; + Text = text; + } + + public TokenType Type { get; } + + public string Text { get; } +} \ No newline at end of file diff --git a/src/ClosedXML.Parser/Pratt/TokenType.cs b/src/ClosedXML.Parser/Pratt/TokenType.cs new file mode 100644 index 0000000..bbdbad5 --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/TokenType.cs @@ -0,0 +1,137 @@ +namespace ClosedXML.Parser.Pratt; + +/// +/// A token types for a lexer. +/// +internal enum TokenType +{ + /// + /// An identifier in a formula. In most generic form, it's a name. + /// Essentially a text that doesn't start with a number without a whitespace. + /// Includes following rules from ABNF: + /// + /// A1-column, A1-row, A1-cell + /// name + /// logical-constant + /// sheet-name + /// + /// + /// + /// + /// Excel doesn't have clear distinction between identifiers and keywords or other things. + /// Example: LOG10 could be an A1 reference name (column LOG, row 10), + /// a function (LOG10(14)) or sheet name (LOG10!A1). To determine it, parser + /// needs a context. And context isn't available in the lexer. + /// + /// + /// Unlike A1, R1C1 has to be recognized in a parser. The minus sign along with square + /// brackets in a R[-1]C[-1] is a deal-breaker. + /// + /// + Ident, // TODO: sheet-name rule description is obviously not true, needs to be checked manually + + /// + /// A floating point number. The textual representation isn't limited number to maximum + /// precision of IEEE 754 standard. + /// + Number, + + /// + /// A text inside double quotes. Double quotes are escaped by doubling. + /// + Text, + + /// + /// Error literal, e.g. #N/A. + /// + Error, + + /// + /// A token representing text between two single quotes. Single quotes are escaped by doubling. + /// + /// 'Jane''s' - sheet names with escaped character + /// 'New York' - sheet names with spaces + /// 'January 1st:December 31st' - 3D references of sheets with spaces + /// '[7]Year 20:Year 25' - 3D references to external workbook. + /// '[Book.xlsx]Year 20:Year 25' - 3D references to external workbook. + /// + /// + /// + /// The ABNF says + /// + /// sheet-name-special = sheet-name-base-character [*sheet-name-character-special sheet-name-base-character] + /// sheet-name-character-special = 2apostrophe / sheet-name-base-character + /// sheet-name-base-character = character; MUST NOT be ', *, [, ], \, :, /, ?, or Unicode character 'END OF TEXT' + /// character = as defined by the production Char in the [W3C-XML] section 2.2 + /// + /// but we accept everything in lexer. The [ and ] must be part of it due + /// to workbook index or * could be a valid name of a workbook. Since is has to be + /// filtered in the parser anyway, don't burden the lexer. + /// + QIdent, + + /// + /// A span of content inside a square brackets. The token inside brackets includes escaped + /// brackets and structure reference keywords. + /// + /// + /// + /// There is a problem with it being either book + /// reference or structure reference. In addition, we might want to parse names of book files + /// in the future. Lexer should be doable through DFA and this is really hard, so just detect + /// token and leave decision to the parser. Nested square brackets are not allowed (must be + /// escaped), so there are at most two level deep nested brackets ([[#Header],[#Data]]), + /// which is doable by DFA (unlimited nesting isn't). + /// + /// + /// Examples: + /// + /// [1] - either structure reference to a column '1' or book index. + /// [] - structure reference to a whole table (from first to last). + /// ['[] - structure reference to a column '['. + /// [Book1.xlsx] - book reference, not part of official grammar. + /// [#Data] - structure reference to data portion of a table + /// [[#Data]] - Nested reference + /// ['#] - structure reference to a column '#' + /// + /// + /// + SquareIdent, + + /// + /// Bang !. It is used in sheet reference, bang names and bang references. + /// + Bang, + + // Operators + Comma, + Semicolon, + Pow, + Mul, + Div, + Plus, + Minus, + Concat, + Equal, + NotEqual, + Less, + LessEqual, + Greater, + GreaterEqual, + Percent, + + // Reference operators + Range, + Spill, + Intersection, + + // Misc + LeftParen, + RightParen, + LeftCurly, + RightCurly, + + // Might contain even space, which is intersection operator, but depends on context + Whitespace, + Eof, +} From 5cb3225e80ad7b73d49729023e48e580867f0189 Mon Sep 17 00:00:00 2001 From: jahav Date: Sun, 19 Oct 2025 01:23:30 +0200 Subject: [PATCH 2/3] Avoid useless allocation Token doesn't need to allocate substring, it can just store indeces and combine it with an input. Due to queue, it can't be turned into a ref struct. --- src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs | 4 ++-- src/ClosedXML.Parser/Pratt/Lexer.cs | 4 ++-- src/ClosedXML.Parser/Pratt/Token.cs | 15 +++++++++++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs b/src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs index 2eaa1f2..6987c33 100644 --- a/src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs +++ b/src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs @@ -316,8 +316,8 @@ private static void AssertToken(TokenType type, string input) { var lexer = new Lexer(input); var token = lexer.Consume(); - Assert.Equal(token.Type, type); - Assert.Equal(token.Text, input); + Assert.Equal(type, token.Type); + Assert.Equal(input, token.GetText(input).ToString()); } private static void AssertFail(string input, string exceptionSubstring) diff --git a/src/ClosedXML.Parser/Pratt/Lexer.cs b/src/ClosedXML.Parser/Pratt/Lexer.cs index cc35421..2122088 100644 --- a/src/ClosedXML.Parser/Pratt/Lexer.cs +++ b/src/ClosedXML.Parser/Pratt/Lexer.cs @@ -64,7 +64,7 @@ private Token Next() Advance(); if (IsEof) - return new Token(TokenType.Eof, string.Empty); + return new Token(TokenType.Eof, 0, 0); _start = _i; @@ -412,7 +412,7 @@ static bool IsXml10Char(int codepoint) private Token T(TokenType type) { - return new Token(type, _input.Substring(_start, _i - _start)); + return new Token(type, _start, _i); } private int Advance() diff --git a/src/ClosedXML.Parser/Pratt/Token.cs b/src/ClosedXML.Parser/Pratt/Token.cs index 0717b44..1166cff 100644 --- a/src/ClosedXML.Parser/Pratt/Token.cs +++ b/src/ClosedXML.Parser/Pratt/Token.cs @@ -1,14 +1,21 @@ -namespace ClosedXML.Parser.Pratt; +using System; + +namespace ClosedXML.Parser.Pratt; internal readonly struct Token { - public Token(TokenType type, string text) + private readonly SymbolRange _text; + + public Token(TokenType type, int start, int end) { Type = type; - Text = text; + _text = new SymbolRange(start, end); } public TokenType Type { get; } - public string Text { get; } + public ReadOnlySpan GetText(string input) + { + return input.AsSpan(_text.Start, _text.Length); + } } \ No newline at end of file From 5c22f210e37735893c971c9b809451341a8cca7b Mon Sep 17 00:00:00 2001 From: jahav Date: Sun, 19 Oct 2025 20:48:44 +0200 Subject: [PATCH 3/3] Add info to token types --- src/ClosedXML.Parser/Pratt/TokenType.cs | 96 ++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/src/ClosedXML.Parser/Pratt/TokenType.cs b/src/ClosedXML.Parser/Pratt/TokenType.cs index bbdbad5..c0d6355 100644 --- a/src/ClosedXML.Parser/Pratt/TokenType.cs +++ b/src/ClosedXML.Parser/Pratt/TokenType.cs @@ -104,34 +104,124 @@ internal enum TokenType Bang, // Operators + /// + /// , - argument separator in function call, range union operator, or separator of + /// values in a row for an array literal. + /// Comma, + + /// + /// ; - separator of rows in array literal. + /// Semicolon, + + /// + /// ^ - power operator. + /// Pow, + + /// + /// * - multiplication operator. + /// Mul, + + /// + /// / - division operator. + /// Div, + + /// + /// - - prefix or binary plus operator. + /// Plus, + + /// + /// - - prefix or binary minus operator. + /// Minus, + + /// + /// & - text concatenation operator. + /// Concat, + + /// + /// = equal comparison operator. + /// Equal, + + /// + /// <> not equals comparison operator. + /// NotEqual, + + /// + /// < less than comparison operator. + /// Less, + + /// + /// <= less than or equal comparison operator. + /// LessEqual, + + /// + /// > greater than comparison operator. + /// Greater, + + /// + /// >= greater than or equal comparison operator. + /// GreaterEqual, + + /// + /// % postfix operator. + /// Percent, - // Reference operators + /// + /// : - range of two references. + /// Range, + + /// + /// # - postfix reference operator. + /// Spill, + + /// + /// @ - implicit intersection of reference. + /// Intersection, - // Misc + /// + /// ( - a nested group operator or opening parenthesis of a function call. + /// LeftParen, + + /// + /// ) - a nested group operator or closing parenthesis of a function call. + /// RightParen, + + /// + /// { - opening token of array literal. + /// LeftCurly, + + /// + /// } - closing token of array literal. + /// RightCurly, - // Might contain even space, which is intersection operator, but depends on context + /// + /// - binary intersection operator or whitespace that will be ignored by parser. + /// Whitespace, + + /// + /// End of file. + /// Eof, }