diff --git a/src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs b/src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs new file mode 100644 index 0000000..6987c33 --- /dev/null +++ b/src/ClosedXML.Parser.Tests/Lexers/LexerTests.cs @@ -0,0 +1,330 @@ +using ClosedXML.Parser.Pratt; + +namespace ClosedXML.Parser.Tests.Lexers; + +public class LexerTests +{ + // ( [0-9] )+ + [InlineData("0")] + [InlineData("1")] + [InlineData("90")] + [InlineData("00050")] + + // ( [0-9] )+ '.' ( [0-9]] )+ + [InlineData("0.0")] + [InlineData("1.2")] + [InlineData("0010.0020")] + [InlineData("999.99")] + + // '.' ( [0-9]] )+ + [InlineData(".0")] + [InlineData(".1")] + [InlineData(".0001")] + [InlineData(".987")] + + // ( [0-9] )+ [Ee] ( [0-9]] )+ + [InlineData("0e0")] + [InlineData("0E0")] + [InlineData("1e2")] + [InlineData("1E2")] + [InlineData("987e12")] + + // ( [0-9] )+ '.' ( [0-9]] )+ [Ee] ( [0-9]] )+ + [InlineData("0.0e4")] + [InlineData("12.724e13")] + [InlineData("12.3E2")] + + // '.' ( [0-9]] )+ [Ee] ( [0-9]] )+ + [InlineData(".0e0")] + [InlineData(".1e2")] + [InlineData(".987e54")] + + // ( [0-9] )+ [Ee] [+-] ( [0-9]] )+ + [InlineData("1e+7")] + [InlineData("74e-32")] + [InlineData("15E-0")] + [InlineData("0e+0")] + [InlineData("01e+7")] + + // ( [0-9] )+ '.' ( [0-9]] )+ [Ee] [+-] ( [0-9]] )+ + [InlineData("0.0e+0")] + [InlineData("1.2e+3")] + [InlineData("01.2e+3")] + [InlineData("1.2E+3")] + [InlineData("12.34e+56")] + + // '.' ( [0-9]] )+ [Ee] [+-] ( [0-9]] )+ + [InlineData(".0e+0")] + [InlineData(".1e+2")] + [InlineData(".12E+34")] + [InlineData(".012e+034")] + [Theory] + public void Number_ok(string input) + { + AssertToken(TokenType.Number, input); + } + + [InlineData("0e+")] + [InlineData(".0e+")] + [Theory] + public void Number_fails(string input) + { + AssertFail(input, "Number"); + } + + [InlineData("\"\"")] + [InlineData("\"Some text\"")] + [InlineData("\"Some \"\" text\"")] + [InlineData("\"\uD83E\uDD8A\"")] // Fox face through surrogates + [Theory] + public void Text_ok(string input) + { + AssertToken(TokenType.Text, input); + } + + [InlineData("\"")] + [InlineData("\"text")] + [InlineData("\"text\"\"")] + [InlineData("\"Some \"\" text")] + [Theory] + public void Text_must_be_terminated(string input) + { + AssertFail(input, "unterminated literal"); + } + + [InlineData("\"\u0015\"")] + [Theory] + public void Text_must_be_contain_xml_10_characters(string input) + { + AssertFail(input, "Invalid text character"); + } + + [InlineData("#DIV/0!")] + [InlineData("#GETTING_DATA")] + [InlineData("#N/A")] + [InlineData("#NAME?")] + [InlineData("#NULL!")] + [InlineData("#NUM!")] + [InlineData("#REF!")] + [InlineData("#VALUE!")] + [InlineData("#ref!")] + [Theory] + public void Error_ok(string input) + { + AssertToken(TokenType.Error, input); + } + + [Fact] + public void Lexer_throws_on_unpaired_surrogates() + { + // Either Visual Studio or NUnit is converting invalid surrogates to -1/65536. O + var invalidCodeUnits = new[] + { + "\uD83E", // Unpaired high surrogate for Fox Face + "\uD83E*", // Unpaired high surrogate for Fox Face + "\uDD8A", // Unpaired low surrogate for Fox Face + "\uDD8A*\"", // Unpaired low surrogate for Fox Face + "\uDD8A\uD83E", // Low surrogate first + }; + foreach (var invalidText in invalidCodeUnits) + { + AssertFail(invalidText, "surrogate"); + } + } + + [InlineData("''")] + [InlineData("'[1]Something'")] + [InlineData("'Jane''s'")] + [InlineData("'New York'")] + [InlineData("'January 1st:December 31st'")] + [InlineData("'[7]Year 20:Year 25'")] + [InlineData("'[Book.xlsx]Year 20:Year 25'")] + [InlineData("'[End*Near.xlsx]Final'")] + [InlineData("''''''")] + [Theory] + public void QIdent_ok(string input) + { + AssertToken(TokenType.QIdent, input); + } + + [InlineData("'")] + [InlineData("'Jane''s")] + [InlineData("'''''")] + [Theory] + public void QIdent_must_be_terminated(string input) + { + AssertFail(input, "unterminated literal"); + } + + [InlineData("ABC")] + [InlineData("A1")] + [InlineData("$A$1")] + [InlineData("AEF$A$1")] + [InlineData("name")] + [InlineData("TRUE")] + [InlineData("FALSE")] + [InlineData("true")] + [InlineData("false")] + [InlineData("?name")] + [InlineData("\\name")] + [InlineData("_name")] + [InlineData("name?")] + [InlineData("name\\")] + [InlineData("name_")] + [InlineData("some.name")] + [InlineData("_xlfn.ACOT")] + [InlineData("\u05D0\u05D1\u05E0")] // stone in hebrew - Letters from other languages + [InlineData("\u05E9\u05B0\u05DC\u05D5\u05DD")] // shalom - A mark from other languages + [Theory] + public void Ident_ok(string input) + { + AssertToken(TokenType.Ident, input); + } + + [Fact] + public void Ident_stops_at_operators() + { + var operators = new Dictionary + { + { TokenType.Bang, "!" }, + { TokenType.Comma, "," }, + { TokenType.Semicolon, ";" }, + { TokenType.Pow, "^" }, + { TokenType.Mul, "*" }, + { TokenType.Div, "/" }, + { TokenType.Plus, "+" }, + { TokenType.Minus, "-" }, + { TokenType.Concat, "&" }, + { TokenType.Equal, "=" }, + { TokenType.NotEqual, "<>" }, + { TokenType.Less, "<" }, + { TokenType.LessEqual, "<=" }, + { TokenType.Greater, ">" }, + { TokenType.GreaterEqual, ">=" }, + { TokenType.Percent, "%" }, + { TokenType.Range, ":" }, + { TokenType.Spill, "#" }, + { TokenType.Intersection, "@" }, + { TokenType.LeftParen, "(" }, + { TokenType.RightParen, ")" }, + { TokenType.LeftCurly, "{" }, + { TokenType.RightCurly, "}" }, + { TokenType.Whitespace, " " }, + }; + + foreach (var (opType, opText) in operators) + { + var input = "name" + opText; + var lexer = new Lexer(input); + + var identToken = lexer.Consume(); + Assert.Equal(TokenType.Ident, identToken.Type); + Assert.Equal("name", identToken.GetText(input).ToString()); + + var opToken = lexer.Consume(); + Assert.Equal(opType, opToken.Type); + Assert.Equal(opText, opToken.GetText(input).ToString()); + } + } + + [InlineData("[1]")] + [InlineData("[]")] + [InlineData("['[]")] + [InlineData("[Book1.xlsx]")] + [InlineData("[#Data]")] + [InlineData("[[#Data]]")] + [InlineData("[[#Data],[#Headers]]")] + [InlineData("['#]")] + [InlineData("[985]")] + [InlineData("[Jan:Dec]")] + [InlineData("['['['[]")] + [InlineData("[']']']]")] + [Theory] + public void SquareIdent_ok(string input) + { + AssertToken(TokenType.SquareIdent, input); + } + + [InlineData("[Ja[[a]]]")] + [Theory] + public void SquareIdent_at_most_two_nested_brackets(string input) + { + // Mostly to keep within something DFA can do. + AssertFail(input, "at most two nested square brackets"); + } + + [InlineData("[")] + [InlineData("[text")] + [InlineData("[[")] + [InlineData("[a[b")] + [InlineData("[Start[]and end")] + [InlineData("[Start[']and end']")] + [Theory] + public void SquareIdent_must_be_paired(string input) + { + AssertFail(input, "Unable to find closing square bracket"); + } + + [InlineData((int)TokenType.Bang, "!")] + [InlineData((int)TokenType.Range, ":")] + [InlineData((int)TokenType.Comma, ",")] + [InlineData((int)TokenType.Semicolon, ";")] + [InlineData((int)TokenType.Pow, "^")] + [InlineData((int)TokenType.Mul, "*")] + [InlineData((int)TokenType.Div, "/")] + [InlineData((int)TokenType.Plus, "+")] + [InlineData((int)TokenType.Minus, "-")] + [InlineData((int)TokenType.Concat, "&")] + [InlineData((int)TokenType.Percent, "%")] + [InlineData((int)TokenType.Spill, "#")] + [InlineData((int)TokenType.Intersection, "@")] + [InlineData((int)TokenType.LeftParen, "(")] + [InlineData((int)TokenType.RightParen, ")")] + [InlineData((int)TokenType.LeftCurly, "{")] + [InlineData((int)TokenType.RightCurly, "}")] + [Theory] + public void Single_char_tokens_ok(int token, string input) + { + // TODO: Dump xUnit. Can't even use internal classes as test fixtures, so I have to pass enum as int. + AssertToken((TokenType)token, input); + } + + [InlineData((int)TokenType.Equal, "=")] + [InlineData((int)TokenType.NotEqual, "<>")] + [InlineData((int)TokenType.Less, "<")] + [InlineData((int)TokenType.LessEqual, "<=")] + [InlineData((int)TokenType.Greater, ">")] + [InlineData((int)TokenType.GreaterEqual, ">=")] + [Theory] + public void Comparison_tokens_ok(int token, string input) + { + AssertToken((TokenType)token, input); + } + + [InlineData("\t")] + [InlineData("\n")] + [InlineData("\r")] + [InlineData(" ")] + [InlineData("\t \r\n")] + [Theory] + public void Whitespace_ok(string input) + { + AssertToken(TokenType.Whitespace, input); + } + + private static void AssertToken(TokenType type, string input) + { + var lexer = new Lexer(input); + var token = lexer.Consume(); + Assert.Equal(type, token.Type); + Assert.Equal(input, token.GetText(input).ToString()); + } + + private static void AssertFail(string input, string exceptionSubstring) + { + var lexer = new Lexer(input); + var exception = Assert.Throws(() => lexer.Consume()); + Assert.NotNull(exception); + Assert.True(exception.Message.Contains(exceptionSubstring), $"Expected to find '{exceptionSubstring}', but not found in '{exception.Message}'."); + } +} diff --git a/src/ClosedXML.Parser/ParsingException.cs b/src/ClosedXML.Parser/ParsingException.cs index 9a70536..c4683e0 100644 --- a/src/ClosedXML.Parser/ParsingException.cs +++ b/src/ClosedXML.Parser/ParsingException.cs @@ -1,4 +1,5 @@ using System; +using ClosedXML.Parser.Pratt; namespace ClosedXML.Parser; @@ -10,4 +11,36 @@ public class ParsingException : Exception internal ParsingException(string message) : base(message) { } -} \ No newline at end of file + + /// + /// There are problems with underlying stream. + /// + internal static ParsingException UnpairedSurrogate(int codepoint, int position) + { + throw new ParsingException($"Found an unpaired surrogate 0x{codepoint:X4} at {position}."); + } + + /// + /// Token has a start and end indicator, but no end indicator was found. + /// + internal static Exception UnterminatedLiteral(int start, char delimiter) + { + throw new ParsingException($"An unterminated literal (delimiter {delimiter}) found at position {start}."); + } + + /// + /// A token was started to be parsed, but is not complete or there is a problem with it. + /// + internal static Exception TokenPartialMatch(int start, TokenType type) + { + throw new ParsingException($"Token {type} was parsed from position {start}, but was only partially matched."); + } + + /// + /// Lexer has no idea which token it should start to parse. + /// + internal static Exception UnableToSelectToken(int start) + { + throw new ParsingException($"Unable to determine a token at position {start}."); + } +} diff --git a/src/ClosedXML.Parser/Pratt/Lexer.cs b/src/ClosedXML.Parser/Pratt/Lexer.cs new file mode 100644 index 0000000..2122088 --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/Lexer.cs @@ -0,0 +1,446 @@ +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Xml; + +namespace ClosedXML.Parser.Pratt; + +/// +/// A lexer for pratt parser. +/// +internal class Lexer +{ + private const string OPERATOR_CHARS = "!,;^*/+-&=<>%:#@(){} "; + private const int EOF = -1; + + private static readonly bool[] IsOp; + + private readonly Queue _queue = new(4); + private readonly string _input; + + private int _start; // The start index of currently parsed token in Next() + private int _i; // Index of current code point _c in _input + private int _c; // A current code point (including astral planes) or -1 if at the EOF + + + static Lexer() + { + IsOp = new bool[128]; + foreach (var op in OPERATOR_CHARS) + IsOp[op] = true; + } + + /// + /// Create a new instance of a lexer. + /// + /// Formula to tokenize. + public Lexer(string input) + { + _input = input ?? throw new ArgumentNullException(); + _i = -1; + } + + private bool IsEof => _c == EOF; + + public Token Consume() + { + if (_queue.Count == 0) + return Next(); + + return _queue.Dequeue(); + } + + public Token Peek() + { + if (_queue.Count == 0) + _queue.Enqueue(Next()); + + return _queue.Peek(); + } + + private Token Next() + { + if (_i < 0) + Advance(); + + if (IsEof) + return new Token(TokenType.Eof, 0, 0); + + _start = _i; + + // Number + if (IsDigit(_c)) + { + // Whole number part + DigitSequence(); + + // Fractional part + if (_c == '.') + { + Advance(); + DigitSequence(); + } + + ExponentPart(); + + return T(TokenType.Number); + } + if (_c is '.') + { + Advance(); + + // Fractional part + DigitSequence(); + ExponentPart(); + + return T(TokenType.Number); + } + + // Text + if (_c == '"') + { + Advance(); + + while (!IsEof) + { + if (_c == '"') + { + Advance(); + if (_c != '"') + return T(TokenType.Text); + } + + if (!IsXml10Char(_c)) + throw new ParsingException($"Invalid text character (codepoint {_c:x8})."); + + Advance(); + } + + throw ParsingException.UnterminatedLiteral(_start, '"'); + } + + // QIdent + if (_c == '\'') + { + while (!IsEof) + { + Advance(); + + if (_c == '\'') + { + Advance(); + if (_c != '\'') + return T(TokenType.QIdent); + } + } + + throw ParsingException.UnterminatedLiteral(_start, '\''); + } + + if (IsIdentStart(_c)) + { + Advance(); + while (!IsEof && IsIdentNext(_c)) + Advance(); + + return T(TokenType.Ident); + } + + if (_c == '+') + return FoundToken(TokenType.Plus); + + if (_c == '-') + return FoundToken(TokenType.Minus); + + if (_c == '*') + return FoundToken(TokenType.Mul); + + if (_c == '/') + return FoundToken(TokenType.Div); + + if (_c == '^') + return FoundToken(TokenType.Pow); + + if (_c == '%') + return FoundToken(TokenType.Percent); + + if (_c == '&') + return FoundToken(TokenType.Concat); + + if (_c == '!') + return FoundToken(TokenType.Bang); + + if (_c == '(') + return FoundToken(TokenType.LeftParen); + + if (_c == ')') + return FoundToken(TokenType.RightParen); + + if (_c == '{') + return FoundToken(TokenType.LeftCurly); + + if (_c == '}') + return FoundToken(TokenType.RightCurly); + + if (_c == ',') + return FoundToken(TokenType.Comma); + + if (_c == ';') + return FoundToken(TokenType.Semicolon); + + if (_c == ':') + return FoundToken(TokenType.Range); + + if (_c == '@') + return FoundToken(TokenType.Intersection); + + // Comparison + if (_c == '=') + return FoundToken(TokenType.Equal); + + if (_c == '<') + { + var next = Advance(); + if (next == '>') + return FoundToken(TokenType.NotEqual); + + if (next == '=') + return FoundToken(TokenType.LessEqual); + + return T(TokenType.Less); + } + + if (_c == '>') + { + if (Advance() == '=') + { + return FoundToken(TokenType.GreaterEqual); + } + + return T(TokenType.Greater); + } + + if (IsWhitespace(_c)) + { + do + { + Advance(); + } while (IsWhitespace(_c)); + + return T(TokenType.Whitespace); + } + + // Spill operator and errors + if (_c == '#') + { + var char1 = Advance(); + switch (char1) + { + case 'D' or 'd': + return Error("#DIV/0!", 2); + case 'R' or 'r': + return Error("#REF!", 2); + case 'V' or 'v': + return Error("#VALUE!", 2); + case 'G' or 'g': + return Error("#GETTING_DATA", 2); + case 'N' or 'n': + { + var char2 = Advance(); + if (char2 == '/') + return Error("#N/A", 3); + + if (char2 == 'A') + return Error("#NAME?", 3); + + var char3 = Advance(); + if (char2 == 'U' && char3 == 'L') + return Error("#NULL!", 4); + + if (char2 == 'U' && char3 == 'M') + return Error("#NUM!", 4); + + throw ParsingException.TokenPartialMatch(_start, TokenType.Error); + } + } + + return T(TokenType.Spill); + } + + if (_c == '[') + { + var level = 0; + do + { + switch (_c) + { + case '[': + ++level; + break; + case ']': + --level; + break; + case '\'': + Advance(); // Escaped chars don't change level - skip + break; + } + + if (IsEof) + throw new ParsingException($"Unable to find closing square bracket for token from position {_start}."); + + if (level > 2) + throw new ParsingException($"There can be at most two nested square brackets in a token from position {_start}."); + + Advance(); + } while (level > 0); + + return T(TokenType.SquareIdent); + } + + throw ParsingException.UnableToSelectToken(_start); + + static bool IsWhitespace(int c) + { + return c is ' ' or '\r' or '\n' or '\t'; + } + + static bool IsDigit(int c) + { + return c is >= '0' and <= '9'; + } + + // Check [0-9]+ + void DigitSequence() + { + do + { + if (!IsDigit(_c)) + throw ParsingException.TokenPartialMatch(_start, TokenType.Number); + Advance(); + } + while (!IsEof && IsDigit(_c)); + } + + void ExponentPart() + { + if (_c is 'e' or 'E') + { + if (Advance() is '+' or '-') + Advance(); + + DigitSequence(); + } + } + + static bool IsIdentStart(int c) + { + // Ident must satisfy logical-literal, sheet-name, name and A1-cell/column/row + return + IsAsciiLetter(c) || // name + A1 + c == '$' || // A1 + (c is '_' or '\\' or '?') || // name + (c > 0x7F && IsLetterOrLetterMark(c)); // name + } + + static bool IsIdentNext(int c) + { + // Stop at operators + if (c < IsOp.Length && IsOp[c]) + return false; + + return IsIdentStart(c) || + c is >= '0' and <= '9' || // name, A1 + c == '.'; // name + future-functions + } + + Token Error(string error, int start) + { + foreach (var errorChar in error.AsSpan().Slice(start)) + { + Advance(); + if (ToUpperAlpha(_c) != errorChar) + throw ParsingException.TokenPartialMatch(_start, TokenType.Error); + } + + Advance(); + return T(TokenType.Error); + } + + // Token that ends at the Current has been found. Advance to next and return token. + Token FoundToken(TokenType type) + { + Advance(); + return T(type); + } + + // Convert a-z to A-Z, keep other codepoints same. + static int ToUpperAlpha(int codepoint) + { + return codepoint is >= 'a' and <= 'z' + ? 'A' + codepoint - 'a' + : codepoint; + } + + static bool IsAsciiLetter(int codepoint) + { + // Convert to lowercase, normalize 'a' to 0, and check if within 0 (~A)..25(~Z). + // Really cool use of cast int to uint (-1 to 0xFFFFFFFF), thus saving one comparison + // and avoiding potential pipeline stall. + return (uint)((codepoint | 32) - 97) <= 25U; + } + + static bool IsLetterOrLetterMark(int codepoint) + { + // TODO: Only netstandard 2.1 has a parameter of type int, 2.0 has only char. + if (codepoint > 0xFFFF) + return false; // No letters from astral planes for us :( + + // Letters are categories from 0 to OtherLetter category. Then there are NonSpacingMark (accents and such). + return CharUnicodeInfo.GetUnicodeCategory((char)codepoint) <= UnicodeCategory.NonSpacingMark; + } + + // Is codepoint a character per XML 1.0 spec (2.2)? + static bool IsXml10Char(int codepoint) + { + // .NET is using a lookup table with properties + if (codepoint <= 0xFFFF) + return XmlConvert.IsXmlChar((char)codepoint); + + return codepoint <= 0x10FFFF; + } + } + + private Token T(TokenType type) + { + return new Token(type, _start, _i); + } + + private int Advance() + { + if (++_i >= _input.Length) + { + _c = -1; + return (char)_c; + } + + var c = _input[_i]; + + if (char.IsLowSurrogate(c)) + throw ParsingException.UnpairedSurrogate(c, _i); + + if (char.IsHighSurrogate(c)) + { + if (_i + 1 >= _input.Length) + throw ParsingException.UnpairedSurrogate(c, _i); + + var low = _input[++_i]; + if (!char.IsLowSurrogate(low)) + throw ParsingException.UnpairedSurrogate(c, _i - 1); + + _c = char.ConvertToUtf32(c, low); + return _c; + } + + return _c = c; + } +} diff --git a/src/ClosedXML.Parser/Pratt/Token.cs b/src/ClosedXML.Parser/Pratt/Token.cs new file mode 100644 index 0000000..1166cff --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/Token.cs @@ -0,0 +1,21 @@ +using System; + +namespace ClosedXML.Parser.Pratt; + +internal readonly struct Token +{ + private readonly SymbolRange _text; + + public Token(TokenType type, int start, int end) + { + Type = type; + _text = new SymbolRange(start, end); + } + + public TokenType Type { get; } + + public ReadOnlySpan GetText(string input) + { + return input.AsSpan(_text.Start, _text.Length); + } +} \ No newline at end of file diff --git a/src/ClosedXML.Parser/Pratt/TokenType.cs b/src/ClosedXML.Parser/Pratt/TokenType.cs new file mode 100644 index 0000000..c0d6355 --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/TokenType.cs @@ -0,0 +1,227 @@ +namespace ClosedXML.Parser.Pratt; + +/// +/// A token types for a lexer. +/// +internal enum TokenType +{ + /// + /// An identifier in a formula. In most generic form, it's a name. + /// Essentially a text that doesn't start with a number without a whitespace. + /// Includes following rules from ABNF: + /// + /// A1-column, A1-row, A1-cell + /// name + /// logical-constant + /// sheet-name + /// + /// + /// + /// + /// Excel doesn't have clear distinction between identifiers and keywords or other things. + /// Example: LOG10 could be an A1 reference name (column LOG, row 10), + /// a function (LOG10(14)) or sheet name (LOG10!A1). To determine it, parser + /// needs a context. And context isn't available in the lexer. + /// + /// + /// Unlike A1, R1C1 has to be recognized in a parser. The minus sign along with square + /// brackets in a R[-1]C[-1] is a deal-breaker. + /// + /// + Ident, // TODO: sheet-name rule description is obviously not true, needs to be checked manually + + /// + /// A floating point number. The textual representation isn't limited number to maximum + /// precision of IEEE 754 standard. + /// + Number, + + /// + /// A text inside double quotes. Double quotes are escaped by doubling. + /// + Text, + + /// + /// Error literal, e.g. #N/A. + /// + Error, + + /// + /// A token representing text between two single quotes. Single quotes are escaped by doubling. + /// + /// 'Jane''s' - sheet names with escaped character + /// 'New York' - sheet names with spaces + /// 'January 1st:December 31st' - 3D references of sheets with spaces + /// '[7]Year 20:Year 25' - 3D references to external workbook. + /// '[Book.xlsx]Year 20:Year 25' - 3D references to external workbook. + /// + /// + /// + /// The ABNF says + /// + /// sheet-name-special = sheet-name-base-character [*sheet-name-character-special sheet-name-base-character] + /// sheet-name-character-special = 2apostrophe / sheet-name-base-character + /// sheet-name-base-character = character; MUST NOT be ', *, [, ], \, :, /, ?, or Unicode character 'END OF TEXT' + /// character = as defined by the production Char in the [W3C-XML] section 2.2 + /// + /// but we accept everything in lexer. The [ and ] must be part of it due + /// to workbook index or * could be a valid name of a workbook. Since is has to be + /// filtered in the parser anyway, don't burden the lexer. + /// + QIdent, + + /// + /// A span of content inside a square brackets. The token inside brackets includes escaped + /// brackets and structure reference keywords. + /// + /// + /// + /// There is a problem with it being either book + /// reference or structure reference. In addition, we might want to parse names of book files + /// in the future. Lexer should be doable through DFA and this is really hard, so just detect + /// token and leave decision to the parser. Nested square brackets are not allowed (must be + /// escaped), so there are at most two level deep nested brackets ([[#Header],[#Data]]), + /// which is doable by DFA (unlimited nesting isn't). + /// + /// + /// Examples: + /// + /// [1] - either structure reference to a column '1' or book index. + /// [] - structure reference to a whole table (from first to last). + /// ['[] - structure reference to a column '['. + /// [Book1.xlsx] - book reference, not part of official grammar. + /// [#Data] - structure reference to data portion of a table + /// [[#Data]] - Nested reference + /// ['#] - structure reference to a column '#' + /// + /// + /// + SquareIdent, + + /// + /// Bang !. It is used in sheet reference, bang names and bang references. + /// + Bang, + + // Operators + /// + /// , - argument separator in function call, range union operator, or separator of + /// values in a row for an array literal. + /// + Comma, + + /// + /// ; - separator of rows in array literal. + /// + Semicolon, + + /// + /// ^ - power operator. + /// + Pow, + + /// + /// * - multiplication operator. + /// + Mul, + + /// + /// / - division operator. + /// + Div, + + /// + /// - - prefix or binary plus operator. + /// + Plus, + + /// + /// - - prefix or binary minus operator. + /// + Minus, + + /// + /// & - text concatenation operator. + /// + Concat, + + /// + /// = equal comparison operator. + /// + Equal, + + /// + /// <> not equals comparison operator. + /// + NotEqual, + + /// + /// < less than comparison operator. + /// + Less, + + /// + /// <= less than or equal comparison operator. + /// + LessEqual, + + /// + /// > greater than comparison operator. + /// + Greater, + + /// + /// >= greater than or equal comparison operator. + /// + GreaterEqual, + + /// + /// % postfix operator. + /// + Percent, + + /// + /// : - range of two references. + /// + Range, + + /// + /// # - postfix reference operator. + /// + Spill, + + /// + /// @ - implicit intersection of reference. + /// + Intersection, + + /// + /// ( - a nested group operator or opening parenthesis of a function call. + /// + LeftParen, + + /// + /// ) - a nested group operator or closing parenthesis of a function call. + /// + RightParen, + + /// + /// { - opening token of array literal. + /// + LeftCurly, + + /// + /// } - closing token of array literal. + /// + RightCurly, + + /// + /// - binary intersection operator or whitespace that will be ignored by parser. + /// + Whitespace, + + /// + /// End of file. + /// + Eof, +}