diff --git a/src/ClosedXML.Parser.Ast/AstNode.cs b/src/ClosedXML.Parser.Ast/AstNode.cs index e7ea1bc..7aad746 100644 --- a/src/ClosedXML.Parser.Ast/AstNode.cs +++ b/src/ClosedXML.Parser.Ast/AstNode.cs @@ -14,4 +14,4 @@ public abstract record AstNode public virtual bool Equals(AstNode? other) => other is not null && Children.SequenceEqual(other.Children); public override int GetHashCode() => Children.Sum(child => child.GetHashCode()); -} \ No newline at end of file +} diff --git a/src/ClosedXML.Parser.Ast/BinaryNode.cs b/src/ClosedXML.Parser.Ast/BinaryNode.cs index a8ad586..93c8834 100644 --- a/src/ClosedXML.Parser.Ast/BinaryNode.cs +++ b/src/ClosedXML.Parser.Ast/BinaryNode.cs @@ -24,8 +24,8 @@ public record BinaryNode(BinaryOperation Operation) : AstNode public BinaryNode(BinaryOperation operation, AstNode left, AstNode right) : this(operation) { - Children = new[] { left, right }; + Children = [left, right]; } public override string GetDisplayString(ReferenceStyle style) => OpNames[Operation]; -}; \ No newline at end of file +} diff --git a/src/ClosedXML.Parser.Ast/ValueNode.cs b/src/ClosedXML.Parser.Ast/ValueNode.cs index 0a52355..026820c 100644 --- a/src/ClosedXML.Parser.Ast/ValueNode.cs +++ b/src/ClosedXML.Parser.Ast/ValueNode.cs @@ -1,4 +1,4 @@ -namespace ClosedXML.Parser; +namespace ClosedXML.Parser; public record ValueNode(string Type, object Value) : AstNode { @@ -11,4 +11,4 @@ public override string GetDisplayString(ReferenceStyle style) { return Value?.ToString() ?? "BLANK"; } -}; \ No newline at end of file +} diff --git a/src/ClosedXML.Parser.Tests/Lexers/PrattParserPrecedenceTests.cs b/src/ClosedXML.Parser.Tests/Lexers/PrattParserPrecedenceTests.cs new file mode 100644 index 0000000..750d1d4 --- /dev/null +++ b/src/ClosedXML.Parser.Tests/Lexers/PrattParserPrecedenceTests.cs @@ -0,0 +1,63 @@ +using System.Diagnostics; +using ClosedXML.Parser.Pratt; + +namespace ClosedXML.Parser.Tests.Lexers; + +public class PrattParserPrecedenceTests +{ + [Theory] + [InlineData("1+2+3+4", "(((1+2)+3)+4)")] + [InlineData("1-2-3-4", "(((1-2)-3)-4)")] + [InlineData("1-2+3-4+5", "((((1-2)+3)-4)+5)")] + [InlineData("1*2*3*4", "(((1*2)*3)*4)")] + [InlineData("1/2/3/4", "(((1/2)/3)/4)")] + [InlineData("1*2/3*4/5", "((((1*2)/3)*4)/5)")] + [InlineData("2^3^4^5", "(((2^3)^4)^5)")] // Even exponential is left-associative in Excel, contrary to standard convention + public void Operations_with_same_precedence_are_left_associative(string formula, string normalizedForm) + { + AssertSameFormulas(formula, normalizedForm); + } + + [Theory] + [InlineData("1+(2+3+4)+((5+6)+7)", "((1+((2+3)+4))+((5+6)+7))")] + [InlineData("1-(2-3-4)-((5-6)-7)", "((1-((2-3)-4))-((5-6)-7))")] + [InlineData("1-(2+3-4)+((5-6)+7)", "((1-((2+3)-4))+((5-6)+7))")] + [InlineData("1*(2*3*4)*((5*6)*7)", "((1*((2*3)*4))*((5*6)*7))")] + [InlineData("1/(2/3/4)/((5/6)/7)", "((1/((2/3)/4))/((5/6)/7))")] + [InlineData("1/(2*3/4)*((5/6)*7)", "((1/((2*3)/4))*((5/6)*7))")] + [InlineData("2^(3^4)^5", "((2^(3^4))^5)")] + public void Groups_override_precedence(string formula, string normalizedForm) + { + AssertSameFormulas(formula, normalizedForm); + } + + [Theory] + [InlineData("1+2*3+4/5*6^7-8", "(((1+(2*3))+((4/5)*(6^7)))-8)")] + [InlineData("1+2-3*4+5/6^7-8*9", "((((1+2)-(3*4))+(5/(6^7)))-(8*9))")] + public void Operations_are_grouped_by_precedence(string formula, string normalizedForm) + { + AssertSameFormulas(formula, normalizedForm); + } + + private static void AssertSameFormulas(string formula, string normalizedForm) + { + var parser = ParserFactory.Create(new F()); + var root = parser.ParseFormula(formula, new Ctx()); + + Assert.Equal(normalizedForm, GetNormalizedForm(root)); + } + + private static string GetNormalizedForm(AstNode node) + { + return node switch + { + ValueNode value => value.GetDisplayString(A1), + BinaryNode binaryOp => "(" + + GetNormalizedForm(binaryOp.Children[0]) + + binaryOp.GetDisplayString(A1) + + GetNormalizedForm(binaryOp.Children[1]) + + ")", + _ => throw new UnreachableException() + }; + } +} diff --git a/src/ClosedXML.Parser.sln.DotSettings b/src/ClosedXML.Parser.sln.DotSettings index c24fb3c..c0b03b0 100644 --- a/src/ClosedXML.Parser.sln.DotSettings +++ b/src/ClosedXML.Parser.sln.DotSettings @@ -5,6 +5,7 @@ True True True + True True True True diff --git a/src/ClosedXML.Parser/Pratt/BindingPower.cs b/src/ClosedXML.Parser/Pratt/BindingPower.cs new file mode 100644 index 0000000..a4a2765 --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/BindingPower.cs @@ -0,0 +1,15 @@ +namespace ClosedXML.Parser.Pratt; + +/// +/// Values of binding power for operators in an expression. Higher number = higher binding power. +/// Precedence of operators is specified by ISO-29500:18.17.2.2. Operators that have the same +/// precedence associate left-to-right. +/// +internal static class BindingPower +{ + internal const int Addition = 3; + internal const int Subtraction = 3; + internal const int Multiplication = 4; + internal const int Division = 4; + internal const int Exponentiation = 5; +} diff --git a/src/ClosedXML.Parser/Pratt/IParselet.cs b/src/ClosedXML.Parser/Pratt/IParselet.cs new file mode 100644 index 0000000..74af3c5 --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/IParselet.cs @@ -0,0 +1,8 @@ +namespace ClosedXML.Parser.Pratt; + +internal interface IParselet +{ + Node Parse(TContext ctx, Node left, Token op); + + int GetBindingPower(); +} diff --git a/src/ClosedXML.Parser/Pratt/IPrefixParselet.cs b/src/ClosedXML.Parser/Pratt/IPrefixParselet.cs new file mode 100644 index 0000000..95c0f3a --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/IPrefixParselet.cs @@ -0,0 +1,6 @@ +namespace ClosedXML.Parser.Pratt; + +internal interface IPrefixParselet +{ + Node Parse(TContext ctx, Token token); +} diff --git a/src/ClosedXML.Parser/Pratt/Lexer.cs b/src/ClosedXML.Parser/Pratt/Lexer.cs index 2122088..3ca760a 100644 --- a/src/ClosedXML.Parser/Pratt/Lexer.cs +++ b/src/ClosedXML.Parser/Pratt/Lexer.cs @@ -16,8 +16,7 @@ internal class Lexer private static readonly bool[] IsOp; private readonly Queue _queue = new(4); - private readonly string _input; - + private string _input = string.Empty; // Currently tokenized formula private int _start; // The start index of currently parsed token in Next() private int _i; // Index of current code point _c in _input private int _c; // A current code point (including astral planes) or -1 if at the EOF @@ -30,18 +29,30 @@ static Lexer() IsOp[op] = true; } - /// - /// Create a new instance of a lexer. - /// - /// Formula to tokenize. + public Lexer() + : this(string.Empty) + { + } + public Lexer(string input) { - _input = input ?? throw new ArgumentNullException(); - _i = -1; + Reset(input); } private bool IsEof => _c == EOF; + /// + /// Prepare lexer to start tokenization of the . + /// + /// Formula to tokenize. + public void Reset(string formula) + { + _input = formula ?? throw new ArgumentNullException(); + _start = -1; + _i = -1; + _c = 0; + } + public Token Consume() { if (_queue.Count == 0) diff --git a/src/ClosedXML.Parser/Pratt/Node.cs b/src/ClosedXML.Parser/Pratt/Node.cs new file mode 100644 index 0000000..7ae92be --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/Node.cs @@ -0,0 +1,44 @@ +namespace ClosedXML.Parser.Pratt; + +/// +/// An info about node used during parsing. +/// +/// The TNode type of a node from . +internal readonly struct Node +{ + public Node(T value, int start, int end) + : this(value, new SymbolRange(start, end)) + { + } + + public Node(T value, SymbolRange range) + { + Value = value; + Range = range; + } + + /// + /// Parsed value of a node, created by the . + /// + public T Value { get; } + + /// + /// A range that was used to created the node. + /// + public SymbolRange Range { get; } + + public static implicit operator T(Node node) + { + return node.Value; + } + + internal Node ExtendLeft(Token token) + { + return new Node(Value, token.Range.ExtendRight(Range)); + } + + internal Node ExtendRight(Token token) + { + return new Node(Value, Range.ExtendRight(token.Range)); + } +} diff --git a/src/ClosedXML.Parser/Pratt/Parselets/BinaryOpParselet.cs b/src/ClosedXML.Parser/Pratt/Parselets/BinaryOpParselet.cs new file mode 100644 index 0000000..6305502 --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/Parselets/BinaryOpParselet.cs @@ -0,0 +1,34 @@ +namespace ClosedXML.Parser.Pratt.Parselets; + +internal class BinaryOpParselet : IParselet +{ + private readonly IAstFactory _factory; + private readonly Parser _parser; + private readonly BinaryOperation _op; + private readonly int _bp; + + public BinaryOpParselet(IAstFactory factory, Parser parser, BinaryOperation op, int bp) + { + _factory = factory; + _parser = parser; + _op = op; + _bp = bp; + } + + public Node Parse(TContext ctx, Node left, Token op) + { + var right = _parser.ParseExpression(ctx, _bp); + var nodeRange = left.Range + .ExtendRight(op.Range) + .ExtendRight(right.Range); + + var node = _factory.BinaryNode(ctx, nodeRange, _op, left, right); + return new Node(node, nodeRange); + } + + public int GetBindingPower() + { + return _bp; + } +} + diff --git a/src/ClosedXML.Parser/Pratt/Parselets/GroupParselet.cs b/src/ClosedXML.Parser/Pratt/Parselets/GroupParselet.cs new file mode 100644 index 0000000..f3c8013 --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/Parselets/GroupParselet.cs @@ -0,0 +1,18 @@ +namespace ClosedXML.Parser.Pratt.Parselets; + +internal class GroupParselet : IPrefixParselet +{ + private readonly Parser _parser; + + public GroupParselet(Parser parser) + { + _parser = parser; + } + + public Node Parse(TContext ctx, Token leftParen) + { + var node = _parser.ParseExpression(ctx, 0); + var rightParen = _parser.Consume(TokenType.RightParen); + return node.ExtendLeft(leftParen).ExtendRight(rightParen); + } +} diff --git a/src/ClosedXML.Parser/Pratt/Parselets/NumberParselet.cs b/src/ClosedXML.Parser/Pratt/Parselets/NumberParselet.cs new file mode 100644 index 0000000..28a95eb --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/Parselets/NumberParselet.cs @@ -0,0 +1,34 @@ +using System.Globalization; + +namespace ClosedXML.Parser.Pratt.Parselets; + +/// +/// Get a number node from a token. +/// +/// +/// double.Parse parses even NaN or , but we can never receive such text +/// from the lexer. +/// +internal class NumberParselet : IPrefixParselet +{ + private readonly IAstFactory _factory; + private readonly Parser _parser; + + public NumberParselet(IAstFactory factory, Parser parser) + { + _factory = factory; + _parser = parser; + } + + public Node Parse(TContext ctx, Token token) + { +#if NETSTANDARD2_1 + var text = token.GetText(_parser.Input); +#else + var text = token.GetText(_parser.Input).ToString(); // NetFx has a double whammy, it's slow and gets extra memory to GC +#endif + var number = double.Parse(text, NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, CultureInfo.InvariantCulture); + var node = _factory.NumberNode(ctx, token.Range, number); + return new Node(node, token.Range); + } +} diff --git a/src/ClosedXML.Parser/Pratt/Parser.cs b/src/ClosedXML.Parser/Pratt/Parser.cs new file mode 100644 index 0000000..254c270 --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/Parser.cs @@ -0,0 +1,77 @@ +using System; +using System.Collections.Generic; + +namespace ClosedXML.Parser.Pratt; + +/// +/// Pratt parser. +/// +internal class Parser +{ + private readonly Lexer _lexer = new(); + private readonly Dictionary> _prefixParselets = new(); + private readonly Dictionary> _parselets = new(); + + internal string Input { get; private set; } = string.Empty; + + public T ParseFormula(string formula, TContext ctx) + { + Input = formula; + _lexer.Reset(formula); + return ParseExpression(ctx, 0).Value; + } + + internal Node ParseExpression(TContext ctx, int minBp) + { + var node = Prefix(ctx); + + while (true) + { + var maybeOp = _lexer.Peek(); + if (maybeOp.Type == TokenType.Eof) + break; + + var isOp = _parselets.TryGetValue(maybeOp.Type, out var parselet); + if (!isOp) + break; + + var bp = parselet!.GetBindingPower(); + if (bp <= minBp) + break; + + var op = _lexer.Consume(); + node = parselet.Parse(ctx, node, op); + } + + return node; + } + + private Node Prefix(TContext ctx) + { + var token = _lexer.Consume(); + + if (!_prefixParselets.TryGetValue(token.Type, out var parselet)) + throw new InvalidOperationException($"No parselet found for {token.Type}."); + + return parselet.Parse(ctx, token); + } + + internal Token Consume(TokenType expectedType) + { + var token = _lexer.Consume(); + if (token.Type != expectedType) + throw new InvalidOperationException($"Expected token of type {expectedType}, but received {token.Type}."); + + return token; + } + + internal void Register(TokenType type, IPrefixParselet parselet) + { + _prefixParselets.Add(type, parselet); + } + + internal void Register(TokenType type, IParselet parselet) + { + _parselets.Add(type, parselet); + } +} diff --git a/src/ClosedXML.Parser/Pratt/ParserFactory.cs b/src/ClosedXML.Parser/Pratt/ParserFactory.cs new file mode 100644 index 0000000..84f22cb --- /dev/null +++ b/src/ClosedXML.Parser/Pratt/ParserFactory.cs @@ -0,0 +1,25 @@ +using ClosedXML.Parser.Pratt.Parselets; + +namespace ClosedXML.Parser.Pratt; + +internal static class ParserFactory +{ + public static Parser Create( + IAstFactory factory) + { + var parser = new Parser(); + + // Register prefix parselets + parser.Register(TokenType.Number, new NumberParselet(factory, parser)); + parser.Register(TokenType.LeftParen, new GroupParselet(parser)); + + // Register operation parselets + parser.Register(TokenType.Plus, new BinaryOpParselet(factory, parser, BinaryOperation.Addition, BindingPower.Addition)); + parser.Register(TokenType.Minus, new BinaryOpParselet(factory, parser, BinaryOperation.Subtraction, BindingPower.Subtraction)); + parser.Register(TokenType.Mul, new BinaryOpParselet(factory, parser, BinaryOperation.Multiplication, BindingPower.Multiplication)); + parser.Register(TokenType.Div, new BinaryOpParselet(factory, parser, BinaryOperation.Division, BindingPower.Division)); + parser.Register(TokenType.Pow, new BinaryOpParselet(factory, parser, BinaryOperation.Power, BindingPower.Exponentiation)); + + return parser; + } +} diff --git a/src/ClosedXML.Parser/Pratt/Token.cs b/src/ClosedXML.Parser/Pratt/Token.cs index 1166cff..69103d9 100644 --- a/src/ClosedXML.Parser/Pratt/Token.cs +++ b/src/ClosedXML.Parser/Pratt/Token.cs @@ -4,18 +4,18 @@ namespace ClosedXML.Parser.Pratt; internal readonly struct Token { - private readonly SymbolRange _text; - public Token(TokenType type, int start, int end) { Type = type; - _text = new SymbolRange(start, end); + Range = new SymbolRange(start, end); } public TokenType Type { get; } + public SymbolRange Range { get; } + public ReadOnlySpan GetText(string input) { - return input.AsSpan(_text.Start, _text.Length); + return input.AsSpan(Range.Start, Range.Length); } -} \ No newline at end of file +} diff --git a/src/ClosedXML.Parser/SymbolRange.cs b/src/ClosedXML.Parser/SymbolRange.cs index c1b1c4e..bfe42cc 100644 --- a/src/ClosedXML.Parser/SymbolRange.cs +++ b/src/ClosedXML.Parser/SymbolRange.cs @@ -1,4 +1,6 @@ -namespace ClosedXML.Parser; +using System; + +namespace ClosedXML.Parser; /// /// A range of a symbol in formula text. @@ -37,4 +39,12 @@ public override string ToString() { return $"[{Start}:{End}]"; } -} \ No newline at end of file + + internal SymbolRange ExtendRight(SymbolRange rangeToRight) + { + if (End != rangeToRight.Start) + throw new InvalidOperationException($"The range end {End} doesn't match start of the range to the right {rangeToRight.Start}."); + + return new SymbolRange(Start, rangeToRight.End); + } +}