diff --git a/CLAUDE.md b/CLAUDE.md index e09f58c2..833629f0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -3,16 +3,20 @@ ## Pipeline ``` -Forward: HCL2 Text → Lark Parse Tree → LarkElement Tree → Python Dict/JSON +Forward: HCL2 Text → [PostLexer] → Lark Parse Tree → LarkElement Tree → Python Dict/JSON Reverse: Python Dict/JSON → LarkElement Tree → Lark Tree → HCL2 Text +Direct: HCL2 Text → [PostLexer] → Lark Parse Tree → LarkElement Tree → Lark Tree → HCL2 Text ``` +The **Direct** pipeline (`parse_to_tree` → `transform` → `to_lark` → `reconstruct`) skips serialization to dict, so all IR nodes (including `NewLineOrCommentRule` nodes for whitespace/comments) directly influence the reconstructed output. Any information discarded before the IR is lost in this pipeline. + ## Module Map | Module | Role | |---|---| | `hcl2/hcl2.lark` | Lark grammar definition | | `hcl2/api.py` | Public API (`load/loads/dump/dumps` + intermediate stages) | +| `hcl2/postlexer.py` | Token stream transforms between lexer and parser | | `hcl2/parser.py` | Lark parser factory with caching | | `hcl2/transformer.py` | Lark parse tree → LarkElement tree | | `hcl2/deserializer.py` | Python dict → LarkElement tree | @@ -73,6 +77,16 @@ jsontohcl2 --indent 4 --no-align file.json Add new options as `parser.add_argument()` calls in the relevant entry point module. +## PostLexer (`postlexer.py`) + +Lark's `postlex` parameter accepts a single object with a `process(stream)` method that transforms the token stream between the lexer and LALR parser. The `PostLexer` class is designed for extensibility: each transformation is a private method that accepts and yields tokens, and `process()` chains them together. + +Current passes: + +- `_merge_newlines_into_operators` + +To add a new pass: create a private method with the same `(self, stream) -> generator` signature, and add a `yield from` call in `process()`. + ## Hard Rules These are project-specific constraints that must not be violated: @@ -88,6 +102,7 @@ These are project-specific constraints that must not be violated: ## Adding a New Language Construct 1. Add grammar rules to `hcl2.lark` +1. If the new construct creates LALR ambiguities with `NL_OR_COMMENT`, add a postlexer pass in `postlexer.py` 1. Create rule class(es) in the appropriate `rules/` file 1. Add transformer method(s) in `transformer.py` 1. Implement `serialize()` in the rule class @@ -103,9 +118,9 @@ python -m unittest discover -s test -p "test_*.py" -v **Unit tests** (`test/unit/`): instantiate rule objects directly (no parsing). -- `test/unit/rules/` — one file per rules module -- `test/unit/cli/` — one file per CLI module -- `test/unit/test_api.py`, `test_builder.py`, `test_deserializer.py`, `test_formatter.py`, `test_reconstructor.py`, `test_utils.py` +- `rules/` — one file per rules module +- `cli/` — one file per CLI module +- `test_*.py` — tests for corresponding files from `hcl2/` directory Use concrete stubs when testing ABCs (e.g., `StubExpression(ExpressionRule)`). diff --git a/hcl2/parser.py b/hcl2/parser.py index a33fe5f8..d275a589 100644 --- a/hcl2/parser.py +++ b/hcl2/parser.py @@ -4,6 +4,8 @@ from lark import Lark +from hcl2.postlexer import PostLexer + PARSER_FILE = Path(__file__).absolute().resolve().parent / ".lark_cache.bin" @@ -17,4 +19,5 @@ def parser() -> Lark: cache=str(PARSER_FILE), # Disable/Delete file to effect changes to the grammar rel_to=__file__, propagate_positions=True, + postlex=PostLexer(), ) diff --git a/hcl2/postlexer.py b/hcl2/postlexer.py new file mode 100644 index 00000000..1d22cc03 --- /dev/null +++ b/hcl2/postlexer.py @@ -0,0 +1,79 @@ +"""Postlexer that transforms the token stream between the Lark lexer and parser. + +Each transformation is implemented as a private method that accepts and yields +tokens. The public ``process`` method chains them together, making it easy to +add new passes without touching existing logic. +""" + +from typing import FrozenSet, Iterator, Optional, Tuple + +from lark import Token + +# Type alias for a token stream consumed and produced by each pass. +TokenStream = Iterator[Token] + +# Operator token types that may legally follow a line-continuation newline. +# MINUS is excluded — it is also the unary negation operator, and merging a +# newline into MINUS would incorrectly consume statement-separating newlines +# before negative literals (e.g. "a = 1\nb = -2"). +OPERATOR_TYPES: FrozenSet[str] = frozenset( + { + "DOUBLE_EQ", + "NEQ", + "LT", + "GT", + "LEQ", + "GEQ", + "ASTERISK", + "SLASH", + "PERCENT", + "DOUBLE_AMP", + "DOUBLE_PIPE", + "PLUS", + "QMARK", + } +) + + +class PostLexer: + """Transform the token stream before it reaches the LALR parser.""" + + def process(self, stream: TokenStream) -> TokenStream: + """Chain all postlexer passes over the token stream.""" + yield from self._merge_newlines_into_operators(stream) + + def _merge_newlines_into_operators(self, stream: TokenStream) -> TokenStream: + """Merge NL_OR_COMMENT tokens into immediately following operator tokens. + + LALR parsers cannot distinguish a statement-ending newline from a + line-continuation newline before a binary operator. This pass resolves + the ambiguity by merging NL_OR_COMMENT into the operator token's value + when the next token is a binary operator or QMARK. The transformer + later extracts the newline prefix and creates a NewLineOrCommentRule + node, preserving round-trip fidelity. + """ + pending_nl: Optional[Token] = None + for token in stream: + if token.type == "NL_OR_COMMENT": + if pending_nl is not None: + yield pending_nl + pending_nl = token + else: + if pending_nl is not None: + if token.type in OPERATOR_TYPES: + token = token.update(value=str(pending_nl) + str(token)) + else: + yield pending_nl + pending_nl = None + yield token + if pending_nl is not None: + yield pending_nl + + @property + def always_accept(self) -> Tuple[()]: + """Terminal names the parser must accept even when not expected by LALR. + + Lark requires this property on postlexer objects. An empty tuple + means no extra terminals are injected. + """ + return () diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py index 4760665a..cd1538ec 100644 --- a/hcl2/reconstructor.py +++ b/hcl2/reconstructor.py @@ -183,6 +183,11 @@ def _reconstruct_tree( # Check spacing BEFORE processing children, while _last_rule_name # still reflects the previous sibling (not a child of this tree). needs_space = self._should_add_space_before(tree, parent_rule_name) + if needs_space: + # A space will be inserted before this tree's output, so tell + # children that the last character was a space to prevent the + # first child from adding a duplicate leading space. + self._last_was_space = True if rule_name == UnaryOpRule.lark_name(): for i, child in enumerate(tree.children): diff --git a/hcl2/rules/expressions.py b/hcl2/rules/expressions.py index c29859a3..6636d4c8 100644 --- a/hcl2/rules/expressions.py +++ b/hcl2/rules/expressions.py @@ -122,6 +122,7 @@ class ConditionalRule(ExpressionRule): _children_layout: Tuple[ ExpressionRule, + Optional[NewLineOrCommentRule], QMARK, Optional[NewLineOrCommentRule], ExpressionRule, @@ -137,7 +138,7 @@ def lark_name() -> str: return "conditional" def __init__(self, children, meta: Optional[Meta] = None): - self._insert_optionals(children, [2, 4, 6]) + self._insert_optionals(children, [1, 3, 5, 7]) super().__init__(children, meta) @property @@ -148,12 +149,12 @@ def condition(self) -> ExpressionRule: @property def if_true(self) -> ExpressionRule: """Return the true-branch expression.""" - return self._children[3] + return self._children[4] @property def if_false(self) -> ExpressionRule: """Return the false-branch expression.""" - return self._children[7] + return self._children[8] def serialize( self, options=SerializationOptions(), context=SerializationContext() @@ -179,6 +180,7 @@ class BinaryTermRule(ExpressionRule): """Rule for the operator+operand portion of a binary operation.""" _children_layout: Tuple[ + Optional[NewLineOrCommentRule], BinaryOperatorRule, Optional[NewLineOrCommentRule], ExprTermRule, @@ -190,18 +192,18 @@ def lark_name() -> str: return "binary_term" def __init__(self, children, meta: Optional[Meta] = None): - self._insert_optionals(children, [1]) + self._insert_optionals(children, [0, 2]) super().__init__(children, meta) @property def binary_operator(self) -> BinaryOperatorRule: """Return the binary operator.""" - return self._children[0] + return self._children[1] @property def expr_term(self) -> ExprTermRule: """Return the right-hand operand.""" - return self._children[2] + return self._children[3] def serialize( self, options=SerializationOptions(), context=SerializationContext() diff --git a/hcl2/transformer.py b/hcl2/transformer.py index d0a09630..f6e94004 100644 --- a/hcl2/transformer.py +++ b/hcl2/transformer.py @@ -81,9 +81,15 @@ def __init__(self, discard_new_line_or_comments: bool = False): def __default_token__(self, token: Token) -> StringToken: # TODO make this return StaticStringToken where applicable - if token.value in StaticStringToken.classes_by_value: - return StaticStringToken.classes_by_value[token.value]() - return StringToken[token.type](token.value) # type: ignore[misc] + value = token.value + # The EQ terminal /[ \t]*=(?!=|>)/ captures leading whitespace. + # Strip it so parsed and deserialized tokens produce the same "=" value, + # preventing the reconstructor from emitting double spaces. + if token.type == "EQ": + value = value.lstrip(" \t") + if value in StaticStringToken.classes_by_value: + return StaticStringToken.classes_by_value[value]() + return StringToken[token.type](value) # type: ignore[misc] # pylint: disable=C0103 def FLOAT_LITERAL(self, token: Token) -> FloatLiteral: @@ -164,8 +170,32 @@ def heredoc_template_trim(self, meta: Meta, args) -> HeredocTrimTemplateRule: def expr_term(self, meta: Meta, args) -> ExprTermRule: return ExprTermRule(args, meta) + def _extract_nl_prefix(self, token): + """Strip leading newlines from a token value. + + If the token contains a newline prefix (from the postlexer merging a + line-continuation newline into the operator token), strip it and + return a NewLineOrCommentRule. Otherwise return None. + """ + value = str(token.value) + stripped = value.lstrip("\n \t") + if len(stripped) == len(value): + return None + nl_text = value[: len(value) - len(stripped)] + token.set_value(stripped) + if self.discard_new_line_or_comments: + return None + return NewLineOrCommentRule.from_string(nl_text) + @v_args(meta=True) def conditional(self, meta: Meta, args) -> ConditionalRule: + # args: [condition, QMARK, NL?, if_true, NL?, COLON, NL?, if_false] + # QMARK is at index 1 — check for NL prefix from the postlexer + qmark_token = args[1] + nl_rule = self._extract_nl_prefix(qmark_token) + if nl_rule is not None: + args = list(args) + args.insert(1, nl_rule) return ConditionalRule(args, meta) @v_args(meta=True) @@ -174,6 +204,12 @@ def binary_operator(self, meta: Meta, args) -> BinaryOperatorRule: @v_args(meta=True) def binary_term(self, meta: Meta, args) -> BinaryTermRule: + # args: [BinaryOperatorRule, NL?, ExprTermRule] + # The operator's token may contain a NL prefix from the postlexer + op_rule = args[0] + nl_rule = self._extract_nl_prefix(op_rule.token) + if nl_rule is not None: + args = [nl_rule] + list(args) return BinaryTermRule(args, meta) @v_args(meta=True) diff --git a/test/integration/hcl2_original/smoke.tf b/test/integration/hcl2_original/smoke.tf index 99537532..32e22466 100644 --- a/test/integration/hcl2_original/smoke.tf +++ b/test/integration/hcl2_original/smoke.tf @@ -43,6 +43,28 @@ block label1 label2 { } } +block multiline_ternary { + foo = ( + bar + ? baz(foo) + : foo == "bar" + ? "baz" + : foo + ) +} + +block multiline_binary_ops { + expr = { + for k, v in local.map_a : k => v + if lookup(local.map_b[v.id + ], "enabled", false) + || ( + contains(local.map_c, v.id) + && contains(local.map_d, v.id) + ) + } +} + block { route53_forwarding_rule_shares = { for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : diff --git a/test/integration/hcl2_reconstructed/smoke.tf b/test/integration/hcl2_reconstructed/smoke.tf index c8529e70..655ee440 100644 --- a/test/integration/hcl2_reconstructed/smoke.tf +++ b/test/integration/hcl2_reconstructed/smoke.tf @@ -39,6 +39,20 @@ block label1 label2 { } +block multiline_ternary { + foo = (bar ? baz(foo) : foo == "bar" ? "baz" : foo) +} + + +block multiline_binary_ops { + expr = { + for k, v in local.map_a : + k => v + if lookup(local.map_b[v.id], "enabled", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id)) + } +} + + block { route53_forwarding_rule_shares = { for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : diff --git a/test/integration/json_reserialized/smoke.json b/test/integration/json_reserialized/smoke.json index dbff114f..ec003473 100644 --- a/test/integration/json_reserialized/smoke.json +++ b/test/integration/json_reserialized/smoke.json @@ -48,6 +48,18 @@ } } }, + { + "multiline_ternary": { + "foo": "${(bar ? baz(foo) : foo == \"bar\" ? \"baz\" : foo)}", + "__is_block__": true + } + }, + { + "multiline_binary_ops": { + "expr": "${{for k, v in local.map_a : k => v if lookup(local.map_b[v.id], \"enabled\", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id))}}", + "__is_block__": true + } + }, { "route53_forwarding_rule_shares": "${{for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : \"${forwarding_rule_key}\" => {aws_account_ids = [for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names : module.remote_state_subaccounts.map[account_name].outputs[\"aws_account_id\"]]}... if substr(bucket_name, 0, 1) == \"l\"}}", "__is_block__": true diff --git a/test/integration/json_serialized/smoke.json b/test/integration/json_serialized/smoke.json index dbff114f..ec003473 100644 --- a/test/integration/json_serialized/smoke.json +++ b/test/integration/json_serialized/smoke.json @@ -48,6 +48,18 @@ } } }, + { + "multiline_ternary": { + "foo": "${(bar ? baz(foo) : foo == \"bar\" ? \"baz\" : foo)}", + "__is_block__": true + } + }, + { + "multiline_binary_ops": { + "expr": "${{for k, v in local.map_a : k => v if lookup(local.map_b[v.id], \"enabled\", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id))}}", + "__is_block__": true + } + }, { "route53_forwarding_rule_shares": "${{for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : \"${forwarding_rule_key}\" => {aws_account_ids = [for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names : module.remote_state_subaccounts.map[account_name].outputs[\"aws_account_id\"]]}... if substr(bucket_name, 0, 1) == \"l\"}}", "__is_block__": true diff --git a/test/unit/rules/test_expressions.py b/test/unit/rules/test_expressions.py index 974885b5..8ab7e8db 100644 --- a/test/unit/rules/test_expressions.py +++ b/test/unit/rules/test_expressions.py @@ -194,8 +194,8 @@ def test_lark_name(self): def test_construction_inserts_optional_slots(self): rule = self._make_conditional() - # Should have 8 children after _insert_optionals at [2, 4, 6] - self.assertEqual(len(rule.children), 8) + # Should have 9 children after _insert_optionals at [1, 3, 5, 7] + self.assertEqual(len(rule.children), 9) def test_condition_property(self): cond = StubExpression("cond") @@ -274,9 +274,10 @@ def test_lark_name(self): def test_construction_inserts_optional(self): rule = _make_binary_term("+", "b") - # [BinaryOperatorRule, None, ExprTermRule] - self.assertEqual(len(rule.children), 3) - self.assertIsNone(rule.children[1]) + # [None, BinaryOperatorRule, None, ExprTermRule] + self.assertEqual(len(rule.children), 4) + self.assertIsNone(rule.children[0]) + self.assertIsNone(rule.children[2]) def test_binary_operator_property(self): op = _make_binary_operator("+") diff --git a/test/unit/test_postlexer.py b/test/unit/test_postlexer.py new file mode 100644 index 00000000..07a8164b --- /dev/null +++ b/test/unit/test_postlexer.py @@ -0,0 +1,150 @@ +# pylint: disable=C0103,C0114,C0115,C0116 +"""Unit tests for hcl2.postlexer. + +Tests parse real HCL2 snippets through the full pipeline to verify that the +postlexer correctly handles newlines before binary operators and QMARK. +""" + +from unittest import TestCase + +from lark import Token + +from hcl2.api import loads +from hcl2.postlexer import OPERATOR_TYPES, PostLexer + + +class TestMergeNewlinesIntoOperators(TestCase): + """Test _merge_newlines_into_operators at the token-stream level.""" + + def _run(self, tokens): + """Run the postlexer pass and return a list of tokens.""" + return list(PostLexer()._merge_newlines_into_operators(iter(tokens))) + + def test_no_newlines_passes_through(self): + tokens = [Token("NAME", "a"), Token("PLUS", "+"), Token("NAME", "b")] + result = self._run(tokens) + self.assertEqual(len(result), 3) + self.assertEqual(result[1].type, "PLUS") + self.assertEqual(str(result[1]), "+") + + def test_newline_before_operator_is_merged(self): + tokens = [ + Token("NAME", "a"), + Token("NL_OR_COMMENT", "\n "), + Token("PLUS", "+"), + Token("NAME", "b"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 3) + self.assertEqual(result[1].type, "PLUS") + self.assertEqual(str(result[1]), "\n +") + + def test_newline_before_non_operator_is_preserved(self): + tokens = [ + Token("NAME", "a"), + Token("NL_OR_COMMENT", "\n"), + Token("NAME", "b"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 3) + self.assertEqual(result[1].type, "NL_OR_COMMENT") + + def test_consecutive_newlines_first_yielded(self): + tokens = [ + Token("NL_OR_COMMENT", "\n"), + Token("NL_OR_COMMENT", "\n "), + Token("PLUS", "+"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 2) + self.assertEqual(result[0].type, "NL_OR_COMMENT") + self.assertEqual(str(result[0]), "\n") + self.assertEqual(result[1].type, "PLUS") + self.assertEqual(str(result[1]), "\n +") + + def test_trailing_newline_is_yielded(self): + tokens = [Token("NAME", "a"), Token("NL_OR_COMMENT", "\n")] + result = self._run(tokens) + self.assertEqual(len(result), 2) + self.assertEqual(result[1].type, "NL_OR_COMMENT") + + def test_all_operator_types_are_merged(self): + for op_type in sorted(OPERATOR_TYPES): + with self.subTest(op_type=op_type): + tokens = [ + Token("NL_OR_COMMENT", "\n"), + Token(op_type, "x"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 1) + self.assertEqual(result[0].type, op_type) + self.assertTrue(str(result[0]).startswith("\n")) + + def test_minus_not_in_operator_types(self): + self.assertNotIn("MINUS", OPERATOR_TYPES) + + +class TestMultilineOperatorParsing(TestCase): + """Test that HCL2 snippets with multiline operators parse correctly.""" + + def test_multiline_ternary(self): + hcl = 'x = (\n a\n ? "yes"\n : "no"\n)\n' + result = loads(hcl) + self.assertEqual(result["x"], '${(a ? "yes" : "no")}') + + def test_multiline_logical_or(self): + hcl = "x = (\n a\n || b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a || b)}") + + def test_multiline_logical_and(self): + hcl = "x = (\n a\n && b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a && b)}") + + def test_multiline_equality(self): + hcl = "x = (\n a\n == b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a == b)}") + + def test_multiline_not_equal(self): + hcl = "x = (\n a\n != b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a != b)}") + + def test_multiline_comparison(self): + hcl = "x = (\n a\n >= b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a >= b)}") + + def test_multiline_addition(self): + hcl = "x = (\n a\n + b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a + b)}") + + def test_multiline_multiplication(self): + hcl = "x = (\n a\n * b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a * b)}") + + def test_multiline_chained_operators(self): + hcl = "x = (\n a\n && b\n && c\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a && b && c)}") + + def test_multiline_nested_ternary(self): + hcl = 'x = (\n a\n ? b\n : c == "d"\n ? "e"\n : f\n)\n' + result = loads(hcl) + self.assertEqual(result["x"], '${(a ? b : c == "d" ? "e" : f)}') + + def test_minus_on_new_line_is_separate_attribute(self): + """MINUS is excluded from merging — newline before - starts a new statement.""" + hcl = "a = 1\nb = -2\n" + result = loads(hcl) + self.assertEqual(result["a"], 1) + self.assertIn("b", result) + + def test_single_line_operators_still_work(self): + hcl = "x = a + b\n" + result = loads(hcl) + self.assertEqual(result["x"], "${a + b}")