From ba0039faea5d03e7614638145f2925e4ce8b0086 Mon Sep 17 00:00:00 2001 From: Kamil Kozik Date: Tue, 10 Mar 2026 14:26:58 +0100 Subject: [PATCH 1/4] Add postlexer to support multiline binary operators and ternary expressions --- CLAUDE.md | 23 ++- hcl2/parser.py | 3 + hcl2/postlexer.py | 80 ++++++++++ hcl2/reconstructor.py | 5 + hcl2/rules/expressions.py | 14 +- hcl2/transformer.py | 42 ++++- test/integration/hcl2_original/smoke.tf | 22 +++ test/integration/hcl2_reconstructed/smoke.tf | 14 ++ test/integration/json_reserialized/smoke.json | 12 ++ test/integration/json_serialized/smoke.json | 12 ++ test/unit/rules/test_expressions.py | 11 +- test/unit/test_postlexer.py | 150 ++++++++++++++++++ 12 files changed, 370 insertions(+), 18 deletions(-) create mode 100644 hcl2/postlexer.py create mode 100644 test/unit/test_postlexer.py diff --git a/CLAUDE.md b/CLAUDE.md index e09f58c2..833629f0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -3,16 +3,20 @@ ## Pipeline ``` -Forward: HCL2 Text → Lark Parse Tree → LarkElement Tree → Python Dict/JSON +Forward: HCL2 Text → [PostLexer] → Lark Parse Tree → LarkElement Tree → Python Dict/JSON Reverse: Python Dict/JSON → LarkElement Tree → Lark Tree → HCL2 Text +Direct: HCL2 Text → [PostLexer] → Lark Parse Tree → LarkElement Tree → Lark Tree → HCL2 Text ``` +The **Direct** pipeline (`parse_to_tree` → `transform` → `to_lark` → `reconstruct`) skips serialization to dict, so all IR nodes (including `NewLineOrCommentRule` nodes for whitespace/comments) directly influence the reconstructed output. Any information discarded before the IR is lost in this pipeline. + ## Module Map | Module | Role | |---|---| | `hcl2/hcl2.lark` | Lark grammar definition | | `hcl2/api.py` | Public API (`load/loads/dump/dumps` + intermediate stages) | +| `hcl2/postlexer.py` | Token stream transforms between lexer and parser | | `hcl2/parser.py` | Lark parser factory with caching | | `hcl2/transformer.py` | Lark parse tree → LarkElement tree | | `hcl2/deserializer.py` | Python dict → LarkElement tree | @@ -73,6 +77,16 @@ jsontohcl2 --indent 4 --no-align file.json Add new options as `parser.add_argument()` calls in the relevant entry point module. +## PostLexer (`postlexer.py`) + +Lark's `postlex` parameter accepts a single object with a `process(stream)` method that transforms the token stream between the lexer and LALR parser. The `PostLexer` class is designed for extensibility: each transformation is a private method that accepts and yields tokens, and `process()` chains them together. + +Current passes: + +- `_merge_newlines_into_operators` + +To add a new pass: create a private method with the same `(self, stream) -> generator` signature, and add a `yield from` call in `process()`. + ## Hard Rules These are project-specific constraints that must not be violated: @@ -88,6 +102,7 @@ These are project-specific constraints that must not be violated: ## Adding a New Language Construct 1. Add grammar rules to `hcl2.lark` +1. If the new construct creates LALR ambiguities with `NL_OR_COMMENT`, add a postlexer pass in `postlexer.py` 1. Create rule class(es) in the appropriate `rules/` file 1. Add transformer method(s) in `transformer.py` 1. Implement `serialize()` in the rule class @@ -103,9 +118,9 @@ python -m unittest discover -s test -p "test_*.py" -v **Unit tests** (`test/unit/`): instantiate rule objects directly (no parsing). -- `test/unit/rules/` — one file per rules module -- `test/unit/cli/` — one file per CLI module -- `test/unit/test_api.py`, `test_builder.py`, `test_deserializer.py`, `test_formatter.py`, `test_reconstructor.py`, `test_utils.py` +- `rules/` — one file per rules module +- `cli/` — one file per CLI module +- `test_*.py` — tests for corresponding files from `hcl2/` directory Use concrete stubs when testing ABCs (e.g., `StubExpression(ExpressionRule)`). diff --git a/hcl2/parser.py b/hcl2/parser.py index a33fe5f8..d275a589 100644 --- a/hcl2/parser.py +++ b/hcl2/parser.py @@ -4,6 +4,8 @@ from lark import Lark +from hcl2.postlexer import PostLexer + PARSER_FILE = Path(__file__).absolute().resolve().parent / ".lark_cache.bin" @@ -17,4 +19,5 @@ def parser() -> Lark: cache=str(PARSER_FILE), # Disable/Delete file to effect changes to the grammar rel_to=__file__, propagate_positions=True, + postlex=PostLexer(), ) diff --git a/hcl2/postlexer.py b/hcl2/postlexer.py new file mode 100644 index 00000000..0418f362 --- /dev/null +++ b/hcl2/postlexer.py @@ -0,0 +1,80 @@ +"""Postlexer that transforms the token stream between the Lark lexer and parser. + +Each transformation is implemented as a private method that accepts and yields +tokens. The public ``process`` method chains them together, making it easy to +add new passes without touching existing logic. +""" + +from collections.abc import Iterator +from typing import FrozenSet, Optional, Tuple + +from lark import Token + +# Type alias for a token stream consumed and produced by each pass. +TokenStream = Iterator[Token] + +# Operator token types that may legally follow a line-continuation newline. +# MINUS is excluded — it is also the unary negation operator, and merging a +# newline into MINUS would incorrectly consume statement-separating newlines +# before negative literals (e.g. "a = 1\nb = -2"). +OPERATOR_TYPES: FrozenSet[str] = frozenset( + { + "DOUBLE_EQ", + "NEQ", + "LT", + "GT", + "LEQ", + "GEQ", + "ASTERISK", + "SLASH", + "PERCENT", + "DOUBLE_AMP", + "DOUBLE_PIPE", + "PLUS", + "QMARK", + } +) + + +class PostLexer: + """Transform the token stream before it reaches the LALR parser.""" + + def process(self, stream: TokenStream) -> TokenStream: + """Chain all postlexer passes over the token stream.""" + yield from self._merge_newlines_into_operators(stream) + + def _merge_newlines_into_operators(self, stream: TokenStream) -> TokenStream: + """Merge NL_OR_COMMENT tokens into immediately following operator tokens. + + LALR parsers cannot distinguish a statement-ending newline from a + line-continuation newline before a binary operator. This pass resolves + the ambiguity by merging NL_OR_COMMENT into the operator token's value + when the next token is a binary operator or QMARK. The transformer + later extracts the newline prefix and creates a NewLineOrCommentRule + node, preserving round-trip fidelity. + """ + pending_nl: Optional[Token] = None + for token in stream: + if token.type == "NL_OR_COMMENT": + if pending_nl is not None: + yield pending_nl + pending_nl = token + else: + if pending_nl is not None: + if token.type in OPERATOR_TYPES: + token = token.update(value=str(pending_nl) + str(token)) + else: + yield pending_nl + pending_nl = None + yield token + if pending_nl is not None: + yield pending_nl + + @property + def always_accept(self) -> Tuple[()]: + """Terminal names the parser must accept even when not expected by LALR. + + Lark requires this property on postlexer objects. An empty tuple + means no extra terminals are injected. + """ + return () diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py index 4760665a..cd1538ec 100644 --- a/hcl2/reconstructor.py +++ b/hcl2/reconstructor.py @@ -183,6 +183,11 @@ def _reconstruct_tree( # Check spacing BEFORE processing children, while _last_rule_name # still reflects the previous sibling (not a child of this tree). needs_space = self._should_add_space_before(tree, parent_rule_name) + if needs_space: + # A space will be inserted before this tree's output, so tell + # children that the last character was a space to prevent the + # first child from adding a duplicate leading space. + self._last_was_space = True if rule_name == UnaryOpRule.lark_name(): for i, child in enumerate(tree.children): diff --git a/hcl2/rules/expressions.py b/hcl2/rules/expressions.py index c29859a3..6636d4c8 100644 --- a/hcl2/rules/expressions.py +++ b/hcl2/rules/expressions.py @@ -122,6 +122,7 @@ class ConditionalRule(ExpressionRule): _children_layout: Tuple[ ExpressionRule, + Optional[NewLineOrCommentRule], QMARK, Optional[NewLineOrCommentRule], ExpressionRule, @@ -137,7 +138,7 @@ def lark_name() -> str: return "conditional" def __init__(self, children, meta: Optional[Meta] = None): - self._insert_optionals(children, [2, 4, 6]) + self._insert_optionals(children, [1, 3, 5, 7]) super().__init__(children, meta) @property @@ -148,12 +149,12 @@ def condition(self) -> ExpressionRule: @property def if_true(self) -> ExpressionRule: """Return the true-branch expression.""" - return self._children[3] + return self._children[4] @property def if_false(self) -> ExpressionRule: """Return the false-branch expression.""" - return self._children[7] + return self._children[8] def serialize( self, options=SerializationOptions(), context=SerializationContext() @@ -179,6 +180,7 @@ class BinaryTermRule(ExpressionRule): """Rule for the operator+operand portion of a binary operation.""" _children_layout: Tuple[ + Optional[NewLineOrCommentRule], BinaryOperatorRule, Optional[NewLineOrCommentRule], ExprTermRule, @@ -190,18 +192,18 @@ def lark_name() -> str: return "binary_term" def __init__(self, children, meta: Optional[Meta] = None): - self._insert_optionals(children, [1]) + self._insert_optionals(children, [0, 2]) super().__init__(children, meta) @property def binary_operator(self) -> BinaryOperatorRule: """Return the binary operator.""" - return self._children[0] + return self._children[1] @property def expr_term(self) -> ExprTermRule: """Return the right-hand operand.""" - return self._children[2] + return self._children[3] def serialize( self, options=SerializationOptions(), context=SerializationContext() diff --git a/hcl2/transformer.py b/hcl2/transformer.py index d0a09630..f6e94004 100644 --- a/hcl2/transformer.py +++ b/hcl2/transformer.py @@ -81,9 +81,15 @@ def __init__(self, discard_new_line_or_comments: bool = False): def __default_token__(self, token: Token) -> StringToken: # TODO make this return StaticStringToken where applicable - if token.value in StaticStringToken.classes_by_value: - return StaticStringToken.classes_by_value[token.value]() - return StringToken[token.type](token.value) # type: ignore[misc] + value = token.value + # The EQ terminal /[ \t]*=(?!=|>)/ captures leading whitespace. + # Strip it so parsed and deserialized tokens produce the same "=" value, + # preventing the reconstructor from emitting double spaces. + if token.type == "EQ": + value = value.lstrip(" \t") + if value in StaticStringToken.classes_by_value: + return StaticStringToken.classes_by_value[value]() + return StringToken[token.type](value) # type: ignore[misc] # pylint: disable=C0103 def FLOAT_LITERAL(self, token: Token) -> FloatLiteral: @@ -164,8 +170,32 @@ def heredoc_template_trim(self, meta: Meta, args) -> HeredocTrimTemplateRule: def expr_term(self, meta: Meta, args) -> ExprTermRule: return ExprTermRule(args, meta) + def _extract_nl_prefix(self, token): + """Strip leading newlines from a token value. + + If the token contains a newline prefix (from the postlexer merging a + line-continuation newline into the operator token), strip it and + return a NewLineOrCommentRule. Otherwise return None. + """ + value = str(token.value) + stripped = value.lstrip("\n \t") + if len(stripped) == len(value): + return None + nl_text = value[: len(value) - len(stripped)] + token.set_value(stripped) + if self.discard_new_line_or_comments: + return None + return NewLineOrCommentRule.from_string(nl_text) + @v_args(meta=True) def conditional(self, meta: Meta, args) -> ConditionalRule: + # args: [condition, QMARK, NL?, if_true, NL?, COLON, NL?, if_false] + # QMARK is at index 1 — check for NL prefix from the postlexer + qmark_token = args[1] + nl_rule = self._extract_nl_prefix(qmark_token) + if nl_rule is not None: + args = list(args) + args.insert(1, nl_rule) return ConditionalRule(args, meta) @v_args(meta=True) @@ -174,6 +204,12 @@ def binary_operator(self, meta: Meta, args) -> BinaryOperatorRule: @v_args(meta=True) def binary_term(self, meta: Meta, args) -> BinaryTermRule: + # args: [BinaryOperatorRule, NL?, ExprTermRule] + # The operator's token may contain a NL prefix from the postlexer + op_rule = args[0] + nl_rule = self._extract_nl_prefix(op_rule.token) + if nl_rule is not None: + args = [nl_rule] + list(args) return BinaryTermRule(args, meta) @v_args(meta=True) diff --git a/test/integration/hcl2_original/smoke.tf b/test/integration/hcl2_original/smoke.tf index 99537532..32e22466 100644 --- a/test/integration/hcl2_original/smoke.tf +++ b/test/integration/hcl2_original/smoke.tf @@ -43,6 +43,28 @@ block label1 label2 { } } +block multiline_ternary { + foo = ( + bar + ? baz(foo) + : foo == "bar" + ? "baz" + : foo + ) +} + +block multiline_binary_ops { + expr = { + for k, v in local.map_a : k => v + if lookup(local.map_b[v.id + ], "enabled", false) + || ( + contains(local.map_c, v.id) + && contains(local.map_d, v.id) + ) + } +} + block { route53_forwarding_rule_shares = { for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : diff --git a/test/integration/hcl2_reconstructed/smoke.tf b/test/integration/hcl2_reconstructed/smoke.tf index c8529e70..655ee440 100644 --- a/test/integration/hcl2_reconstructed/smoke.tf +++ b/test/integration/hcl2_reconstructed/smoke.tf @@ -39,6 +39,20 @@ block label1 label2 { } +block multiline_ternary { + foo = (bar ? baz(foo) : foo == "bar" ? "baz" : foo) +} + + +block multiline_binary_ops { + expr = { + for k, v in local.map_a : + k => v + if lookup(local.map_b[v.id], "enabled", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id)) + } +} + + block { route53_forwarding_rule_shares = { for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : diff --git a/test/integration/json_reserialized/smoke.json b/test/integration/json_reserialized/smoke.json index dbff114f..ec003473 100644 --- a/test/integration/json_reserialized/smoke.json +++ b/test/integration/json_reserialized/smoke.json @@ -48,6 +48,18 @@ } } }, + { + "multiline_ternary": { + "foo": "${(bar ? baz(foo) : foo == \"bar\" ? \"baz\" : foo)}", + "__is_block__": true + } + }, + { + "multiline_binary_ops": { + "expr": "${{for k, v in local.map_a : k => v if lookup(local.map_b[v.id], \"enabled\", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id))}}", + "__is_block__": true + } + }, { "route53_forwarding_rule_shares": "${{for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : \"${forwarding_rule_key}\" => {aws_account_ids = [for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names : module.remote_state_subaccounts.map[account_name].outputs[\"aws_account_id\"]]}... if substr(bucket_name, 0, 1) == \"l\"}}", "__is_block__": true diff --git a/test/integration/json_serialized/smoke.json b/test/integration/json_serialized/smoke.json index dbff114f..ec003473 100644 --- a/test/integration/json_serialized/smoke.json +++ b/test/integration/json_serialized/smoke.json @@ -48,6 +48,18 @@ } } }, + { + "multiline_ternary": { + "foo": "${(bar ? baz(foo) : foo == \"bar\" ? \"baz\" : foo)}", + "__is_block__": true + } + }, + { + "multiline_binary_ops": { + "expr": "${{for k, v in local.map_a : k => v if lookup(local.map_b[v.id], \"enabled\", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id))}}", + "__is_block__": true + } + }, { "route53_forwarding_rule_shares": "${{for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : \"${forwarding_rule_key}\" => {aws_account_ids = [for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names : module.remote_state_subaccounts.map[account_name].outputs[\"aws_account_id\"]]}... if substr(bucket_name, 0, 1) == \"l\"}}", "__is_block__": true diff --git a/test/unit/rules/test_expressions.py b/test/unit/rules/test_expressions.py index 974885b5..8ab7e8db 100644 --- a/test/unit/rules/test_expressions.py +++ b/test/unit/rules/test_expressions.py @@ -194,8 +194,8 @@ def test_lark_name(self): def test_construction_inserts_optional_slots(self): rule = self._make_conditional() - # Should have 8 children after _insert_optionals at [2, 4, 6] - self.assertEqual(len(rule.children), 8) + # Should have 9 children after _insert_optionals at [1, 3, 5, 7] + self.assertEqual(len(rule.children), 9) def test_condition_property(self): cond = StubExpression("cond") @@ -274,9 +274,10 @@ def test_lark_name(self): def test_construction_inserts_optional(self): rule = _make_binary_term("+", "b") - # [BinaryOperatorRule, None, ExprTermRule] - self.assertEqual(len(rule.children), 3) - self.assertIsNone(rule.children[1]) + # [None, BinaryOperatorRule, None, ExprTermRule] + self.assertEqual(len(rule.children), 4) + self.assertIsNone(rule.children[0]) + self.assertIsNone(rule.children[2]) def test_binary_operator_property(self): op = _make_binary_operator("+") diff --git a/test/unit/test_postlexer.py b/test/unit/test_postlexer.py new file mode 100644 index 00000000..07a8164b --- /dev/null +++ b/test/unit/test_postlexer.py @@ -0,0 +1,150 @@ +# pylint: disable=C0103,C0114,C0115,C0116 +"""Unit tests for hcl2.postlexer. + +Tests parse real HCL2 snippets through the full pipeline to verify that the +postlexer correctly handles newlines before binary operators and QMARK. +""" + +from unittest import TestCase + +from lark import Token + +from hcl2.api import loads +from hcl2.postlexer import OPERATOR_TYPES, PostLexer + + +class TestMergeNewlinesIntoOperators(TestCase): + """Test _merge_newlines_into_operators at the token-stream level.""" + + def _run(self, tokens): + """Run the postlexer pass and return a list of tokens.""" + return list(PostLexer()._merge_newlines_into_operators(iter(tokens))) + + def test_no_newlines_passes_through(self): + tokens = [Token("NAME", "a"), Token("PLUS", "+"), Token("NAME", "b")] + result = self._run(tokens) + self.assertEqual(len(result), 3) + self.assertEqual(result[1].type, "PLUS") + self.assertEqual(str(result[1]), "+") + + def test_newline_before_operator_is_merged(self): + tokens = [ + Token("NAME", "a"), + Token("NL_OR_COMMENT", "\n "), + Token("PLUS", "+"), + Token("NAME", "b"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 3) + self.assertEqual(result[1].type, "PLUS") + self.assertEqual(str(result[1]), "\n +") + + def test_newline_before_non_operator_is_preserved(self): + tokens = [ + Token("NAME", "a"), + Token("NL_OR_COMMENT", "\n"), + Token("NAME", "b"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 3) + self.assertEqual(result[1].type, "NL_OR_COMMENT") + + def test_consecutive_newlines_first_yielded(self): + tokens = [ + Token("NL_OR_COMMENT", "\n"), + Token("NL_OR_COMMENT", "\n "), + Token("PLUS", "+"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 2) + self.assertEqual(result[0].type, "NL_OR_COMMENT") + self.assertEqual(str(result[0]), "\n") + self.assertEqual(result[1].type, "PLUS") + self.assertEqual(str(result[1]), "\n +") + + def test_trailing_newline_is_yielded(self): + tokens = [Token("NAME", "a"), Token("NL_OR_COMMENT", "\n")] + result = self._run(tokens) + self.assertEqual(len(result), 2) + self.assertEqual(result[1].type, "NL_OR_COMMENT") + + def test_all_operator_types_are_merged(self): + for op_type in sorted(OPERATOR_TYPES): + with self.subTest(op_type=op_type): + tokens = [ + Token("NL_OR_COMMENT", "\n"), + Token(op_type, "x"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 1) + self.assertEqual(result[0].type, op_type) + self.assertTrue(str(result[0]).startswith("\n")) + + def test_minus_not_in_operator_types(self): + self.assertNotIn("MINUS", OPERATOR_TYPES) + + +class TestMultilineOperatorParsing(TestCase): + """Test that HCL2 snippets with multiline operators parse correctly.""" + + def test_multiline_ternary(self): + hcl = 'x = (\n a\n ? "yes"\n : "no"\n)\n' + result = loads(hcl) + self.assertEqual(result["x"], '${(a ? "yes" : "no")}') + + def test_multiline_logical_or(self): + hcl = "x = (\n a\n || b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a || b)}") + + def test_multiline_logical_and(self): + hcl = "x = (\n a\n && b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a && b)}") + + def test_multiline_equality(self): + hcl = "x = (\n a\n == b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a == b)}") + + def test_multiline_not_equal(self): + hcl = "x = (\n a\n != b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a != b)}") + + def test_multiline_comparison(self): + hcl = "x = (\n a\n >= b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a >= b)}") + + def test_multiline_addition(self): + hcl = "x = (\n a\n + b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a + b)}") + + def test_multiline_multiplication(self): + hcl = "x = (\n a\n * b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a * b)}") + + def test_multiline_chained_operators(self): + hcl = "x = (\n a\n && b\n && c\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a && b && c)}") + + def test_multiline_nested_ternary(self): + hcl = 'x = (\n a\n ? b\n : c == "d"\n ? "e"\n : f\n)\n' + result = loads(hcl) + self.assertEqual(result["x"], '${(a ? b : c == "d" ? "e" : f)}') + + def test_minus_on_new_line_is_separate_attribute(self): + """MINUS is excluded from merging — newline before - starts a new statement.""" + hcl = "a = 1\nb = -2\n" + result = loads(hcl) + self.assertEqual(result["a"], 1) + self.assertIn("b", result) + + def test_single_line_operators_still_work(self): + hcl = "x = a + b\n" + result = loads(hcl) + self.assertEqual(result["x"], "${a + b}") From a3bc62cbaead219648d5e635d79b5e15713d8672 Mon Sep 17 00:00:00 2001 From: Kamil Kozik Date: Tue, 10 Mar 2026 15:05:23 +0100 Subject: [PATCH 2/4] more robust whitespace handling during reconstruction --- hcl2/formatter.py | 12 ++++-- hcl2/hcl2.lark | 2 +- hcl2/reconstructor.py | 45 +++++++++++++++++--- hcl2/rules/expressions.py | 7 ++- hcl2/transformer.py | 8 ++-- test/integration/hcl2_original/smoke.tf | 2 +- test/integration/hcl2_reconstructed/smoke.tf | 2 +- test/integration/test_round_trip.py | 35 +++++++++++++++ 8 files changed, 95 insertions(+), 18 deletions(-) diff --git a/hcl2/formatter.py b/hcl2/formatter.py index 20c3dced..5fc1c1b3 100644 --- a/hcl2/formatter.py +++ b/hcl2/formatter.py @@ -272,8 +272,8 @@ def _align_attributes_sequence(self, attributes_sequence: List[AttributeRule]): for attribute in attributes_sequence: name_length = len(attribute.identifier.token.value) spaces_to_add = max_length - name_length - base = attribute.children[1].value.lstrip(" ") - attribute.children[1].set_value(" " * spaces_to_add + base) + base = attribute.children[1].value.lstrip(" \t") + attribute.children[1].set_value(" " * (spaces_to_add + 1) + base) def _vertically_align_object_elems(self, rule: ObjectRule): max_length = max(self._key_text_width(elem.key) for elem in rule.elements) @@ -286,8 +286,12 @@ def _vertically_align_object_elems(self, rule: ObjectRule): if isinstance(separator, COLON): # type: ignore[misc] spaces_to_add += 1 - base = separator.value.lstrip(" ") - elem.children[1].set_value(" " * spaces_to_add + base) + base = separator.value.lstrip(" \t") + # EQ gets +1 for the base space (reconstructor no longer adds it + # when the token already has leading whitespace); COLON already + # has its own +1 above and the reconstructor doesn't add space. + extra = 1 if not isinstance(separator, COLON) else 0 # type: ignore[misc] + elem.children[1].set_value(" " * (spaces_to_add + extra) + base) @staticmethod def _key_text_width(key: LarkElement) -> int: diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark index f0248290..a5c5fba6 100644 --- a/hcl2/hcl2.lark +++ b/hcl2/hcl2.lark @@ -50,7 +50,7 @@ RSQB : "]" COMMA : "," DOT : "." EQ : /[ \t]*=(?!=|>)/ -COLON : ":" +COLON : /[ \t]*:(?!:)/ DBLQUOTE : "\"" // Interpolation diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py index cd1538ec..2a9ca708 100644 --- a/hcl2/reconstructor.py +++ b/hcl2/reconstructor.py @@ -4,6 +4,7 @@ from lark import Tree, Token from hcl2.rules import tokens from hcl2.rules.base import BlockRule +from hcl2.rules.containers import ObjectElemRule from hcl2.rules.for_expressions import ForIntroRule, ForTupleExprRule, ForObjectExprRule from hcl2.rules.literal_rules import IdentifierRule from hcl2.rules.strings import StringRule @@ -76,14 +77,20 @@ def _should_add_space_before( or self._last_token_name in [tokens.COLON.lark_name(), tokens.QMARK.lark_name()] ): + # COLON may already carry leading whitespace from the grammar + if token_type == tokens.COLON.lark_name() and str( + current_node + ).startswith((" ", "\t")): + return False return True - # Space after + # Space before colon in for_intro if ( parent_rule_name == ForIntroRule.lark_name() and token_type == tokens.COLON.lark_name() ): - + if str(current_node).startswith((" ", "\t")): + return False return True # Space after commas in tuples and function arguments... @@ -120,10 +127,28 @@ def _should_add_space_before( return True # Space after ellipsis in function arguments + # ... except before newlines which provide their own whitespace if self._last_token_name == tokens.ELLIPSIS.lark_name(): + if token_type == "NL_OR_COMMENT": + return False return True - if tokens.EQ.lark_name() in [token_type, self._last_token_name]: + # Space around EQ and COLON separators in attributes/object elements. + # Both terminals may carry leading whitespace from the original + # source (e.g. " =" for aligned attributes, " :" for object + # elements). Skip the automatic space when the token already + # provides it. COLON only gets space if it already has leading + # whitespace (unlike EQ which always gets at least one space). + if token_type == tokens.EQ.lark_name(): + if str(current_node).startswith((" ", "\t")): + return False + return True + if token_type == tokens.COLON.lark_name(): + return False + if self._last_token_name == tokens.EQ.lark_name(): + # Don't add space before newlines which provide their own whitespace + if token_type == "NL_OR_COMMENT": + return False return True # Don't add space around operator tokens inside unary_op @@ -158,14 +183,16 @@ def _should_add_space_before( ): return True - # Space after colon in for expressions (before value expression, - # but not before newline/comment which provides its own whitespace) + # Space after colon in for expressions and object elements + # (before value expression, but not before newline/comment + # which provides its own whitespace) if ( self._last_token_name == tokens.COLON.lark_name() and parent_rule_name in [ ForTupleExprRule.lark_name(), ForObjectExprRule.lark_name(), + ObjectElemRule.lark_name(), ] and rule_name != "new_line_or_comment" ): @@ -253,6 +280,14 @@ def reconstruct(self, tree: Tree, postproc=None) -> str: if postproc: result = postproc(result) + # The grammar's body rule ends with an optional new_line_or_comment + # which captures the final newline. The parser often produces two + # NL_OR_COMMENT tokens for a single trailing newline (the statement + # separator plus the EOF newline), resulting in a spurious blank line. + # Strip exactly one trailing newline when there are two or more. + if result.endswith("\n\n"): + result = result[:-1] + # Ensure file ends with newline if result and not result.endswith("\n"): result += "\n" diff --git a/hcl2/rules/expressions.py b/hcl2/rules/expressions.py index 6636d4c8..f29eea5a 100644 --- a/hcl2/rules/expressions.py +++ b/hcl2/rules/expressions.py @@ -248,7 +248,9 @@ def serialize( """Serialize to 'lhs operator rhs' string.""" with context.modify(inside_dollar_string=True): lhs = self.expr_term.serialize(options, context) - operator = self.binary_term.binary_operator.serialize(options, context) + operator = str( + self.binary_term.binary_operator.serialize(options, context) + ).strip() rhs = self.binary_term.expr_term.serialize(options, context) result = f"{lhs} {operator} {rhs}" @@ -286,7 +288,8 @@ def serialize( ) -> Any: """Serialize to 'operator operand' string.""" with context.modify(inside_dollar_string=True): - result = f"{self.operator}{self.expr_term.serialize(options, context)}" + operator = self.operator.rstrip() + result = f"{operator}{self.expr_term.serialize(options, context)}" if not context.inside_dollar_string: result = to_dollar_string(result) diff --git a/hcl2/transformer.py b/hcl2/transformer.py index f6e94004..7b067462 100644 --- a/hcl2/transformer.py +++ b/hcl2/transformer.py @@ -83,10 +83,10 @@ def __default_token__(self, token: Token) -> StringToken: # TODO make this return StaticStringToken where applicable value = token.value # The EQ terminal /[ \t]*=(?!=|>)/ captures leading whitespace. - # Strip it so parsed and deserialized tokens produce the same "=" value, - # preventing the reconstructor from emitting double spaces. - if token.type == "EQ": - value = value.lstrip(" \t") + # Preserve it so the direct pipeline (to_lark → reconstruct) retains + # original alignment. The reconstructor skips its own space insertion + # when the EQ token already carries leading whitespace. + if value in StaticStringToken.classes_by_value: return StaticStringToken.classes_by_value[value]() return StringToken[token.type](value) # type: ignore[misc] diff --git a/test/integration/hcl2_original/smoke.tf b/test/integration/hcl2_original/smoke.tf index 32e22466..e2cb9bb8 100644 --- a/test/integration/hcl2_original/smoke.tf +++ b/test/integration/hcl2_original/smoke.tf @@ -3,7 +3,7 @@ block label1 label2 { a = 5 b = 1256.5 c = 15 + (10 * 12) - d = (- a) + d = (-a) e = ( a == b ? true : false diff --git a/test/integration/hcl2_reconstructed/smoke.tf b/test/integration/hcl2_reconstructed/smoke.tf index 655ee440..54b3286c 100644 --- a/test/integration/hcl2_reconstructed/smoke.tf +++ b/test/integration/hcl2_reconstructed/smoke.tf @@ -61,7 +61,7 @@ block { for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names : module.remote_state_subaccounts.map[account_name].outputs["aws_account_id"] ] - } ... + } ... if substr(bucket_name, 0, 1) == "l" } } diff --git a/test/integration/test_round_trip.py b/test/integration/test_round_trip.py index 67217f07..b78141cb 100644 --- a/test/integration/test_round_trip.py +++ b/test/integration/test_round_trip.py @@ -79,6 +79,14 @@ def _parse_and_serialize(hcl_text: str, options=None) -> dict: return rules.serialize() +def _direct_reconstruct(hcl_text: str) -> str: + """Parse HCL text, transform to IR, convert to Lark tree, and reconstruct.""" + parsed_tree = parses_to_tree(hcl_text) + rules = RuleTransformer().transform(parsed_tree) + lark_tree = rules.to_lark() + return HCLReconstructor().reconstruct(lark_tree) + + def _deserialize_and_reserialize(serialized: dict) -> dict: """Deserialize a Python dict back through the rule tree and reserialize.""" deserializer = BaseDeserializer() @@ -120,6 +128,33 @@ def test_hcl_to_json(self): ) +class TestDirectReconstruction(TestCase): + """Test HCL2 → IR → HCL2 direct pipeline. + + Parse HCL, transform to IR, convert directly to Lark tree (skipping + serialization to dict), reconstruct HCL, then verify the result + re-parses to the same JSON as the original. + """ + + maxDiff = None + + def test_direct_reconstruct(self): + for suite in _get_suites(): + with self.subTest(suite=suite): + hcl_path = _get_suite_file(suite, SuiteStep.ORIGINAL) + original_hcl = hcl_path.read_text() + + # Direct: HCL → IR → Lark → HCL + reconstructed_hcl = _direct_reconstruct(original_hcl) + + self.assertMultiLineEqual( + reconstructed_hcl, + original_hcl, + f"Direct reconstruction mismatch for suite {suite}: " + f"HCL → IR → HCL did not match original HCL", + ) + + class TestRoundTripReserialization(TestCase): """Test JSON → JSON reserialization. From c9a10d96f6781c00aca4277e0adc2881f362ee63 Mon Sep 17 00:00:00 2001 From: Kamil Kozik Date: Tue, 10 Mar 2026 14:26:58 +0100 Subject: [PATCH 3/4] Add postlexer to support multiline binary operators and ternary expressions --- CLAUDE.md | 23 ++- hcl2/parser.py | 3 + hcl2/postlexer.py | 79 +++++++++ hcl2/reconstructor.py | 5 + hcl2/rules/expressions.py | 14 +- hcl2/transformer.py | 42 ++++- test/integration/hcl2_original/smoke.tf | 22 +++ test/integration/hcl2_reconstructed/smoke.tf | 14 ++ test/integration/json_reserialized/smoke.json | 12 ++ test/integration/json_serialized/smoke.json | 12 ++ test/unit/rules/test_expressions.py | 11 +- test/unit/test_postlexer.py | 150 ++++++++++++++++++ 12 files changed, 369 insertions(+), 18 deletions(-) create mode 100644 hcl2/postlexer.py create mode 100644 test/unit/test_postlexer.py diff --git a/CLAUDE.md b/CLAUDE.md index e09f58c2..833629f0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -3,16 +3,20 @@ ## Pipeline ``` -Forward: HCL2 Text → Lark Parse Tree → LarkElement Tree → Python Dict/JSON +Forward: HCL2 Text → [PostLexer] → Lark Parse Tree → LarkElement Tree → Python Dict/JSON Reverse: Python Dict/JSON → LarkElement Tree → Lark Tree → HCL2 Text +Direct: HCL2 Text → [PostLexer] → Lark Parse Tree → LarkElement Tree → Lark Tree → HCL2 Text ``` +The **Direct** pipeline (`parse_to_tree` → `transform` → `to_lark` → `reconstruct`) skips serialization to dict, so all IR nodes (including `NewLineOrCommentRule` nodes for whitespace/comments) directly influence the reconstructed output. Any information discarded before the IR is lost in this pipeline. + ## Module Map | Module | Role | |---|---| | `hcl2/hcl2.lark` | Lark grammar definition | | `hcl2/api.py` | Public API (`load/loads/dump/dumps` + intermediate stages) | +| `hcl2/postlexer.py` | Token stream transforms between lexer and parser | | `hcl2/parser.py` | Lark parser factory with caching | | `hcl2/transformer.py` | Lark parse tree → LarkElement tree | | `hcl2/deserializer.py` | Python dict → LarkElement tree | @@ -73,6 +77,16 @@ jsontohcl2 --indent 4 --no-align file.json Add new options as `parser.add_argument()` calls in the relevant entry point module. +## PostLexer (`postlexer.py`) + +Lark's `postlex` parameter accepts a single object with a `process(stream)` method that transforms the token stream between the lexer and LALR parser. The `PostLexer` class is designed for extensibility: each transformation is a private method that accepts and yields tokens, and `process()` chains them together. + +Current passes: + +- `_merge_newlines_into_operators` + +To add a new pass: create a private method with the same `(self, stream) -> generator` signature, and add a `yield from` call in `process()`. + ## Hard Rules These are project-specific constraints that must not be violated: @@ -88,6 +102,7 @@ These are project-specific constraints that must not be violated: ## Adding a New Language Construct 1. Add grammar rules to `hcl2.lark` +1. If the new construct creates LALR ambiguities with `NL_OR_COMMENT`, add a postlexer pass in `postlexer.py` 1. Create rule class(es) in the appropriate `rules/` file 1. Add transformer method(s) in `transformer.py` 1. Implement `serialize()` in the rule class @@ -103,9 +118,9 @@ python -m unittest discover -s test -p "test_*.py" -v **Unit tests** (`test/unit/`): instantiate rule objects directly (no parsing). -- `test/unit/rules/` — one file per rules module -- `test/unit/cli/` — one file per CLI module -- `test/unit/test_api.py`, `test_builder.py`, `test_deserializer.py`, `test_formatter.py`, `test_reconstructor.py`, `test_utils.py` +- `rules/` — one file per rules module +- `cli/` — one file per CLI module +- `test_*.py` — tests for corresponding files from `hcl2/` directory Use concrete stubs when testing ABCs (e.g., `StubExpression(ExpressionRule)`). diff --git a/hcl2/parser.py b/hcl2/parser.py index a33fe5f8..d275a589 100644 --- a/hcl2/parser.py +++ b/hcl2/parser.py @@ -4,6 +4,8 @@ from lark import Lark +from hcl2.postlexer import PostLexer + PARSER_FILE = Path(__file__).absolute().resolve().parent / ".lark_cache.bin" @@ -17,4 +19,5 @@ def parser() -> Lark: cache=str(PARSER_FILE), # Disable/Delete file to effect changes to the grammar rel_to=__file__, propagate_positions=True, + postlex=PostLexer(), ) diff --git a/hcl2/postlexer.py b/hcl2/postlexer.py new file mode 100644 index 00000000..1d22cc03 --- /dev/null +++ b/hcl2/postlexer.py @@ -0,0 +1,79 @@ +"""Postlexer that transforms the token stream between the Lark lexer and parser. + +Each transformation is implemented as a private method that accepts and yields +tokens. The public ``process`` method chains them together, making it easy to +add new passes without touching existing logic. +""" + +from typing import FrozenSet, Iterator, Optional, Tuple + +from lark import Token + +# Type alias for a token stream consumed and produced by each pass. +TokenStream = Iterator[Token] + +# Operator token types that may legally follow a line-continuation newline. +# MINUS is excluded — it is also the unary negation operator, and merging a +# newline into MINUS would incorrectly consume statement-separating newlines +# before negative literals (e.g. "a = 1\nb = -2"). +OPERATOR_TYPES: FrozenSet[str] = frozenset( + { + "DOUBLE_EQ", + "NEQ", + "LT", + "GT", + "LEQ", + "GEQ", + "ASTERISK", + "SLASH", + "PERCENT", + "DOUBLE_AMP", + "DOUBLE_PIPE", + "PLUS", + "QMARK", + } +) + + +class PostLexer: + """Transform the token stream before it reaches the LALR parser.""" + + def process(self, stream: TokenStream) -> TokenStream: + """Chain all postlexer passes over the token stream.""" + yield from self._merge_newlines_into_operators(stream) + + def _merge_newlines_into_operators(self, stream: TokenStream) -> TokenStream: + """Merge NL_OR_COMMENT tokens into immediately following operator tokens. + + LALR parsers cannot distinguish a statement-ending newline from a + line-continuation newline before a binary operator. This pass resolves + the ambiguity by merging NL_OR_COMMENT into the operator token's value + when the next token is a binary operator or QMARK. The transformer + later extracts the newline prefix and creates a NewLineOrCommentRule + node, preserving round-trip fidelity. + """ + pending_nl: Optional[Token] = None + for token in stream: + if token.type == "NL_OR_COMMENT": + if pending_nl is not None: + yield pending_nl + pending_nl = token + else: + if pending_nl is not None: + if token.type in OPERATOR_TYPES: + token = token.update(value=str(pending_nl) + str(token)) + else: + yield pending_nl + pending_nl = None + yield token + if pending_nl is not None: + yield pending_nl + + @property + def always_accept(self) -> Tuple[()]: + """Terminal names the parser must accept even when not expected by LALR. + + Lark requires this property on postlexer objects. An empty tuple + means no extra terminals are injected. + """ + return () diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py index 4760665a..cd1538ec 100644 --- a/hcl2/reconstructor.py +++ b/hcl2/reconstructor.py @@ -183,6 +183,11 @@ def _reconstruct_tree( # Check spacing BEFORE processing children, while _last_rule_name # still reflects the previous sibling (not a child of this tree). needs_space = self._should_add_space_before(tree, parent_rule_name) + if needs_space: + # A space will be inserted before this tree's output, so tell + # children that the last character was a space to prevent the + # first child from adding a duplicate leading space. + self._last_was_space = True if rule_name == UnaryOpRule.lark_name(): for i, child in enumerate(tree.children): diff --git a/hcl2/rules/expressions.py b/hcl2/rules/expressions.py index c29859a3..6636d4c8 100644 --- a/hcl2/rules/expressions.py +++ b/hcl2/rules/expressions.py @@ -122,6 +122,7 @@ class ConditionalRule(ExpressionRule): _children_layout: Tuple[ ExpressionRule, + Optional[NewLineOrCommentRule], QMARK, Optional[NewLineOrCommentRule], ExpressionRule, @@ -137,7 +138,7 @@ def lark_name() -> str: return "conditional" def __init__(self, children, meta: Optional[Meta] = None): - self._insert_optionals(children, [2, 4, 6]) + self._insert_optionals(children, [1, 3, 5, 7]) super().__init__(children, meta) @property @@ -148,12 +149,12 @@ def condition(self) -> ExpressionRule: @property def if_true(self) -> ExpressionRule: """Return the true-branch expression.""" - return self._children[3] + return self._children[4] @property def if_false(self) -> ExpressionRule: """Return the false-branch expression.""" - return self._children[7] + return self._children[8] def serialize( self, options=SerializationOptions(), context=SerializationContext() @@ -179,6 +180,7 @@ class BinaryTermRule(ExpressionRule): """Rule for the operator+operand portion of a binary operation.""" _children_layout: Tuple[ + Optional[NewLineOrCommentRule], BinaryOperatorRule, Optional[NewLineOrCommentRule], ExprTermRule, @@ -190,18 +192,18 @@ def lark_name() -> str: return "binary_term" def __init__(self, children, meta: Optional[Meta] = None): - self._insert_optionals(children, [1]) + self._insert_optionals(children, [0, 2]) super().__init__(children, meta) @property def binary_operator(self) -> BinaryOperatorRule: """Return the binary operator.""" - return self._children[0] + return self._children[1] @property def expr_term(self) -> ExprTermRule: """Return the right-hand operand.""" - return self._children[2] + return self._children[3] def serialize( self, options=SerializationOptions(), context=SerializationContext() diff --git a/hcl2/transformer.py b/hcl2/transformer.py index d0a09630..f6e94004 100644 --- a/hcl2/transformer.py +++ b/hcl2/transformer.py @@ -81,9 +81,15 @@ def __init__(self, discard_new_line_or_comments: bool = False): def __default_token__(self, token: Token) -> StringToken: # TODO make this return StaticStringToken where applicable - if token.value in StaticStringToken.classes_by_value: - return StaticStringToken.classes_by_value[token.value]() - return StringToken[token.type](token.value) # type: ignore[misc] + value = token.value + # The EQ terminal /[ \t]*=(?!=|>)/ captures leading whitespace. + # Strip it so parsed and deserialized tokens produce the same "=" value, + # preventing the reconstructor from emitting double spaces. + if token.type == "EQ": + value = value.lstrip(" \t") + if value in StaticStringToken.classes_by_value: + return StaticStringToken.classes_by_value[value]() + return StringToken[token.type](value) # type: ignore[misc] # pylint: disable=C0103 def FLOAT_LITERAL(self, token: Token) -> FloatLiteral: @@ -164,8 +170,32 @@ def heredoc_template_trim(self, meta: Meta, args) -> HeredocTrimTemplateRule: def expr_term(self, meta: Meta, args) -> ExprTermRule: return ExprTermRule(args, meta) + def _extract_nl_prefix(self, token): + """Strip leading newlines from a token value. + + If the token contains a newline prefix (from the postlexer merging a + line-continuation newline into the operator token), strip it and + return a NewLineOrCommentRule. Otherwise return None. + """ + value = str(token.value) + stripped = value.lstrip("\n \t") + if len(stripped) == len(value): + return None + nl_text = value[: len(value) - len(stripped)] + token.set_value(stripped) + if self.discard_new_line_or_comments: + return None + return NewLineOrCommentRule.from_string(nl_text) + @v_args(meta=True) def conditional(self, meta: Meta, args) -> ConditionalRule: + # args: [condition, QMARK, NL?, if_true, NL?, COLON, NL?, if_false] + # QMARK is at index 1 — check for NL prefix from the postlexer + qmark_token = args[1] + nl_rule = self._extract_nl_prefix(qmark_token) + if nl_rule is not None: + args = list(args) + args.insert(1, nl_rule) return ConditionalRule(args, meta) @v_args(meta=True) @@ -174,6 +204,12 @@ def binary_operator(self, meta: Meta, args) -> BinaryOperatorRule: @v_args(meta=True) def binary_term(self, meta: Meta, args) -> BinaryTermRule: + # args: [BinaryOperatorRule, NL?, ExprTermRule] + # The operator's token may contain a NL prefix from the postlexer + op_rule = args[0] + nl_rule = self._extract_nl_prefix(op_rule.token) + if nl_rule is not None: + args = [nl_rule] + list(args) return BinaryTermRule(args, meta) @v_args(meta=True) diff --git a/test/integration/hcl2_original/smoke.tf b/test/integration/hcl2_original/smoke.tf index 99537532..32e22466 100644 --- a/test/integration/hcl2_original/smoke.tf +++ b/test/integration/hcl2_original/smoke.tf @@ -43,6 +43,28 @@ block label1 label2 { } } +block multiline_ternary { + foo = ( + bar + ? baz(foo) + : foo == "bar" + ? "baz" + : foo + ) +} + +block multiline_binary_ops { + expr = { + for k, v in local.map_a : k => v + if lookup(local.map_b[v.id + ], "enabled", false) + || ( + contains(local.map_c, v.id) + && contains(local.map_d, v.id) + ) + } +} + block { route53_forwarding_rule_shares = { for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : diff --git a/test/integration/hcl2_reconstructed/smoke.tf b/test/integration/hcl2_reconstructed/smoke.tf index c8529e70..655ee440 100644 --- a/test/integration/hcl2_reconstructed/smoke.tf +++ b/test/integration/hcl2_reconstructed/smoke.tf @@ -39,6 +39,20 @@ block label1 label2 { } +block multiline_ternary { + foo = (bar ? baz(foo) : foo == "bar" ? "baz" : foo) +} + + +block multiline_binary_ops { + expr = { + for k, v in local.map_a : + k => v + if lookup(local.map_b[v.id], "enabled", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id)) + } +} + + block { route53_forwarding_rule_shares = { for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : diff --git a/test/integration/json_reserialized/smoke.json b/test/integration/json_reserialized/smoke.json index dbff114f..ec003473 100644 --- a/test/integration/json_reserialized/smoke.json +++ b/test/integration/json_reserialized/smoke.json @@ -48,6 +48,18 @@ } } }, + { + "multiline_ternary": { + "foo": "${(bar ? baz(foo) : foo == \"bar\" ? \"baz\" : foo)}", + "__is_block__": true + } + }, + { + "multiline_binary_ops": { + "expr": "${{for k, v in local.map_a : k => v if lookup(local.map_b[v.id], \"enabled\", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id))}}", + "__is_block__": true + } + }, { "route53_forwarding_rule_shares": "${{for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : \"${forwarding_rule_key}\" => {aws_account_ids = [for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names : module.remote_state_subaccounts.map[account_name].outputs[\"aws_account_id\"]]}... if substr(bucket_name, 0, 1) == \"l\"}}", "__is_block__": true diff --git a/test/integration/json_serialized/smoke.json b/test/integration/json_serialized/smoke.json index dbff114f..ec003473 100644 --- a/test/integration/json_serialized/smoke.json +++ b/test/integration/json_serialized/smoke.json @@ -48,6 +48,18 @@ } } }, + { + "multiline_ternary": { + "foo": "${(bar ? baz(foo) : foo == \"bar\" ? \"baz\" : foo)}", + "__is_block__": true + } + }, + { + "multiline_binary_ops": { + "expr": "${{for k, v in local.map_a : k => v if lookup(local.map_b[v.id], \"enabled\", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id))}}", + "__is_block__": true + } + }, { "route53_forwarding_rule_shares": "${{for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : \"${forwarding_rule_key}\" => {aws_account_ids = [for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names : module.remote_state_subaccounts.map[account_name].outputs[\"aws_account_id\"]]}... if substr(bucket_name, 0, 1) == \"l\"}}", "__is_block__": true diff --git a/test/unit/rules/test_expressions.py b/test/unit/rules/test_expressions.py index 974885b5..8ab7e8db 100644 --- a/test/unit/rules/test_expressions.py +++ b/test/unit/rules/test_expressions.py @@ -194,8 +194,8 @@ def test_lark_name(self): def test_construction_inserts_optional_slots(self): rule = self._make_conditional() - # Should have 8 children after _insert_optionals at [2, 4, 6] - self.assertEqual(len(rule.children), 8) + # Should have 9 children after _insert_optionals at [1, 3, 5, 7] + self.assertEqual(len(rule.children), 9) def test_condition_property(self): cond = StubExpression("cond") @@ -274,9 +274,10 @@ def test_lark_name(self): def test_construction_inserts_optional(self): rule = _make_binary_term("+", "b") - # [BinaryOperatorRule, None, ExprTermRule] - self.assertEqual(len(rule.children), 3) - self.assertIsNone(rule.children[1]) + # [None, BinaryOperatorRule, None, ExprTermRule] + self.assertEqual(len(rule.children), 4) + self.assertIsNone(rule.children[0]) + self.assertIsNone(rule.children[2]) def test_binary_operator_property(self): op = _make_binary_operator("+") diff --git a/test/unit/test_postlexer.py b/test/unit/test_postlexer.py new file mode 100644 index 00000000..07a8164b --- /dev/null +++ b/test/unit/test_postlexer.py @@ -0,0 +1,150 @@ +# pylint: disable=C0103,C0114,C0115,C0116 +"""Unit tests for hcl2.postlexer. + +Tests parse real HCL2 snippets through the full pipeline to verify that the +postlexer correctly handles newlines before binary operators and QMARK. +""" + +from unittest import TestCase + +from lark import Token + +from hcl2.api import loads +from hcl2.postlexer import OPERATOR_TYPES, PostLexer + + +class TestMergeNewlinesIntoOperators(TestCase): + """Test _merge_newlines_into_operators at the token-stream level.""" + + def _run(self, tokens): + """Run the postlexer pass and return a list of tokens.""" + return list(PostLexer()._merge_newlines_into_operators(iter(tokens))) + + def test_no_newlines_passes_through(self): + tokens = [Token("NAME", "a"), Token("PLUS", "+"), Token("NAME", "b")] + result = self._run(tokens) + self.assertEqual(len(result), 3) + self.assertEqual(result[1].type, "PLUS") + self.assertEqual(str(result[1]), "+") + + def test_newline_before_operator_is_merged(self): + tokens = [ + Token("NAME", "a"), + Token("NL_OR_COMMENT", "\n "), + Token("PLUS", "+"), + Token("NAME", "b"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 3) + self.assertEqual(result[1].type, "PLUS") + self.assertEqual(str(result[1]), "\n +") + + def test_newline_before_non_operator_is_preserved(self): + tokens = [ + Token("NAME", "a"), + Token("NL_OR_COMMENT", "\n"), + Token("NAME", "b"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 3) + self.assertEqual(result[1].type, "NL_OR_COMMENT") + + def test_consecutive_newlines_first_yielded(self): + tokens = [ + Token("NL_OR_COMMENT", "\n"), + Token("NL_OR_COMMENT", "\n "), + Token("PLUS", "+"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 2) + self.assertEqual(result[0].type, "NL_OR_COMMENT") + self.assertEqual(str(result[0]), "\n") + self.assertEqual(result[1].type, "PLUS") + self.assertEqual(str(result[1]), "\n +") + + def test_trailing_newline_is_yielded(self): + tokens = [Token("NAME", "a"), Token("NL_OR_COMMENT", "\n")] + result = self._run(tokens) + self.assertEqual(len(result), 2) + self.assertEqual(result[1].type, "NL_OR_COMMENT") + + def test_all_operator_types_are_merged(self): + for op_type in sorted(OPERATOR_TYPES): + with self.subTest(op_type=op_type): + tokens = [ + Token("NL_OR_COMMENT", "\n"), + Token(op_type, "x"), + ] + result = self._run(tokens) + self.assertEqual(len(result), 1) + self.assertEqual(result[0].type, op_type) + self.assertTrue(str(result[0]).startswith("\n")) + + def test_minus_not_in_operator_types(self): + self.assertNotIn("MINUS", OPERATOR_TYPES) + + +class TestMultilineOperatorParsing(TestCase): + """Test that HCL2 snippets with multiline operators parse correctly.""" + + def test_multiline_ternary(self): + hcl = 'x = (\n a\n ? "yes"\n : "no"\n)\n' + result = loads(hcl) + self.assertEqual(result["x"], '${(a ? "yes" : "no")}') + + def test_multiline_logical_or(self): + hcl = "x = (\n a\n || b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a || b)}") + + def test_multiline_logical_and(self): + hcl = "x = (\n a\n && b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a && b)}") + + def test_multiline_equality(self): + hcl = "x = (\n a\n == b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a == b)}") + + def test_multiline_not_equal(self): + hcl = "x = (\n a\n != b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a != b)}") + + def test_multiline_comparison(self): + hcl = "x = (\n a\n >= b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a >= b)}") + + def test_multiline_addition(self): + hcl = "x = (\n a\n + b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a + b)}") + + def test_multiline_multiplication(self): + hcl = "x = (\n a\n * b\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a * b)}") + + def test_multiline_chained_operators(self): + hcl = "x = (\n a\n && b\n && c\n)\n" + result = loads(hcl) + self.assertEqual(result["x"], "${(a && b && c)}") + + def test_multiline_nested_ternary(self): + hcl = 'x = (\n a\n ? b\n : c == "d"\n ? "e"\n : f\n)\n' + result = loads(hcl) + self.assertEqual(result["x"], '${(a ? b : c == "d" ? "e" : f)}') + + def test_minus_on_new_line_is_separate_attribute(self): + """MINUS is excluded from merging — newline before - starts a new statement.""" + hcl = "a = 1\nb = -2\n" + result = loads(hcl) + self.assertEqual(result["a"], 1) + self.assertIn("b", result) + + def test_single_line_operators_still_work(self): + hcl = "x = a + b\n" + result = loads(hcl) + self.assertEqual(result["x"], "${a + b}") From 83c58a3ca886e3a576b2b20871df0a6a85c2d60c Mon Sep 17 00:00:00 2001 From: Kamil Kozik Date: Wed, 11 Mar 2026 11:53:07 +0100 Subject: [PATCH 4/4] CLAUED.md - add pre-commit section --- CLAUDE.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 833629f0..6823678a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -131,6 +131,14 @@ Use concrete stubs when testing ABCs (e.g., `StubExpression(ExpressionRule)`). Always run round-trip full test suite after any modification. +## Pre-commit Checks + +Hooks are defined in `.pre-commit-config.yaml` (includes black, mypy, pylint, and others). All changed files must pass these checks before committing. When writing or modifying code: + +- Format Python with **black** (Python 3.8 target). +- Ensure **mypy** and **pylint** pass. Pylint config is in `pylintrc`, scoped to `hcl2/` and `test/`. +- End files with a newline; strip trailing whitespace (except under `test/integration/(hcl2_reconstructed|specialized)/`). + ## Keeping Docs Current Update this file when architecture, modules, API surface, or testing conventions change. Also update `README.md` and `docs/usage.md` when changes affect the public API, CLI flags, or option fields.