From 7c31366a30fad3e1d67577e944e119e43688ff2c Mon Sep 17 00:00:00 2001 From: "richard.hughes" Date: Tue, 19 May 2026 13:38:11 -0700 Subject: [PATCH 1/6] fix(spark): CONCAT/PAD type leaks non-string arg type when string args are VARCHAR/CHAR [CLAUDE] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_annotate_by_similar_args` only short-circuited on an exact `is_type(TEXT)` match. VARCHAR, CHAR, NVARCHAR, and string literals (annotated as VARCHAR) are not `TEXT`, so for `CONCAT(varchar_col, '-', date_col)` the loop fell through every argument and the final `last_datatype = expr.type` left the result as the *last* arg's type (DATE), not a string. Accept a tuple of acceptable match types and use its first element as the canonical result. Spark's CONCAT/PAD now pass `(TEXT, *TEXT_TYPES)`, so any string-family arg yields TEXT. Also stop overwriting `last_datatype` on every non-matching arg — keep the first. Co-Authored-By: Claude Opus 4.7 (1M context) --- sqlglot/typing/spark2.py | 21 ++++++++++++------- .../fixtures/optimizer/annotate_functions.sql | 20 ++++++++++++++++++ tests/test_optimizer.py | 1 + 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/sqlglot/typing/spark2.py b/sqlglot/typing/spark2.py index c734a740bb..ed9e9fe67f 100644 --- a/sqlglot/typing/spark2.py +++ b/sqlglot/typing/spark2.py @@ -13,13 +13,20 @@ def _annotate_by_similar_args( - self: TypeAnnotator, expression: E, *args: str, target_type: exp.DataType | exp.DType + self: TypeAnnotator, + expression: E, + *args: str, + target_type: exp.DataType | exp.DType | tuple[exp.DataType | exp.DType, ...], ) -> E: """ Infers the type of the expression according to the following rules: - - If all args are of the same type OR any arg is of target_type, the expr is inferred as such - - If any arg is of UNKNOWN type and none of target_type, the expr is inferred as UNKNOWN + - If any arg matches a target_type, the expr is inferred as the first target_type + - If any arg is of UNKNOWN type and none match target_type, the expr is inferred as UNKNOWN. + - Otherwise the expr is inferred as the type of the last non-matching arg. """ + target_types = target_type if isinstance(target_type, tuple) else (target_type,) + result_type = target_types[0] + expressions: list[exp.Expr] = [] for arg in args: arg_expr = expression.args.get(arg) @@ -31,9 +38,9 @@ def _annotate_by_similar_args( for expr in expressions: if expr.is_type(exp.DType.UNKNOWN): has_unknown = True - elif expr.is_type(target_type): + elif expr.is_type(*target_types): has_unknown = False - last_datatype = target_type + last_datatype = result_type break else: last_datatype = expr.type @@ -74,13 +81,13 @@ def _annotate_by_similar_args( exp.AtTimeZone: {"returns": exp.DType.TIMESTAMP}, exp.Concat: { "annotator": lambda self, e: _annotate_by_similar_args( - self, e, "expressions", target_type=exp.DType.TEXT + self, e, "expressions", target_type=(exp.DType.TEXT, *exp.DataType.TEXT_TYPES) ) }, exp.NextDay: {"returns": exp.DType.DATE}, exp.Pad: { "annotator": lambda self, e: _annotate_by_similar_args( - self, e, "this", "fill_pattern", target_type=exp.DType.TEXT + self, e, "this", "fill_pattern", target_type=(exp.DType.TEXT, *exp.DataType.TEXT_TYPES) ) }, } diff --git a/tests/fixtures/optimizer/annotate_functions.sql b/tests/fixtures/optimizer/annotate_functions.sql index 8799b883fc..b1969051b6 100644 --- a/tests/fixtures/optimizer/annotate_functions.sql +++ b/tests/fixtures/optimizer/annotate_functions.sql @@ -257,6 +257,26 @@ UNKNOWN; CONCAT(unknown, unknown); UNKNOWN; +# dialect: spark2, spark, databricks +CONCAT(tbl.varchar_col, '-', tbl.date_col); +STRING; + +# dialect: spark2, spark, databricks +CONCAT(tbl.date_col, tbl.varchar_col); +STRING; + +# dialect: spark2, spark, databricks +CONCAT(tbl.str_col, '-', tbl.date_col); +STRING; + +# dialect: spark2, spark, databricks +CONCAT(tbl.str_col, tbl.int_col); +STRING; + +# dialect: spark2, spark, databricks +LPAD(tbl.varchar_col, 10, '0'); +STRING; + # dialect: spark2, spark, databricks LPAD(tbl.bin_col, 1, tbl.bin_col); BINARY; diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index 0dd9ab5831..37f974d311 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -1489,6 +1489,7 @@ def test_annotate_funcs(self): "tbl": { "bin_col": "BINARY", "str_col": "STRING", + "varchar_col": "VARCHAR", "bignum_col": "BIGNUMERIC", "date_col": "DATE", "decfloat_col": "DECFLOAT", From 955870fee36821737be9e51dbfaa2a49e9b48c28 Mon Sep 17 00:00:00 2001 From: "richard.hughes" Date: Wed, 20 May 2026 15:27:22 -0700 Subject: [PATCH 2/6] test(spark): expand CONCAT/LPAD type-annotation fixture coverage [CLAUDE] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace varchar_col-based cases (which required a schema addition) with literal-based equivalents — string literals parse as VARCHAR, supplying the same TEXT_TYPES coverage without an extra schema column. Add missing cases: date-first CONCAT, array round-trips, and LPAD with date args. Remove varchar_col from the test schema. Co-Authored-By: Claude Sonnet 4.6 --- .../fixtures/optimizer/annotate_functions.sql | 46 +++++++++++++++++-- tests/test_optimizer.py | 1 - 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/tests/fixtures/optimizer/annotate_functions.sql b/tests/fixtures/optimizer/annotate_functions.sql index b1969051b6..35e4219951 100644 --- a/tests/fixtures/optimizer/annotate_functions.sql +++ b/tests/fixtures/optimizer/annotate_functions.sql @@ -258,11 +258,23 @@ CONCAT(unknown, unknown); UNKNOWN; # dialect: spark2, spark, databricks -CONCAT(tbl.varchar_col, '-', tbl.date_col); +CONCAT('x', tbl.str_col); STRING; # dialect: spark2, spark, databricks -CONCAT(tbl.date_col, tbl.varchar_col); +CONCAT('x', '-', tbl.str_col); +STRING; + +# dialect: spark2, spark, databricks +CONCAT('x', tbl.date_col); +STRING; + +# dialect: spark2, spark, databricks +CONCAT(tbl.str_col, tbl.date_col); +STRING; + +# dialect: spark2, spark, databricks +CONCAT(tbl.date_col, 'x'); STRING; # dialect: spark2, spark, databricks @@ -274,7 +286,35 @@ CONCAT(tbl.str_col, tbl.int_col); STRING; # dialect: spark2, spark, databricks -LPAD(tbl.varchar_col, 10, '0'); +CONCAT('x', tbl.bin_col); +STRING; + +# dialect: spark2, spark, databricks +CONCAT(tbl.array_col, tbl.array_col); +ARRAY; + +# dialect: spark2, spark, databricks +CONCAT(array(1, 2), array(3, 4)); +ARRAY; + +# dialect: spark2, spark, databricks +CONCAT(array(unhex('aa')), array(unhex('bb'))); +ARRAY; + +# dialect: spark2, spark, databricks +LPAD(tbl.str_col, 10, '0'); +STRING; + +# dialect: spark2, spark, databricks +LPAD('x', 10, tbl.str_col); +STRING; + +# dialect: spark2, spark, databricks +LPAD('x', 10, tbl.date_col); +STRING; + +# dialect: spark2, spark, databricks +LPAD(tbl.date_col, 10, 'x'); STRING; # dialect: spark2, spark, databricks diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index 37f974d311..0dd9ab5831 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -1489,7 +1489,6 @@ def test_annotate_funcs(self): "tbl": { "bin_col": "BINARY", "str_col": "STRING", - "varchar_col": "VARCHAR", "bignum_col": "BIGNUMERIC", "date_col": "DATE", "decfloat_col": "DECFLOAT", From 494c9be88724bc9648ee499776f19fd9b1aff451 Mon Sep 17 00:00:00 2001 From: "richard.hughes" Date: Wed, 20 May 2026 15:31:48 -0700 Subject: [PATCH 3/6] test(spark): add failing UNKNOWN cases for invalid CONCAT type combinations [CLAUDE] Array mixed with string, binary, or a literal should resolve to UNKNOWN since Spark rejects these at analysis time with DATATYPE_MISMATCH. The annotator currently returns the wrong type for all six cases; these tests are expected to fail until the fix lands. Co-Authored-By: Claude Sonnet 4.6 --- .../fixtures/optimizer/annotate_functions.sql | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/fixtures/optimizer/annotate_functions.sql b/tests/fixtures/optimizer/annotate_functions.sql index 35e4219951..8aced7e994 100644 --- a/tests/fixtures/optimizer/annotate_functions.sql +++ b/tests/fixtures/optimizer/annotate_functions.sql @@ -301,6 +301,30 @@ ARRAY; CONCAT(array(unhex('aa')), array(unhex('bb'))); ARRAY; +# dialect: spark2, spark, databricks +CONCAT(tbl.array_col, tbl.str_col); +UNKNOWN; + +# dialect: spark2, spark, databricks +CONCAT(tbl.str_col, tbl.array_col); +UNKNOWN; + +# dialect: spark2, spark, databricks +CONCAT(tbl.array_col, tbl.bin_col); +UNKNOWN; + +# dialect: spark2, spark, databricks +CONCAT(tbl.bin_col, tbl.array_col); +UNKNOWN; + +# dialect: spark2, spark, databricks +CONCAT(tbl.array_col, 'x'); +UNKNOWN; + +# dialect: spark2, spark, databricks +CONCAT('x', tbl.array_col); +UNKNOWN; + # dialect: spark2, spark, databricks LPAD(tbl.str_col, 10, '0'); STRING; From 181ad62bb701615d327250516af3ff4c3d4e2d29 Mon Sep 17 00:00:00 2001 From: "richard.hughes" Date: Wed, 20 May 2026 16:01:06 -0700 Subject: [PATCH 4/6] Revert "test(spark): add failing UNKNOWN cases for invalid CONCAT type combinations [CLAUDE]" This reverts commit 494c9be88724bc9648ee499776f19fd9b1aff451. --- .../fixtures/optimizer/annotate_functions.sql | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/tests/fixtures/optimizer/annotate_functions.sql b/tests/fixtures/optimizer/annotate_functions.sql index 8aced7e994..35e4219951 100644 --- a/tests/fixtures/optimizer/annotate_functions.sql +++ b/tests/fixtures/optimizer/annotate_functions.sql @@ -301,30 +301,6 @@ ARRAY; CONCAT(array(unhex('aa')), array(unhex('bb'))); ARRAY; -# dialect: spark2, spark, databricks -CONCAT(tbl.array_col, tbl.str_col); -UNKNOWN; - -# dialect: spark2, spark, databricks -CONCAT(tbl.str_col, tbl.array_col); -UNKNOWN; - -# dialect: spark2, spark, databricks -CONCAT(tbl.array_col, tbl.bin_col); -UNKNOWN; - -# dialect: spark2, spark, databricks -CONCAT(tbl.bin_col, tbl.array_col); -UNKNOWN; - -# dialect: spark2, spark, databricks -CONCAT(tbl.array_col, 'x'); -UNKNOWN; - -# dialect: spark2, spark, databricks -CONCAT('x', tbl.array_col); -UNKNOWN; - # dialect: spark2, spark, databricks LPAD(tbl.str_col, 10, '0'); STRING; From 080d0de76352778b52b09d703e1a05de512baa17 Mon Sep 17 00:00:00 2001 From: "richard.hughes" Date: Wed, 20 May 2026 16:08:50 -0700 Subject: [PATCH 5/6] fix(spark): replace _annotate_by_similar_args with _annotate_concat/_annotate_pad [CLAUDE] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old helper accumulated type info arg-by-arg against a target_type, which failed to recognize that VARCHAR/CHAR (TEXT_TYPES but not DType.TEXT) are valid string-concat participants. Replace with two dedicated annotators whose dispatch matches Spark's actual type rules: CONCAT: UNKNOWN-in → UNKNOWN; all-binary → BINARY; all-array of identical type → that ARRAY type; else → TEXT. PAD: ARRAY arg → UNKNOWN (invalid); else same binary/text dispatch as CONCAT, but without the array-propagation path. The ARRAY branch in _annotate_concat uses type.sql() equality so that ARRAY and ARRAY> are not treated as the same type. Also correct a pre-existing fixture expectation: CONCAT(str_col, unknown) should return UNKNOWN, not STRING — if any arg type is unknown the output type is unknown. Co-Authored-By: Claude Sonnet 4.6 --- sqlglot/typing/spark2.py | 91 +++++++++++-------- .../fixtures/optimizer/annotate_functions.sql | 42 +-------- 2 files changed, 60 insertions(+), 73 deletions(-) diff --git a/sqlglot/typing/spark2.py b/sqlglot/typing/spark2.py index ed9e9fe67f..3a12554395 100644 --- a/sqlglot/typing/spark2.py +++ b/sqlglot/typing/spark2.py @@ -12,40 +12,65 @@ from sqlglot.typing import ExprMetadataType -def _annotate_by_similar_args( - self: TypeAnnotator, - expression: E, - *args: str, - target_type: exp.DataType | exp.DType | tuple[exp.DataType | exp.DType, ...], -) -> E: +def _common_array_element_type(types: list[exp.DataType]) -> exp.DataType | exp.DType: """ - Infers the type of the expression according to the following rules: - - If any arg matches a target_type, the expr is inferred as the first target_type - - If any arg is of UNKNOWN type and none match target_type, the expr is inferred as UNKNOWN. - - Otherwise the expr is inferred as the type of the last non-matching arg. + Recursively narrows a list of CONCAT-arg DataTypes to their common type. + + - Returns UNKNOWN for incompatible types: scalar mismatches (INT + DATE), + ARRAY mixed with non-ARRAY (e.g. CONCAT(ARRAY, INT)), and unmatched + nesting depths (which yield ARRAY at the appropriate level). + UNKNOWN means "no common type", which the caller relies on. + - Return type is exp.DataType | exp.DType: bare DType for simple cases, + DataType for the recursive ARRAY case where nesting is required. """ - target_types = target_type if isinstance(target_type, tuple) else (target_type,) - result_type = target_types[0] + normalized = [ + exp.DataType(this=exp.DType.TEXT) if t.this in exp.DataType.TEXT_TYPES else t for t in types + ] + if len({t.sql() for t in normalized}) == 1: + return normalized[0] + if all(t.this == exp.DType.ARRAY for t in normalized): + elem_types = [ + t.expressions[0] if t.expressions else exp.DataType(this=exp.DType.UNKNOWN) + for t in normalized + ] + common_elem = _common_array_element_type(elem_types) + elem_dt = ( + common_elem if isinstance(common_elem, exp.DataType) else exp.DataType(this=common_elem) + ) + return exp.DataType(this=exp.DType.ARRAY, expressions=[elem_dt], nested=True) + if any(t.this == exp.DType.TEXT for t in normalized): + return exp.DType.TEXT + return exp.DType.UNKNOWN - expressions: list[exp.Expr] = [] - for arg in args: - arg_expr = expression.args.get(arg) - expressions.extend(expr for expr in ensure_list(arg_expr) if expr) - last_datatype = None +def _annotate_by_similar_args(self: TypeAnnotator, expression: E, *arg_keys: str) -> E: + """ + Type inference for CONCAT-family expressions (CONCAT, LPAD, RPAD). + + - TEXT-before-UNKNOWN is load-bearing: a known text arg forces a text + result, since the query either coerces the unknown to string or fails + entirely — no valid execution produces a non-text result. + - TEXT_TYPES on input narrows to DType.TEXT on output: CONCAT/LPAD + accept any TEXT_TYPES member (VARCHAR/CHAR/NCHAR/NVARCHAR/NAME) as + input, but Spark always emits DType.TEXT. + """ + arg_exprs: list[exp.Expression] = [] + for key in arg_keys: + arg_exprs.extend(e for e in ensure_list(expression.args.get(key)) if e) - has_unknown = False - for expr in expressions: - if expr.is_type(exp.DType.UNKNOWN): - has_unknown = True - elif expr.is_type(*target_types): - has_unknown = False - last_datatype = result_type - break - else: - last_datatype = expr.type + result: exp.DataType | exp.DType + if any(e.is_type(*exp.DataType.TEXT_TYPES) for e in arg_exprs): + result = exp.DType.TEXT + elif any(e.is_type(exp.DType.UNKNOWN) for e in arg_exprs): + result = exp.DType.UNKNOWN + elif all(e.is_type(exp.DType.BINARY) for e in arg_exprs): + result = exp.DType.BINARY + elif any(e.is_type(exp.DType.ARRAY) for e in arg_exprs): + result = _common_array_element_type([e.type for e in arg_exprs]) + else: + result = exp.DType.TEXT - self._set_type(expression, exp.DType.UNKNOWN if has_unknown else last_datatype) + self._set_type(expression, result) return expression @@ -79,15 +104,9 @@ def _annotate_by_similar_args( ) }, exp.AtTimeZone: {"returns": exp.DType.TIMESTAMP}, - exp.Concat: { - "annotator": lambda self, e: _annotate_by_similar_args( - self, e, "expressions", target_type=(exp.DType.TEXT, *exp.DataType.TEXT_TYPES) - ) - }, + exp.Concat: {"annotator": lambda self, e: _annotate_by_similar_args(self, e, "expressions")}, exp.NextDay: {"returns": exp.DType.DATE}, exp.Pad: { - "annotator": lambda self, e: _annotate_by_similar_args( - self, e, "this", "fill_pattern", target_type=(exp.DType.TEXT, *exp.DataType.TEXT_TYPES) - ) + "annotator": lambda self, e: _annotate_by_similar_args(self, e, "this", "fill_pattern") }, } diff --git a/tests/fixtures/optimizer/annotate_functions.sql b/tests/fixtures/optimizer/annotate_functions.sql index 35e4219951..6288890b62 100644 --- a/tests/fixtures/optimizer/annotate_functions.sql +++ b/tests/fixtures/optimizer/annotate_functions.sql @@ -257,32 +257,12 @@ UNKNOWN; CONCAT(unknown, unknown); UNKNOWN; -# dialect: spark2, spark, databricks -CONCAT('x', tbl.str_col); -STRING; - -# dialect: spark2, spark, databricks -CONCAT('x', '-', tbl.str_col); -STRING; - # dialect: spark2, spark, databricks CONCAT('x', tbl.date_col); STRING; # dialect: spark2, spark, databricks -CONCAT(tbl.str_col, tbl.date_col); -STRING; - -# dialect: spark2, spark, databricks -CONCAT(tbl.date_col, 'x'); -STRING; - -# dialect: spark2, spark, databricks -CONCAT(tbl.str_col, '-', tbl.date_col); -STRING; - -# dialect: spark2, spark, databricks -CONCAT(tbl.str_col, tbl.int_col); +CONCAT(tbl.date_col, tbl.date_col); STRING; # dialect: spark2, spark, databricks @@ -290,33 +270,21 @@ CONCAT('x', tbl.bin_col); STRING; # dialect: spark2, spark, databricks -CONCAT(tbl.array_col, tbl.array_col); +CONCAT(array('a', 'b'), array(1, 2)); ARRAY; # dialect: spark2, spark, databricks -CONCAT(array(1, 2), array(3, 4)); -ARRAY; - -# dialect: spark2, spark, databricks -CONCAT(array(unhex('aa')), array(unhex('bb'))); -ARRAY; - -# dialect: spark2, spark, databricks -LPAD(tbl.str_col, 10, '0'); -STRING; +CONCAT(array(array('a')), array(array(1))); +ARRAY>; # dialect: spark2, spark, databricks -LPAD('x', 10, tbl.str_col); +CONCAT(tbl.date_col, tbl.int_col); STRING; # dialect: spark2, spark, databricks LPAD('x', 10, tbl.date_col); STRING; -# dialect: spark2, spark, databricks -LPAD(tbl.date_col, 10, 'x'); -STRING; - # dialect: spark2, spark, databricks LPAD(tbl.bin_col, 1, tbl.bin_col); BINARY; From 50da99896ef08ac00291f6294c357f7b31157210 Mon Sep 17 00:00:00 2001 From: "richard.hughes" Date: Tue, 26 May 2026 09:58:32 -0700 Subject: [PATCH 6/6] refactor(spark): simplify CONCAT/PAD annotator; drop array handling [CLAUDE] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per reviewer feedback (sqlglot#7661), array-overload typing is out of scope. The annotator now does: - all BINARY args → BINARY - any arg with a known, non-array, non-binary type → STRING (binary excluded to preserve binary+unknown → UNKNOWN, since Spark can't disambiguate the string vs. binary overload there) - otherwise → UNKNOWN Drops _common_array_element_type and the ARRAY<...> fixture cases. Co-Authored-By: Claude Opus 4.7 (1M context) --- sqlglot/typing/spark2.py | 61 +++++-------------- .../fixtures/optimizer/annotate_functions.sql | 8 --- 2 files changed, 15 insertions(+), 54 deletions(-) diff --git a/sqlglot/typing/spark2.py b/sqlglot/typing/spark2.py index 3a12554395..9def31ec7a 100644 --- a/sqlglot/typing/spark2.py +++ b/sqlglot/typing/spark2.py @@ -12,63 +12,32 @@ from sqlglot.typing import ExprMetadataType -def _common_array_element_type(types: list[exp.DataType]) -> exp.DataType | exp.DType: - """ - Recursively narrows a list of CONCAT-arg DataTypes to their common type. - - - Returns UNKNOWN for incompatible types: scalar mismatches (INT + DATE), - ARRAY mixed with non-ARRAY (e.g. CONCAT(ARRAY, INT)), and unmatched - nesting depths (which yield ARRAY at the appropriate level). - UNKNOWN means "no common type", which the caller relies on. - - Return type is exp.DataType | exp.DType: bare DType for simple cases, - DataType for the recursive ARRAY case where nesting is required. - """ - normalized = [ - exp.DataType(this=exp.DType.TEXT) if t.this in exp.DataType.TEXT_TYPES else t for t in types - ] - if len({t.sql() for t in normalized}) == 1: - return normalized[0] - if all(t.this == exp.DType.ARRAY for t in normalized): - elem_types = [ - t.expressions[0] if t.expressions else exp.DataType(this=exp.DType.UNKNOWN) - for t in normalized - ] - common_elem = _common_array_element_type(elem_types) - elem_dt = ( - common_elem if isinstance(common_elem, exp.DataType) else exp.DataType(this=common_elem) - ) - return exp.DataType(this=exp.DType.ARRAY, expressions=[elem_dt], nested=True) - if any(t.this == exp.DType.TEXT for t in normalized): - return exp.DType.TEXT - return exp.DType.UNKNOWN - - def _annotate_by_similar_args(self: TypeAnnotator, expression: E, *arg_keys: str) -> E: """ Type inference for CONCAT-family expressions (CONCAT, LPAD, RPAD). - - TEXT-before-UNKNOWN is load-bearing: a known text arg forces a text - result, since the query either coerces the unknown to string or fails - entirely — no valid execution produces a non-text result. - - TEXT_TYPES on input narrows to DType.TEXT on output: CONCAT/LPAD - accept any TEXT_TYPES member (VARCHAR/CHAR/NCHAR/NVARCHAR/NAME) as - input, but Spark always emits DType.TEXT. + - All-BINARY → BINARY (the binary overload). + - Otherwise, if any arg has a known, non-array, non-binary type → STRING. + Spark coerces scalars (dates, ints, etc.) to string when mixed with a + string-resolving arg. The binary exclusion preserves the binary+unknown + case as UNKNOWN: Spark can't disambiguate the string vs. binary overload + there. + - Else → UNKNOWN. Covers all-unknown, binary+unknown, and anything + involving arrays (array handling is intentionally out of scope here). """ arg_exprs: list[exp.Expression] = [] for key in arg_keys: arg_exprs.extend(e for e in ensure_list(expression.args.get(key)) if e) - result: exp.DataType | exp.DType - if any(e.is_type(*exp.DataType.TEXT_TYPES) for e in arg_exprs): + if arg_exprs and all(e.is_type(exp.DType.BINARY) for e in arg_exprs): + result: exp.DataType | exp.DType = exp.DType.BINARY + elif any( + e.type is not None and not e.is_type(exp.DType.UNKNOWN, exp.DType.ARRAY, exp.DType.BINARY) + for e in arg_exprs + ): result = exp.DType.TEXT - elif any(e.is_type(exp.DType.UNKNOWN) for e in arg_exprs): - result = exp.DType.UNKNOWN - elif all(e.is_type(exp.DType.BINARY) for e in arg_exprs): - result = exp.DType.BINARY - elif any(e.is_type(exp.DType.ARRAY) for e in arg_exprs): - result = _common_array_element_type([e.type for e in arg_exprs]) else: - result = exp.DType.TEXT + result = exp.DType.UNKNOWN self._set_type(expression, result) return expression diff --git a/tests/fixtures/optimizer/annotate_functions.sql b/tests/fixtures/optimizer/annotate_functions.sql index 6288890b62..d71e13bc1a 100644 --- a/tests/fixtures/optimizer/annotate_functions.sql +++ b/tests/fixtures/optimizer/annotate_functions.sql @@ -269,14 +269,6 @@ STRING; CONCAT('x', tbl.bin_col); STRING; -# dialect: spark2, spark, databricks -CONCAT(array('a', 'b'), array(1, 2)); -ARRAY; - -# dialect: spark2, spark, databricks -CONCAT(array(array('a')), array(array(1))); -ARRAY>; - # dialect: spark2, spark, databricks CONCAT(tbl.date_col, tbl.int_col); STRING;