From dc96f373ec6573d899b367bb15419616c78761c3 Mon Sep 17 00:00:00 2001 From: "richard.hughes" Date: Tue, 26 May 2026 19:23:50 -0700 Subject: [PATCH 1/2] test(databricks): assert LCT string promotion for COALESCE/IF/CASE [CLAUDE] Databricks string-promotes least-common-type functions when an argument is text and the rest are non-boolean/non-binary atomics. Verified on a Databricks serverless (always-ANSI) warehouse: typeof(coalesce(cast(1 as int), 'abc')) -> string typeof(coalesce(cast(1.5 as double), 'abc')) -> string typeof(coalesce(cast('2020-01-01' as date), 'abc')) -> string typeof(coalesce(interval 1 day, 'abc')) -> string typeof(if(true, cast(1 as int), 'abc')) -> string typeof(case when true then cast(1 as int) else 'abc' end) -> string boolean+string and binary+string raise DATATYPE_MISMATCH in Databricks, so those rows keep their current (best-effort) annotation. These fixtures fail until the annotator fix lands. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../fixtures/optimizer/annotate_functions.sql | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/tests/fixtures/optimizer/annotate_functions.sql b/tests/fixtures/optimizer/annotate_functions.sql index 4487444e34..2cef6f271f 100644 --- a/tests/fixtures/optimizer/annotate_functions.sql +++ b/tests/fixtures/optimizer/annotate_functions.sql @@ -307,11 +307,11 @@ STRING; # dialect: databricks IF(cond, tbl.str_col, tbl.double_col); -DOUBLE; +STRING; # dialect: databricks IF(cond, tbl.double_col, tbl.str_col); -DOUBLE; +STRING; # dialect: hive, spark2, spark IF(cond, tbl.date_col, tbl.str_col); @@ -323,11 +323,11 @@ STRING; # dialect: databricks IF(cond, tbl.date_col, tbl.str_col); -DATE; +STRING; # dialect: databricks IF(cond, tbl.str_col, tbl.date_col); -DATE; +STRING; # dialect: hive, spark2, spark, databricks IF(cond, tbl.date_col, tbl.timestamp_col); @@ -371,19 +371,19 @@ STRING; # dialect: databricks COALESCE(tbl.str_col, tbl.bigint_col); -BIGINT; +STRING; # dialect: databricks COALESCE(tbl.bigint_col, tbl.str_col); -BIGINT; +STRING; # dialect: databricks COALESCE(tbl.str_col, NULL, tbl.bigint_col); -BIGINT; +STRING; # dialect: databricks COALESCE(tbl.bigint_col, NULL, tbl.str_col); -BIGINT; +STRING; # dialect: databricks COALESCE(tbl.bool_col, tbl.str_col); @@ -395,12 +395,28 @@ STRING; # dialect: databricks COALESCE(tbl.interval_col, tbl.str_col); -INTERVAL; +STRING; # dialect: databricks COALESCE(tbl.bin_col, tbl.str_col); BINARY; +# dialect: databricks +COALESCE(tbl.int_col, tbl.str_col); +STRING; + +# dialect: databricks +NVL(tbl.int_col, tbl.str_col); +STRING; + +# dialect: databricks +CASE WHEN cond THEN tbl.int_col ELSE tbl.str_col END; +STRING; + +# dialect: databricks +COALESCE(tbl.int_col, tbl.bigint_col); +BIGINT; + # dialect: spark, databricks LOCALTIMESTAMP(); TIMESTAMPNTZ; From 6d5187ef6ce9e8da3bbcc102accee1031732ba64 Mon Sep 17 00:00:00 2001 From: "richard.hughes" Date: Tue, 26 May 2026 19:34:45 -0700 Subject: [PATCH 2/2] fix(databricks): string-promote COALESCE/IF/CASE per findWiderCommonType [CLAUDE] Databricks resolves least-common-type functions (COALESCE/IFNULL/NVL, IF, CASE) with Spark's findWiderCommonType string promotion: when a value argument is text and the rest are non-boolean/non-binary atomics, the result is text. Previously these folded through Databricks' COERCES_TO lattice (text coerces into numeric/temporal), so coalesce(int_col, str_col) annotated as BIGINT instead of STRING, independent of argument order. Verified on a Databricks serverless (always-ANSI) warehouse: coalesce/if/ case of text + numeric/date/interval -> string; text + boolean/binary raises DATATYPE_MISMATCH, so those defer to the existing numeric-widening fallback. Note this diverges from open-source Spark AnsiTypeCoercion (which promotes string+int to long under ANSI) -- Databricks SQL string- promotes these functions regardless of ANSI mode. Scoped to Databricks; Spark/Hive (non-ANSI, already string-promoting via their inverted lattice) and other dialects are unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) --- sqlglot/dialects/databricks.py | 44 ++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/sqlglot/dialects/databricks.py b/sqlglot/dialects/databricks.py index 44291626d0..ce3b9c411f 100644 --- a/sqlglot/dialects/databricks.py +++ b/sqlglot/dialects/databricks.py @@ -9,6 +9,43 @@ from sqlglot.parsers.databricks import DatabricksParser from sqlglot.tokens import TokenType from sqlglot.optimizer.annotate_types import TypeAnnotator +from sqlglot.typing.spark import EXPRESSION_METADATA as SPARK_EXPRESSION_METADATA + + +def _string_promotes(values: list[exp.Expression]) -> bool: + """ + Whether a least-common-type function string-promotes given its value arguments. + + Databricks resolves COALESCE/IF/CASE via Spark's findWiderCommonType string + promotion: when an argument is text and the rest are non-boolean/non-binary + atomics, the common type is text. boolean+text and binary+text have no common + type (query-time DATATYPE_MISMATCH), so we defer those to numeric widening. + """ + return any(v.is_type(*exp.DataType.TEXT_TYPES) for v in values) and not any( + v.is_type(exp.DType.BOOLEAN, exp.DType.BINARY) for v in values + ) + + +def _annotate_coalesce(self: TypeAnnotator, e: exp.Coalesce) -> exp.Coalesce: + if _string_promotes([v for v in (e.this, *e.expressions) if v]): + self._set_type(e, exp.DType.TEXT) + return e + return self._annotate_by_args(e, "this", "expressions", promote=True) + + +def _annotate_if(self: TypeAnnotator, e: exp.If) -> exp.If: + if _string_promotes([v for v in (e.args.get("true"), e.args.get("false")) if v]): + self._set_type(e, exp.DType.TEXT) + return e + return self._annotate_by_args(e, "true", "false", promote=True) + + +def _annotate_case(self: TypeAnnotator, e: exp.Case) -> exp.Case: + thens = [if_expr.args["true"] for if_expr in e.args["ifs"]] + if _string_promotes([v for v in (*thens, e.args.get("default")) if v]): + self._set_type(e, exp.DType.TEXT) + return e + return self._annotate_by_args(e, *thens, "default") class Databricks(Spark): @@ -25,6 +62,13 @@ class Databricks(Spark): exp.DType.INTERVAL, } + EXPRESSION_METADATA = { + **SPARK_EXPRESSION_METADATA, + exp.Coalesce: {"annotator": _annotate_coalesce}, + exp.If: {"annotator": _annotate_if}, + exp.Case: {"annotator": _annotate_case}, + } + class JSONPathTokenizer(Spark.JSONPathTokenizer): IDENTIFIERS = ["`", '"']