diff --git a/sqlglot-integration-tests b/sqlglot-integration-tests index 6b0ede1e19..8f9acede6d 160000 --- a/sqlglot-integration-tests +++ b/sqlglot-integration-tests @@ -1 +1 @@ -Subproject commit 6b0ede1e19ff20ebbc3939e48eeb8f5f6db67fac +Subproject commit 8f9acede6de5aa48db36258db1d90cb628d59d17 diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py index aa6304c41e..1c5487cf24 100644 --- a/sqlglot/dialects/dialect.py +++ b/sqlglot/dialects/dialect.py @@ -379,6 +379,12 @@ class Dialect(metaclass=_Dialect): e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery """ + PRESERVE_ORIGINAL_OUTPUT_NAME_CASE: bool = False + """ + Whether the dialect preserves the original case of column aliases. When True, + qualify_outputs will not apply normalize_identifier to synthesized aliases. + """ + LOG_BASE_FIRST: bool | None = True """ Whether the base comes first in the `LOG` function. diff --git a/sqlglot/dialects/spark.py b/sqlglot/dialects/spark.py index 0cc9ef7d07..4cf52dde15 100644 --- a/sqlglot/dialects/spark.py +++ b/sqlglot/dialects/spark.py @@ -14,6 +14,7 @@ class Spark(Spark2): SUPPORTS_LIMIT_ALL = True SUPPORTS_NULL_TYPE = True ARRAY_FUNCS_PROPAGATES_NULLS = True + PRESERVE_ORIGINAL_OUTPUT_NAME_CASE = True EXPRESSION_METADATA = EXPRESSION_METADATA.copy() class Tokenizer(Spark2.Tokenizer): diff --git a/sqlglot/generators/tsql.py b/sqlglot/generators/tsql.py index 88de0bfc25..60c86a9f2a 100644 --- a/sqlglot/generators/tsql.py +++ b/sqlglot/generators/tsql.py @@ -74,6 +74,7 @@ def qualify_derived_table_outputs(expression: exp.Expr) -> exp.Expr: and isinstance(alias, exp.TableAlias) and not alias.columns ): + from sqlglot.dialects.tsql import TSQL from sqlglot.optimizer.qualify_columns import qualify_outputs # We keep track of the unaliased column projection indexes instead of the expressions @@ -84,7 +85,7 @@ def qualify_derived_table_outputs(expression: exp.Expr) -> exp.Expr: i for i, c in enumerate(query.selects) if isinstance(c, exp.Column) and not c.alias ) - qualify_outputs(query) + qualify_outputs(query, dialect=TSQL()) # Preserve the quoting information of columns for newly added Alias nodes query_selects = query.selects diff --git a/sqlglot/optimizer/qualify_columns.py b/sqlglot/optimizer/qualify_columns.py index a976c02a57..162d80ef18 100644 --- a/sqlglot/optimizer/qualify_columns.py +++ b/sqlglot/optimizer/qualify_columns.py @@ -101,7 +101,7 @@ def qualify_columns( pseudocolumns, annotator, ) - qualify_outputs(scope) + qualify_outputs(scope, dialect=dialect) _expand_group_by(scope, dialect) @@ -831,7 +831,8 @@ def _expand_stars( continue for table in tables: - if table not in scope.sources: + source = scope.sources.get(table) + if source is None: raise OptimizeError(f"Unknown table: {table}") columns = resolver.get_source_columns(table, only_visible=True) @@ -848,6 +849,15 @@ def _expand_stars( renamed_columns = rename_columns.get(table_id, {}) replaced_columns = replace_columns.get(table_id, {}) + # Preserve case-sensitivity of quoted source columns when expanding stars, + # so the generated alias isn't folded by dialect normalization + source_expression = source.expression if isinstance(source, Scope) else None + quoted_columns = ( + {s.output_name: _output_identifier_quoted(s) for s in source_expression.selects} + if isinstance(source_expression, exp.Query) + else {} + ) + if pivot: pivot_columns = pivot.output_columns(columns) or pivot.alias_column_names @@ -875,7 +885,19 @@ def _expand_stars( ) else: alias_ = renamed_columns.get(name, name) - selection_expr = replaced_columns.get(name) or exp.column(name, table=table) + quoted = quoted_columns.get(name) or ( + # if it has characters that the dialect would have changed, infer that it was quoted. + isinstance(source, exp.Table) and dialect.case_sensitive(name) + ) + selection_expr = replaced_columns.get(name) or exp.column( + name, table=table, quoted=quoted + ) + if ( + quoted + and isinstance(selection_expr, exp.Column) + and not selection_expr.this.quoted + ): + selection_expr.this.set("quoted", True) new_selections.append( alias(selection_expr, alias_, copy=False) if alias_ != name @@ -887,6 +909,18 @@ def _expand_stars( scope_expression.set("expressions", new_selections) +def _output_identifier_quoted(selection: exp.Expr) -> bool: + """Whether a projection's output column name is a quoted (case-sensitive) identifier.""" + if isinstance(selection, exp.Alias): + identifier = selection.args.get("alias") + elif isinstance(selection, exp.Column): + identifier = selection.this + else: + identifier = None + + return isinstance(identifier, exp.Identifier) and identifier.quoted + + def _add_ilike_columns(expression: exp.Expr) -> str | None: ilike = expression.args.get("ilike") @@ -936,7 +970,7 @@ def _add_replace_columns( replace_columns[id(table)] = columns -def qualify_outputs(scope_or_expression: Scope | exp.Expr) -> None: +def qualify_outputs(scope_or_expression: Scope | exp.Expr, dialect: Dialect) -> None: """Ensure all output columns are aliased""" if isinstance(scope_or_expression, exp.Expr): scope = build_scope(scope_or_expression) @@ -960,13 +994,21 @@ def qualify_outputs(scope_or_expression: Scope | exp.Expr) -> None: if isinstance(selection, exp.Subquery): if not selection.output_name: - selection.set("alias", exp.TableAlias(this=exp.to_identifier(f"_col_{i}"))) + alias_identifier = exp.to_identifier(f"_col_{i}") + if dialect and not (dialect.PRESERVE_ORIGINAL_OUTPUT_NAME_CASE): + dialect.normalize_identifier(alias_identifier) + selection.set("alias", exp.TableAlias(this=alias_identifier)) elif not isinstance(selection, (exp.Alias, exp.Aliases)) and not selection.is_star: + source_quoted = isinstance(selection, exp.Column) and selection.this.quoted selection = alias( selection, alias=selection.output_name or f"_col_{i}", copy=False, ) + if source_quoted: + selection.args["alias"].set("quoted", True) + if dialect and not (dialect.PRESERVE_ORIGINAL_OUTPUT_NAME_CASE): + dialect.normalize_identifier(selection.args["alias"]) if aliased_column: selection.set("alias", exp.to_identifier(aliased_column)) diff --git a/tests/fixtures/optimizer/optimizer.sql b/tests/fixtures/optimizer/optimizer.sql index 21fdab1c65..bea5ffa2d4 100644 --- a/tests/fixtures/optimizer/optimizer.sql +++ b/tests/fixtures/optimizer/optimizer.sql @@ -1588,5 +1588,5 @@ CROSS JOIN LATERAL FLATTEN(input => "OBJ"."DATA") AS "F"("SEQ", "KEY", "PATH", " SELECT array_agg(id) WITHIN GROUP (ORDER BY id) OVER (PARTITION BY grp) FROM t; SELECT ARRAY_AGG("T"."ID") WITHIN GROUP (ORDER BY - "T"."ID") OVER (PARTITION BY "T"."GRP") AS "_col_0" + "T"."ID") OVER (PARTITION BY "T"."GRP") AS "_COL_0" FROM "T" AS "T"; \ No newline at end of file diff --git a/tests/fixtures/optimizer/pushdown_projections.sql b/tests/fixtures/optimizer/pushdown_projections.sql index 02bdda7404..bf6f141290 100644 --- a/tests/fixtures/optimizer/pushdown_projections.sql +++ b/tests/fixtures/optimizer/pushdown_projections.sql @@ -122,11 +122,11 @@ SELECT _0.a AS a, _0.b AS b FROM (WITH cte1 AS (SELECT 1 AS a, 2 AS b) SELECT ct # dialect: snowflake SELECT OBJECT_CONSTRUCT(*) FROM (SELECT a, b FROM x) AS t; -SELECT OBJECT_CONSTRUCT(*) AS _col_0 FROM (SELECT a AS a, b AS b FROM x AS x) AS t; +SELECT OBJECT_CONSTRUCT(*) AS _COL_0 FROM (SELECT a AS A, b AS B FROM x AS x) AS t; # dialect: snowflake WITH base AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d) SELECT OBJECT_INSERT(OBJECT_CONSTRUCT(*), 'e', 5) FROM base; -WITH base AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d) SELECT OBJECT_INSERT(OBJECT_CONSTRUCT(*), 'e', 5) AS _col_0 FROM base AS base; +WITH base AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d) SELECT OBJECT_INSERT(OBJECT_CONSTRUCT(*), 'e', 5) AS _COL_0 FROM base AS base; # dialect: snowflake WITH base AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d) SELECT obj:A, obj:B FROM (SELECT OBJECT_INSERT(OBJECT_CONSTRUCT(*), 'e', 5) AS obj, a FROM base) AS t; @@ -134,11 +134,11 @@ WITH base AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d) SELECT GET_PATH(t.obj, 'A') # dialect: snowflake WITH cte AS (SELECT 1 AS a, 2 as b) SELECT HASH_AGG(*) FROM cte; -WITH cte AS (SELECT 1 AS a, 2 AS b) SELECT HASH_AGG(*) AS _col_0 FROM cte AS cte; +WITH cte AS (SELECT 1 AS a, 2 AS b) SELECT HASH_AGG(*) AS _COL_0 FROM cte AS cte; # dialect: snowflake WITH cte AS (SELECT a, b FROM x) SELECT COUNT(* EXCLUDE a) FROM cte; -WITH cte AS (SELECT a AS a, b AS b FROM x AS x) SELECT COUNT(* EXCLUDE (a)) AS _col_0 FROM cte AS cte; +WITH cte AS (SELECT a AS A, b AS B FROM x AS x) SELECT COUNT(* EXCLUDE (a)) AS _COL_0 FROM cte AS cte; WITH cte1 AS (SELECT a, SUM(b) AS sale FROM x GROUP BY a), cte2 AS (SELECT cte1.a, COUNT(*) AS cnt FROM cte1 GROUP BY cte1.a) SELECT a, cnt FROM cte2; WITH cte1 AS (SELECT x.a AS a FROM x AS x GROUP BY x.a), cte2 AS (SELECT cte1.a AS a, COUNT(*) AS cnt FROM cte1 AS cte1 GROUP BY cte1.a) SELECT cte2.a AS a, cte2.cnt AS cnt FROM cte2 AS cte2; diff --git a/tests/fixtures/optimizer/qualify_columns.sql b/tests/fixtures/optimizer/qualify_columns.sql index f75a0a3ac4..35a5824fea 100644 --- a/tests/fixtures/optimizer/qualify_columns.sql +++ b/tests/fixtures/optimizer/qualify_columns.sql @@ -5,7 +5,7 @@ SELECT a FROM x; SELECT x.a AS a FROM x AS x; SELECT "a" FROM x; -SELECT x."a" AS a FROM x AS x; +SELECT x."a" AS "a" FROM x AS x; # execute: false SELECT a FROM zz GROUP BY a ORDER BY a; @@ -110,7 +110,7 @@ SELECT T."col" AS "col" FROM TBL T; # execute: false # dialect: oracle WITH base AS (SELECT x.dummy AS COL_1 FROM dual x) SELECT b."COL_1" FROM base b; -WITH BASE AS (SELECT X.DUMMY AS COL_1 FROM DUAL X) SELECT B."COL_1" AS COL_1 FROM BASE B; +WITH BASE AS (SELECT X.DUMMY AS COL_1 FROM DUAL X) SELECT B."COL_1" AS "COL_1" FROM BASE B; # execute: false -- this query seems to be invalid in postgres and duckdb but valid in bigquery diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index 24a8c6b80e..a792f14688 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -1345,9 +1345,8 @@ def test_canonicalize_internal_names(self): self.assertEqual(canon_pg_a, canon_pg_qa) # In Snowflake (upper-folding), unquoted `a` becomes `A`, while quoted `"a"` stays - # lowercase — they reference *different* columns. Base-table names are preserved, - # and the quote state on the lowercase column is retained because dropping it - # would let Snowflake re-case-fold `a` back to `A` (changing semantics). + # lowercase — they reference *different* columns. The generated alias for the quoted + # column keeps its exact spelling, since folding it would re-case-fold `a` back to `A`. sf_schema = {"X": {"A": "INT", '"a"': "INT"}} canon_sf = qualify_then_canonicalize( parse_one('SELECT a, "a" FROM x', dialect="snowflake"), @@ -2424,7 +2423,7 @@ def test_quotes(self): schema = { "example": { '"source"': { - "id": "text", + '"ID"': "text", '"name"': "text", '"payload"': "text", } @@ -2762,7 +2761,7 @@ def _parse_and_optimize(query: str, dialect: str) -> exp.Expr: sql = _parse_and_optimize("SELECT col:A.a, col:a.A FROM t", dialect="snowflake") assert ( sql - == '''SELECT GET_PATH("T"."COL", 'A.a') AS "a", GET_PATH("T"."COL", 'a.A') AS "A" FROM "T" AS "T"''' + == '''SELECT GET_PATH("T"."COL", 'A.a') AS "A", GET_PATH("T"."COL", 'a.A') AS "A" FROM "T" AS "T"''' ) query = parse_one(