Skip to content
Open
2 changes: 1 addition & 1 deletion sqlglot-integration-tests
6 changes: 6 additions & 0 deletions sqlglot/dialects/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,12 @@ class Dialect(metaclass=_Dialect):
e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
"""

PRESERVE_ORIGINAL_OUTPUT_NAME_CASE: bool = False
"""
Whether the dialect preserves the original case of column aliases. When True,
qualify_outputs will not apply normalize_identifier to synthesized aliases.
"""

LOG_BASE_FIRST: bool | None = True
"""
Whether the base comes first in the `LOG` function.
Expand Down
1 change: 1 addition & 0 deletions sqlglot/dialects/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class Spark(Spark2):
SUPPORTS_LIMIT_ALL = True
SUPPORTS_NULL_TYPE = True
ARRAY_FUNCS_PROPAGATES_NULLS = True
PRESERVE_ORIGINAL_OUTPUT_NAME_CASE = True
EXPRESSION_METADATA = EXPRESSION_METADATA.copy()

class Tokenizer(Spark2.Tokenizer):
Expand Down
3 changes: 2 additions & 1 deletion sqlglot/generators/tsql.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def qualify_derived_table_outputs(expression: exp.Expr) -> exp.Expr:
and isinstance(alias, exp.TableAlias)
and not alias.columns
):
from sqlglot.dialects.tsql import TSQL
from sqlglot.optimizer.qualify_columns import qualify_outputs

# We keep track of the unaliased column projection indexes instead of the expressions
Expand All @@ -84,7 +85,7 @@ def qualify_derived_table_outputs(expression: exp.Expr) -> exp.Expr:
i for i, c in enumerate(query.selects) if isinstance(c, exp.Column) and not c.alias
)

qualify_outputs(query)
qualify_outputs(query, dialect=TSQL())

# Preserve the quoting information of columns for newly added Alias nodes
query_selects = query.selects
Expand Down
52 changes: 47 additions & 5 deletions sqlglot/optimizer/qualify_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def qualify_columns(
pseudocolumns,
annotator,
)
qualify_outputs(scope)
qualify_outputs(scope, dialect=dialect)

_expand_group_by(scope, dialect)

Expand Down Expand Up @@ -831,7 +831,8 @@ def _expand_stars(
continue

for table in tables:
if table not in scope.sources:
source = scope.sources.get(table)
if source is None:
raise OptimizeError(f"Unknown table: {table}")

columns = resolver.get_source_columns(table, only_visible=True)
Expand All @@ -848,6 +849,15 @@ def _expand_stars(
renamed_columns = rename_columns.get(table_id, {})
replaced_columns = replace_columns.get(table_id, {})

# Preserve case-sensitivity of quoted source columns when expanding stars,
# so the generated alias isn't folded by dialect normalization
source_expression = source.expression if isinstance(source, Scope) else None
quoted_columns = (
{s.output_name: _output_identifier_quoted(s) for s in source_expression.selects}
if isinstance(source_expression, exp.Query)
else {}
)
Comment on lines +855 to +859

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I'd make this a set.

Suggested change
quoted_columns = (
{s.output_name: _output_identifier_quoted(s) for s in source_expression.selects}
if isinstance(source_expression, exp.Query)
else {}
)
quoted_columns = {
s.output_name
for s in source_expression.selects
if isinstance(source_expression, exp.Query) and _output_identifier_quoted(s)
}

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, is this costly to construct within the for table in tables loop instead of once? Doesn't this repeat work? Is there a better way to implement this?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure there's a better way, for example, I don't see if (or how) to reliably access this in the for name in columns: loop.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah feel free to ignore my last comment above. It's fine.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That snippet crashes the test suite. The if isinstance(source_expression, exp.Query) needs to run outside of the loop, not inside.


if pivot:
pivot_columns = pivot.output_columns(columns) or pivot.alias_column_names

Expand Down Expand Up @@ -875,7 +885,19 @@ def _expand_stars(
)
else:
alias_ = renamed_columns.get(name, name)
selection_expr = replaced_columns.get(name) or exp.column(name, table=table)
quoted = quoted_columns.get(name) or (
# if it has characters that the dialect would have changed, infer that it was quoted.
isinstance(source, exp.Table) and dialect.case_sensitive(name)
)
selection_expr = replaced_columns.get(name) or exp.column(
name, table=table, quoted=quoted
)
if (
quoted
and isinstance(selection_expr, exp.Column)
and not selection_expr.this.quoted
):
selection_expr.this.set("quoted", True)
new_selections.append(
alias(selection_expr, alias_, copy=False)
if alias_ != name
Expand All @@ -887,6 +909,18 @@ def _expand_stars(
scope_expression.set("expressions", new_selections)


def _output_identifier_quoted(selection: exp.Expr) -> bool:
"""Whether a projection's output column name is a quoted (case-sensitive) identifier."""
if isinstance(selection, exp.Alias):
identifier = selection.args.get("alias")
elif isinstance(selection, exp.Column):
identifier = selection.this
else:
identifier = None

return isinstance(identifier, exp.Identifier) and identifier.quoted


def _add_ilike_columns(expression: exp.Expr) -> str | None:
ilike = expression.args.get("ilike")

Expand Down Expand Up @@ -936,7 +970,7 @@ def _add_replace_columns(
replace_columns[id(table)] = columns


def qualify_outputs(scope_or_expression: Scope | exp.Expr) -> None:
def qualify_outputs(scope_or_expression: Scope | exp.Expr, dialect: Dialect) -> None:
"""Ensure all output columns are aliased"""
if isinstance(scope_or_expression, exp.Expr):
scope = build_scope(scope_or_expression)
Expand All @@ -960,13 +994,21 @@ def qualify_outputs(scope_or_expression: Scope | exp.Expr) -> None:

if isinstance(selection, exp.Subquery):
if not selection.output_name:
selection.set("alias", exp.TableAlias(this=exp.to_identifier(f"_col_{i}")))
alias_identifier = exp.to_identifier(f"_col_{i}")
if dialect and not (dialect.PRESERVE_ORIGINAL_OUTPUT_NAME_CASE):
dialect.normalize_identifier(alias_identifier)
selection.set("alias", exp.TableAlias(this=alias_identifier))
elif not isinstance(selection, (exp.Alias, exp.Aliases)) and not selection.is_star:
source_quoted = isinstance(selection, exp.Column) and selection.this.quoted
Comment thread
georgesittas marked this conversation as resolved.
selection = alias(
selection,
alias=selection.output_name or f"_col_{i}",
copy=False,
)
if source_quoted:
selection.args["alias"].set("quoted", True)
if dialect and not (dialect.PRESERVE_ORIGINAL_OUTPUT_NAME_CASE):
dialect.normalize_identifier(selection.args["alias"])
if aliased_column:
selection.set("alias", exp.to_identifier(aliased_column))

Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/optimizer/optimizer.sql
Original file line number Diff line number Diff line change
Expand Up @@ -1588,5 +1588,5 @@ CROSS JOIN LATERAL FLATTEN(input => "OBJ"."DATA") AS "F"("SEQ", "KEY", "PATH", "
SELECT array_agg(id) WITHIN GROUP (ORDER BY id) OVER (PARTITION BY grp) FROM t;
SELECT
ARRAY_AGG("T"."ID") WITHIN GROUP (ORDER BY
"T"."ID") OVER (PARTITION BY "T"."GRP") AS "_col_0"
"T"."ID") OVER (PARTITION BY "T"."GRP") AS "_COL_0"
FROM "T" AS "T";
8 changes: 4 additions & 4 deletions tests/fixtures/optimizer/pushdown_projections.sql
Original file line number Diff line number Diff line change
Expand Up @@ -122,23 +122,23 @@ SELECT _0.a AS a, _0.b AS b FROM (WITH cte1 AS (SELECT 1 AS a, 2 AS b) SELECT ct

# dialect: snowflake
SELECT OBJECT_CONSTRUCT(*) FROM (SELECT a, b FROM x) AS t;
SELECT OBJECT_CONSTRUCT(*) AS _col_0 FROM (SELECT a AS a, b AS b FROM x AS x) AS t;
SELECT OBJECT_CONSTRUCT(*) AS _COL_0 FROM (SELECT a AS A, b AS B FROM x AS x) AS t;

# dialect: snowflake
WITH base AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d) SELECT OBJECT_INSERT(OBJECT_CONSTRUCT(*), 'e', 5) FROM base;
WITH base AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d) SELECT OBJECT_INSERT(OBJECT_CONSTRUCT(*), 'e', 5) AS _col_0 FROM base AS base;
WITH base AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d) SELECT OBJECT_INSERT(OBJECT_CONSTRUCT(*), 'e', 5) AS _COL_0 FROM base AS base;

# dialect: snowflake
WITH base AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d) SELECT obj:A, obj:B FROM (SELECT OBJECT_INSERT(OBJECT_CONSTRUCT(*), 'e', 5) AS obj, a FROM base) AS t;
WITH base AS (SELECT 1 AS a, 2 AS b, 3 AS c, 4 AS d) SELECT GET_PATH(t.obj, 'A') AS A, GET_PATH(t.obj, 'B') AS B FROM (SELECT OBJECT_INSERT(OBJECT_CONSTRUCT(*), 'e', 5) AS obj FROM base AS base) AS t;

# dialect: snowflake
WITH cte AS (SELECT 1 AS a, 2 as b) SELECT HASH_AGG(*) FROM cte;
WITH cte AS (SELECT 1 AS a, 2 AS b) SELECT HASH_AGG(*) AS _col_0 FROM cte AS cte;
WITH cte AS (SELECT 1 AS a, 2 AS b) SELECT HASH_AGG(*) AS _COL_0 FROM cte AS cte;

# dialect: snowflake
WITH cte AS (SELECT a, b FROM x) SELECT COUNT(* EXCLUDE a) FROM cte;
WITH cte AS (SELECT a AS a, b AS b FROM x AS x) SELECT COUNT(* EXCLUDE (a)) AS _col_0 FROM cte AS cte;
WITH cte AS (SELECT a AS A, b AS B FROM x AS x) SELECT COUNT(* EXCLUDE (a)) AS _COL_0 FROM cte AS cte;

WITH cte1 AS (SELECT a, SUM(b) AS sale FROM x GROUP BY a), cte2 AS (SELECT cte1.a, COUNT(*) AS cnt FROM cte1 GROUP BY cte1.a) SELECT a, cnt FROM cte2;
WITH cte1 AS (SELECT x.a AS a FROM x AS x GROUP BY x.a), cte2 AS (SELECT cte1.a AS a, COUNT(*) AS cnt FROM cte1 AS cte1 GROUP BY cte1.a) SELECT cte2.a AS a, cte2.cnt AS cnt FROM cte2 AS cte2;
Expand Down
4 changes: 2 additions & 2 deletions tests/fixtures/optimizer/qualify_columns.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ SELECT a FROM x;
SELECT x.a AS a FROM x AS x;

SELECT "a" FROM x;
SELECT x."a" AS a FROM x AS x;
SELECT x."a" AS "a" FROM x AS x;

# execute: false
SELECT a FROM zz GROUP BY a ORDER BY a;
Expand Down Expand Up @@ -110,7 +110,7 @@ SELECT T."col" AS "col" FROM TBL T;
# execute: false
# dialect: oracle
WITH base AS (SELECT x.dummy AS COL_1 FROM dual x) SELECT b."COL_1" FROM base b;
WITH BASE AS (SELECT X.DUMMY AS COL_1 FROM DUAL X) SELECT B."COL_1" AS COL_1 FROM BASE B;
WITH BASE AS (SELECT X.DUMMY AS COL_1 FROM DUAL X) SELECT B."COL_1" AS "COL_1" FROM BASE B;

# execute: false
-- this query seems to be invalid in postgres and duckdb but valid in bigquery
Expand Down
9 changes: 4 additions & 5 deletions tests/test_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1345,9 +1345,8 @@ def test_canonicalize_internal_names(self):
self.assertEqual(canon_pg_a, canon_pg_qa)

# In Snowflake (upper-folding), unquoted `a` becomes `A`, while quoted `"a"` stays
# lowercase — they reference *different* columns. Base-table names are preserved,
# and the quote state on the lowercase column is retained because dropping it
# would let Snowflake re-case-fold `a` back to `A` (changing semantics).
# lowercase — they reference *different* columns. The generated alias for the quoted
# column keeps its exact spelling, since folding it would re-case-fold `a` back to `A`.
sf_schema = {"X": {"A": "INT", '"a"': "INT"}}
canon_sf = qualify_then_canonicalize(
parse_one('SELECT a, "a" FROM x', dialect="snowflake"),
Expand Down Expand Up @@ -2424,7 +2423,7 @@ def test_quotes(self):
schema = {
"example": {
'"source"': {
"id": "text",
'"ID"': "text",
'"name"': "text",
'"payload"': "text",
}
Expand Down Expand Up @@ -2762,7 +2761,7 @@ def _parse_and_optimize(query: str, dialect: str) -> exp.Expr:
sql = _parse_and_optimize("SELECT col:A.a, col:a.A FROM t", dialect="snowflake")
assert (
sql
== '''SELECT GET_PATH("T"."COL", 'A.a') AS "a", GET_PATH("T"."COL", 'a.A') AS "A" FROM "T" AS "T"'''
== '''SELECT GET_PATH("T"."COL", 'A.a') AS "A", GET_PATH("T"."COL", 'a.A') AS "A" FROM "T" AS "T"'''
Comment thread
georgesittas marked this conversation as resolved.
)

query = parse_one(
Expand Down
Loading