Skip to content

Commit 3117d76

Browse files
authored
Fix!: exclude Semicolon expressions from model state (#4257)
1 parent 99cca24 commit 3117d76

File tree

3 files changed

+164
-1
lines changed

3 files changed

+164
-1
lines changed

sqlmesh/core/model/common.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,9 @@ def parse_expression(
262262

263263
if isinstance(v, list):
264264
return [
265-
d.parse_one(e, dialect=dialect) if not isinstance(e, exp.Expression) else e for e in v
265+
e if isinstance(e, exp.Expression) else d.parse_one(e, dialect=dialect)
266+
for e in v
267+
if not isinstance(e, exp.Semicolon)
266268
]
267269

268270
if isinstance(v, str):
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""
2+
This script's goal is to warn users if there are two adjacent expressions in a SQL
3+
model that are equivalent.
4+
5+
Context:
6+
7+
We used to include `Semicolon` expressions in the model's state, which led to a bug
8+
where the expression preceding the semicolon would be duplicated in pre_statements
9+
or post_statements. For example, the query in the model below would be incorrectly
10+
included in its post_statements list:
11+
12+
```
13+
MODEL (
14+
name test
15+
);
16+
17+
SELECT 1 AS c;
18+
19+
-- foo
20+
```
21+
22+
We now don't include `Semicolon` expressions in the model's state, which fixes this
23+
issue, but unfortunately migrating existing snapshots is not possible because we do
24+
not have a signal in state to detect whether an expression was incorrectly duplicated.
25+
26+
If a SQL model suffered from this issue, then there would be two adjacent equivalent
27+
expressions in it, so we use that as a heuristic to warn the user accordingly.
28+
"""
29+
30+
import json
31+
32+
from sqlglot import exp
33+
34+
from sqlmesh.core.console import get_console
35+
36+
37+
def migrate(state_sync, **kwargs): # type: ignore
38+
engine_adapter = state_sync.engine_adapter
39+
schema = state_sync.schema
40+
snapshots_table = "_snapshots"
41+
if schema:
42+
snapshots_table = f"{schema}.{snapshots_table}"
43+
44+
warning = (
45+
"SQLMesh detected that it may not be able to fully migrate the state database. This should not impact "
46+
"the migration process, but may result in unexpected changes being reported by the next `sqlmesh plan` "
47+
"command. Please run `sqlmesh diff prod` after the migration has completed, before making any new "
48+
"changes. If any unexpected changes are reported, consider running a forward-only plan to apply these "
49+
"changes and avoid unnecessary backfills: sqlmesh plan prod --forward-only. "
50+
"See https://sqlmesh.readthedocs.io/en/stable/concepts/plans/#forward-only-plans for more details.\n"
51+
)
52+
53+
for (snapshot,) in engine_adapter.fetchall(
54+
exp.select("snapshot").from_(snapshots_table), quote_identifiers=True
55+
):
56+
parsed_snapshot = json.loads(snapshot)
57+
node = parsed_snapshot["node"]
58+
59+
if node.get("source_type") == "sql":
60+
expressions = [
61+
*node.get("pre_statements", []),
62+
node["query"],
63+
*node.get("post_statements", []),
64+
]
65+
for e1, e2 in zip(expressions, expressions[1:]):
66+
if e1 == e2:
67+
get_console().log_warning(warning)
68+
return

tests/core/test_model.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9332,6 +9332,7 @@ def test_python_env_references_are_unequal_but_point_to_same_definition(tmp_path
93329332

93339333
db_path = str(tmp_path / "db.db")
93349334
db_connection = DuckDBConnectionConfig(database=db_path)
9335+
93359336
config = Config(
93369337
gateways={"duckdb": GatewayConfig(connection=db_connection)},
93379338
model_defaults=ModelDefaultsConfig(dialect="duckdb"),
@@ -9447,3 +9448,95 @@ def f():
94479448

94489449
with pytest.raises(SQLMeshError, match=r"duplicate definitions found"):
94499450
Context(paths=tmp_path, config=config)
9451+
9452+
9453+
def test_semicolon_is_not_included_in_model_state(tmp_path, assert_exp_eq):
9454+
init_example_project(tmp_path, dialect="duckdb", template=ProjectTemplate.EMPTY)
9455+
9456+
db_connection = DuckDBConnectionConfig(database=str(tmp_path / "db.db"))
9457+
config = Config(
9458+
gateways={"duckdb": GatewayConfig(connection=db_connection)},
9459+
model_defaults=ModelDefaultsConfig(dialect="duckdb"),
9460+
)
9461+
9462+
model_file = tmp_path / "models" / "model_with_semicolon.sql"
9463+
model_file.write_text(
9464+
"""
9465+
MODEL (
9466+
name sqlmesh_example.incremental_model_with_semicolon,
9467+
kind INCREMENTAL_BY_TIME_RANGE (
9468+
time_column event_date
9469+
),
9470+
start '2020-01-01',
9471+
cron '@daily',
9472+
grain (id, event_date)
9473+
);
9474+
9475+
SELECT
9476+
1 AS id,
9477+
1 AS item_id,
9478+
CAST('2020-01-01' AS DATE) AS event_date
9479+
;
9480+
9481+
--Just a comment
9482+
"""
9483+
)
9484+
9485+
ctx = Context(paths=tmp_path, config=config)
9486+
model = ctx.get_model("sqlmesh_example.incremental_model_with_semicolon")
9487+
9488+
assert not model.pre_statements
9489+
assert not model.post_statements
9490+
9491+
assert_exp_eq(
9492+
model.render_query(),
9493+
'SELECT 1 AS "id", 1 AS "item_id", CAST(\'2020-01-01\' AS DATE) AS "event_date"',
9494+
)
9495+
ctx.format()
9496+
9497+
assert (
9498+
model_file.read_text()
9499+
== """MODEL (
9500+
name sqlmesh_example.incremental_model_with_semicolon,
9501+
kind INCREMENTAL_BY_TIME_RANGE (
9502+
time_column event_date
9503+
),
9504+
start '2020-01-01',
9505+
cron '@daily',
9506+
grain (id, event_date)
9507+
);
9508+
9509+
SELECT
9510+
1 AS id,
9511+
1 AS item_id,
9512+
'2020-01-01'::DATE AS event_date;
9513+
9514+
/* Just a comment */"""
9515+
)
9516+
9517+
ctx.plan(no_prompts=True, auto_apply=True)
9518+
9519+
model_file = tmp_path / "models" / "model_with_semicolon.sql"
9520+
model_file.write_text(
9521+
"""
9522+
MODEL (
9523+
name sqlmesh_example.incremental_model_with_semicolon,
9524+
kind INCREMENTAL_BY_TIME_RANGE (
9525+
time_column event_date
9526+
),
9527+
start '2020-01-01',
9528+
cron '@daily',
9529+
grain (id, event_date)
9530+
);
9531+
9532+
SELECT
9533+
1 AS id,
9534+
1 AS item_id,
9535+
CAST('2020-01-01' AS DATE) AS event_date
9536+
"""
9537+
)
9538+
9539+
ctx.load()
9540+
plan = ctx.plan(no_prompts=True, auto_apply=True)
9541+
9542+
assert not plan.context_diff.modified_snapshots

0 commit comments

Comments
 (0)