Fix: Create a temporary view to source the model schema instead of parsing the Redshift plan (#2308)

izeigerman · web-flow · commit 765b7377d418 · 2024-03-25T09:07:36.000-07:00
diff --git a/sqlmesh/core/engine_adapter/base_postgres.py b/sqlmesh/core/engine_adapter/base_postgres.py
@@ -35,10 +35,11 @@ def columns(
         sql = (
             exp.select("column_name", "data_type")
             .from_(self.COLUMNS_TABLE)
-            .where(
-                f"table_name = '{table.alias_or_name}' AND table_schema = '{table.args['db'].name}'"
-            )
+            .where(f"table_name = '{table.alias_or_name}'")
         )
+        if table.args.get("db"):
+            sql = sql.where(f"table_schema = '{table.args['db'].name}'")
+
         self.execute(sql)
         resp = self.cursor.fetchall()
         if not resp:
diff --git a/sqlmesh/core/engine_adapter/redshift.py b/sqlmesh/core/engine_adapter/redshift.py
@@ -4,7 +4,6 @@
 
 import pandas as pd
 from sqlglot import exp
-from sqlglot.errors import ErrorLevel
 
 from sqlmesh.core.dialect import to_schema
 from sqlmesh.core.engine_adapter.base_postgres import BasePostgresEngineAdapter
@@ -20,7 +19,7 @@
     SourceQuery,
     set_catalog,
 )
-from sqlmesh.utils.errors import SQLMeshError
+from sqlmesh.utils import random_id
 
 if t.TYPE_CHECKING:
     from sqlmesh.core._typing import SchemaName, TableName
@@ -121,68 +120,36 @@ def _build_create_table_exp(
             and statement.expression.args.get("limit") is not None
             and statement.expression.args["limit"].expression.this == "0"
         ):
+            assert not isinstance(table_name_or_schema, exp.Schema)
             # redshift has a bug where CTAS statements have non determistic types. if a limit
             # is applied to a ctas statement, VARCHAR types default to 1 in some instances.
             # this checks the explain plain from redshift and tries to detect when these optimizer
             # bugs occur and force a cast
-            explain_statement = statement.copy()
-            for select in explain_statement.find_all(exp.Select):
-                if select.args.get("from"):
-                    select.set("limit", None)
-                    select.set("where", None)
+            select_statement = statement.expression.copy()
+            for select_or_union in select_statement.find_all(exp.Select, exp.Union):
+                select_or_union.set("limit", None)
+                select_or_union.set("where", None)
+
+            temp_view_name = exp.table_(f"#sqlmesh__{random_id()}")
+            self.create_view(
+                temp_view_name, select_statement, replace=False, no_schema_binding=False
+            )
+            columns_to_types_from_view = self.columns(temp_view_name)
 
-            explain_statement_sql = explain_statement.sql(
-                dialect=self.dialect, identify=True, unsupported_level=ErrorLevel.IGNORE, copy=False
+            schema = self._build_schema_exp(
+                exp.to_table(table_name_or_schema),
+                columns_to_types_from_view,
             )
-            plan = parse_plan(
-                "\n".join(r[0] for r in self.fetchall(f"EXPLAIN VERBOSE {explain_statement_sql}"))
+            statement = super()._build_create_table_exp(
+                schema,
+                None,
+                exists=exists,
+                replace=replace,
+                columns_to_types=columns_to_types_from_view,
+                table_description=table_description,
+                **kwargs,
             )
 
-            if plan:
-                select = exp.Select().from_(statement.expression.subquery("_subquery"))
-                statement.expression.replace(select)
-
-                for target in plan["targetlist"]:  # type: ignore
-                    if target["name"] == "TARGETENTRY":
-                        resdom = target["resdom"]
-                        resname = resdom["resname"]
-                        if resname == "<>":
-                            # A synthetic column added by Redshift to compute a window function.
-                            continue
-                        if resname == "? column ?":
-                            table_name_str = (
-                                table_name_or_schema
-                                if isinstance(table_name_or_schema, str)
-                                else table_name_or_schema.sql(dialect=self.dialect)
-                            )
-                            raise SQLMeshError(f"Missing column name for table '{table_name_str}'")
-                        # https://github.com/postgres/postgres/blob/master/src/include/catalog/pg_type.dat
-                        restype = resdom["restype"]
-                        data_type: t.Optional[str] = None
-                        if restype == "1043":
-                            size = (
-                                int(resdom["restypmod"]) - 4
-                                if resdom["restypmod"] != "- 1"
-                                else "MAX"
-                            )
-                            # Cast NULL instead of the original projection to trick the planner into assigning a
-                            # correct type to the column.
-                            data_type = f"VARCHAR({size})"
-                        else:
-                            data_type = REDSHIFT_PLAN_TYPE_MAPPINGS.get(restype)
-
-                        if data_type:
-                            select.select(
-                                exp.cast(
-                                    exp.null(),
-                                    data_type,
-                                    dialect=self.dialect,
-                                ).as_(resname),
-                                copy=False,
-                            )
-                        else:
-                            select.select(resname, copy=False)
-
         return statement
 
     def create_view(
@@ -209,7 +176,7 @@ def create_view(
             materialized,
             table_description=table_description,
             column_descriptions=column_descriptions,
-            no_schema_binding=True,
+            no_schema_binding=create_kwargs.pop("no_schema_binding", True),
             **create_kwargs,
         )
 
@@ -309,90 +276,3 @@ def _get_data_objects(
             )
             for row in df.itertuples()
         ]
-
-
-def parse_plan(plan: str) -> t.Optional[t.Dict]:
-    """Parse the output of a redshift explain verbose query plan into a Python dict."""
-    from sqlglot import Tokenizer, TokenType
-    from sqlglot.tokens import Token
-
-    tokens = Tokenizer().tokenize(plan)
-    i = 0
-    terminal_tokens = {TokenType.L_PAREN, TokenType.R_PAREN, TokenType.R_BRACE, TokenType.COLON}
-
-    def curr() -> t.Optional[TokenType]:
-        return tokens[i].token_type if i < len(tokens) else None
-
-    def advance() -> Token:
-        nonlocal i
-        i += 1
-        return tokens[i - 1]
-
-    def match(token_type: TokenType, raise_unmatched: bool = False) -> t.Optional[Token]:
-        if curr() == token_type:
-            return advance()
-        if raise_unmatched:
-            raise Exception(f"Expected {token_type}")
-        return None
-
-    def parse_value() -> t.Any:
-        if match(TokenType.L_PAREN):
-            values = []
-            while not match(TokenType.R_PAREN):
-                values.append(parse_value())
-            return values
-
-        nested = parse_nested()
-
-        if nested:
-            return nested
-
-        value = []
-
-        while not curr() in terminal_tokens:
-            value.append(advance().text)
-
-        return " ".join(value)
-
-    def parse_nested() -> t.Optional[t.Dict]:
-        if not match(TokenType.L_BRACE):
-            return None
-        query_plan = {}
-        query_plan["name"] = advance().text
-
-        while match(TokenType.COLON):
-            key = advance().text
-
-            while match(TokenType.DOT):
-                key += f".{advance().text}"
-
-            query_plan[key] = parse_value()
-
-        match(TokenType.R_BRACE, True)
-        return query_plan
-
-    while curr():
-        nested = parse_nested()
-
-        if nested and nested.get("name") in ("RESULT", "SEQSCAN"):
-            return nested
-        advance()
-    return None
-
-
-# https://github.com/postgres/postgres/blob/master/src/include/catalog/pg_type.dat
-REDSHIFT_PLAN_TYPE_MAPPINGS = {
-    "16": "BOOL",
-    "18": "CHAR",
-    "21": "SMALLINT",
-    "23": "INT",
-    "20": "BIGINT",
-    "1700": "NUMERIC",
-    "700": "FLOAT",
-    "701": "DOUBLE",
-    "1114": "TIMESTAMP",
-    "1184": "TIMESTAMPTZ",
-    "1083": "TIME",
-    "1266": "TIMETZ",
-    "1082": "DATE",
-}
diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py
@@ -1094,10 +1094,12 @@ def create(
                 column_descriptions=model.column_descriptions if is_table_deployable else None,
             )
 
-            # only sql models have queries that can be tested
-            # additionally, we always create temp tables and sometimes
-            # we additionally created prod tables, so we only need to test one.
-            if model.is_sql and not is_table_deployable:
+            # Only sql models have queries that can be tested.
+            # Additionally, we always create temp tables and sometimes we additionally created prod tables,
+            # we need to make sure that we only dry run once.
+            # We also need to make sure that we don't dry run on Redshift because its planner / optimizer sometimes
+            # breaks on our CTAS queries due to us relying on the WHERE FALSE LIMIT 0 combo.
+            if model.is_sql and not is_table_deployable and self.adapter.dialect != "redshift":
                 logger.info("Dry running model '%s'", model.name)
                 self.adapter.fetchall(ctas_query)
         else:
diff --git a/tests/core/engine_adapter/test_redshift.py b/tests/core/engine_adapter/test_redshift.py