Feat: add support for YAML dictionaries in unit tests (MVP) (#2264)

georgesittas · web-flow · commit 68ea9f662796 · 2024-03-24T01:17:32.000+02:00
* Feat: add support for YAML dictionaries in unit tests

* Replace applymap with map since it got deprecated after pandas 2.1.0

* Revert dict logic

* Refactor
diff --git a/sqlmesh/core/dialect.py b/sqlmesh/core/dialect.py
@@ -25,8 +25,6 @@
 
 SQLMESH_MACRO_PREFIX = "@"
 
-JSON_TYPE = exp.DataType.build("json")
-
 TABLES_META = "sqlmesh.tables"
 
 
@@ -942,13 +940,12 @@ def _transform(node: exp.Expression) -> exp.Expression:
 def transform_values(
     values: t.Tuple[t.Any, ...], columns_to_types: t.Dict[str, exp.DataType]
 ) -> t.Iterator[t.Any]:
-    """Perform transformations on values given columns_to_types.
-
-    Currently, the only transformation is wrapping JSON columns with PARSE_JSON().
-    """
+    """Perform transformations on values given columns_to_types."""
     for value, col_type in zip(values, columns_to_types.values()):
-        if col_type == JSON_TYPE:
+        if col_type.is_type(exp.DataType.Type.JSON):
             yield exp.func("PARSE_JSON", f"'{value}'")
+        elif isinstance(value, dict) and col_type.is_type(*exp.DataType.STRUCT_TYPES):
+            yield _dict_to_struct(value)
         else:
             yield value
 
@@ -994,3 +991,13 @@ def _unquote_schema(schema: t.Dict) -> t.Dict:
     return {
         k.strip('"'): _unquote_schema(v) if isinstance(v, dict) else v for k, v in schema.items()
     }
+
+
+def _dict_to_struct(values: t.Dict) -> exp.Struct:
+    expressions = []
+    for key, value in values.items():
+        key = exp.to_identifier(key)
+        value = _dict_to_struct(value) if isinstance(value, dict) else exp.convert(value)
+        expressions.append(exp.PropertyEQ(this=key, expression=value))
+
+    return exp.Struct(expressions=expressions)
diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py
@@ -1000,20 +1000,17 @@ def insert_append(
         )
 
     @t.overload
-    @classmethod
-    def _escape_json(cls, value: Query) -> Query: ...
+    def _escape_json(self, value: Query) -> Query: ...
 
     @t.overload
-    @classmethod
-    def _escape_json(cls, value: str) -> str: ...
+    def _escape_json(self, value: str) -> str: ...
 
-    @classmethod
-    def _escape_json(cls, value: Query | str) -> Query | str:
+    def _escape_json(self, value: Query | str) -> Query | str:
         """
         Some engines need to add an extra escape to literals that contain JSON values. By default we don't do this
         though
         """
-        if cls.ESCAPE_JSON:
+        if self.ESCAPE_JSON:
             if isinstance(value, str):
                 return double_escape(value)
             return t.cast(
@@ -1093,9 +1090,8 @@ def insert_overwrite_by_time_partition(
         )
         self._insert_overwrite_by_condition(table_name, source_queries, columns_to_types, where)
 
-    @classmethod
     def _values_to_sql(
-        cls,
+        self,
         values: t.List[PandasNamedTuple],
         columns_to_types: t.Dict[str, exp.DataType],
         batch_start: int,
@@ -1111,7 +1107,7 @@ def _values_to_sql(
             alias=alias,
         )
         if contains_json:
-            query = t.cast(exp.Select, cls._escape_json(query))
+            query = t.cast(exp.Select, self._escape_json(query))
         return query
 
     def _insert_overwrite_by_condition(
diff --git a/sqlmesh/core/test/definition.py b/sqlmesh/core/test/definition.py
@@ -82,10 +82,10 @@ def setUp(self) -> None:
             rows = values["rows"]
             if not columns_to_types and rows:
                 for i, v in rows[0].items():
-                    # convert ruamel into python
-                    v = v.real if hasattr(v, "real") else v
                     v_type = annotate_types(exp.convert(v)).type or type(v).__name__
-                    columns_to_types[i] = exp.maybe_parse(v_type, into=exp.DataType)
+                    columns_to_types[i] = exp.maybe_parse(
+                        v_type, into=exp.DataType, dialect=self.dialect
+                    )
 
             test_fixture_table = _fully_qualified_test_fixture_table(table_name, self.dialect)
             if test_fixture_table.db:
@@ -138,6 +138,7 @@ def _to_hashable(x: t.Any) -> t.Any:
             actual = actual.sort_values(by=actual.columns.to_list()).reset_index(drop=True)
             expected = expected.apply(lambda col: col.map(_to_hashable))
             expected = expected.sort_values(by=expected.columns.to_list()).reset_index(drop=True)
+
         try:
             pd.testing.assert_frame_equal(
                 expected,
diff --git a/tests/core/engine_adapter/test_base.py b/tests/core/engine_adapter/test_base.py
@@ -842,9 +842,9 @@ def test_merge_upsert(make_mocked_engine_adapter: t.Callable, assert_exp_eq):
         target_table="target",
         source_table=t.cast(exp.Select, parse_one('SELECT "ID", ts, val FROM source')),
         columns_to_types={
-            "ID": exp.DataType.Type.INT,
-            "ts": exp.DataType.Type.TIMESTAMP,
-            "val": exp.DataType.Type.INT,
+            "ID": exp.DataType.build("int"),
+            "ts": exp.DataType.build("timestamp"),
+            "val": exp.DataType.build("int"),
         },
         unique_key=[exp.to_identifier("ID", quoted=True)],
     )
@@ -873,9 +873,9 @@ def test_merge_upsert(make_mocked_engine_adapter: t.Callable, assert_exp_eq):
         target_table="target",
         source_table=parse_one("SELECT id, ts, val FROM source"),
         columns_to_types={
-            "id": exp.DataType.Type.INT,
-            "ts": exp.DataType.Type.TIMESTAMP,
-            "val": exp.DataType.Type.INT,
+            "id": exp.DataType.build("int"),
+            "ts": exp.DataType.build("timestamp"),
+            "val": exp.DataType.build("int"),
         },
         unique_key=[exp.column("id"), exp.column("ts")],
     )
@@ -894,9 +894,9 @@ def test_merge_upsert_pandas(make_mocked_engine_adapter: t.Callable):
         target_table="target",
         source_table=df,
         columns_to_types={
-            "id": exp.DataType.Type.INT,
-            "ts": exp.DataType.Type.TIMESTAMP,
-            "val": exp.DataType.Type.INT,
+            "id": exp.DataType.build("int"),
+            "ts": exp.DataType.build("timestamp"),
+            "val": exp.DataType.build("int"),
         },
         unique_key=[exp.to_identifier("id")],
     )
@@ -911,9 +911,9 @@ def test_merge_upsert_pandas(make_mocked_engine_adapter: t.Callable):
         target_table="target",
         source_table=df,
         columns_to_types={
-            "id": exp.DataType.Type.INT,
-            "ts": exp.DataType.Type.TIMESTAMP,
-            "val": exp.DataType.Type.INT,
+            "id": exp.DataType.build("int"),
+            "ts": exp.DataType.build("timestamp"),
+            "val": exp.DataType.build("int"),
         },
         unique_key=[exp.to_identifier("id"), exp.to_identifier("ts")],
     )
@@ -931,9 +931,9 @@ def test_merge_when_matched(make_mocked_engine_adapter: t.Callable, assert_exp_e
         target_table="target",
         source_table=t.cast(exp.Select, parse_one('SELECT "ID", ts, val FROM source')),
         columns_to_types={
-            "ID": exp.DataType.Type.INT,
-            "ts": exp.DataType.Type.TIMESTAMP,
-            "val": exp.DataType.Type.INT,
+            "ID": exp.DataType.build("int"),
+            "ts": exp.DataType.build("timestamp"),
+            "val": exp.DataType.build("int"),
         },
         unique_key=[exp.to_identifier("ID", quoted=True)],
         when_matched=exp.When(
diff --git a/tests/core/engine_adapter/test_databricks.py b/tests/core/engine_adapter/test_databricks.py
@@ -65,7 +65,9 @@ def test_replace_query_pandas_exists(mocker: MockFixture, make_mocked_engine_ada
     )
     adapter = make_mocked_engine_adapter(DatabricksEngineAdapter)
     df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
-    adapter.replace_query("test_table", df, {"a": "int", "b": "int"})
+    adapter.replace_query(
+        "test_table", df, {"a": exp.DataType.build("int"), "b": exp.DataType.build("int")}
+    )
 
     assert to_sql_calls(adapter) == [
         "INSERT OVERWRITE TABLE `test_table` (`a`, `b`) SELECT CAST(`a` AS INT) AS `a`, CAST(`b` AS INT) AS `b` FROM VALUES (1, 4), (2, 5), (3, 6) AS `t`(`a`, `b`)",
diff --git a/tests/core/engine_adapter/test_redshift.py b/tests/core/engine_adapter/test_redshift.py
@@ -166,7 +166,7 @@ def test_values_to_sql(adapter: t.Callable, mocker: MockerFixture):
     df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
     result = adapter._values_to_sql(
         values=list(df.itertuples(index=False, name=None)),
-        columns_to_types={"a": "int", "b": "int"},
+        columns_to_types={"a": exp.DataType.build("int"), "b": exp.DataType.build("int")},
         batch_start=0,
         batch_end=2,
     )
diff --git a/tests/core/test_test.py b/tests/core/test_test.py
@@ -667,28 +667,38 @@ def test_source_func() -> None:
 
 
 def test_nested_data_types() -> None:
+    raw = _create_model(
+        "SELECT array::INT[], struct::STRUCT(x INT[], y VARCHAR, z INT, w STRUCT(a INT)) FROM sushi.unknown",
+        meta="MODEL (name sushi.raw, kind FULL)",
+        default_catalog="memory",
+    )
+    context = Context(config=Config(model_defaults=ModelDefaultsConfig(dialect="duckdb")))
+    context.upsert_model(raw)
+
     result = _create_test(
         body=load_yaml(
             """
 test_foo:
   model: sushi.foo
   inputs:
-    raw:
-      - value: [1, 2, 3]
-      - value:
+    sushi.raw:
+      - array: [1, 2, 3]
+        struct: {'x': [1, 2, 3], 'y': 'foo', 'z': 1, 'w': {'a': 5}}
+      - array:
         - 2
         - 3
-      - value: [0, 4, 1]
+      - array: [0, 4, 1]
   outputs:
     query:
-      - value: [0, 4, 1]
-      - value: [1, 2, 3]
-      - value: [2, 3]
+      - array: [0, 4, 1]
+      - array: [1, 2, 3]
+        struct: {'x': [1, 2, 3], 'y': 'foo', 'z': 1, 'w': {'a': 5}}
+      - array: [2, 3]
             """
         ),
         test_name="test_foo",
-        model=_create_model("SELECT value FROM raw"),
-        context=Context(config=Config(model_defaults=ModelDefaultsConfig(dialect="duckdb"))),
+        model=_create_model("SELECT array, struct FROM sushi.raw", default_catalog="memory"),
+        context=context,
     ).run()
 
     _check_successful_or_raise(result)

Original file line number	Diff line number	Diff line change
`@@ -166,7 +166,7 @@ def test_values_to_sql(adapter: t.Callable, mocker: MockerFixture):`
`166`	`166`	`df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})`
`167`	`167`	`result = adapter._values_to_sql(`
`168`	`168`	`values=list(df.itertuples(index=False, name=None)),`
`169`		`- columns_to_types={"a": "int", "b": "int"},`
	`169`	`+ columns_to_types={"a": exp.DataType.build("int"), "b": exp.DataType.build("int")},`
`170`	`170`	`batch_start=0,`
`171`	`171`	`batch_end=2,`
`172`	`172`	`)`