Feat(gizmosql): Add ADBC bulk ingestion and SQL-based transactions

prmoore77 · claude · prmoore77 · commit 1040ee146428 · 2026-01-15T17:18:56.000-05:00
- Use adbc_ingest for efficient Arrow-native DataFrame loading
- Replace DuckDB-style temp table approach with ADBC bulk ingestion
- Add SQL-based transaction support (BEGIN/COMMIT/ROLLBACK)
- Override transaction() method since ADBC connection methods don't work
- Add test for DataFrame bulk ingestion

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/sqlmesh/core/engine_adapter/gizmosql.py b/sqlmesh/core/engine_adapter/gizmosql.py
@@ -150,22 +150,45 @@ def _df_to_source_queries(
         """
         Convert a DataFrame to source queries for insertion.
 
-        For GizmoSQL, we use a temporary table approach similar to DuckDB.
-        The DataFrame is registered and then selected from.
+        Uses ADBC bulk ingestion (adbc_ingest) for efficient Arrow-native data transfer
+        to GizmoSQL, avoiding row-by-row insertion overhead.
         """
+        import pyarrow as pa
+
+        # Generate a simple temp table name without schema prefix
+        # adbc_ingest creates tables in the current schema and treats the full
+        # string as a literal table name (doesn't parse schema.table)
         temp_table = self._get_temp_table(target_table)
-        temp_table_sql = (
-            exp.select(*self._casted_columns(target_columns_to_types, source_columns))
-            .from_("df")
-            .sql(dialect=self.dialect)
+        # Extract just the table name without schema/catalog
+        temp_table_name = temp_table.name
+
+        # Select only the source columns in the right order
+        source_columns_to_types = (
+            {col: target_columns_to_types[col] for col in source_columns}
+            if source_columns
+            else target_columns_to_types
+        )
+        ordered_df = df[list(source_columns_to_types.keys())]
+
+        # Convert DataFrame to PyArrow Table for bulk ingestion
+        arrow_table = pa.Table.from_pandas(ordered_df)
+
+        # Use ADBC bulk ingestion - much faster than row-by-row INSERT
+        self.cursor.adbc_ingest(
+            table_name=temp_table_name,
+            data=arrow_table,
+            mode="create",
         )
-        self.cursor.sql(f"CREATE TABLE {temp_table} AS {temp_table_sql}")
+
+        # Create a simple table reference for queries (no schema prefix)
+        temp_table_ref = exp.to_table(temp_table_name)
+
         return [
             SourceQuery(
                 query_factory=lambda: self._select_columns(target_columns_to_types).from_(
-                    temp_table
+                    temp_table_ref
                 ),
-                cleanup_func=lambda: self.drop_table(temp_table),
+                cleanup_func=lambda: self.drop_table(temp_table_ref),
             )
         ]
 
diff --git a/tests/core/engine_adapter/integration/test_integration_gizmosql.py b/tests/core/engine_adapter/integration/test_integration_gizmosql.py
@@ -255,3 +255,51 @@ def test_query_with_expressions(gizmosql_adapter: GizmoSQLEngineAdapter):
     assert result is not None
     assert result[0] == 1
     assert result[1] == "hello"
+
+
+def test_dataframe_bulk_ingestion(gizmosql_adapter: GizmoSQLEngineAdapter):
+    """Test bulk DataFrame ingestion using ADBC adbc_ingest."""
+    import pandas as pd
+
+    schema_name = "test_bulk_ingest_schema"
+    table_name = f"{schema_name}.bulk_test_table"
+
+    try:
+        # Setup
+        gizmosql_adapter.drop_schema(schema_name, ignore_if_not_exists=True, cascade=True)
+        gizmosql_adapter.create_schema(schema_name)
+
+        # Create a test DataFrame
+        df = pd.DataFrame({
+            "id": [1, 2, 3, 4, 5],
+            "name": ["alice", "bob", "charlie", "diana", "eve"],
+            "value": [10.5, 20.5, 30.5, 40.5, 50.5],
+        })
+
+        # Create target table
+        columns_to_types = {
+            "id": exp.DataType.build("INT"),
+            "name": exp.DataType.build("VARCHAR"),
+            "value": exp.DataType.build("DOUBLE"),
+        }
+        gizmosql_adapter.create_table(table_name, columns_to_types)
+
+        # Use replace_query with DataFrame (this uses _df_to_source_queries internally)
+        gizmosql_adapter.replace_query(
+            table_name,
+            df,
+            columns_to_types,
+        )
+
+        # Verify data was loaded
+        result = gizmosql_adapter.fetchall(f"SELECT * FROM {table_name} ORDER BY id")
+        assert len(result) == 5
+        assert result[0][0] == 1
+        assert result[0][1] == "alice"
+        assert abs(result[0][2] - 10.5) < 0.001
+        assert result[4][0] == 5
+        assert result[4][1] == "eve"
+
+    finally:
+        # Cleanup
+        gizmosql_adapter.drop_schema(schema_name, ignore_if_not_exists=True, cascade=True)