feat: use multiprocessing to speed up loading (#3077)

tobymao · izeigerman · izeigerman · commit 570870fe43bc · 2024-09-05T09:58:32.000-07:00
Co-authored-by: Iaroslav Zeigerman &lt;zeigerman.ia@gmail.com&gt;
diff --git a/sqlmesh/core/loader.py b/sqlmesh/core/loader.py
@@ -3,9 +3,11 @@
 import abc
 import linecache
 import logging
+import multiprocessing as mp
 import os
 import typing as t
 from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -23,6 +25,7 @@
     ModelCache,
     OptimizedQueryCache,
     SeedModel,
+    SqlModel,
     create_external_model,
     load_sql_based_model,
 )
@@ -42,39 +45,6 @@
 logger = logging.getLogger(__name__)
 
 
-# TODO: consider moving this to context
-def update_model_schemas(
-    dag: DAG[str],
-    models: UniqueKeyDict[str, Model],
-    context_path: Path,
-) -> None:
-    schema = MappingSchema(normalize=False)
-    optimized_query_cache: OptimizedQueryCache = OptimizedQueryCache(context_path / c.CACHE)
-
-    for name in dag.sorted:
-        model = models.get(name)
-
-        # External models don't exist in the context, so we need to skip them
-        if not model:
-            continue
-
-        try:
-            model.update_schema(schema)
-            optimized_query_cache.with_optimized_query(model)
-
-            columns_to_types = model.columns_to_types
-            if columns_to_types is not None:
-                schema.add_table(
-                    model.fqn, columns_to_types, dialect=model.dialect, normalize=False
-                )
-        except SchemaError as e:
-            if "nesting level:" in str(e):
-                logger.error(
-                    "SQLMesh requires all model names and references to have the same level of nesting."
-                )
-            raise
-
-
 @dataclass
 class LoadedProject:
     macros: MacroRegistry
@@ -568,3 +538,111 @@ def _model_cache_entry_id(self, model_path: Path) -> str:
                     or self._loader._context.config.default_gateway_name,
                 ]
             )
+
+
+# TODO: consider moving this to context
+def update_model_schemas(
+    dag: DAG[str],
+    models: UniqueKeyDict[str, Model],
+    context_path: Path,
+) -> None:
+    schema = MappingSchema(normalize=False)
+    optimized_query_cache: OptimizedQueryCache = OptimizedQueryCache(context_path / c.CACHE)
+
+    if not hasattr(os, "fork") or "PYTEST_CURRENT_TEST" in os.environ:
+        _update_model_schemas_sequential(dag, models, schema, optimized_query_cache)
+    else:
+        _update_model_schemas_parallel(dag, models, schema, optimized_query_cache)
+
+
+def _update_schema_with_model(schema: MappingSchema, model: Model) -> None:
+    columns_to_types = model.columns_to_types
+    if columns_to_types:
+        try:
+            schema.add_table(model.fqn, columns_to_types, dialect=model.dialect, normalize=False)
+        except SchemaError as e:
+            if "nesting level:" in str(e):
+                logger.error(
+                    "SQLMesh requires all model names and references to have the same level of nesting."
+                )
+            raise
+
+
+def _update_model_schemas_sequential(
+    dag: DAG[str],
+    models: UniqueKeyDict[str, Model],
+    schema: MappingSchema,
+    optimized_query_cache: OptimizedQueryCache,
+) -> None:
+    for name in dag.sorted:
+        model = models.get(name)
+
+        # External models don't exist in the context, so we need to skip them
+        if not model:
+            continue
+
+        model.update_schema(schema)
+        optimized_query_cache.with_optimized_query(model)
+        _update_schema_with_model(schema, model)
+
+
+def _update_model_schemas_parallel(
+    dag: DAG[str],
+    models: UniqueKeyDict[str, Model],
+    schema: MappingSchema,
+    optimized_query_cache: OptimizedQueryCache,
+) -> None:
+    futures = set()
+    graph = {
+        model: {dep for dep in deps if dep in models}
+        for model, deps in dag._dag.items()
+        if model in models
+    }
+
+    def process_models(completed_model: t.Optional[Model] = None) -> None:
+        for name in list(graph):
+            deps = graph[name]
+
+            if completed_model:
+                deps.discard(completed_model.fqn)
+
+            if not deps:
+                del graph[name]
+                model = models[name]
+                model.update_schema(schema)
+                futures.add(executor.submit(_load_optimized_query_cache, model))
+
+    with ProcessPoolExecutor(
+        mp_context=mp.get_context("fork"),
+        initializer=_init_optimized_query_cache,
+        initargs=(optimized_query_cache,),
+    ) as executor:
+        process_models()
+
+        while futures:
+            for future in as_completed(futures):
+                futures.remove(future)
+                fqn, entry_name = future.result()
+                model = models[fqn]
+                if entry_name:
+                    optimized_query_cache.with_optimized_query(model, entry_name)
+
+                _update_schema_with_model(schema, model)
+                process_models(completed_model=model)
+
+
+_optimized_query_cache: t.Optional[OptimizedQueryCache] = None
+
+
+def _init_optimized_query_cache(optimized_query_cache: OptimizedQueryCache) -> None:
+    global _optimized_query_cache
+    _optimized_query_cache = optimized_query_cache
+
+
+def _load_optimized_query_cache(model: Model) -> t.Tuple[str, t.Optional[str]]:
+    assert _optimized_query_cache
+    if isinstance(model, SqlModel):
+        entry_name = _optimized_query_cache.put(model)
+    else:
+        entry_name = None
+    return model.fqn, entry_name
diff --git a/sqlmesh/core/model/cache.py b/sqlmesh/core/model/cache.py
@@ -72,16 +72,17 @@ def __init__(self, path: Path):
             path, prefix="optimized_query"
         )
 
-    def with_optimized_query(self, model: Model) -> bool:
+    def with_optimized_query(self, model: Model, name: t.Optional[str] = None) -> bool:
         """Adds an optimized query to the model's in-memory cache.
 
         Args:
             model: The model to add the optimized query to.
+            name: The cache entry name of the model.
         """
         if not isinstance(model, SqlModel):
             return False
 
-        name = self._entry_name(model)
+        name = self._entry_name(model) if name is None else name
         cache_entry = self._file_cache.get(name)
         if cache_entry:
             try:
@@ -101,15 +102,17 @@ def with_optimized_query(self, model: Model) -> bool:
         self._put(name, model)
         return False
 
-    def put(self, model: Model) -> None:
+    def put(self, model: Model) -> t.Optional[str]:
         if not isinstance(model, SqlModel):
-            return
+            return None
 
         name = self._entry_name(model)
+
         if self._file_cache.exists(name):
-            return
+            return name
 
         self._put(name, model)
+        return name
 
     def _put(self, name: str, model: SqlModel) -> None:
         optimized_query = model.render_query()
diff --git a/sqlmesh/core/renderer.py b/sqlmesh/core/renderer.py
@@ -77,6 +77,7 @@ def _render(
         table_mapping: t.Optional[t.Dict[str, str]] = None,
         deployability_index: t.Optional[DeployabilityIndex] = None,
         runtime_stage: RuntimeStage = RuntimeStage.LOADING,
+        normalize_identifiers: bool = True,
         **kwargs: t.Any,
     ) -> t.List[t.Optional[exp.Expression]]:
         """Renders a expression, expanding macros with provided kwargs
@@ -89,14 +90,15 @@ def _render(
             table_mapping: Table mapping of physical locations. Takes precedence over snapshot mappings.
             deployability_index: Determines snapshots that are deployable in the context of this evaluation.
             runtime_stage: Indicates the current runtime stage, for example if we're still loading the project, etc.
+            normalize_identifiers: Whether or not to normalize and quote identifiers.
             kwargs: Additional kwargs to pass to the renderer.
 
         Returns:
             The rendered expressions.
         """
 
         should_cache = self._should_cache(
-            runtime_stage, start, end, execution_time, *kwargs.values()
+            runtime_stage, start, end, execution_time, not normalize_identifiers, *kwargs.values()
         )
 
         if should_cache and self._cache:
@@ -193,7 +195,7 @@ def _render(
                 raise_config_error(f"Failed to resolve macro for expression. {ex}", self._path)
 
             for expression in t.cast(t.List[exp.Expression], transformed_expressions):
-                with self._normalize_and_quote(expression) as expression:
+                with self._normalize_and_quote(expression, normalize_identifiers) as expression:
                     if hasattr(expression, "selects"):
                         for select in expression.selects:
                             if not isinstance(select, exp.Alias) and select.output_name not in (
@@ -295,8 +297,8 @@ def _expand(node: exp.Expression) -> exp.Expression:
             return expression
 
     @contextmanager
-    def _normalize_and_quote(self, query: E) -> t.Iterator[E]:
-        if self._normalize_identifiers:
+    def _normalize_and_quote(self, query: E, normalize_identifiers: bool = True) -> t.Iterator[E]:
+        if self._normalize_identifiers and normalize_identifiers:
             with d.normalize_and_quote(
                 query, self._dialect, self._default_catalog, quote=self._quote_identifiers
             ) as query:
@@ -400,10 +402,10 @@ def render(
         """
 
         should_cache = self._should_cache(
-            runtime_stage, start, end, execution_time, *kwargs.values()
+            runtime_stage, start, end, execution_time, not optimize, *kwargs.values()
         )
 
-        if should_cache and self._optimized_cache and optimize:
+        if should_cache and self._optimized_cache:
             query = self._optimized_cache
         else:
             try:
@@ -415,6 +417,7 @@ def render(
                     table_mapping=table_mapping,
                     deployability_index=deployability_index,
                     runtime_stage=runtime_stage,
+                    normalize_identifiers=optimize,
                     **kwargs,
                 )
             except ParsetimeAdapterCallError:
diff --git a/tests/core/test_model.py b/tests/core/test_model.py
@@ -5677,3 +5677,41 @@ def test_cache():
     model = load_sql_based_model(expressions)
     assert model.depends_on == {'"y"'}
     assert model.copy(update={"depends_on_": {'"z"'}}).depends_on == {'"z"', '"y"'}
+
+
+def test_parallel_load(assert_exp_eq, mocker):
+    import os
+    from sqlmesh.core import loader
+
+    pytest_current_test = os.environ.pop("PYTEST_CURRENT_TEST")
+    try:
+        spy = mocker.spy(loader, "_update_model_schemas_parallel")
+        context = Context(paths="examples/sushi")
+
+        if hasattr(os, "fork"):
+            spy.assert_called()
+
+        assert_exp_eq(
+            context.render("sushi.customers"),
+            """
+    WITH "current_marketing" AS (
+      SELECT
+        "marketing"."customer_id" AS "customer_id",
+        "marketing"."status" AS "status"
+      FROM "memory"."sushi"."marketing" AS "marketing"
+      WHERE
+        "marketing"."valid_to" IS NULL
+    )
+    SELECT DISTINCT
+      CAST("o"."customer_id" AS INT) AS "customer_id", /* this comment should not be registered */
+      "m"."status" AS "status",
+      "d"."zip" AS "zip"
+    FROM "memory"."sushi"."orders" AS "o"
+    LEFT JOIN "current_marketing" AS "m"
+      ON "m"."customer_id" = "o"."customer_id"
+    LEFT JOIN "memory"."raw"."demographics" AS "d"
+      ON "d"."customer_id" = "o"."customer_id"
+            """,
+        )
+    finally:
+        os.environ["PYTEST_CURRENT_TEST"] = pytest_current_test