feat: improve cold start of snapshot cache with multi processing (#3084)

tobymao · izeigerman · commit 847877849ea0 · 2024-09-05T09:59:40.000-07:00
diff --git a/Makefile b/Makefile
@@ -91,13 +91,13 @@ engine-up: engine-mssql-up engine-mysql-up engine-postgres-up engine-spark-up en
 engine-down: engine-mssql-down engine-mysql-down engine-postgres-down engine-spark-down engine-trino-down
 
 fast-test:
-	pytest -n auto -m "fast and not cicdonly"
+	pytest -n auto -m "fast and not cicdonly" && pytest -m "isolated"
 
 slow-test:
-	pytest -n auto -m "(fast or slow) and not cicdonly"
+	pytest -n auto -m "(fast or slow) and not cicdonly" && pytest -m "isolated"
 
 cicd-test:
-	pytest -n auto -m "fast or slow" --junitxml=test-results/junit-cicd.xml
+	pytest -n auto -m "fast or slow" --junitxml=test-results/junit-cicd.xml && pytest -m "isolated"
 
 core-fast-test:
 	pytest -n auto -m "fast and not web and not github and not dbt and not airflow and not jupyter"
@@ -199,4 +199,7 @@ databricks-test: guard-DATABRICKS_CATALOG guard-DATABRICKS_SERVER_HOSTNAME guard
 	pytest -n auto -x -m "databricks" --junitxml=test-results/junit-databricks.xml
 
 redshift-test: guard-REDSHIFT_HOST guard-REDSHIFT_USER guard-REDSHIFT_PASSWORD guard-REDSHIFT_DATABASE engine-redshift-install
-	pytest -n auto -x -m "redshift" --junitxml=test-results/junit-redshift.xml
+	pytest -n auto -x -m "redshift" --retries 3 --junitxml=test-results/junit-redshift.xml
+
+clickhouse-cloud-test: guard-CLICKHOUSE_CLOUD_HOST guard-CLICKHOUSE_CLOUD_USERNAME guard-CLICKHOUSE_CLOUD_PASSWORD engine-clickhouse-install
+	pytest -n auto -x -m "clickhouse_cloud" --retries 3 --junitxml=test-results/junit-clickhouse-cloud.xml
diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md
@@ -16,10 +16,10 @@ This section describes the other root level configuration parameters.
 
 Configuration options for SQLMesh project directories.
 
-| Option            | Description                                                                                                        |     Type     | Required |
-| ----------------- | ------------------------------------------------------------------------------------------------------------------ | :----------: | :------: |
-| `ignore_patterns` | Files that match glob patterns specified in this list are ignored when scanning the project folder (Default: `[]`) | list[string] |    N     |
-| `project`         | The project name of this config. Used for [multi-repo setups](../guides/multi_repo.md).                            | string       |    N     |
+| Option             | Description                                                                                                        |     Type     | Required |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------ | :----------: | :------: |
+| `ignore_patterns`  | Files that match glob patterns specified in this list are ignored when scanning the project folder (Default: `[]`) | list[string] |    N     |
+| `project`          | The project name of this config. Used for [multi-repo setups](../guides/multi_repo.md).                            | string       |    N     |
 
 ### Environments
 
@@ -291,3 +291,8 @@ You can disable collection of anonymized usage information with these methods:
 
 - Set the root `disable_anonymized_analytics: true` key in your SQLMesh project configuration file
 - Execute SQLMesh commands with an environment variable `SQLMESH__DISABLE_ANONYMIZED_ANALYTICS` set to `1`, `true`, `t`, `yes`, or `y`
+
+## Parallel loading
+SQLMesh by default uses all of your cores when loading models and snapshots. It takes advantage of `fork` which is not available on Windows. The default is to use the same number of workers as cores on your machine if fork is available.
+
+You can override this setting by setting the environment variable `MAX_FORK_WORKERS`. A value of 1 will disable forking and load things sequentially.
diff --git a/pytest.ini b/pytest.ini
@@ -7,6 +7,7 @@ markers =
     docker: test that involves interacting with a Docker container
     remote: test that involves interacting with a remote DB
     cicdonly: test that only runs on CI/CD
+    isolated: tests that need to run sequentially usually because they use fork
 
     # Test Domain Markers
     # default: core functionality
diff --git a/sqlmesh/core/constants.py b/sqlmesh/core/constants.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import datetime
+import os
+import typing as t
 from pathlib import Path
 
 SQLMESH = "sqlmesh"
@@ -28,6 +30,21 @@
 MAX_MODEL_DEFINITION_SIZE = 10000
 """Maximum number of characters in a model definition"""
 
+
+# The maximum number of fork processes, used for loading projects
+# None means default to process pool, 1 means don't fork, :N is number of processes
+# Factors in the number of available CPUs even if the process is bound to a subset of them
+# (e.g. via taskset) to avoid oversubscribing the system and causing kill signals
+if hasattr(os, "fork"):
+    try:
+        MAX_FORK_WORKERS: t.Optional[int] = int(os.getenv("MAX_FORK_WORKERS"))  # type: ignore
+    except TypeError:
+        MAX_FORK_WORKERS = (
+            len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else None
+        )
+else:
+    MAX_FORK_WORKERS = 1
+
 EPOCH = datetime.date(1970, 1, 1)
 
 DEFAULT_MAX_LIMIT = 1000
diff --git a/sqlmesh/core/loader.py b/sqlmesh/core/loader.py
@@ -3,11 +3,10 @@
 import abc
 import linecache
 import logging
-import multiprocessing as mp
 import os
 import typing as t
 from collections import defaultdict
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from concurrent.futures import as_completed
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -25,10 +24,10 @@
     ModelCache,
     OptimizedQueryCache,
     SeedModel,
-    SqlModel,
     create_external_model,
     load_sql_based_model,
 )
+from sqlmesh.core.model.cache import optimized_query_cache_pool, load_optimized_query_cache
 from sqlmesh.core.model import model as model_registry
 from sqlmesh.utils import UniqueKeyDict
 from sqlmesh.utils.dag import DAG
@@ -549,7 +548,7 @@ def update_model_schemas(
     schema = MappingSchema(normalize=False)
     optimized_query_cache: OptimizedQueryCache = OptimizedQueryCache(context_path / c.CACHE)
 
-    if not hasattr(os, "fork") or "PYTEST_CURRENT_TEST" in os.environ:
+    if c.MAX_FORK_WORKERS == 1:
         _update_model_schemas_sequential(dag, models, schema, optimized_query_cache)
     else:
         _update_model_schemas_parallel(dag, models, schema, optimized_query_cache)
@@ -610,13 +609,9 @@ def process_models(completed_model: t.Optional[Model] = None) -> None:
                 del graph[name]
                 model = models[name]
                 model.update_schema(schema)
-                futures.add(executor.submit(_load_optimized_query_cache, model))
+                futures.add(executor.submit(load_optimized_query_cache, model))
 
-    with ProcessPoolExecutor(
-        mp_context=mp.get_context("fork"),
-        initializer=_init_optimized_query_cache,
-        initargs=(optimized_query_cache,),
-    ) as executor:
+    with optimized_query_cache_pool(optimized_query_cache) as executor:
         process_models()
 
         while futures:
@@ -629,20 +624,3 @@ def process_models(completed_model: t.Optional[Model] = None) -> None:
 
                 _update_schema_with_model(schema, model)
                 process_models(completed_model=model)
-
-
-_optimized_query_cache: t.Optional[OptimizedQueryCache] = None
-
-
-def _init_optimized_query_cache(optimized_query_cache: OptimizedQueryCache) -> None:
-    global _optimized_query_cache
-    _optimized_query_cache = optimized_query_cache
-
-
-def _load_optimized_query_cache(model: Model) -> t.Tuple[str, t.Optional[str]]:
-    assert _optimized_query_cache
-    if isinstance(model, SqlModel):
-        entry_name = _optimized_query_cache.put(model)
-    else:
-        entry_name = None
-    return model.fqn, entry_name
diff --git a/sqlmesh/core/model/cache.py b/sqlmesh/core/model/cache.py
@@ -1,12 +1,15 @@
 from __future__ import annotations
 
 import logging
+import multiprocessing as mp
 import typing as t
+from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
 
 from sqlglot import exp
 from sqlglot.optimizer.simplify import gen
 
+from sqlmesh.core import constants as c
 from sqlmesh.core.model.definition import Model, SqlModel, _Model
 from sqlmesh.utils.cache import FileCache
 from sqlmesh.utils.hashing import crc32
@@ -15,6 +18,8 @@
 
 logger = logging.getLogger(__name__)
 
+T = t.TypeVar("T")
+
 
 class ModelCache:
     """File-based cache implementation for model definitions.
@@ -128,6 +133,49 @@ def _entry_name(model: SqlModel) -> str:
         return f"{model.name}_{crc32(hash_data)}"
 
 
+def optimized_query_cache_pool(optimized_query_cache: OptimizedQueryCache) -> ProcessPoolExecutor:
+    return ProcessPoolExecutor(
+        mp_context=mp.get_context("fork"),
+        initializer=_init_optimized_query_cache,
+        initargs=(optimized_query_cache,),
+        max_workers=c.MAX_FORK_WORKERS,
+    )
+
+
+@t.overload
+def load_optimized_query_cache(
+    model_or_tuple: t.Tuple[Model, T],
+) -> t.Tuple[T, t.Optional[str]]: ...
+
+
+@t.overload
+def load_optimized_query_cache(model_or_tuple: Model) -> t.Tuple[str, t.Optional[str]]: ...
+
+
+def load_optimized_query_cache(model_or_tuple):  # type: ignore
+    assert _optimized_query_cache
+
+    if isinstance(model_or_tuple, _Model):
+        model = model_or_tuple
+        key = None
+    else:
+        model, key = model_or_tuple
+
+    if isinstance(model, SqlModel):
+        entry_name = _optimized_query_cache.put(model)
+    else:
+        entry_name = None
+    return key or model.fqn, entry_name
+
+
+_optimized_query_cache: t.Optional[OptimizedQueryCache] = None
+
+
+def _init_optimized_query_cache(optimized_query_cache: OptimizedQueryCache) -> None:
+    global _optimized_query_cache
+    _optimized_query_cache = optimized_query_cache
+
+
 def _mapping_schema_hash_data(schema: t.Dict[str, t.Any]) -> t.List[str]:
     keys = sorted(schema) if all(isinstance(v, dict) for v in schema.values()) else schema
 
diff --git a/sqlmesh/core/model/definition.py b/sqlmesh/core/model/definition.py
@@ -960,7 +960,7 @@ def full_depends_on(self) -> t.Set[str]:
         if self._full_depends_on is None:
             depends_on = self.depends_on_ or set()
 
-            query = self.render_query(optimize=False)
+            query = self.render_query(needs_optimization=False)
             if query is not None:
                 depends_on |= d.find_tables(
                     query, default_catalog=self.default_catalog, dialect=self.dialect
diff --git a/sqlmesh/core/renderer.py b/sqlmesh/core/renderer.py
@@ -376,7 +376,7 @@ def render(
         table_mapping: t.Optional[t.Dict[str, str]] = None,
         deployability_index: t.Optional[DeployabilityIndex] = None,
         expand: t.Iterable[str] = tuple(),
-        optimize: bool = True,
+        needs_optimization: bool = True,
         runtime_stage: RuntimeStage = RuntimeStage.LOADING,
         **kwargs: t.Any,
     ) -> t.Optional[exp.Query]:
@@ -393,7 +393,8 @@ def render(
             expand: Expand referenced models as subqueries. This is used to bypass backfills when running queries
                 that depend on materialized tables.  Model definitions are inlined and can thus be run end to
                 end on the fly.
-            optimize: Whether to optimize the query.
+            needs_optimization: Whether or not an optimization should be attempted
+                (if passing False, it still may return a cached optimized query).
             runtime_stage: Indicates the current runtime stage, for example if we're still loading the project, etc.
             kwargs: Additional kwargs to pass to the renderer.
 
@@ -402,7 +403,7 @@ def render(
         """
 
         should_cache = self._should_cache(
-            runtime_stage, start, end, execution_time, not optimize, *kwargs.values()
+            runtime_stage, start, end, execution_time, *kwargs.values()
         )
 
         if should_cache and self._optimized_cache:
@@ -417,7 +418,7 @@ def render(
                     table_mapping=table_mapping,
                     deployability_index=deployability_index,
                     runtime_stage=runtime_stage,
-                    normalize_identifiers=optimize,
+                    normalize_identifiers=needs_optimization,
                     **kwargs,
                 )
             except ParsetimeAdapterCallError:
@@ -439,7 +440,7 @@ def render(
                 )
                 raise
 
-            if optimize:
+            if needs_optimization:
                 deps = d.find_tables(
                     query, default_catalog=self._default_catalog, dialect=self._dialect
                 )
@@ -449,7 +450,7 @@ def render(
                 if should_cache:
                     self._optimized_cache = query
 
-        if optimize:
+        if needs_optimization:
             query = self._resolve_tables(
                 query,
                 snapshots=snapshots,
diff --git a/sqlmesh/core/snapshot/cache.py b/sqlmesh/core/snapshot/cache.py
@@ -3,7 +3,12 @@
 import typing as t
 
 from pathlib import Path
-from sqlmesh.core.model.cache import OptimizedQueryCache
+from sqlmesh.core.model.cache import (
+    OptimizedQueryCache,
+    optimized_query_cache_pool,
+    load_optimized_query_cache,
+)
+from sqlmesh.core import constants as c
 from sqlmesh.core.snapshot.definition import Snapshot, SnapshotId
 from sqlmesh.utils.cache import FileCache
 
@@ -31,12 +36,10 @@ def get_or_load(
         """
         snapshots = {}
         cache_hits: t.Set[SnapshotId] = set()
+
         for s_id in snapshot_ids:
             snapshot = self._snapshot_cache.get(self._entry_name(s_id))
             if snapshot:
-                if snapshot.is_model:
-                    self._optimized_query_cache.with_optimized_query(snapshot.model)
-                self._update_node_hash_cache(snapshot)
                 snapshot.intervals = []
                 snapshot.dev_intervals = []
                 snapshots[s_id] = snapshot
@@ -46,18 +49,43 @@ def get_or_load(
         if snapshot_ids_to_load:
             loaded_snapshots = loader(snapshot_ids_to_load)
             for snapshot in loaded_snapshots:
-                self._update_node_hash_cache(snapshot)
-                self.put(snapshot)
                 snapshots[snapshot.snapshot_id] = snapshot
 
+        if c.MAX_FORK_WORKERS != 1:
+            with optimized_query_cache_pool(self._optimized_query_cache) as executor:
+                for key, entry_name in executor.map(
+                    load_optimized_query_cache,
+                    (
+                        (snapshot.model, s_id)
+                        for s_id, snapshot in snapshots.items()
+                        if snapshot.is_model
+                    ),
+                ):
+                    if entry_name:
+                        self._optimized_query_cache.with_optimized_query(
+                            snapshots[key].model, entry_name
+                        )
+
+        for snapshot in snapshots.values():
+            self._update_node_hash_cache(snapshot)
+
+            if snapshot.is_model and c.MAX_FORK_WORKERS == 1:
+                self._optimized_query_cache.with_optimized_query(snapshot.model)
+
+            self.put(snapshot)
+
         return snapshots, cache_hits
 
     def put(self, snapshot: Snapshot) -> None:
+        entry_name = self._entry_name(snapshot.snapshot_id)
+
+        if self._snapshot_cache.exists(entry_name):
+            return
+
         if snapshot.is_model:
-            self._optimized_query_cache.put(snapshot.model)
             # make sure we preload full_depends_on
             snapshot.model.full_depends_on
-        self._snapshot_cache.put(self._entry_name(snapshot.snapshot_id), value=snapshot)
+        self._snapshot_cache.put(entry_name, value=snapshot)
 
     def clear(self) -> None:
         self._snapshot_cache.clear()
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,3 +1,5 @@
+from sqlmesh.core import constants as c
 from sqlmesh.core.analytics import disable_analytics
 
+c.MAX_FORK_WORKERS = 1
 disable_analytics()
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -179,7 +179,7 @@ def validate(
 
 
 def pytest_collection_modifyitems(items, *args, **kwargs):
-    test_type_markers = {"fast", "slow", "docker", "remote"}
+    test_type_markers = {"fast", "slow", "docker", "remote", "isolated"}
     for item in items:
         for marker in item.iter_markers():
             if marker.name in test_type_markers:
diff --git a/tests/core/test_model.py b/tests/core/test_model.py
diff --git a/tests/test_forking.py b/tests/test_forking.py