Feat: Add the ability to specify a model end date (#2287)

erindru · web-flow · commit 969c7b317a29 · 2024-03-18T14:20:25.000-07:00
diff --git a/docs/reference/model_configuration.md b/docs/reference/model_configuration.md
@@ -20,6 +20,7 @@ Configuration options for SQLMesh model properties. Supported by all model kinds
 | `cron`             | The cron expression specifying how often the model should be refreshed. (Default: `@daily`)                                                                                                                                                                                                                                      |        str        |    N     |
 | `interval_unit`    | The temporal granularity of the model's data intervals. Supported values: `year`, `month`, `day`, `hour`, `half_hour`, `quarter_hour`, `five_minute`. (Default: inferred from `cron`)                                                                                                                                            |        str        |    N     |
 | `start`            | The date/time that determines the earliest date interval that should be processed by a model. Can be a datetime string, epoch time in milliseconds, or a relative datetime such as `1 year ago`.                                                                                                                                 |    str \| int     |    N     |
+| `end`              | The date/time that determines the latest date interval that should be processed by a model. Can be a datetime string, epoch time in milliseconds, or a relative datetime such as `1 year ago`.                                                                                                                                   |    str \| int     |    N     |
 | `batch_size`       | The maximum number of intervals that can be evaluated in a single backfill task. If this is `None`, all intervals will be processed as part of a single task. If this is set, a model's backfill will be chunked such that each individual task only contains jobs with the maximum of `batch_size` intervals. (Default: `None`) |        int        |    N     |
 | `grains`           | The column(s) whose combination uniquely identifies each row in the model                                                                                                                                                                                                                                                        | str \| array[str] |    N     |
 | `references`       | The model column(s) used to join to other models' grains                                                                                                                                                                                                                                                                         | str \| array[str] |    N     |
@@ -43,6 +44,7 @@ The SQLMesh project-level `model_defaults` key supports the following options, d
 - cron
 - owner
 - start
+- end
 - batch_size
 - storage_format
 
diff --git a/sqlmesh/core/model/definition.py b/sqlmesh/core/model/definition.py
@@ -101,6 +101,8 @@ class _Model(ModelMeta, frozen=True):
         start: The earliest date that the model will be backfilled for. If this is None,
             then the date is inferred by taking the most recent start date of its ancestors.
             The start date can be a static datetime or a relative datetime like "1 year ago"
+        end: The date that the model will be backfilled up until. Follows the same syntax as 'start',
+            should be omitted if there is no end date.
         batch_size: The maximum number of incremental intervals that can be run per backfill job. If this is None,
             then backfilling this model will do all of history in one job. If this is set, a model's backfill
             will be chunked such that each individual job will only contain jobs with max `batch_size` intervals.
diff --git a/sqlmesh/core/node.py b/sqlmesh/core/node.py
@@ -9,7 +9,7 @@
 from sqlglot import exp
 
 from sqlmesh.utils.cron import CroniterCache
-from sqlmesh.utils.date import TimeLike, to_datetime
+from sqlmesh.utils.date import TimeLike, to_datetime, validate_date_range
 from sqlmesh.utils.errors import ConfigError
 from sqlmesh.utils.pydantic import (
     PydanticModel,
@@ -172,6 +172,8 @@ class _Node(PydanticModel):
         start: The earliest date that the node will be executed for. If this is None,
             then the date is inferred by taking the most recent start date of its ancestors.
             The start date can be a static datetime or a relative datetime like "1 year ago"
+        end: The latest date that the model will be executed for. If this is None,
+            the date from the scheduler will be used
         cron: A cron string specifying how often the node should be run, leveraging the
             [croniter](https://github.com/kiorky/croniter) library.
         interval_unit: The duration of an interval for the node. By default, it is computed from the cron expression.
@@ -185,6 +187,7 @@ class _Node(PydanticModel):
     description: t.Optional[str] = None
     owner: t.Optional[str] = None
     start: t.Optional[TimeLike] = None
+    end: t.Optional[TimeLike] = None
     cron: str = "@daily"
     interval_unit_: t.Optional[IntervalUnit] = Field(alias="interval_unit", default=None)
     tags: t.List[str] = []
@@ -207,7 +210,7 @@ def _name_validator(cls, v: t.Any) -> t.Optional[str]:
             return v.meta["sql"]
         return str(v)
 
-    @field_validator("start", mode="before")
+    @field_validator("start", "end", mode="before")
     @classmethod
     def _date_validator(cls, v: t.Any) -> t.Optional[TimeLike]:
         if isinstance(v, exp.Expression):
@@ -255,6 +258,7 @@ def _node_root_validator(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]:
                 raise ConfigError(
                     f"Interval unit of '{interval_unit}' is larger than cron period of '{cron}'"
                 )
+        validate_date_range(values.get("start"), values.get("end"))
         return values
 
     @property
diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py
@@ -1482,12 +1482,17 @@ def missing_intervals(
             snapshot.intervals = snapshot.intervals.copy()
             snapshot.remove_interval(interval, execution_time)
 
+        missing_interval_end_date = snapshot_end_date
+        node_end_date = snapshot.node.end
+        if node_end_date and (to_datetime(node_end_date) < to_datetime(snapshot_end_date)):
+            missing_interval_end_date = node_end_date
+
         intervals = snapshot.missing_intervals(
             max(
                 to_datetime(snapshot_start_date),
                 to_datetime(start_date(snapshot, snapshots, cache, relative_to=snapshot_end_date)),
             ),
-            snapshot_end_date,
+            missing_interval_end_date,
             execution_time=execution_time,
             deployability_index=deployability_index,
             ignore_cron=ignore_cron,
diff --git a/sqlmesh/schedulers/airflow/dag_generator.py b/sqlmesh/schedulers/airflow/dag_generator.py
@@ -108,10 +108,15 @@ def _create_cadence_dag_for_snapshot(
                 f"Can't create a cadence DAG for the paused snapshot {snapshot.snapshot_id}"
             )
 
+        end_date = None
+        if snapshot.node.end:
+            end_date = pendulum.instance(to_datetime(snapshot.node.end))
+
         with DAG(
             dag_id=dag_id,
             schedule_interval=snapshot.node.cron,
             start_date=pendulum.instance(to_datetime(snapshot.unpaused_ts)),
+            end_date=end_date,
             max_active_runs=1,
             catchup=True,
             is_paused_upon_creation=False,
diff --git a/tests/core/test_model.py b/tests/core/test_model.py
@@ -3244,3 +3244,43 @@ def my_model(context, **kwargs):
 
     assert m.default_catalog == "catalog"
     assert m.depends_on == {'"catalog"."other"."table"'}
+
+
+def test_end_date():
+    expressions = d.parse(
+        f"""
+        MODEL (
+            name db.table,
+            kind INCREMENTAL_BY_TIME_RANGE (
+                time_column ts,
+            ),
+            start '2023-01-01',
+            end '2023-06-01'
+        );
+
+        SELECT 1::int AS a, 2::int AS b, now::timestamp as ts
+        """
+    )
+    model = load_sql_based_model(expressions)
+
+    assert model.start == "2023-01-01"
+    assert model.end == "2023-06-01"
+    assert model.interval_unit == IntervalUnit.DAY
+
+    with pytest.raises(ConfigError, match=".*Start date.+can't be greater than end date.*"):
+        load_sql_based_model(
+            d.parse(
+                f"""
+            MODEL (
+                name db.table,
+                kind INCREMENTAL_BY_TIME_RANGE (
+                    time_column ts,
+                ),
+                start '2024-01-01',
+                end '2023-06-01'
+            );
+
+            SELECT 1::int AS a, 2::int AS b, now::timestamp as ts
+            """
+            )
+        )
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
@@ -10,6 +10,7 @@
     IncrementalByUniqueKeyKind,
     TimeColumn,
 )
+from sqlmesh.core.node import IntervalUnit
 from sqlmesh.core.scheduler import Scheduler, compute_interval_params
 from sqlmesh.core.snapshot import Snapshot, SnapshotEvaluator
 from sqlmesh.utils.date import to_datetime
@@ -234,3 +235,41 @@ def test_circuit_breaker(scheduler: Scheduler):
             "2022-01-30",
             circuit_breaker=lambda: True,
         )
+
+
+def test_intervals_with_end_date_on_model(mocker: MockerFixture, make_snapshot):
+    snapshot: Snapshot = make_snapshot(
+        SqlModel(
+            name="name",
+            kind=IncrementalByTimeRangeKind(time_column="ds", batch_size=1),
+            interval_unit=IntervalUnit.DAY,
+            start="2023-01-01",
+            end="2023-01-31",
+            query=parse_one("SELECT ds FROM parent.tbl"),
+        )
+    )
+
+    snapshot_evaluator = SnapshotEvaluator(adapter=mocker.MagicMock(), ddl_concurrent_tasks=1)
+    scheduler = Scheduler(
+        snapshots=[snapshot],
+        snapshot_evaluator=snapshot_evaluator,
+        state_sync=mocker.MagicMock(),
+        max_workers=2,
+        default_catalog=None,
+    )
+
+    # generate for 1 year to show that the returned batches should only cover
+    # the range defined on the model itself
+    batches = scheduler.batches(start="2023-01-01", end="2024-01-01")[snapshot]
+
+    assert len(batches) == 31  # days in Jan 2023
+    assert batches[0] == (to_datetime("2023-01-01"), to_datetime("2023-01-02"))
+    assert batches[-1] == (to_datetime("2023-01-31"), to_datetime("2023-02-01"))
+
+    # generate for less than 1 month to ensure that the scheduler end date
+    # takes precedence over the model end date
+    batches = scheduler.batches(start="2023-01-01", end="2023-01-10")[snapshot]
+
+    assert len(batches) == 10
+    assert batches[0] == (to_datetime("2023-01-01"), to_datetime("2023-01-02"))
+    assert batches[-1] == (to_datetime("2023-01-10"), to_datetime("2023-01-11"))