From eeade9695f5080138b155229a90ed08029f84043 Mon Sep 17 00:00:00 2001 From: Marko Lekic Date: Sun, 30 Nov 2025 12:52:56 +0100 Subject: [PATCH 1/3] Added first version of packages --- docs/Packages.md | 434 ++++++++++++ docs/examples/Packages_Demo.md | 345 +++++++++ examples/packages_demo/.gitignore | 2 + examples/packages_demo/main_project/Makefile | 80 +++ examples/packages_demo/main_project/README.md | 27 + .../packages_demo/main_project/env.dev_duckdb | 6 + .../main_project/models/README.md | 9 + .../marts/mart_users_from_git_package.ff.sql | 42 ++ .../marts/mart_users_from_package.ff.sql | 38 + .../packages_demo/main_project/packages.yml | 16 + .../packages_demo/main_project/profiles.yml | 6 + .../packages_demo/main_project/project.yml | 34 + .../main_project/seeds/README.md | 12 + .../main_project/seeds/seed_users.csv | 5 + .../packages_demo/main_project/sources.yml | 8 + .../main_project/tests/unit/README.md | 6 + .../shared_package/models/README.md | 14 + .../models/macros/shared_utils.sql | 6 + .../models/staging/users_base.ff.sql | 36 + .../packages_demo/shared_package/project.yml | 14 + .../models/macros/git_shared_utils.sql | 6 + .../models/staging/users_base_git.ff.sql | 38 + .../shared_package_git_remote/package.yml | 7 + pyproject.toml | 2 +- src/fastflowtransform/cli/__init__.py | 2 + src/fastflowtransform/cli/deps_cmd.py | 66 ++ src/fastflowtransform/config/packages.py | 135 ++++ src/fastflowtransform/core.py | 197 ++++- src/fastflowtransform/packages.py | 670 ++++++++++++++++++ uv.lock | 2 +- 30 files changed, 2239 insertions(+), 26 deletions(-) create mode 100644 docs/Packages.md create mode 100644 docs/examples/Packages_Demo.md create mode 100644 examples/packages_demo/.gitignore create mode 100644 examples/packages_demo/main_project/Makefile create mode 100644 examples/packages_demo/main_project/README.md create mode 100644 examples/packages_demo/main_project/env.dev_duckdb create mode 100644 examples/packages_demo/main_project/models/README.md create mode 100644 examples/packages_demo/main_project/models/marts/mart_users_from_git_package.ff.sql create mode 100644 examples/packages_demo/main_project/models/marts/mart_users_from_package.ff.sql create mode 100644 examples/packages_demo/main_project/packages.yml create mode 100644 examples/packages_demo/main_project/profiles.yml create mode 100644 examples/packages_demo/main_project/project.yml create mode 100644 examples/packages_demo/main_project/seeds/README.md create mode 100644 examples/packages_demo/main_project/seeds/seed_users.csv create mode 100644 examples/packages_demo/main_project/sources.yml create mode 100644 examples/packages_demo/main_project/tests/unit/README.md create mode 100644 examples/packages_demo/shared_package/models/README.md create mode 100644 examples/packages_demo/shared_package/models/macros/shared_utils.sql create mode 100644 examples/packages_demo/shared_package/models/staging/users_base.ff.sql create mode 100644 examples/packages_demo/shared_package/project.yml create mode 100644 examples/packages_demo/shared_package_git_remote/models/macros/git_shared_utils.sql create mode 100644 examples/packages_demo/shared_package_git_remote/models/staging/users_base_git.ff.sql create mode 100644 examples/packages_demo/shared_package_git_remote/package.yml create mode 100644 src/fastflowtransform/cli/deps_cmd.py create mode 100644 src/fastflowtransform/config/packages.py create mode 100644 src/fastflowtransform/packages.py diff --git a/docs/Packages.md b/docs/Packages.md new file mode 100644 index 0000000..e9936b3 --- /dev/null +++ b/docs/Packages.md @@ -0,0 +1,434 @@ +# Packages + +FastFlowTransform **packages** let you reuse models and macros across projects. + +A *package* is just another FFT project (or mini-project) whose `models/` you want to treat as if they were part of your current project. You wire it in via a `packages.yml` file in your project root. + +Typical use cases: + +* A shared **staging layer** (e.g. CRM / ERP cleaning) used by multiple teams. +* A central **macro library** (casting helpers, email parsing, date tricks). +* A “starter kit” of **canonical marts** that downstream projects can add on top of. + +--- + +## High-level behavior + +When you declare packages in `packages.yml`: + +1. FFT loads your **main project** as usual. +2. For each entry in `packages.yml`, FFT: + + * resolves the path on disk, + * reads that package’s `project.yml` (if present), + * loads its `models/` and macros. +3. All package models and macros are registered into the **same namespace** as your own. + +From inside your project you can: + +* `ref('users_base.ff')` even if `users_base.ff.sql` physically lives in `../shared_package/models/…`. +* Use macros defined under `shared_package/models/macros/*.sql` in your own models. + +> There is no special syntax for package references; once loaded, package models look like any other model. + +--- + +## 1. Minimal setup + +### 1.1. Create a reusable package + +A package looks like a regular FFT project, but you mainly care about its `models/` and macros. + +```text +shared_package/ + project.yml + models/ + macros/ + shared_utils.sql + staging/ + users_base.ff.sql +``` + +Example `project.yml` in the package: + +```yaml +name: shared_package +version: "0.1" +models_dir: models + +# (Optional) tests/docs/etc. are allowed here but are not special in the consumer. +vars: {} +tests: [] +``` + +Example macro (`models/macros/shared_utils.sql`): + +```jinja +{%- macro email_domain(expr) -%} + lower(regexp_replace({{ expr }}, '^.*@', '')) +{%- endmacro -%} +``` + +Example staging model (`models/staging/users_base.ff.sql`): + +```jinja +{{ config( + materialized='view', + tags=['shared:staging', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery'], +) }} + +with raw_users as ( + select + cast(id as integer) as user_id, + lower(email) as email, + cast(signup_date as date) as signup_date + from {{ source('crm', 'users') }} +) +select + user_id, + email, + {{ email_domain("email") }} as email_domain, + signup_date +from raw_users; +``` + +This package expects the **consumer project** to define `source('crm','users')`. + +--- + +### 1.2. Declare the package in your project + +In your main project: + +```text +my_project/ + project.yml + packages.yml ← new + models/ + seeds/ + … +``` + +Create `packages.yml`: + +```yaml +packages: + - name: shared_package + path: "../shared_package" + models_dir: "models" +``` + +* `name` + Logical name for the package (used for logs/diagnostics). Does *not* change how you `ref()` models. +* `path` + Filesystem location of the package folder, resolved **relative to the directory containing `packages.yml`**. +* `models_dir` (optional) + Subdirectory containing the package’s models. Defaults to `models` if omitted. + +--- + +### 1.3. Use package models in your project + +Now, in `my_project/models/marts/mart_users_from_package.ff.sql`: + +```jinja +{{ config( + materialized='table', + tags=['example:packages_demo', 'scope:mart', 'engine:duckdb'], +) }} + +with base as ( + select + email_domain, + signup_date + from {{ ref('users_base.ff') }} -- defined in the package +) +select + email_domain, + count(*) as user_count, + min(signup_date) as first_signup, + max(signup_date) as last_signup +from base +group by email_domain +order by email_domain; +``` + +Run as usual: + +```bash +fft seed . --env dev_duckdb +fft run . --env dev_duckdb +fft dag . --env dev_duckdb --html +``` + +The DAG will show: + +```text +crm.users (source) → users_base.ff (from package) → mart_users_from_package.ff (local) +``` + +--- + +## 2. `packages.yml` – configuration reference + +`packages.yml` must live in the **project root**, next to `project.yml`: + +```text +my_project/ + project.yml + packages.yml + models/ + … +``` + +Structure: + +```yaml +packages: + - name: # required + path: # required, relative or absolute path to the package root + models_dir: # optional, defaults to "models" +``` + +Notes: + +* `path` is resolved relative to `packages.yml`’s directory: + + * `../shared_package` → sibling folder + * `vendor/my_pkg` → subfolder +* `models_dir` allows you to keep a different structure in the package: + + * Example: `models_dir: "src/models"`. + +### Multiple packages + +You can declare multiple packages: + +```yaml +packages: + - name: shared_staging + path: "../shared_staging" + + - name: analytics_macros + path: "../analytics_macros" + models_dir: "macros_only" +``` + +All models/macros from all packages are loaded into the same project. + +--- + +## 3. What gets loaded (and what doesn’t) + +Currently, packages are focused on **models and macros**. + +When FFT loads a package, it will: + +* Read the package’s `project.yml` (if present) for: + + * `name`, `version` (for metadata), + * `models_dir` (overridden by `packages.yml` if provided). +* Load: + + * SQL models (`*.ff.sql`) from the package’s `models_dir`. + * Python models (`*.ff.py`) from the package’s `models_dir`. + * SQL macros under `models_dir/macros/` (standard Jinja macro files). + * Python render-time helpers/macros if your core exposes them from the package (same mechanism as the main project). + +And it will **not**: + +* Load or execute the package’s `profiles.yml` – the consumer project’s profiles are always used. +* Automatically register package **seeds** or **sources**; those stay local to the consumer. +* Automatically run the package’s DQ tests; only tests declared in the **consumer project’s** `project.yml` are executed on `fft test`. + +> In practice, package models often still refer to `source('…')` or `ref('…')`. +> The *consumer* project is responsible for: +> +> * defining sources in its own `sources.yml`, and +> * wiring any extra seeds needed. + +--- + +## 4. Name resolution & conflicts + +### 4.1. Model names + +Once loaded, a package model is just a regular model in the registry: + +* It has a **logical name** (e.g. `users_base.ff`). +* Its file path and package association are recorded as metadata. + +Rules: + +* `ref('')` has **no package prefix**. You always use the bare model name. +* Model names must be **globally unique** across: + + * your main project, + * all packages. + +If two models with the same name are found (e.g. `users_base.ff` in both main and package), FFT raises a clear error during project loading. You must rename or decide which one you want. + +### 4.2. Macros + +Macros from packages are injected into the **same Jinja environment** as your own macros: + +* Name collisions are possible. +* If two macros share the same name, whichever is registered last will “win”. + +Best practice: + +* Prefix shared macros with a **package-ish** prefix (e.g. `shared_email_domain`), or +* Group them in macro files you explicitly `{% import 'macros/shared_utils.sql' as shared %}` and then call `shared.email_domain()`. + +--- + +## 5. DAGs, caching, and manifests + +Once packages are loaded, the pipeline behaves like a single large project. + +### 5.1. DAG & docs + +* `fft dag` sees package models as part of the DAG. +* The generated HTML docs show: + + * nodes for package models, + * nodes for local models, + * edges between them. + +Package models typically carry an extra metadata field (`package_name`) used in the catalog/manifest; you can inspect `.fastflowtransform/target/manifest.json` if you want to differentiate them programmatically. + +### 5.2. Caching and fingerprints + +Build caching (`--cache`) treats package models like any other: + +* Fingerprints include: + + * SQL/Python source from the **package** file, + * environment vars, + * upstream dependencies, etc. +* If a package model’s code changes, its fingerprint changes, and: + + * that model will rebuild on the next run, + * downstream models (local or from other packages) will also rebuild if needed. + +### 5.3. Tests and selectors + +Selectors (`--select`, `--exclude`) are agnostic to package vs. local: + +* You can tag package models with `tags: ['shared:staging']` and run: + + ```bash + fft run . --env dev_duckdb --select tag:shared:staging + ``` + +* You can define DQ tests in your **main project**’s `project.yml` targeting package tables: + + ```yaml + tests: + - type: not_null + table: users_base + column: email + tags: [example:packages_demo] + ``` + +--- + +## 6. Best practices + +### 6.1. Keep packages stable and versioned + +Treat a shared package like a library: + +* Maintain a `version` in `project.yml`. +* Avoid backwards-incompatible changes without coordination: + + * e.g. dropping columns or changing semantics in shared staging models. +* Consider tagging or branching in Git to coordinate upgrades across consumers. + +(There’s no built-in package registry or version pinning yet; you control which commit of the package you point to via Git + `path`.) + +### 6.2. Package responsibility + +A good rule of thumb: + +* **Package:** “What does *user* mean for us?” — common cleaning, typing, normalization, derivations (e.g. `email_domain`, `customer_segment`). +* **Consumer project:** “What do we need for *this* product/report?” — marts, joins across domains, project-specific logic. + +This keeps packages focused and low-churn. + +### 6.3. Avoid tight coupling to local schemas + +Shared packages shouldn’t depend on highly project-specific schemas or seeds. Instead: + +* Use `source('domain', 'table')` with generic names (“crm.users”, “billing.invoices”). +* Document in the package README what sources it expects. +* Let each consumer wire those sources to its concrete tables via its own `sources.yml`. + +### 6.4. Tag everything + +Give package models clear tags: + +```jinja +{{ config( + tags=[ + 'pkg:shared_package', + 'scope:staging', + 'engine:duckdb', + 'engine:postgres', + ], +) }} +``` + +Then consumers can: + +* include only `tag:pkg:shared_package` in some runs, +* or exclude them via `--exclude tag:pkg:shared_package` if they want to run only local marts. + +--- + +## 7. Common pitfalls & how to avoid them + +**❌ Package model fails: `source('crm','users')` not found** + +* You’re using a package model that references a source your main project hasn’t declared. +* Fix: add a matching `sources.yml` entry in your main project: + + ```yaml + version: 2 + sources: + - name: crm + tables: + - name: users + identifier: seed_users + ``` + +--- + +**❌ Duplicate model name between project and package** + +* You have `models/staging/users_base.ff.sql` locally **and** in the package. +* Fix: rename one of them, or drop the local one if you want to fully delegate to the package. + +--- + +**❌ Macro name collision** + +* Same macro name in package and project; behavior seems “random”. +* Fix: rename macros or use explicit `{% import %}` and call macros with a namespace alias. + +--- + +## Summary + +Packages let you: + +* Factor out **shared staging** and **macro libraries**. +* Reuse them across many projects via a simple `packages.yml`. +* Keep execution, caching, DAGs, and tests working as if everything were one project. + +The mental model is: + +> “My project + all packages = one big FastFlowTransform project +> where some models just happen to live in other directories.” + +As your internal ecosystem grows, you can introduce multiple packages (per domain, per team, per capability) and let downstream projects compose them like building blocks. diff --git a/docs/examples/Packages_Demo.md b/docs/examples/Packages_Demo.md new file mode 100644 index 0000000..ded14c2 --- /dev/null +++ b/docs/examples/Packages_Demo.md @@ -0,0 +1,345 @@ +# Packages Demo + +The **packages demo** shows how to split FastFlowTransform logic into a **reusable package** +and a **consumer project** that imports it via `packages.yml`. + +It answers: + +- How do I **share staging models** and macros across multiple projects? +- How does `packages.yml` work? +- How do I `ref()` a model that physically lives in another directory/tree? + +Use this as a template for building your own internal “FFT packages” repo. + +--- + +## Layout + +The example lives under: + +```text +examples/packages_demo/ + shared_package/ # reusable package + main_project/ # normal FFT project that consumes the package +```` + +### `shared_package` – reusable code + +This folder behaves like a **mini FFT project**: + +```text +shared_package/ + project.yml + models/ + README.md + macros/ + shared_utils.sql + staging/ + users_base.ff.sql +``` + +It includes: + +* `project.yml` – minimal config so FFT knows how to load its models. +* `models/macros/shared_utils.sql` – shared SQL macros (e.g. `email_domain(expr)`). +* `models/staging/users_base.ff.sql` – a reusable staging model that: + + * reads `source('crm', 'users')` + * normalizes emails + * derives `email_domain`. + +You **do not** call `fft run shared_package` directly in this demo. +Instead, `shared_package` is loaded by the consumer project via `packages.yml`. + +### `main_project` – normal FFT project + +```text +main_project/ + .env.dev_duckdb + Makefile + README.md + profiles.yml + project.yml + packages.yml + sources.yml + models/ + README.md + marts/ + mart_users_from_package.ff.sql + seeds/ + README.md + seed_users.csv + tests/ + unit/ + README.md +``` + +This is a regular project: + +* `profiles.yml` – DuckDB connection profile (`dev_duckdb`). +* `.env.dev_duckdb` – points DuckDB at `.local/packages_demo.duckdb`. +* `seeds/seed_users.csv` + `sources.yml` – define `source('crm','users')`. +* `packages.yml` – declares the dependency on `../shared_package`. +* `models/marts/mart_users_from_package.ff.sql` – a local mart that does: + + ```jinja + from {{ ref('users_base.ff') }} + ``` + + where `users_base.ff` is defined in the **package**, not in `main_project`. + +--- + +## Key concepts + +### 1. Declaring packages – `packages.yml` + +In `main_project/packages.yml`: + +```yaml +packages: + - name: shared_package + path: "../shared_package" + models_dir: "models" +``` + +* `name` – logical package name (for logging / internal bookkeeping). +* `path` – where to find the package directory, relative to `packages.yml`. +* `models_dir` – where to look for models inside the package (defaults to `models` if omitted). + +At load time, the core: + +1. Reads `packages.yml`. +2. For each entry, loads its `project.yml` (if present) and its `models/`. +3. Registers all models/macros from the package **into the same namespace** as local models. + +From the perspective of `main_project`, `users_base.ff` looks just like any other model — +you can `ref('users_base.ff')` without caring that it physically lives in `../shared_package`. + +### 2. Referencing package models with `ref()` + +In `main_project/models/marts/mart_users_from_package.ff.sql`: + +```jinja +with base as ( + select + email_domain, + signup_date + from {{ ref('users_base.ff') }} +) +select + email_domain, + count(*) as user_count, + min(signup_date) as first_signup, + max(signup_date) as last_signup +from base +group by email_domain +order by email_domain; +``` + +* `users_base.ff` is defined in `shared_package/models/staging/users_base.ff.sql`. +* Because the package is registered, `ref('users_base.ff')` resolves correctly. +* The DAG includes both the package model and the local mart. + +**Important:** model names must still be globally unique. If you define a model with the same name +in both the package and the project, you’ll get a conflict (which is what you want). + +### 3. Shared macros from a package + +The package ships a simple macro file: + +```jinja +-- shared_package/models/macros/shared_utils.sql +{%- macro email_domain(expr) -%} + lower(regexp_replace({{ expr }}, '^.*@', '')) +{%- endmacro -%} +``` + +`users_base.ff` uses it: + +```jinja +select + user_id, + email, + {{ email_domain("email") }} as email_domain, + signup_date +from raw_users; +``` + +Because macros are loaded from both the main project and all packages into the same Jinja environment: + +* models in the **package** can use macros from the package, +* models in the **main project** can also use those macros if you want, subject to naming rules. + +--- + +## Data flow + +The demo intentionally mirrors the basic_demo pipeline but splits staging into a package: + +```text +(main_project) seeds/seed_users.csv + │ + ├─ fft seed + ▼ + seed_users (DuckDB table via sources.yml → crm.users) + │ + ├─ shared_package/models/staging/users_base.ff.sql + │ (materialized view) + ▼ + users_base + │ + ├─ main_project/models/marts/mart_users_from_package.ff.sql + ▼ + mart_users_from_package +``` + +The DAG (after a run) will roughly show: + +```text +crm.users (source) → users_base.ff (package) → mart_users_from_package.ff (main_project) +``` + +--- + +## Running the demo + +From the repo root: + +```bash +cd examples/packages_demo/main_project +``` + +### 1. Configure DuckDB env + +```bash +set -a; source .env.dev_duckdb; set +a +# This sets: +# FF_DUCKDB_PATH=.local/packages_demo.duckdb +# FFT_ACTIVE_ENV=dev_duckdb +# FF_ENGINE=duckdb +``` + +### 2. Run the full demo + +```bash +make demo ENGINE=duckdb +``` + +This will: + +1. **clean** – drop local artifacts and DuckDB file via `cleanup_env.py`. + +2. **seed** – `fft seed . --env dev_duckdb`: + + * loads `seeds/seed_users.csv` into DuckDB as `seed_users`. + +3. **run** – `fft run . --env dev_duckdb` with: + + ```bash + --select tag:example:packages_demo --select tag:engine:duckdb + ``` + + Only models tagged for this example are built: + + * `users_base.ff` (from `shared_package`) + * `mart_users_from_package.ff` (from `main_project`) + +4. **dag** – `fft dag . --env dev_duckdb --html`: + + * writes HTML docs to `main_project/site/dag/index.html`. + +5. **test** – `fft test . --env dev_duckdb`: + + * runs DQ tests from `project.yml` (`not_null`, `unique`, `greater_equal`). + +6. **artifacts** – prints paths to `manifest.json`, `run_results.json`, `catalog.json`. + +### 3. Inspect results + +* DAG HTML: + + ```text + examples/packages_demo/main_project/site/dag/index.html + ``` + +* Artifacts: + + ```text + examples/packages_demo/main_project/.fastflowtransform/target/manifest.json + examples/packages_demo/main_project/.fastflowtransform/target/run_results.json + examples/packages_demo/main_project/.fastflowtransform/target/catalog.json + ``` + +* DuckDB file (if you want to open it manually): + + ```text + examples/packages_demo/main_project/.local/packages_demo.duckdb + ``` + +--- + +## What this demo demonstrates + +1. **Package loading** + + `packages.yml` allows you to point at **another tree of models** and macros and load them as if they were local. + +2. **Shared staging layers** + + You can move “standardized” staging code (sources, cleaning, type-casting, email normalization, etc.) + into a central `shared_package` and reuse it from multiple projects. + +3. **Consistent naming** + + Since packaged models live in the same logical namespace, you get early feedback if two projects try to + define a model with the same name. + +4. **Separation of concerns** + + * Package: stable, reused logic (e.g. `users_base`). + * Main project: business-specific marts and reporting (`mart_users_from_package`). + +--- + +## Things to try + +To understand packages better, experiment with: + +1. **Breaking the shared model** + + * Edit `shared_package/models/staging/users_base.ff.sql` (e.g. remove `email_domain`). + * Re-run `make demo`. + * Watch `mart_users_from_package` fail because the column is missing — proving the dependency goes through the package. + +2. **Adding a new shared macro** + + * Add a `country_label(expr)` macro in `shared_utils.sql`. + * Use it in a *local* model inside `main_project` to see that macros from the package are visible in the consumer. + +3. **Adding another package** + + * Create `examples/packages_demo/another_package` with its own `project.yml` and models. + * Extend `main_project/packages.yml` with a second entry and confirm both package’s models appear in the DAG. + +4. **Introducing a naming conflict** + + * Define a model named `users_base.ff` inside `main_project/models/staging`. + * Reload the project; you should get a clear error about duplicate model names, which is your cue to rename or explicitly choose one. + +--- + +## Summary + +The packages demo is a minimal, concrete example of: + +* Defining a reusable FastFlowTransform **package** (`shared_package`). +* Wiring it into a **consumer project** (`main_project`) via `packages.yml`. +* Building a mart that depends on a model defined outside of the project tree. +* Running everything through the normal `fft seed`, `fft run`, `fft dag`, and `fft test` workflow. + +You can adopt the same pattern to share: + +* Standard staging layers (CRM / ERP / web analytics), +* Macro libraries (date helpers, casting utilities), +* Even entire mini-marts that represent common dimensional models across teams. diff --git a/examples/packages_demo/.gitignore b/examples/packages_demo/.gitignore new file mode 100644 index 0000000..4718b0e --- /dev/null +++ b/examples/packages_demo/.gitignore @@ -0,0 +1,2 @@ +.fastflowtransform/packages/** +packages.lock.yml \ No newline at end of file diff --git a/examples/packages_demo/main_project/Makefile b/examples/packages_demo/main_project/Makefile new file mode 100644 index 0000000..6221c81 --- /dev/null +++ b/examples/packages_demo/main_project/Makefile @@ -0,0 +1,80 @@ +.PHONY: seed run test dag show artifacts clean demo help + +# --- Configuration ----------------------------------------------------------- + +DB ?= .local/packages_demo.duckdb +PROJECT ?= . +UV ?= uv + +# Only DuckDB is wired for this demo +ENGINE ?= duckdb + +PROFILE_ENV = dev_duckdb +ENGINE_TAG = engine:duckdb + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) +RUN_ENV = $(BASE_ENV) + +SELECT_FLAGS = --select tag:example:packages_demo --select tag:$(ENGINE_TAG) + +SHOW_MODEL ?= mart_users_from_package + +# Cleanup script path (relative to main_project/) +CLEAN_SCRIPT = ../../_scripts/cleanup_env.py + +CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) \ + --engine duckdb --env "$(PROFILE_ENV)" --project "$(PROJECT)" \ + --duckdb-path "$(DB)" + +# --- Targets ---------------------------------------------------------------- + +help: + @echo "FastFlowTransform Packages Demo" + @echo "Targets:" + @echo " make seed ENGINE=$(ENGINE)" + @echo " make run ENGINE=$(ENGINE)" + @echo " make dag ENGINE=$(ENGINE)" + @echo " make test ENGINE=$(ENGINE)" + @echo " make show ENGINE=$(ENGINE) SHOW_MODEL=$(SHOW_MODEL)" + @echo " make demo ENGINE=$(ENGINE)" + @echo " make clean ENGINE=$(ENGINE)" + @echo + @echo "Variables: DB=$(DB) PROJECT=$(PROJECT) UV=$(UV) ENGINE=$(ENGINE)" + +seed: + env $(BASE_ENV) $(UV) run fft seed "$(PROJECT)" --env $(PROFILE_ENV) + +run: + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +test: + env $(BASE_ENV) $(UV) run fft test "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +dag: + env $(RUN_ENV) $(UV) run fft dag "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) --html + +show: + @if [ -f "$(PROJECT)/site/dag/index.html" ]; then \ + $(OPENER) "$(PROJECT)/site/dag/index.html" 2>/dev/null || echo "Open manually at: $(PROJECT)/site/dag/index.html"; \ + else \ + echo "No HTML found: $(PROJECT)/site/dag/index.html"; \ + fi + +artifacts: + @echo + @echo "== 📦 Artifacts ==" + @echo " $(PROJECT)/.fastflowtransform/target/{manifest.json,run_results.json,catalog.json}" + @echo " DAG HTML: $(PROJECT)/site/dag/index.html" + +clean: + $(CLEAN_CMD) + +demo: clean + @echo "== 🚀 Packages Demo ($(ENGINE)) ==" + @echo "Profile=$(PROFILE_ENV) PROJECT=$(PROJECT)" + +$(MAKE) seed ENGINE=$(ENGINE) + +$(MAKE) run ENGINE=$(ENGINE) + +$(MAKE) dag ENGINE=$(ENGINE) + +$(MAKE) test ENGINE=$(ENGINE) + +$(MAKE) artifacts + @echo "✅ Demo complete." diff --git a/examples/packages_demo/main_project/README.md b/examples/packages_demo/main_project/README.md new file mode 100644 index 0000000..021b0a1 --- /dev/null +++ b/examples/packages_demo/main_project/README.md @@ -0,0 +1,27 @@ +# packages_demo main project + +This is a normal FastFlowTransform project that **consumes** models +and macros from: + +- a **local path package** at `../shared_package` +- optionally, a **git-based package** declared in `packages.yml` + +Key pieces: + +- `packages.yml` – declares `shared_package` (local path) and shows an example + git package entry. +- `profiles.yml` – DuckDB connection profile. +- `seeds/seed_users.csv` + `sources.yml` – seed + source for CRM users. +- `models/marts/mart_users_from_package.ff.sql` – uses `ref('users_base.ff')` + where `users_base.ff` lives in the shared package. + +Typical workflow: + +```bash +cd examples/packages_demo/main_project + +# Configure env (DuckDB path, engine) +set -a; source env.dev_duckdb; set +a + +# Run the full demo on DuckDB +make demo ENGINE=duckdb diff --git a/examples/packages_demo/main_project/env.dev_duckdb b/examples/packages_demo/main_project/env.dev_duckdb new file mode 100644 index 0000000..7bc24fc --- /dev/null +++ b/examples/packages_demo/main_project/env.dev_duckdb @@ -0,0 +1,6 @@ +# DuckDB profile for the packages_demo main project +FF_DUCKDB_PATH=.local/packages_demo.duckdb + +# Optional: set active env for convenience +FFT_ACTIVE_ENV=dev_duckdb +FF_ENGINE=duckdb diff --git a/examples/packages_demo/main_project/models/README.md b/examples/packages_demo/main_project/models/README.md new file mode 100644 index 0000000..2c0fbb4 --- /dev/null +++ b/examples/packages_demo/main_project/models/README.md @@ -0,0 +1,9 @@ +# Main project models + +This project keeps its own models under `models/` and consumes an additional +staging model `users_base.ff` from `../shared_package` via `packages.yml`. + +Local models: + +- `marts/mart_users_from_package.ff.sql` – aggregates over the packaged + `users_base.ff` model. diff --git a/examples/packages_demo/main_project/models/marts/mart_users_from_git_package.ff.sql b/examples/packages_demo/main_project/models/marts/mart_users_from_git_package.ff.sql new file mode 100644 index 0000000..221e708 --- /dev/null +++ b/examples/packages_demo/main_project/models/marts/mart_users_from_git_package.ff.sql @@ -0,0 +1,42 @@ +-- examples/packages_demo/main_project/models/marts/mart_users_from_git_package.ff.sql + +{{ config( + materialized='table', + tags=[ + 'example:packages_demo', + 'scope:mart', + 'origin:git_package', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery' + ], +) }} + +{#- + This mart depends on the staging model defined in the *git* package: + + examples/packages_demo/shared_package_git_remote/models/staging/users_base_git.ff.sql + + It is referenced by its model name: + + ref('users_base_git.ff') +-#} + +with base as ( + select + email_domain, + signup_date, + source + from {{ ref('users_base_git.ff') }} +) + +select + email_domain, + count(*) as user_count, + min(signup_date) as first_signup, + max(signup_date) as last_signup, + max(source) as last_source_flag +from base +group by email_domain +order by email_domain; diff --git a/examples/packages_demo/main_project/models/marts/mart_users_from_package.ff.sql b/examples/packages_demo/main_project/models/marts/mart_users_from_package.ff.sql new file mode 100644 index 0000000..d1e9c03 --- /dev/null +++ b/examples/packages_demo/main_project/models/marts/mart_users_from_package.ff.sql @@ -0,0 +1,38 @@ +{{ config( + materialized='table', + tags=[ + 'example:packages_demo', + 'scope:mart', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery' + ], +) }} + +{#- + This mart lives in the main project, but depends on a staging model + defined in the shared_package: + + ../shared_package/models/staging/users_base.ff.sql + + The relation is referenced by its model name: + + ref('users_base.ff') +-#} + +with base as ( + select + email_domain, + signup_date + from {{ ref('users_base.ff') }} +) + +select + email_domain, + count(*) as user_count, + min(signup_date) as first_signup, + max(signup_date) as last_signup +from base +group by email_domain +order by email_domain; diff --git a/examples/packages_demo/main_project/packages.yml b/examples/packages_demo/main_project/packages.yml new file mode 100644 index 0000000..73007c8 --- /dev/null +++ b/examples/packages_demo/main_project/packages.yml @@ -0,0 +1,16 @@ +# examples/packages_demo/main_project/packages.yml +packages: + # Local path package + - name: shared_package + path: "../shared_package" + models_dir: "models" + + # Git-based package (same repo, different subdir) + - name: shared_package_git + git: "https://github.com/fftlabs/fastflowtransform.git" + subdir: "examples/packages_demo/shared_package_git_remote" + models_dir: "models" + # optional: use explicit ref / branch / tag + ref: "main" + # optional: version constraint matched against shared_package_git_remote/project.yml + version: ">=0.1.0,<0.2.0" diff --git a/examples/packages_demo/main_project/profiles.yml b/examples/packages_demo/main_project/profiles.yml new file mode 100644 index 0000000..5051e88 --- /dev/null +++ b/examples/packages_demo/main_project/profiles.yml @@ -0,0 +1,6 @@ +# Connection profile for the packages_demo main project (DuckDB only) + +dev_duckdb: + engine: duckdb + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/packages_demo.duckdb') }}" diff --git a/examples/packages_demo/main_project/project.yml b/examples/packages_demo/main_project/project.yml new file mode 100644 index 0000000..44e3b4f --- /dev/null +++ b/examples/packages_demo/main_project/project.yml @@ -0,0 +1,34 @@ +# Main project configuration for the packages demo. + +name: packages_demo_main +version: "0.1" +models_dir: models + +docs: + dag_dir: site/dag + +vars: {} + +tests: + # Sanity checks to show that models from the package + local mart build correctly. + + - type: not_null + table: users_base + column: user_id + tags: [example_packages_demo] + + - type: unique + table: users_base + column: user_id + tags: [example_packages_demo] + + - type: not_null + table: mart_users_from_package + column: email_domain + tags: [example_packages_demo] + + - type: greater_equal + table: mart_users_from_package + column: user_count + threshold: 0 + tags: [example_packages_demo] diff --git a/examples/packages_demo/main_project/seeds/README.md b/examples/packages_demo/main_project/seeds/README.md new file mode 100644 index 0000000..112d503 --- /dev/null +++ b/examples/packages_demo/main_project/seeds/README.md @@ -0,0 +1,12 @@ +# Seeds directory + +`seed_users.csv` provides the CRM users source for the packages demo. +It is materialized as `seed_users` by `fft seed`, then referenced in +`sources.yml` as: + +```yaml +sources: + - name: crm + tables: + - name: users + identifier: seed_users diff --git a/examples/packages_demo/main_project/seeds/seed_users.csv b/examples/packages_demo/main_project/seeds/seed_users.csv new file mode 100644 index 0000000..825bac1 --- /dev/null +++ b/examples/packages_demo/main_project/seeds/seed_users.csv @@ -0,0 +1,5 @@ +id,email,signup_date +1,alice@example.com,2024-01-05 +2,bob@example.net,2024-02-11 +3,carol@example.org,2024-02-27 +4,dave@example.com,2024-03-03 diff --git a/examples/packages_demo/main_project/sources.yml b/examples/packages_demo/main_project/sources.yml new file mode 100644 index 0000000..7866e1e --- /dev/null +++ b/examples/packages_demo/main_project/sources.yml @@ -0,0 +1,8 @@ +version: 2 + +sources: + - name: crm + tables: + - name: users + identifier: seed_users + description: Sample users table seeded from seeds/seed_users.csv. diff --git a/examples/packages_demo/main_project/tests/unit/README.md b/examples/packages_demo/main_project/tests/unit/README.md new file mode 100644 index 0000000..39567da --- /dev/null +++ b/examples/packages_demo/main_project/tests/unit/README.md @@ -0,0 +1,6 @@ +# Unit tests + +You can add YAML-based unit test specs here (see docs/Config_and_Macros.md#model-unit-tests-fft-utest) +and run them with: + + fft utest . --env dev_duckdb diff --git a/examples/packages_demo/shared_package/models/README.md b/examples/packages_demo/shared_package/models/README.md new file mode 100644 index 0000000..44de956 --- /dev/null +++ b/examples/packages_demo/shared_package/models/README.md @@ -0,0 +1,14 @@ +# shared_package models + +This folder contains reusable models and macros that can be imported +into any FastFlowTransform project via `packages.yml`. + +Contents: + +- `macros/shared_utils.sql` – SQL Jinja macros (e.g. `email_domain(expr)`). +- `staging/users_base.ff.sql` – a simple staging model that normalizes users + and derives `email_domain`, intended to be referenced as: + + {{ ref('users_base.ff') }} + +from a consuming project. diff --git a/examples/packages_demo/shared_package/models/macros/shared_utils.sql b/examples/packages_demo/shared_package/models/macros/shared_utils.sql new file mode 100644 index 0000000..52d6ed9 --- /dev/null +++ b/examples/packages_demo/shared_package/models/macros/shared_utils.sql @@ -0,0 +1,6 @@ +{# Shared SQL macros for the package #} + +{%- macro email_domain(expr) -%} + -- Extract domain part from an email address in a simple, portable way. + lower(regexp_replace({{ expr }}, '^.*@', '')) +{%- endmacro -%} diff --git a/examples/packages_demo/shared_package/models/staging/users_base.ff.sql b/examples/packages_demo/shared_package/models/staging/users_base.ff.sql new file mode 100644 index 0000000..9190123 --- /dev/null +++ b/examples/packages_demo/shared_package/models/staging/users_base.ff.sql @@ -0,0 +1,36 @@ +{{ config( + materialized='view', + tags=[ + 'example:packages_demo', + 'scope:staging', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery' + ], +) }} + +{#- + users_base.ff is a reusable staging model that expects a source: + sources.yml → crm.users (identifier: seed_users) + + It: + - casts id → user_id + - normalizes email + - derives email_domain using a shared macro +-#} + +with raw_users as ( + select + cast(id as integer) as user_id, + lower(email) as email, + cast(signup_date as date) as signup_date + from {{ source('crm', 'users') }} +) + +select + user_id, + email, + {{ email_domain("email") }} as email_domain, + signup_date +from raw_users; diff --git a/examples/packages_demo/shared_package/project.yml b/examples/packages_demo/shared_package/project.yml new file mode 100644 index 0000000..1925a4f --- /dev/null +++ b/examples/packages_demo/shared_package/project.yml @@ -0,0 +1,14 @@ +# Minimal project-style config for the shared package. +# This folder is *not* run directly; it is loaded via packages.yml +# from the consumer project in ./main_project. + +name: shared_package +version: "0.1" + +# Where models and macros live within the package directory +models_dir: models + +# Optional: project-level vars or tests local to the package +vars: {} + +tests: [] diff --git a/examples/packages_demo/shared_package_git_remote/models/macros/git_shared_utils.sql b/examples/packages_demo/shared_package_git_remote/models/macros/git_shared_utils.sql new file mode 100644 index 0000000..1bf039c --- /dev/null +++ b/examples/packages_demo/shared_package_git_remote/models/macros/git_shared_utils.sql @@ -0,0 +1,6 @@ +{# Macros for the git-based package #} + +{%- macro git_email_domain(expr) -%} + -- Slightly different macro than the local package + lower(regexp_replace({{ expr }}, '^.*@', '')) +{%- endmacro -%} diff --git a/examples/packages_demo/shared_package_git_remote/models/staging/users_base_git.ff.sql b/examples/packages_demo/shared_package_git_remote/models/staging/users_base_git.ff.sql new file mode 100644 index 0000000..5ee07c5 --- /dev/null +++ b/examples/packages_demo/shared_package_git_remote/models/staging/users_base_git.ff.sql @@ -0,0 +1,38 @@ +{{ config( + materialized='view', + tags=[ + 'example:packages_demo', + 'scope:staging', + 'origin:git_package', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery' + ], +) }} + +{#- + users_base_git.ff is the git-backed variant of users_base.ff + + It: + - casts id → user_id + - normalizes email + - derives email_domain using the *git* macro git_email_domain() + - adds a literal column "source" so it’s obvious which package it came from +-#} + +with raw_users as ( + select + cast(id as integer) as user_id, + lower(email) as email, + cast(signup_date as date) as signup_date + from {{ source('crm', 'users') }} +) + +select + user_id, + email, + {{ git_email_domain("email") }} as email_domain, + signup_date, + 'git_package' as source +from raw_users; diff --git a/examples/packages_demo/shared_package_git_remote/package.yml b/examples/packages_demo/shared_package_git_remote/package.yml new file mode 100644 index 0000000..5d63919 --- /dev/null +++ b/examples/packages_demo/shared_package_git_remote/package.yml @@ -0,0 +1,7 @@ +name: shared_package_git +version: "0.1.0" +models_dir: models + +vars: {} + +tests: [] diff --git a/pyproject.toml b/pyproject.toml index d03ff07..cdf6cbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "fastflowtransform" -version = "0.6.10" +version = "0.6.11" description = "Python framework for SQL & Python data transformation, ETL pipelines, and dbt-style data modeling" readme = "README.md" license = { text = "Apache-2.0" } diff --git a/src/fastflowtransform/cli/__init__.py b/src/fastflowtransform/cli/__init__.py index 0b0abec..7f22b7f 100644 --- a/src/fastflowtransform/cli/__init__.py +++ b/src/fastflowtransform/cli/__init__.py @@ -18,6 +18,7 @@ ) from fastflowtransform.cli.ci_cmd import register as _register_ci from fastflowtransform.cli.dag_cmd import dag, register as _register_dag +from fastflowtransform.cli.deps_cmd import register as _register_deps from fastflowtransform.cli.docgen_cmd import docgen, register as _register_docgen from fastflowtransform.cli.docs_utils import ( _build_docs_manifest, @@ -134,6 +135,7 @@ def main( _register_snapshot(app) _register_source(app) _register_ci(app) +_register_deps(app) __all__ = [ diff --git a/src/fastflowtransform/cli/deps_cmd.py b/src/fastflowtransform/cli/deps_cmd.py new file mode 100644 index 0000000..6dbabde --- /dev/null +++ b/src/fastflowtransform/cli/deps_cmd.py @@ -0,0 +1,66 @@ +# fastflowtransform/cli/deps_cmd.py +from __future__ import annotations + +import typer + +from fastflowtransform.cli.bootstrap import _resolve_project_path +from fastflowtransform.cli.options import ProjectArg +from fastflowtransform.logging import echo +from fastflowtransform.packages import resolve_packages + + +def deps(project: ProjectArg = ".") -> None: + """ + Show packages configured in packages.yml and basic status checks. + + - Resolves project directory (must contain models/). + - Parses packages.yml (if present). + - For each package, resolves its base path and models_dir location. + - Prints a short report and exits with non-zero status when something is missing. + """ + proj = _resolve_project_path(project) + + try: + pkgs = resolve_packages(proj) + except Exception as exc: # pragma: no cover - config error path + raise typer.BadParameter(f"Failed to resolve packages: {exc}") from exc + + echo(f"Project: {proj}") + + if not pkgs: + echo("No packages configured (packages.yml not found or empty).") + raise typer.Exit(0) + + echo("Packages:") + missing = 0 + + for pkg in pkgs: + models_root = pkg.root / pkg.models_dir + status = "OK" + if not models_root.exists(): + status = "MISSING: models_dir not found" + missing += 1 + + echo(f" - {pkg.name} ({pkg.version})") + echo(f" kind: {pkg.source.kind}") + if pkg.source.kind == "path": + echo(f" path: {pkg.root}") + else: + echo(f" git: {pkg.source.git}") + echo(f" rev: {pkg.source.rev}") + if pkg.source.subdir: + echo(f" subdir: {pkg.source.subdir}") + echo(f" models_dir: {pkg.models_dir} -> {models_root}") + echo(f" status: {status}") + + raise typer.Exit(1 if missing else 0) + + +def register(app: typer.Typer) -> None: + app.command( + name="deps", + help="Show configured packages from packages.yml and their local status.", + )(deps) + + +__all__ = ["deps", "register"] diff --git a/src/fastflowtransform/config/packages.py b/src/fastflowtransform/config/packages.py new file mode 100644 index 0000000..85f689b --- /dev/null +++ b/src/fastflowtransform/config/packages.py @@ -0,0 +1,135 @@ +# fastflowtransform/config/packages.py +from __future__ import annotations + +from collections.abc import Mapping +from pathlib import Path +from typing import Any + +import yaml +from pydantic import BaseModel, ConfigDict, Field, model_validator + + +class PackageSpec(BaseModel): + """ + One entry from packages.yml, for example: + + packages: + - name: fft_utils + path: "../fft_utils" + models_dir: "models" + + Or (shorthand mapping form): + + fft_utils: "../fft_utils" + """ + + model_config = ConfigDict(extra="forbid") + + name: str + # Exactly one of `path` or `git` must be set. + path: str | None = None + git: str | None = None + + # Optional git parameters (ignored for path-based packages). + rev: str | None = None + tag: str | None = None + branch: str | None = None + subdir: str | None = None + + # Where models live inside the package root (default: "models"). + # This can be overridden by the package's own project.yml (models_dir), + # but packages.yml always wins if set explicitly. + models_dir: str = "models" + + # Optional constraint for the package's manifest version (semver expression). + # Example: ">=1.0.0,<2.0.0" + version: str | None = None + + @model_validator(mode="after") + def _validate_source(self) -> PackageSpec: + """ + Ensure that exactly one of `path` or `git` is set. + Older configs that only have `path` remain valid. + """ + has_path = bool(self.path) + has_git = bool(self.git) + if has_path == has_git: + raise ValueError( + f"Package '{self.name}': exactly one of 'path' or 'git' must be set " + "in packages.yml." + ) + return self + + +class PackagesConfig(BaseModel): + """ + Top-level representation of packages.yml. + + We accept two shapes: + + 1) Explicit: + + packages: + - name: fft_utils + path: "../fft_utils" + models_dir: "models" + + 2) Shorthand mapping: + + fft_utils: "../fft_utils" + other_pkg: + path: "../other" + models_dir: "dbt_models" + """ + + model_config = ConfigDict(extra="forbid") + + packages: list[PackageSpec] = Field(default_factory=list) + + +def _normalize_raw_packages(raw: Any) -> dict[str, Any]: + """ + Normalize the various accepted YAML shapes into: + + {"packages": [ {name, path?|git?, models_dir?, ...}, ... ]} + """ + if raw is None: + return {"packages": []} + + # Case 1: already a list -> treat as `packages: [...]` + if isinstance(raw, list): + return {"packages": raw} + + # Case 2: mapping with explicit 'packages' key + if isinstance(raw, Mapping): + if "packages" in raw: + return {"packages": raw["packages"] or []} + + # Case 3: shorthand mapping name -> path or dict + pkgs: list[dict[str, Any]] = [] + for name, cfg in raw.items(): + if isinstance(cfg, str): + # shorthand "pkg: ../path" + pkgs.append({"name": str(name), "path": cfg}) + elif isinstance(cfg, Mapping): + d = dict(cfg) + d.setdefault("name", str(name)) + pkgs.append(d) + return {"packages": pkgs} + + raise TypeError("packages.yml must be a list or a mapping") + + +def load_packages_config(project_dir: Path) -> PackagesConfig: + """ + Read packages.yml under `project_dir` and return a strict PackagesConfig. + + If the file does not exist, we return an empty config (no packages). + """ + cfg_path = project_dir / "packages.yml" + if not cfg_path.exists(): + return PackagesConfig() + + raw = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {} + norm = _normalize_raw_packages(raw) + return PackagesConfig.model_validate(norm) diff --git a/src/fastflowtransform/core.py b/src/fastflowtransform/core.py index 19be198..e80c7cb 100644 --- a/src/fastflowtransform/core.py +++ b/src/fastflowtransform/core.py @@ -21,6 +21,7 @@ from fastflowtransform import storage from fastflowtransform.config.models import validate_model_meta_strict +from fastflowtransform.config.packages import PackageSpec, load_packages_config from fastflowtransform.config.project import HookSpec, parse_project_yaml_config from fastflowtransform.config.sources import load_sources_config from fastflowtransform.errors import ( @@ -29,6 +30,7 @@ ModuleLoadError, ) from fastflowtransform.logging import get_logger +from fastflowtransform.packages import ResolvedPackage, resolve_packages def _validate_py_model_signature(func: Callable, deps: list[str], *, path: Path, name: str) -> None: @@ -132,6 +134,8 @@ class Node: path: Path deps: list[str] = field(default_factory=list) meta: dict[str, Any] = field(default_factory=dict) + # which package this node came from (None = main project) + package: str | None = None class Registry: @@ -148,6 +152,13 @@ def __init__(self): self.active_engine: str | None = None self.incremental_models: dict[str, dict[str, Any]] = {} + # package manager state + # - self.packages: raw specs from packages.yml (config layer) + # - self.resolved_packages: git/path + manifest + version/deps + self.packages: list[PackageSpec] = [] + self.resolved_packages: list[ResolvedPackage] = [] + self.package_model_roots: list[Path] = [] + # global hooks from project.yml self.on_run_start_hooks: list[HookSpec] = [] self.on_run_end_hooks: list[HookSpec] = [] @@ -282,26 +293,107 @@ def _should_register_for_engine(self, meta: Mapping[str, Any], *, path: Path) -> ) return current in allowed + def _resolve_package_base_path(self, project_dir: Path, pkg: PackageSpec) -> Path: + """ + BACKCOMPAT helper for path-based packages - still used by some tests + and external callers. For git-based packages this is not used. + """ + base = Path(pkg.path or "") + if not base.is_absolute(): + base = (project_dir / base).resolve() + return base + + def _load_packages_yaml(self, project_dir: Path) -> None: + """ + Load packages.yml into: + - self.packages (raw config specs) + - self.resolved_packages (git/path + manifest + versions/deps) + - self.package_model_roots (actual model roots to load from) + + Missing / empty packages.yml → no packages. + """ + logger = get_logger("registry") + + # Config layer (kept for backwards-compatibility / inspection). + try: + cfg = load_packages_config(project_dir) + except Exception as exc: + raise ValueError(f"Failed to parse packages.yml: {exc}") from exc + + self.packages = list(cfg.packages or []) + + # Runtime resolver: git clone, manifest loading, version/deps validation. + try: + self.resolved_packages = resolve_packages(project_dir, cfg) + except Exception as exc: + # Surface as a clear configuration error + raise ValueError(f"Failed to resolve packages: {exc}") from exc + + # Compute model roots from resolved packages (and warn if missing). + self.package_model_roots = [] + for pkg in self.resolved_packages: + models_root = pkg.root / pkg.models_dir + if not models_root.exists(): + logger.warning( + "Package '%s': models_dir '%s' not found at %s; package will be ignored.", + pkg.name, + pkg.models_dir, + models_root, + ) + continue + if not models_root.is_dir(): + logger.warning( + ( + "Package '%s': models_dir '%s' is not " + "a directory at %s; package will be ignored." + ), + pkg.name, + pkg.models_dir, + models_root, + ) + continue + self.package_model_roots.append(models_root) + def load_project(self, project_dir: Path) -> None: """Load a FastFlowTransform project from the given directory.""" self._reset_registry_state() self.project_dir = project_dir models_dir = project_dir / "models" - self._init_jinja_env(models_dir) - # macros first, because models may use them + # 1) packages.yml: resolve package model roots + self._load_packages_yaml(project_dir) + + # 2) Jinja env with multi-root loader (packages + project) + all_model_roots: list[Path] = [] + if self.package_model_roots: + all_model_roots.extend(self.package_model_roots) + all_model_roots.append(models_dir) + self._init_jinja_env(all_model_roots) + + # 3) macros: packages first, then project (project can override) + for pkg_root in self.package_model_roots: + self._load_macros(pkg_root) + self._load_py_macros(pkg_root) self._load_macros(models_dir) self._load_py_macros(models_dir) + # 4) project.yml + sources.yml self._load_sources_yaml(project_dir) self._load_project_yaml(project_dir) - # discover models + # 5) Discover models: packages first, then project + for pkg in self.resolved_packages: + models_root = pkg.root / pkg.models_dir + if not models_root.is_dir(): + continue + self._discover_sql_models(models_root, package=pkg.name) + self._discover_python_models(models_root, package=pkg.name) + self._discover_sql_models(models_dir) self._discover_python_models(models_dir) - # final validation + # 6) Validate deps self._validate_dependencies() def _reset_registry_state(self) -> None: @@ -315,6 +407,11 @@ def _reset_registry_state(self) -> None: self.macros.clear() self.incremental_models = {} + # packages + self.packages = [] + self.resolved_packages = [] + self.package_model_roots = [] + # reset storage maps storage.set_model_storage({}) storage.set_seed_storage({}) @@ -325,19 +422,22 @@ def _reset_registry_state(self) -> None: self.before_model_hooks = [] self.after_model_hooks = [] - def _init_jinja_env(self, models_dir: Path) -> None: + def _init_jinja_env(self, models_dirs: Path | list[Path]) -> None: """Initialize the Jinja environment for this project.""" + if isinstance(models_dirs, Path): + search_paths = [str(models_dirs)] + else: + search_paths = [str(p) for p in models_dirs] + self.env = Environment( - loader=FileSystemLoader(str(models_dir)), + loader=FileSystemLoader(search_paths), undefined=StrictUndefined, autoescape=False, trim_blocks=True, lstrip_blocks=True, ) - # ---- Make project vars & helpers available in Jinja ---- - # Note: these callables close over `self`, so they always read the - # latest self.cli_vars / self.project_vars even after project.yml loads. + # --- Jinja helpers: var(), engine(), env() --- def _var(key: str, default: Any | None = None) -> Any: # CLI --vars override project vars if isinstance(self.cli_vars, dict) and key in self.cli_vars: @@ -362,7 +462,7 @@ def _env(name: str, default: Any | None = None) -> Any: self.env.filters["var"] = _var self.env.filters["env"] = _env - # SQL literal helper for models *and* hooks + # Export sql_literal as filter as well self.env.filters["sql_literal"] = sql_literal def _load_sources_yaml(self, project_dir: Path) -> None: @@ -429,7 +529,7 @@ def _load_project_yaml(self, project_dir: Path) -> None: self.before_model_hooks = [] self.after_model_hooks = [] - def _discover_sql_models(self, models_dir: Path) -> None: + def _discover_sql_models(self, models_dir: Path, *, package: str | None = None) -> None: """Scan *.ff.sql files, parse config, validate meta, and register nodes.""" for path in models_dir.rglob("*.ff.sql"): name = path.stem @@ -484,9 +584,16 @@ def _discover_sql_models(self, models_dir: Path) -> None: if not self._should_register_for_engine(meta, path=path): continue - self._add_node_or_fail(name, "sql", path, deps, meta=meta) + self._add_node_or_fail( + name, + "sql", + path, + deps, + meta=meta, + package=package, + ) - def _discover_python_models(self, models_dir: Path) -> None: + def _discover_python_models(self, models_dir: Path, *, package: str | None = None) -> None: """Scan *.ff.py files, import them, validate meta, and register decorated callables.""" for path in models_dir.rglob("*.ff.py"): # Import the module so decorators can register functions @@ -565,7 +672,14 @@ def _discover_python_models(self, models_dir: Path) -> None: meta = cfg.model_dump(exclude_none=True) # Register node - self._add_node_or_fail(name, kind, path, deps, meta=meta) + self._add_node_or_fail( + name, + kind, + path, + deps, + meta=meta, + package=package, + ) # Required-columns spec (for executors) stays as before req = getattr(func, "__ff_require__", None) @@ -653,18 +767,53 @@ def _load_py_module(self, path: Path) -> types.ModuleType: return mod def _add_node_or_fail( - self, name: str, kind: str, path: Path, deps: list[str], *, meta: dict[str, Any] + self, + name: str, + kind: str, + path: Path, + deps: list[str], + *, + meta: dict[str, Any], + package: str | None = None, ) -> None: - if name in self.nodes: - other = self.nodes[name].path - raise ModuleLoadError( - "Duplicate model name detected:\n" - f"• alredy registered: {other}\n" - f"• new model: {path}\n" - "Hint: Rename one of the models (file name = node name)" - "or use @model(name='…') for Python." + """ + Register a new DAG node. + + Conflict resolution: + - If both existing and new nodes are from the main project (package=None): + raise an error (as before). + - If at least one of them comes from a package: + last wins, with a warning. + """ + logger = get_logger("registry") + existing = self.nodes.get(name) + if existing: + if existing.package is None and package is None: + other = existing.path + raise ModuleLoadError( + "Duplicate model name detected:\n" + f"• already registered: {other}\n" + f"• new model: {path}\n" + "Hint: Rename one of the models or use @model(name='…') for Python.", + ) + + logger.warning( + "Model name conflict for '%s': %s (package=%s) overrides %s (package=%s).", + name, + path, + package or "", + existing.path, + existing.package or "", ) - self.nodes[name] = Node(name=name, kind=kind, path=path, deps=deps, meta=meta) + + self.nodes[name] = Node( + name=name, + kind=kind, + path=path, + deps=deps, + meta=meta, + package=package, + ) def _scan_sql_deps(self, path: Path) -> list[str]: txt = path.read_text(encoding="utf-8") diff --git a/src/fastflowtransform/packages.py b/src/fastflowtransform/packages.py new file mode 100644 index 0000000..21ac195 --- /dev/null +++ b/src/fastflowtransform/packages.py @@ -0,0 +1,670 @@ +# fastflowtransform/packages.py +from __future__ import annotations + +import re +import subprocess +from collections.abc import Mapping +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import yaml + +from fastflowtransform import __version__ as FFT_VERSION +from fastflowtransform.config.packages import ( + PackagesConfig, + PackageSpec, + load_packages_config, +) +from fastflowtransform.logging import echo, get_logger + +log = get_logger("packages") + + +# --------------------------------------------------------------------------- +# Data structures +# --------------------------------------------------------------------------- + + +@dataclass +class PackageDependency: + """ + Dependency entry from a package's own manifest (project.yml). + + Example in project.yml inside the package: + + dependencies: + - name: shared.core + version: ">=0.8,<1.0" + optional: false + """ + + name: str + version_constraint: str | None = None + optional: bool = False + raw: Mapping[str, Any] = field(default_factory=dict) + + +@dataclass +class PackageManifest: + """ + Package-level metadata loaded from project.yml inside the package. + """ + + name: str + version: str + fft_version: str | None + dependencies: list[PackageDependency] = field(default_factory=list) + models_dir: str | None = None + raw: Mapping[str, Any] = field(default_factory=dict) + root: Path | None = None + + +@dataclass +class LockedSource: + """ + Concrete, pinned source info that ends up in packages.lock.yml. + """ + + kind: str # "path" | "git" + path: str | None = None + git: str | None = None + rev: str | None = None + subdir: str | None = None + + def to_mapping(self) -> dict[str, Any]: + out: dict[str, Any] = {"kind": self.kind} + if self.path is not None: + out["path"] = self.path + if self.git is not None: + out["git"] = self.git + if self.rev is not None: + out["rev"] = self.rev + if self.subdir is not None: + out["subdir"] = self.subdir + return out + + +@dataclass +class LockEntry: + name: str + version: str + source: LockedSource + + def to_mapping(self) -> dict[str, Any]: + return { + "name": self.name, + "version": self.version, + "source": self.source.to_mapping(), + } + + +@dataclass +class LockFile: + """ + packages.lock.yml structure. + + Right now we only *write* it, we do not use it to drive resolution. + """ + + fft_version: str | None + entries: list[LockEntry] = field(default_factory=list) + + @classmethod + def from_mapping(cls, data: Mapping[str, Any]) -> LockFile: + entries: list[LockEntry] = [] + for row in data.get("packages", []) or []: + src = row.get("source") or {} + source = LockedSource( + kind=src.get("kind", "path"), + path=src.get("path"), + git=src.get("git"), + rev=src.get("rev"), + subdir=src.get("subdir"), + ) + entries.append( + LockEntry( + name=row["name"], + version=str(row["version"]), + source=source, + ) + ) + return cls( + fft_version=data.get("fft_version"), + entries=entries, + ) + + def to_mapping(self) -> dict[str, Any]: + return { + "fft_version": self.fft_version, + "packages": [e.to_mapping() for e in self.entries], + } + + +@dataclass +class ResolvedPackage: + """ + Concrete package that has been: + + - located (path or git checkout) + - manifest-loaded (project.yml) + - dependency-validated + """ + + name: str + version: str + root: Path # directory containing project.yml + models/ + models_dir: str # path inside root where models live + source: LockedSource + manifest: PackageManifest + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def resolve_packages( + project_dir: Path, + cfg: PackagesConfig | None = None, +) -> list[ResolvedPackage]: + """ + Resolve all packages declared in packages.yml for the given project: + + - locate local path packages + - clone/fetch git packages into .fastflowtransform/packages + - load per-package project.yml as a manifest (name/version/fft_version/dependencies) + - validate: + + * manifest.name matches spec.name + * manifest.fft_version is compatible with FFT_VERSION (if declared) + * spec.version (constraint) matches manifest.version (if declared) + * inter-package dependencies are satisfied + + - write packages.lock.yml with pinned sources + + Returns a list of ResolvedPackage objects. If packages.yml is missing or + empty, returns []. + """ + project_dir = Path(project_dir).expanduser().resolve() + + if cfg is None: + cfg = load_packages_config(project_dir) + + specs: list[PackageSpec] = list(cfg.packages or []) + if not specs: + return [] + + cache_dir = project_dir / ".fastflowtransform" / "packages" + cache_dir.mkdir(parents=True, exist_ok=True) + + manifests_by_name: dict[str, PackageManifest] = {} + resolved_by_name: dict[str, ResolvedPackage] = {} + + for spec in specs: + root = _materialize_package_source(project_dir, cache_dir, spec) + manifest = _load_package_manifest(root) + + # Name check: spec.name must match manifest.name + if spec.name and manifest.name != spec.name: + raise RuntimeError( + f"Package name mismatch for spec '{spec.name}': " + f"manifest reports '{manifest.name}' in {root / 'project.yml'}." + ) + + # Check FFT core compatibility if declared in manifest + if manifest.fft_version and not version_satisfies(FFT_VERSION, manifest.fft_version): + raise RuntimeError( + f"Package '{manifest.name}' ({manifest.version}) " + f"requires FFT version '{manifest.fft_version}', " + f"but running '{FFT_VERSION}'." + ) + + # Check that the spec's own version constraint (if any) is satisfied + if spec.version and not version_satisfies(manifest.version, spec.version): + raise RuntimeError( + f"Package '{manifest.name}' has version {manifest.version} " + f"but spec requires '{spec.version}'." + ) + + if manifest.name in manifests_by_name: + other = manifests_by_name[manifest.name] + raise RuntimeError( + f"Duplicate package name '{manifest.name}' loaded from:\n" + f" - {other.root}\n" + f" - {root}\n" + "Package names must be globally unique." + ) + + manifests_by_name[manifest.name] = manifest + + # Resolve models_dir: packages.yml overrides manifest.models_dir; default "models" + models_dir = spec.models_dir or manifest.models_dir or "models" + + locked_src = _lock_source_for_spec(spec, root) + resolved_by_name[manifest.name] = ResolvedPackage( + name=manifest.name, + version=manifest.version, + root=root, + models_dir=models_dir, + source=locked_src, + manifest=manifest, + ) + + # Validate dependencies (only across the set of resolved packages) + _validate_package_dependencies(resolved_by_name) + + # Write lock file (best-effort; failure shouldn't be fatal) + _write_lock_file(project_dir, list(resolved_by_name.values())) + + # Return in deterministic order + return sorted(resolved_by_name.values(), key=lambda p: p.name) + + +# --------------------------------------------------------------------------- +# Manifest + source handling +# --------------------------------------------------------------------------- + + +def _load_package_manifest(root: Path) -> PackageManifest: + """ + Load project.yml from a package root and extract basic fields: + + - name (required) + - version (required) + - fft_version (optional) + - dependencies (optional) + - models_dir (optional override; if absent, we default to 'models') + """ + path = root / "project.yml" + if not path.exists(): + raise RuntimeError(f"Package root {root} has no project.yml") + + with path.open("r", encoding="utf8") as f: + data = yaml.safe_load(f) or {} + + name = str(data.get("name") or "").strip() + if not name: + raise RuntimeError(f"{path}: missing 'name' field") + + version = str(data.get("version") or "").strip() + if not version: + raise RuntimeError(f"{path}: missing 'version' field") + + fft_version = data.get("fft_version") + if fft_version is not None: + fft_version = str(fft_version).strip() or None + + models_dir = data.get("models_dir") + if models_dir is not None: + models_dir = str(models_dir).strip() or None + + deps_data = data.get("dependencies") or [] + deps: list[PackageDependency] = [] + if deps_data: + if not isinstance(deps_data, list): + raise RuntimeError(f"{path}: 'dependencies' must be a list if present.") + for d in deps_data: + if not isinstance(d, Mapping): + raise RuntimeError(f"{path}: dependency entries must be mappings, got {d!r}") + dep_name = str(d.get("name") or "").strip() + if not dep_name: + raise RuntimeError(f"{path}: dependency entry missing 'name'") + vc = d.get("version") + opt = bool(d.get("optional", False)) + deps.append( + PackageDependency( + name=dep_name, + version_constraint=str(vc) if vc else None, + optional=opt, + raw=d, + ) + ) + + return PackageManifest( + name=name, + version=version, + fft_version=fft_version, + dependencies=deps, + models_dir=models_dir, + raw=data, + root=root, + ) + + +def _materialize_package_source( + project_dir: Path, + cache_dir: Path, + spec: PackageSpec, +) -> Path: + """ + Turn a PackageSpec into a concrete directory on disk. + + - path packages → project_dir / path (must exist) + - git packages → cloned/updated repo in cache_dir, then optional subdir + """ + if spec.path is not None: + base = Path(spec.path) + if not base.is_absolute(): + base = (project_dir / base).resolve() + if not base.exists(): + raise RuntimeError(f"Package '{spec.name}': path not found: {base}") + if not base.is_dir(): + raise RuntimeError(f"Package '{spec.name}': path is not a directory: {base}") + log.debug("Using path package '%s' at %s", spec.name, base) + return base + + # git package + if not spec.git: + raise RuntimeError( + f"Package '{spec.name}' must specify either 'path' or 'git' in packages.yml." + ) + + repo_root = _ensure_git_repo(cache_dir, spec) + root = (repo_root / spec.subdir).resolve() if spec.subdir else repo_root + + if not root.exists(): + raise RuntimeError( + f"Package '{spec.name}': subdir '{spec.subdir}' within repo does not exist " + f"(root: {repo_root})" + ) + if not root.is_dir(): + raise RuntimeError( + f"Package '{spec.name}': subdir '{spec.subdir}' is not a directory (root: {repo_root})" + ) + + log.debug("Using git package '%s' at %s", spec.name, root) + return root + + +def _ensure_git_repo(cache_dir: Path, spec: PackageSpec) -> Path: + """ + Ensure we have a local clone for the given git package in cache_dir and + return the checked-out directory (HEAD at rev/tag/branch). + """ + assert spec.git + git_root = cache_dir / "git" + git_root.mkdir(parents=True, exist_ok=True) + + repo_slug = _slug_git_url(spec.git, spec.name) + repo_dir = git_root / repo_slug / "repo" + repo_dir.parent.mkdir(parents=True, exist_ok=True) + + if not repo_dir.exists(): + echo(f"Cloning package '{spec.name}' from {spec.git} ...") + _run_git(["clone", "--no-tags", "--quiet", spec.git, str(repo_dir)]) + else: + # best-effort fetch; ignore errors (offline etc.) + try: + _run_git(["-C", str(repo_dir), "fetch", "--all", "--quiet"]) + except Exception as exc: # pragma: no cover + log.debug("Git fetch failed for %s: %s", spec.git, exc) + + ref = spec.rev or spec.tag or spec.branch or "HEAD" + echo(f"Checking out {spec.name}@{ref} ...") + _run_git(["-C", str(repo_dir), "checkout", "--quiet", ref]) + + return repo_dir + + +def _run_git(args: list[str]) -> None: + try: + subprocess.run(["git", *args], check=True, capture_output=True) + except FileNotFoundError as exc: # pragma: no cover + raise RuntimeError( + "git executable not found. Git-based packages require git to be installed and on PATH." + ) from exc + except subprocess.CalledProcessError as exc: + raise RuntimeError( + f"git command failed: git {' '.join(args)}\n" + f"stdout:\n{exc.stdout.decode(errors='ignore')}\n" + f"stderr:\n{exc.stderr.decode(errors='ignore')}" + ) from exc + + +def _slug_git_url(url: str, name: str) -> str: + base = f"{name}@{url}" + return re.sub(r"[^A-Za-z0-9_.-]", "_", base) + + +def _lock_source_for_spec(spec: PackageSpec, root: Path) -> LockedSource: + if spec.path is not None: + return LockedSource( + kind="path", + path=str(root), + ) + + # git - try to figure out the concrete commit SHA at HEAD + rev = spec.rev + if spec.git: + try: + result = subprocess.run( + ["git", "-C", str(root), "rev-parse", "HEAD"], check=True, capture_output=True + ) + head = result.stdout.decode().strip() + if head: + rev = head + except Exception: # pragma: no cover - best-effort + pass + + return LockedSource( + kind="git", + git=spec.git, + rev=rev, + subdir=spec.subdir, + ) + + +# --------------------------------------------------------------------------- +# Dependency validation +# --------------------------------------------------------------------------- + + +def _validate_package_dependencies(pkgs: Mapping[str, ResolvedPackage]) -> None: + """ + Enforce that package-level dependencies are satisfied by the resolved set. + + There is exactly one version per package name in the current project. + Dependencies simply assert that: + + - a package with `name` exists, and + - its manifest.version satisfies the declared version constraint. + """ + for pkg in pkgs.values(): + for dep in pkg.manifest.dependencies: + target = pkgs.get(dep.name) + if not target: + if dep.optional: + log.debug( + "Optional dependency '%s' of package '%s' not present; skipping.", + dep.name, + pkg.name, + ) + continue + raise RuntimeError( + f"Package '{pkg.name}' depends on '{dep.name}', " + "but no package with that name is declared in packages.yml." + ) + + if dep.version_constraint and not version_satisfies( + target.version, dep.version_constraint + ): + raise RuntimeError( + f"Package '{pkg.name}' requires '{dep.name}' " + f"with version '{dep.version_constraint}', " + f"but resolved version is '{target.version}'." + ) + + +# --------------------------------------------------------------------------- +# Lock file IO +# --------------------------------------------------------------------------- + + +def _write_lock_file(project_dir: Path, packages: list[ResolvedPackage]) -> None: + lock_path = project_dir / "packages.lock.yml" + + entries = [ + LockEntry( + name=pkg.name, + version=pkg.version, + source=pkg.source, + ) + for pkg in packages + ] + + lock = LockFile( + fft_version=FFT_VERSION, + entries=entries, + ) + + try: + with lock_path.open("w", encoding="utf8") as f: + yaml.safe_dump(lock.to_mapping(), f, sort_keys=False) + log.debug("Wrote packages.lock.yml with %d entries", len(entries)) + except Exception as exc: # pragma: no cover + log.warning("Failed to write packages.lock.yml: %s", exc) + + +# --------------------------------------------------------------------------- +# Very small semver helper (x.y.z only, with optional -suffix) +# --------------------------------------------------------------------------- + + +_SEMVER_RE = re.compile(r"^(\d+)\.(\d+)\.(\d+)(?:[-+].*)?$") + + +def parse_version(v: str) -> tuple[int, int, int]: + """ + Parse a very simple semver string 'MAJOR.MINOR.PATCH'. + + We ignore pre-release / build metadata; they are treated as equal. + + Raises ValueError if we cannot parse. + """ + m = _SEMVER_RE.match(v.strip()) + if not m: + raise ValueError(f"Invalid semantic version (expected 'x.y.z'): {v!r}") + return int(m.group(1)), int(m.group(2)), int(m.group(3)) + + +def compare_versions(a: str, b: str) -> int: + """ + Compare two version strings. + + <0: a < b + 0: a == b + >0: a > b + """ + a_t = parse_version(a) + b_t = parse_version(b) + if a_t < b_t: + return -1 + if a_t > b_t: + return 1 + return 0 + + +def _expand_caret(v: str) -> str: + """ + ^1.2.3 → >=1.2.3,<2.0.0 + ^0.3.0 → >=0.3.0,<0.4.0 + ^0.0.4 → >=0.0.4,<0.0.5 + """ + major, minor, patch = parse_version(v) + if major > 0: + upper = f"{major + 1}.0.0" + elif minor > 0: + upper = f"0.{minor + 1}.0" + else: + upper = f"0.0.{patch + 1}" + return f">={v},<{upper}" + + +def _expand_tilde(v: str) -> str: + """ + ~1.2.3 → >=1.2.3,<1.3.0 + """ + major, minor, patch = parse_version(v) + upper = f"{major}.{minor + 1}.0" + norm = f"{major}.{minor}.{patch}" + return f">={norm},<{upper}" + + +def _parse_constraints(expr: str) -> list[tuple[str, str]]: + """ + Parse a constraint expression into (op, version) pairs. + + Supported forms (combined with commas or spaces): + + "1.2.3" -> ==1.2.3 + ">=1.2.0,<2.0.0" + ">1.0.0 <=2.0.0" + "^1.2.3" + "~1.4.0" + + Returns a list of (op, version), where op ∈ { "==", "!=", ">", "<", ">=", "<=" }. + """ + expr = expr.strip() + if not expr: + return [] + + # Expand ^ and ~ first (they return comma-joined ranges) + if expr.startswith("^"): + expr = _expand_caret(expr[1:].strip()) + elif expr.startswith("~"): + expr = _expand_tilde(expr[1:].strip()) + + parts = re.split(r"[,\s]+", expr) + out: list[tuple[str, str]] = [] + for p in parts: + _p = p.strip() + if not _p: + continue + m = re.match(r"^(>=|<=|==|!=|>|<)?\s*(\d+\.\d+\.\d+)$", _p) + if not m: + # bare version "1.2.3" means "==1.2.3" + if _SEMVER_RE.match(p): + op = "==" + v = _p + else: + raise ValueError(f"Invalid version constraint token: {_p!r}") + else: + op = m.group(1) or "==" + v = m.group(2) + out.append((op, v)) + return out + + +def version_satisfies(actual: str, constraint: str | None) -> bool: + """ + Return True iff a version string 'actual' satisfies a constraint expression. + + Empty / None constraint always returns True. + """ + if not constraint: + return True + checks = _parse_constraints(constraint) + for op, target in checks: + cmp = compare_versions(actual, target) + if op == "==": + if cmp != 0: + return False + elif op == "!=": + if cmp == 0: + return False + elif op == ">": + if cmp <= 0: + return False + elif op == "<": + if cmp >= 0: + return False + elif op == ">=": + if cmp < 0: + return False + elif op == "<=": + if cmp > 0: + return False + else: # pragma: no cover + raise ValueError(f"Unknown operator in version constraint: {op!r}") + return True diff --git a/uv.lock b/uv.lock index dfa5e7a..31076da 100644 --- a/uv.lock +++ b/uv.lock @@ -733,7 +733,7 @@ wheels = [ [[package]] name = "fastflowtransform" -version = "0.6.10" +version = "0.6.11" source = { editable = "." } dependencies = [ { name = "duckdb" }, From a523f90bab3c9e6293a4d7a327be646521bbce06 Mon Sep 17 00:00:00 2001 From: Marko Lekic Date: Sun, 30 Nov 2025 13:04:56 +0100 Subject: [PATCH 2/3] Fixed project.yml in packages_demo --- .../packages_demo/main_project/packages.yml | 2 +- .../{package.yml => project.yml} | 0 src/fastflowtransform/config/packages.py | 29 +++++++++++++++++-- 3 files changed, 28 insertions(+), 3 deletions(-) rename examples/packages_demo/shared_package_git_remote/{package.yml => project.yml} (100%) diff --git a/examples/packages_demo/main_project/packages.yml b/examples/packages_demo/main_project/packages.yml index 73007c8..79ca5df 100644 --- a/examples/packages_demo/main_project/packages.yml +++ b/examples/packages_demo/main_project/packages.yml @@ -11,6 +11,6 @@ packages: subdir: "examples/packages_demo/shared_package_git_remote" models_dir: "models" # optional: use explicit ref / branch / tag - ref: "main" + ref: "v0.6.11" # optional: version constraint matched against shared_package_git_remote/project.yml version: ">=0.1.0,<0.2.0" diff --git a/examples/packages_demo/shared_package_git_remote/package.yml b/examples/packages_demo/shared_package_git_remote/project.yml similarity index 100% rename from examples/packages_demo/shared_package_git_remote/package.yml rename to examples/packages_demo/shared_package_git_remote/project.yml diff --git a/src/fastflowtransform/config/packages.py b/src/fastflowtransform/config/packages.py index 85f689b..8849acc 100644 --- a/src/fastflowtransform/config/packages.py +++ b/src/fastflowtransform/config/packages.py @@ -1,4 +1,3 @@ -# fastflowtransform/config/packages.py from __future__ import annotations from collections.abc import Mapping @@ -21,16 +20,33 @@ class PackageSpec(BaseModel): Or (shorthand mapping form): fft_utils: "../fft_utils" + + For git-based packages: + + - name: shared_package_git + git: "https://github.com/org/repo.git" + subdir: "path/inside/repo" + # one of the revision selectors below is optional: + # - ref: "main" (generic alias, mapped to `rev`) + # - rev: "abc1234" (commit SHA) + # - tag: "v1.2.3" + # - branch: "main" + # models_dir: "models" # optional, default "models" """ model_config = ConfigDict(extra="forbid") name: str + # Exactly one of `path` or `git` must be set. path: str | None = None git: str | None = None # Optional git parameters (ignored for path-based packages). + # + # "ref" is a user-facing alias (branch/tag/commit); internally we map it to `rev` + # if no more-specific selector (rev/tag/branch) is provided. + ref: str | None = None rev: str | None = None tag: str | None = None branch: str | None = None @@ -49,15 +65,24 @@ class PackageSpec(BaseModel): def _validate_source(self) -> PackageSpec: """ Ensure that exactly one of `path` or `git` is set. - Older configs that only have `path` remain valid. + Also treat `ref` as a generic alias for `rev` when no other + more-specific selector (rev/tag/branch) is given. """ has_path = bool(self.path) has_git = bool(self.git) + if has_path == has_git: + # Either both set or both unset → error. raise ValueError( f"Package '{self.name}': exactly one of 'path' or 'git' must be set " "in packages.yml." ) + + # If user provided a generic `ref` but no explicit rev/tag/branch, + # map it to `rev` so downstream resolver can just look at rev/tag/branch. + if self.ref and not (self.rev or self.tag or self.branch): + self.rev = self.ref + return self From 97e71e49f14691a3abac8fcbb6cbe121b1c481d5 Mon Sep 17 00:00:00 2001 From: Marko Lekic Date: Sun, 30 Nov 2025 14:41:57 +0100 Subject: [PATCH 3/3] Updated packages feature + package demo --- docs/Packages.md | 615 ++++++++++++------ docs/examples/Packages_Demo.md | 436 +++++++++---- examples/.env.dev_postgres | 3 - .../site/dag/events_base.ff.html | 161 ----- .../site/dag/fct_events_py_incremental.html | 148 ----- .../site/dag/fct_events_sql_inline.ff.html | 148 ----- .../site/dag/fct_events_sql_yaml.ff.html | 148 ----- examples/incremental_demo/site/dag/index.html | 326 ---------- examples/packages_demo/.gitignore | 2 - .../{env.dev_duckdb => .env.dev_duckdb} | 0 examples/packages_demo/main_project/Makefile | 4 + .../main_project/packages.lock.yml | 14 + .../packages_demo/main_project/packages.yml | 2 +- mkdocs.yml | 2 + src/fastflowtransform/cli/deps_cmd.py | 28 +- src/fastflowtransform/packages.py | 104 ++- tests/integration/examples/config.py | 8 + tests/unit/cli/test_deps | 81 +++ ...onfig_hook.py => test_config_hook_unit.py} | 0 .../unit/config/test_packages_config_unit.py | 116 ++++ tests/unit/test_packages_unit.py | 346 ++++++++++ 21 files changed, 1433 insertions(+), 1259 deletions(-) delete mode 100644 examples/.env.dev_postgres delete mode 100644 examples/incremental_demo/site/dag/events_base.ff.html delete mode 100644 examples/incremental_demo/site/dag/fct_events_py_incremental.html delete mode 100644 examples/incremental_demo/site/dag/fct_events_sql_inline.ff.html delete mode 100644 examples/incremental_demo/site/dag/fct_events_sql_yaml.ff.html delete mode 100644 examples/incremental_demo/site/dag/index.html delete mode 100644 examples/packages_demo/.gitignore rename examples/packages_demo/main_project/{env.dev_duckdb => .env.dev_duckdb} (100%) create mode 100644 examples/packages_demo/main_project/packages.lock.yml create mode 100644 tests/unit/cli/test_deps rename tests/unit/config/{test_config_hook.py => test_config_hook_unit.py} (100%) create mode 100644 tests/unit/config/test_packages_config_unit.py create mode 100644 tests/unit/test_packages_unit.py diff --git a/docs/Packages.md b/docs/Packages.md index e9936b3..3d026a6 100644 --- a/docs/Packages.md +++ b/docs/Packages.md @@ -10,34 +10,60 @@ Typical use cases: * A central **macro library** (casting helpers, email parsing, date tricks). * A “starter kit” of **canonical marts** that downstream projects can add on top of. +Packages can come from: + +* A **local path** on disk. +* A **git repository** (with optional branch/tag/commit + subdir). + --- -## High-level behavior +## 1. High-level behavior When you declare packages in `packages.yml`: 1. FFT loads your **main project** as usual. -2. For each entry in `packages.yml`, FFT: - * resolves the path on disk, - * reads that package’s `project.yml` (if present), - * loads its `models/` and macros. -3. All package models and macros are registered into the **same namespace** as your own. +2. It runs the **package resolver**: + + * For each entry in `packages.yml` it: + + * locates the package: + + * local: resolves the `path` on disk. + * git: clones/fetches a repo into `.fastflowtransform/packages` and checks out the requested ref. + * reads that package’s **`project.yml`** (the package manifest): + + * `name`, `version`, optional `fft_version`, optional `dependencies`, optional `models_dir`. + * validates: + + * manifest `name` matches the `name` from `packages.yml`. + * `fft_version` (if present) is compatible with the running FFT version. + * the spec’s `version` constraint (if present) is satisfied by `manifest.version`. + * package dependencies (if declared) are satisfied by other packages. + * Writes a `packages.lock.yml` with **pinned** sources (paths / git commit SHAs). + +3. For each **resolved package**, FFT: + + * decides which directory to treat as its `models_dir`: + + * `packages.yml:models_dir` overrides `project.yml:models_dir`, default `"models"`. + * loads SQL / Python models and macros from that directory. + * registers them into the **same namespace** as your own models. From inside your project you can: -* `ref('users_base.ff')` even if `users_base.ff.sql` physically lives in `../shared_package/models/…`. -* Use macros defined under `shared_package/models/macros/*.sql` in your own models. +* `ref('users_base.ff')` even if `users_base.ff.sql` physically lives in a package folder. +* Use macros defined in `models/macros/*.sql` inside a package. > There is no special syntax for package references; once loaded, package models look like any other model. --- -## 1. Minimal setup +## 2. Minimal setup -### 1.1. Create a reusable package +### 2.1. Create a reusable package -A package looks like a regular FFT project, but you mainly care about its `models/` and macros. +A package is structured like a normal FFT project, but consumers mainly care about its `models/` and macros. ```text shared_package/ @@ -49,21 +75,30 @@ shared_package/ users_base.ff.sql ``` -Example `project.yml` in the package: +Example `project.yml` in the **package**: ```yaml name: shared_package version: "0.1" + +# Where this package’s models live relative to the package root. +# This can be overridden by packages.yml in the consumer project. models_dir: models -# (Optional) tests/docs/etc. are allowed here but are not special in the consumer. -vars: {} -tests: [] +# Optional: constrain which FFT core versions can use this package. +# If omitted, any FFT version is allowed. +fft_version: ">=0.6.0,<0.7.0" + +# Optional: dependencies on other packages (by name) if you compose packages. +# These are validated against the set of packages declared in the consumer’s packages.yml. +dependencies: [] ``` Example macro (`models/macros/shared_utils.sql`): ```jinja +{# Shared SQL macros for the package #} + {%- macro email_domain(expr) -%} lower(regexp_replace({{ expr }}, '^.*@', '')) {%- endmacro -%} @@ -74,21 +109,29 @@ Example staging model (`models/staging/users_base.ff.sql`): ```jinja {{ config( materialized='view', - tags=['shared:staging', 'engine:duckdb', 'engine:postgres', 'engine:databricks_spark', 'engine:bigquery'], + tags=[ + 'pkg:shared_package', + 'scope:staging', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery', + ], ) }} with raw_users as ( - select - cast(id as integer) as user_id, - lower(email) as email, - cast(signup_date as date) as signup_date - from {{ source('crm', 'users') }} + select + cast(id as integer) as user_id, + lower(email) as email, + cast(signup_date as date) as signup_date + from {{ source('crm', 'users') }} ) + select - user_id, - email, - {{ email_domain("email") }} as email_domain, - signup_date + user_id, + email, + {{ email_domain("email") }} as email_domain, + signup_date from raw_users; ``` @@ -96,7 +139,7 @@ This package expects the **consumer project** to define `source('crm','users')`. --- -### 1.2. Declare the package in your project +### 2.2. Declare the package in your project In your main project: @@ -114,20 +157,20 @@ Create `packages.yml`: ```yaml packages: - name: shared_package - path: "../shared_package" - models_dir: "models" + path: "../shared_package" # resolved relative to this file + models_dir: "models" # optional; defaults to "models" ``` * `name` - Logical name for the package (used for logs/diagnostics). Does *not* change how you `ref()` models. + Logical name for the package, taken from the package’s own `project.yml`. This must match the manifest; it’s used for logs, diagnostics, and dependency checks. * `path` - Filesystem location of the package folder, resolved **relative to the directory containing `packages.yml`**. + Filesystem location of the **package root**, resolved relative to the directory containing `packages.yml`. * `models_dir` (optional) - Subdirectory containing the package’s models. Defaults to `models` if omitted. + Subdirectory within the package root that contains the package’s models. Defaults to `models`. If both `project.yml:models_dir` and `packages.yml:models_dir` are set, **`packages.yml` wins**. --- -### 1.3. Use package models in your project +### 2.3. Use package models in your project Now, in `my_project/models/marts/mart_users_from_package.ff.sql`: @@ -138,16 +181,17 @@ Now, in `my_project/models/marts/mart_users_from_package.ff.sql`: ) }} with base as ( - select - email_domain, - signup_date - from {{ ref('users_base.ff') }} -- defined in the package + select + email_domain, + signup_date + from {{ ref('users_base.ff') }} -- defined in the shared_package ) + select - email_domain, - count(*) as user_count, - min(signup_date) as first_signup, - max(signup_date) as last_signup + email_domain, + count(*) as user_count, + min(signup_date) as first_signup, + max(signup_date) as last_signup from base group by email_domain order by email_domain; @@ -161,7 +205,7 @@ fft run . --env dev_duckdb fft dag . --env dev_duckdb --html ``` -The DAG will show: +The DAG will show something like: ```text crm.users (source) → users_base.ff (from package) → mart_users_from_package.ff (local) @@ -169,7 +213,7 @@ crm.users (source) → users_base.ff (from package) → mart_users_from_package. --- -## 2. `packages.yml` – configuration reference +## 3. `packages.yml` – configuration reference `packages.yml` must live in the **project root**, next to `project.yml`: @@ -181,26 +225,122 @@ my_project/ … ``` -Structure: +Top-level structure: + +```yaml +packages: + - name: ... + ... +``` + +You can declare both **path-based** and **git-based** packages. Exactly one of `path` or `git` must be set per package. + +### 3.1. Path packages + +```yaml +packages: + - name: shared_package + path: "../shared_package" # relative or absolute + models_dir: "models" # optional + # optional semver constraint on the package’s manifest version: + version: ">=0.1.0,<0.2.0" +``` + +Fields: + +* `name` (required) + Must match `project.yml:name` inside the package root. +* `path` (required for path packages) + Relative or absolute path to the package root. Resolved relative to `packages.yml`. +* `models_dir` (optional) + Models directory inside the package root. Default `"models"`. +* `version` (optional) + Semver constraint for the package’s `project.yml:version`. See [4.2 Version constraints](#42-version-constraints). + +### 3.2. Git packages ```yaml packages: - - name: # required - path: # required, relative or absolute path to the package root - models_dir: # optional, defaults to "models" + - name: shared_package_git + git: "https://github.com/fftlabs/fastflowtransform.git" + + # Directory inside the repo that contains the package + subdir: "examples/packages_demo/shared_package_git_remote" + + # Optional ref selectors (only one needs to be set; see notes below) + ref: "main" # generic alias (branch / tag / commit) + # rev: "abc1234" # explicit commit SHA + # tag: "v0.6.11" # tag name + # branch: "main" # branch name + + models_dir: "models" + + # Optional semver constraint on the package's manifest version + version: ">=0.1.0,<0.2.0" ``` -Notes: +Fields: + +* `name` (required) + Must match `project.yml:name` in the package subdir. +* `git` (required for git packages) + Git URL (HTTPS or SSH, depending on your environment). +* `subdir` (optional but recommended) + Path inside the repo that should be treated as the package root (relative to the repo root). If omitted, the repo root itself is the package root. +* `ref` (optional) + Generic *user-facing* selector (branch, tag, or commit). If you don’t specify a more precise field (`rev` / `tag` / `branch`), `ref` is mapped internally to `rev` and passed directly to `git checkout`. +* `rev` / `tag` / `branch` (optional) + More explicit selectors, used in preference to `ref` if set. +* `models_dir` (optional) + Models directory inside the `subdir` root (default `"models"`). +* `version` (optional) + Semver constraint for the package’s `project.yml:version`. + +Resolution rules: + +* FFT clones/fetches git packages into: + + ```text + .fastflowtransform/packages/git//repo + ``` + + where `` encodes the package name and git URL. +* For each package, FFT: -* `path` is resolved relative to `packages.yml`’s directory: + * clones the repo (if missing), + * attempts a `git fetch --all` (best effort) if it already exists, + * runs `git checkout ` using: - * `../shared_package` → sibling folder - * `vendor/my_pkg` → subfolder -* `models_dir` allows you to keep a different structure in the package: + * `rev` or `tag` or `branch` (first non-empty), + * or `HEAD` if none are provided. - * Example: `models_dir: "src/models"`. +If Git commands fail, you get targeted error messages: -### Multiple packages +* Missing git binary → “git executable not found…” +* Auth issues → “authentication error…” +* Wrong repo / URL → “repository not found…” +* Bad ref / branch / tag → “requested ref/branch/tag does not exist…” + +--- + +### 3.3. Shorthand mapping form + +For local packages you can use a shorter mapping form: + +```yaml +# Equivalent to packages: [ { name: shared_package, path: ../shared_package } ] + +shared_package: "../shared_package" +other_pkg: + path: "../other" + models_dir: "dbt_models" +``` + +Internally this is normalized to the explicit `packages:` list. + +--- + +### 3.4. Multiple packages You can declare multiple packages: @@ -210,225 +350,332 @@ packages: path: "../shared_staging" - name: analytics_macros - path: "../analytics_macros" - models_dir: "macros_only" + git: "https://github.com/my-org/analytics-macros.git" + subdir: "packages/sql_macros" + models_dir: "models" ``` -All models/macros from all packages are loaded into the same project. +All models/macros from all packages are loaded into the same logical project. --- -## 3. What gets loaded (and what doesn’t) +## 4. Manifests, versions & dependencies -Currently, packages are focused on **models and macros**. +### 4.1. Package manifests (`project.yml` inside the package) -When FFT loads a package, it will: +Every package has its own `project.yml` at the package root (or package `subdir` for git packages): -* Read the package’s `project.yml` (if present) for: +```yaml +name: shared_package +version: "0.1.0" +models_dir: "models" # optional; may be overridden by packages.yml +fft_version: ">=0.6.0,<0.7.0" # optional + +dependencies: + - name: other_shared_pkg + version: ">=1.0.0,<2.0.0" + optional: false +``` + +FFT uses this manifest for: + +* `name` + Must match the `name` from `packages.yml`. +* `version` + Compared to the spec’s `version` constraint, if provided. +* `fft_version` (optional) + Semver constraint against the running FFT version. If your package only supports certain FFT versions, set this. If the constraint is not satisfied, resolution fails with a clear error. +* `models_dir` (optional) + Default path for models within the package root; overridden by `packages.yml:models_dir` if set. +* `dependencies` (optional) + A list of other **packages** (by name) this package expects to be present in the same project. + + Each dependency entry may include: + + * `name` – required, another package’s name. + * `version` – optional semver constraint on that package’s `project.yml:version`. + * `optional` – if `true`, missing dependency is allowed; otherwise it is an error. + +Resolution validates that: + +* Every non-optional dependency `name` is present in the set of packages declared in the **consumer’s** `packages.yml`. +* If a `version` constraint is given for a dependency, the resolved dependency’s version satisfies it. + +--- + +### 4.2. Version constraints + +Package specs and dependencies support a tiny semver subset. Version strings must be in `MAJOR.MINOR.PATCH` form (e.g. `1.2.3`). + +Supported constraint forms: + +* Bare version: + + ```text + "1.2.3" # equivalent to "==1.2.3" + ``` + +* Comparators (can be combined with commas or spaces): + + ```text + ">=1.2.0,<2.0.0" + ">1.0.0 <=2.0.0" + ``` - * `name`, `version` (for metadata), - * `models_dir` (overridden by `packages.yml` if provided). -* Load: +* `^` (caret) ranges: - * SQL models (`*.ff.sql`) from the package’s `models_dir`. - * Python models (`*.ff.py`) from the package’s `models_dir`. - * SQL macros under `models_dir/macros/` (standard Jinja macro files). - * Python render-time helpers/macros if your core exposes them from the package (same mechanism as the main project). + ```text + "^1.2.3" # >=1.2.3,<2.0.0 + "^0.3.0" # >=0.3.0,<0.4.0 + "^0.0.4" # >=0.0.4,<0.0.5 + ``` + +* `~` (tilde) ranges: + + ```text + "~1.2.3" # >=1.2.3,<1.3.0 + ``` -And it will **not**: +The resolver checks: -* Load or execute the package’s `profiles.yml` – the consumer project’s profiles are always used. -* Automatically register package **seeds** or **sources**; those stay local to the consumer. -* Automatically run the package’s DQ tests; only tests declared in the **consumer project’s** `project.yml` are executed on `fft test`. +* Consumers → packages: `packages.yml:version` vs package’s `project.yml:version`. +* Package → package: `dependencies[].version` vs the dependent package’s version. +* Package → FFT core: `project.yml:fft_version` vs the running FFT version. -> In practice, package models often still refer to `source('…')` or `ref('…')`. -> The *consumer* project is responsible for: -> -> * defining sources in its own `sources.yml`, and -> * wiring any extra seeds needed. +If a constraint fails, you get a clear runtime error showing which package and which constraint failed. --- -## 4. Name resolution & conflicts +## 5. What gets loaded (and what doesn’t) -### 4.1. Model names +When FFT loads a package, it will: + +**Loads:** + +* `project.yml` manifest (for name, version, fft_version, dependencies, models_dir). +* SQL models: `*.ff.sql` under the resolved `models_dir`. +* Python models: `*.ff.py` under `models_dir`. +* SQL macros: under `models_dir/macros/` (e.g. `macros/shared_utils.sql`). +* Python helpers/macros: under `models_dir/macros_py/` (same mechanism as the main project). + +**Does NOT load / run automatically:** + +* `profiles.yml` from the package — the consumer project’s profiles are always used. +* Seeds / sources defined in the package — these are still local to the consumer project. +* Tests declared in the package’s `project.yml` — only tests in the **consumer project’s** `project.yml` are run on `fft test`. + +> In practice, package models still call `source('…')` and `ref('…')`. The **consumer project** is responsible for defining sources / seeds / additional models. + +--- -Once loaded, a package model is just a regular model in the registry: +## 6. Name resolution & conflicts -* It has a **logical name** (e.g. `users_base.ff`). -* Its file path and package association are recorded as metadata. +### 6.1. Model names + +Once loaded, a package model is just a regular model: + +* It has a logical name (e.g. `users_base.ff`). +* It is registered in the same global registry as your local models. Rules: -* `ref('')` has **no package prefix**. You always use the bare model name. +* `ref('')` never has a package prefix. You always use the model name alone. * Model names must be **globally unique** across: * your main project, - * all packages. + * all loaded packages. -If two models with the same name are found (e.g. `users_base.ff` in both main and package), FFT raises a clear error during project loading. You must rename or decide which one you want. +If two models share a name (e.g. `users_base.ff` in both main and package), FFT will fail loading with a clear “Duplicate model name” error. You must rename or delete one of them. -### 4.2. Macros +### 6.2. Macros -Macros from packages are injected into the **same Jinja environment** as your own macros: +Macros from packages and local macros all end up in the same Jinja environment. * Name collisions are possible. -* If two macros share the same name, whichever is registered last will “win”. +* “Last one wins” — whichever macro is registered last overrides earlier ones. Best practice: -* Prefix shared macros with a **package-ish** prefix (e.g. `shared_email_domain`), or -* Group them in macro files you explicitly `{% import 'macros/shared_utils.sql' as shared %}` and then call `shared.email_domain()`. +* Prefix macro names with a package-ish prefix: `shared_email_domain`, etc. +* Or use explicit `{% import 'macros/shared_utils.sql' as shared %}` and call `shared.email_domain()` from consumer models. --- -## 5. DAGs, caching, and manifests +## 7. Lock file: `packages.lock.yml` + +After successful resolution, FFT writes a `packages.lock.yml` next to `packages.yml`: + +```yaml +fft_version: "0.6.11" +packages: + - name: shared_package + version: "0.1.0" + source: + kind: path + path: "/absolute/path/to/shared_package" + + - name: shared_package_git + version: "0.1.0" + source: + kind: git + git: "https://github.com/fftlabs/fastflowtransform.git" + rev: "abc1234deadbeef..." # resolved commit SHA + subdir: "examples/packages_demo/shared_package_git_remote" +``` -Once packages are loaded, the pipeline behaves like a single large project. +Today the lock file is: -### 5.1. DAG & docs +* **Written** after each successful resolution. +* Useful for diagnostics, reproducibility, CI logs, etc. -* `fft dag` sees package models as part of the DAG. -* The generated HTML docs show: +(Resolution is still driven by `packages.yml`; the lockfile does not yet drive resolution itself.) - * nodes for package models, - * nodes for local models, - * edges between them. +--- -Package models typically carry an extra metadata field (`package_name`) used in the catalog/manifest; you can inspect `.fastflowtransform/target/manifest.json` if you want to differentiate them programmatically. +## 8. CLI: `fft deps` -### 5.2. Caching and fingerprints +The `deps` command inspects packages for your project and shows their resolved status: -Build caching (`--cache`) treats package models like any other: +```bash +fft deps . +``` -* Fingerprints include: +Behavior: - * SQL/Python source from the **package** file, - * environment vars, - * upstream dependencies, etc. -* If a package model’s code changes, its fingerprint changes, and: +* Resolves the project directory. - * that model will rebuild on the next run, - * downstream models (local or from other packages) will also rebuild if needed. +* Runs the **full** package resolver (same as `fft run` would): -### 5.3. Tests and selectors + * locates local path packages, + * clones/fetches git packages, + * loads `project.yml` manifests, + * validates version constraints and dependencies, + * writes `packages.lock.yml`. -Selectors (`--select`, `--exclude`) are agnostic to package vs. local: +* Prints a small report for each package: -* You can tag package models with `tags: ['shared:staging']` and run: + ```text + Project: /path/to/my_project + Packages: + - shared_package (0.1.0) + kind: path + path: /abs/path/to/shared_package + models_dir: models -> /abs/path/to/shared_package/models + status: OK - ```bash - fft run . --env dev_duckdb --select tag:shared:staging + - shared_package_git (0.1.0) + kind: git + git: https://github.com/fftlabs/fastflowtransform.git + rev: abc1234deadbeef... + subdir: examples/packages_demo/shared_package_git_remote + models_dir: models -> /abs/.../repo/examples/packages_demo/shared_package_git_remote/models + status: OK ``` -* You can define DQ tests in your **main project**’s `project.yml` targeting package tables: +* Exits with **non-zero** status if any package’s `models_dir` is missing or invalid. - ```yaml - tests: - - type: not_null - table: users_base - column: email - tags: [example:packages_demo] - ``` +This is the easiest way to debug: + +* git connectivity / credentials, +* bad refs (`tag` / `branch` / `rev`), +* missing `project.yml` in a package, +* version constraint mismatches, +* missing `models_dir` directories. --- -## 6. Best practices +## 9. DAGs, caching, selectors -### 6.1. Keep packages stable and versioned +Once packages are resolved, FFT essentially treats: -Treat a shared package like a library: +> **“main project + all packages” as one large logical project.** -* Maintain a `version` in `project.yml`. -* Avoid backwards-incompatible changes without coordination: +### 9.1. DAG & docs - * e.g. dropping columns or changing semantics in shared staging models. -* Consider tagging or branching in Git to coordinate upgrades across consumers. +* `fft dag` and the generated HTML docs include package models and edges between them and your local models. +* You can inspect `.fastflowtransform/target/manifest.json` if you need to distinguish package vs local models programmatically (nodes carry metadata like their originating package). -(There’s no built-in package registry or version pinning yet; you control which commit of the package you point to via Git + `path`.) +### 9.2. Caching -### 6.2. Package responsibility +Build caching behaves the same for package models as for local ones: -A good rule of thumb: +* Fingerprints incorporate: -* **Package:** “What does *user* mean for us?” — common cleaning, typing, normalization, derivations (e.g. `email_domain`, `customer_segment`). -* **Consumer project:** “What do we need for *this* product/report?” — marts, joins across domains, project-specific logic. + * SQL/Python source of the package model, + * upstream dependencies, + * environment, etc. +* Changing a package model’s code changes its fingerprint and invalidates cache for that model and its downstream dependents. -This keeps packages focused and low-churn. +### 9.3. Selectors & tests -### 6.3. Avoid tight coupling to local schemas +Selectors (`--select`, `--exclude`) are package-agnostic: -Shared packages shouldn’t depend on highly project-specific schemas or seeds. Instead: +* You can tag package models: -* Use `source('domain', 'table')` with generic names (“crm.users”, “billing.invoices”). -* Document in the package README what sources it expects. -* Let each consumer wire those sources to its concrete tables via its own `sources.yml`. + ```jinja + {{ config( + tags=['pkg:shared_package', 'scope:staging'], + ) }} + ``` -### 6.4. Tag everything +* Then: -Give package models clear tags: + ```bash + fft run . --env dev_duckdb --select tag:pkg:shared_package + ``` -```jinja -{{ config( - tags=[ - 'pkg:shared_package', - 'scope:staging', - 'engine:duckdb', - 'engine:postgres', - ], -) }} -``` +You can define tests in your **main project** for tables produced by package models: -Then consumers can: +```yaml +tests: + - type: not_null + table: users_base + column: email + tags: [example_packages_demo] +``` -* include only `tag:pkg:shared_package` in some runs, -* or exclude them via `--exclude tag:pkg:shared_package` if they want to run only local marts. +Only the tests defined in the **consumer** project’s `project.yml` are executed on `fft test`. --- -## 7. Common pitfalls & how to avoid them +## 10. Best practices & pitfalls -**❌ Package model fails: `source('crm','users')` not found** +### 10.1. Treat packages like libraries -* You’re using a package model that references a source your main project hasn’t declared. -* Fix: add a matching `sources.yml` entry in your main project: +* Always set a `version` in the package’s `project.yml`. +* Use tags/releases/branches on the git repo for meaningful versions. +* Use `packages.yml:version` constraints to avoid accidental breaking upgrades. - ```yaml - version: 2 - sources: - - name: crm - tables: - - name: users - identifier: seed_users - ``` +### 10.2. Keep responsibilities clear ---- +* **Package:** shared semantics (cleaning, typing, derived fields), stable over time. +* **Consumer project:** product/report-specific marts and joins. -**❌ Duplicate model name between project and package** +### 10.3. Avoid tight coupling to specific schemas -* You have `models/staging/users_base.ff.sql` locally **and** in the package. -* Fix: rename one of them, or drop the local one if you want to fully delegate to the package. +* Use generic `source('domain','table')` names in packages. +* Document expected sources in the package README. +* Let each consumer wire those to their actual tables via their own `sources.yml`. ---- +### 10.4. Tag and namespace thoughtfully -**❌ Macro name collision** +* Tag package models with something like `pkg:` to make them easy to select/exclude. +* Use macro namespaces or prefixes to reduce collisions. -* Same macro name in package and project; behavior seems “random”. -* Fix: rename macros or use explicit `{% import %}` and call macros with a namespace alias. +### 10.5. Common errors ---- - -## Summary +* **`Package root … has no project.yml`** + Your package directory (or git `subdir`) is wrong. Point `path`/`subdir` at the folder that actually contains the package’s `project.yml`. -Packages let you: +* **Git errors about authentication or unknown revision** + Check your git URL/credentials and branch/tag/commit. `fft deps` will show the raw git error stderr to help you debug. -* Factor out **shared staging** and **macro libraries**. -* Reuse them across many projects via a simple `packages.yml`. -* Keep execution, caching, DAGs, and tests working as if everything were one project. +* **Version mismatch errors** + Align: -The mental model is: + * the package’s `version` and your `packages.yml:version`, + * the package’s `fft_version` and your installed FFT version. -> “My project + all packages = one big FastFlowTransform project -> where some models just happen to live in other directories.” +--- -As your internal ecosystem grows, you can introduce multiple packages (per domain, per team, per capability) and let downstream projects compose them like building blocks. +**In short:** packages let you compose FFT projects like libraries, with both local and git-backed sources, basic versioning, and a resolver + lockfile that make behavior explicit and debuggable. diff --git a/docs/examples/Packages_Demo.md b/docs/examples/Packages_Demo.md index ded14c2..b79efd3 100644 --- a/docs/examples/Packages_Demo.md +++ b/docs/examples/Packages_Demo.md @@ -1,13 +1,18 @@ # Packages Demo -The **packages demo** shows how to split FastFlowTransform logic into a **reusable package** -and a **consumer project** that imports it via `packages.yml`. +The **packages demo** shows how to split FastFlowTransform logic into: + +* a **reusable local package** (path-based), and +* a **git-backed package**, + +and a **consumer project** that imports both via `packages.yml`. It answers: -- How do I **share staging models** and macros across multiple projects? -- How does `packages.yml` work? -- How do I `ref()` a model that physically lives in another directory/tree? +* How do I **share staging models** and macros across multiple projects? +* How does `packages.yml` work for **local path** and **git** sources? +* How do I `ref()` a model that physically lives in another directory/tree? +* How do I see which exact versions / commits are being used? Use this as a template for building your own internal “FFT packages” repo. @@ -19,13 +24,14 @@ The example lives under: ```text examples/packages_demo/ - shared_package/ # reusable package - main_project/ # normal FFT project that consumes the package -```` + shared_package/ # reusable local package (path-based) + shared_package_git_remote/ # git-style package (lives in this repo for the demo) + main_project/ # normal FFT project that consumes both +``` -### `shared_package` – reusable code +### 1. `shared_package` – local reusable code -This folder behaves like a **mini FFT project**: +This folder behaves like a **mini FFT project** that’s consumed via a local path: ```text shared_package/ @@ -38,24 +44,59 @@ shared_package/ users_base.ff.sql ``` -It includes: +Key pieces: -* `project.yml` – minimal config so FFT knows how to load its models. +* `project.yml` – minimal manifest so FFT knows: + + * the package `name` / `version`, + * where its models live (`models_dir`). * `models/macros/shared_utils.sql` – shared SQL macros (e.g. `email_domain(expr)`). * `models/staging/users_base.ff.sql` – a reusable staging model that: - * reads `source('crm', 'users')` - * normalizes emails - * derives `email_domain`. + * reads `source('crm', 'users')`, + * normalizes emails, + * derives `email_domain` using the shared macro. + +In this demo you **do not** run `fft` inside `shared_package/` directly. +It’s pulled in by `main_project` via `packages.yml`. + +--- + +### 2. `shared_package_git_remote` – git-backed reusable code + +For the git example we use a second package: + +```text +shared_package_git_remote/ + project.yml + models/ + README.md + macros/ + shared_utils_git.sql + staging/ + users_base_git.ff.sql +``` + +Conceptually this folder represents a **separate git repo**. In the demo it lives in the main FFT repo so that: -You **do not** call `fft run shared_package` directly in this demo. -Instead, `shared_package` is loaded by the consumer project via `packages.yml`. +* `main_project` can point to the **GitHub URL** of this repo, +* and specify `subdir: examples/packages_demo/shared_package_git_remote` to use it as a git-based package. -### `main_project` – normal FFT project +Key ideas: + +* It has its own `project.yml` with a different `name` (e.g. `shared_package_git`) and `version`. +* It has its own models / macros, distinct from `shared_package`, so you can see clearly which package a model came from. +* For a real deployment, you would typically push this folder into a dedicated repo and update the `git:` URL in `packages.yml` accordingly. + +Again, you don’t run `fft` directly here; it’s consumed by `main_project` as a git package. + +--- + +### 3. `main_project` – consumer FFT project ```text main_project/ - .env.dev_duckdb + env.dev_duckdb Makefile README.md profiles.yml @@ -66,6 +107,7 @@ main_project/ README.md marts/ mart_users_from_package.ff.sql + mart_users_from_git_package.ff.sql seeds/ README.md seed_users.csv @@ -74,58 +116,109 @@ main_project/ README.md ``` -This is a regular project: +This is a regular FFT project that: -* `profiles.yml` – DuckDB connection profile (`dev_duckdb`). -* `.env.dev_duckdb` – points DuckDB at `.local/packages_demo.duckdb`. -* `seeds/seed_users.csv` + `sources.yml` – define `source('crm','users')`. -* `packages.yml` – declares the dependency on `../shared_package`. -* `models/marts/mart_users_from_package.ff.sql` – a local mart that does: +* owns the **engine configuration** (`profiles.yml`, `env.dev_duckdb`), +* defines **seeds** and **sources** for `crm.users`, +* declares **package dependencies** in `packages.yml`, +* defines **local marts** that depend on package models. - ```jinja - from {{ ref('users_base.ff') }} - ``` +Key pieces: - where `users_base.ff` is defined in the **package**, not in `main_project`. +* `profiles.yml` – DuckDB profile (`dev_duckdb`). +* `env.dev_duckdb` – sets `FF_DUCKDB_PATH`, `FFT_ACTIVE_ENV`, `FF_ENGINE`. +* `seeds/seed_users.csv` + `sources.yml` – define the `crm.users` source. +* `packages.yml` – declares both the local and git-backed packages. +* `models/marts/mart_users_from_package.ff.sql` – mart using the local package. +* `models/marts/mart_users_from_git_package.ff.sql` – mart using the git-backed package. --- -## Key concepts +## Declaring packages – `packages.yml` -### 1. Declaring packages – `packages.yml` - -In `main_project/packages.yml`: +In `main_project/packages.yml` you wire in both packages: ```yaml packages: + # Local path package - name: shared_package path: "../shared_package" models_dir: "models" + # optional: constrain acceptable versions from shared_package/project.yml + version: ">=0.1.0,<0.2.0" + + # Git-based package (same repo, different subdir for the demo) + - name: shared_package_git + git: "https://github.com/fftlabs/fastflowtransform.git" + + # Directory inside the repo that contains the package root + subdir: "examples/packages_demo/shared_package_git_remote" + + # Optional ref selectors (you typically choose *one* of these) + ref: "main" + # rev: "abc1234" # explicit commit SHA + # tag: "v0.6.11" # tag name + # branch: "main" # branch name + + models_dir: "models" + + # optional: version constraint matched against shared_package_git_remote/project.yml + version: ">=0.1.0,<0.2.0" ``` -* `name` – logical package name (for logging / internal bookkeeping). -* `path` – where to find the package directory, relative to `packages.yml`. -* `models_dir` – where to look for models inside the package (defaults to `models` if omitted). +Notes: + +* For **path packages**, set `path` and (optionally) `models_dir`. +* For **git packages**, set: + + * `git` – repo URL, + * `subdir` – where the package lives inside the repo, + * **one** of `ref` / `rev` / `tag` / `branch` (or nothing = `HEAD`), + * `models_dir` if needed, + * optional `version` constraint. +* Internally, `ref` is treated as a generic alias and mapped to `rev` if you don’t provide `rev` / `tag` / `branch`. At load time, the core: 1. Reads `packages.yml`. -2. For each entry, loads its `project.yml` (if present) and its `models/`. -3. Registers all models/macros from the package **into the same namespace** as local models. +2. For each package: + + * materializes its source: + + * path: resolve `path` relative to `packages.yml`, + * git: clone/fetch repo into `.fastflowtransform/packages/git/...` and `git checkout` the requested ref. + * loads its `project.yml` manifest: + + * checks `name` and `version`, + * validates `fft_version` (if present) against the running FFT core, + * validates `version` constraints from `packages.yml`, + * validates inter-package `dependencies` (from the package manifest). + * decides which `models_dir` to use (packages.yml overrides manifest). +3. Loads models and macros from each package and registers them into the same namespace as local models. -From the perspective of `main_project`, `users_base.ff` looks just like any other model — -you can `ref('users_base.ff')` without caring that it physically lives in `../shared_package`. +--- + +## Using package models with `ref()` -### 2. Referencing package models with `ref()` +### 1. Local path package In `main_project/models/marts/mart_users_from_package.ff.sql`: ```jinja +{{ config( + materialized='table', + tags=[ + 'example:packages_demo', + 'scope:mart', + 'engine:duckdb', + ], +) }} + with base as ( select email_domain, signup_date - from {{ ref('users_base.ff') }} + from {{ ref('users_base.ff') }} -- defined in shared_package ) select email_domain, @@ -137,16 +230,55 @@ group by email_domain order by email_domain; ``` -* `users_base.ff` is defined in `shared_package/models/staging/users_base.ff.sql`. -* Because the package is registered, `ref('users_base.ff')` resolves correctly. -* The DAG includes both the package model and the local mart. +* `users_base.ff` physically lives in `shared_package/models/staging/users_base.ff.sql`. +* From `main_project` you just `ref('users_base.ff')` — no package prefix. -**Important:** model names must still be globally unique. If you define a model with the same name -in both the package and the project, you’ll get a conflict (which is what you want). +### 2. Git-backed package -### 3. Shared macros from a package +In `main_project/models/marts/mart_users_from_git_package.ff.sql`: -The package ships a simple macro file: +```jinja +{{ config( + materialized='table', + tags=[ + 'example:packages_demo', + 'scope:mart', + 'engine:duckdb', + ], +) }} + +with base as ( + select + email_domain, + signup_date + from {{ ref('users_base_git.ff') }} -- defined in shared_package_git_remote +) +select + email_domain, + count(*) as user_count, + min(signup_date) as first_signup, + max(signup_date) as last_signup +from base +group by email_domain +order by email_domain; +``` + +* `users_base_git.ff` comes from the **git-backed** package (`shared_package_git_remote`). +* Again, the model name is **global**: `ref('users_base_git.ff')` doesn’t mention the package. + +**Important:** model names are globally unique across: + +* main project +* all path packages +* all git packages + +If two models use the same name, FFT fails with a clear “duplicate model name” error at load time. + +--- + +## Shared macros from packages + +The local package ships a macro file, for example: ```jinja -- shared_package/models/macros/shared_utils.sql @@ -155,37 +287,46 @@ The package ships a simple macro file: {%- endmacro -%} ``` -`users_base.ff` uses it: +`shared_package/models/staging/users_base.ff.sql` uses it: ```jinja select - user_id, - email, - {{ email_domain("email") }} as email_domain, - signup_date + user_id, + email, + {{ email_domain("email") }} as email_domain, + signup_date from raw_users; ``` -Because macros are loaded from both the main project and all packages into the same Jinja environment: +Similarly, the git-backed package can provide its own macros (e.g. in `shared_utils_git.sql`). + +Because macros from all packages and the main project share a single Jinja environment: + +* package models can use package macros, +* main project models can also use package macros, +* name collisions are possible, and “last macro wins”. + +Best practice: -* models in the **package** can use macros from the package, -* models in the **main project** can also use those macros if you want, subject to naming rules. +* use tags like `pkg:shared_package`, `pkg:shared_package_git`, and/or +* namespace macros with `{% import 'macros/shared_utils.sql' as shared %}`. --- ## Data flow -The demo intentionally mirrors the basic_demo pipeline but splits staging into a package: +The demo mirrors the basic_demo pipeline while splitting staging into packages. + +**Local package:** ```text (main_project) seeds/seed_users.csv │ ├─ fft seed ▼ - seed_users (DuckDB table via sources.yml → crm.users) + seed_users (via sources.yml → crm.users) │ ├─ shared_package/models/staging/users_base.ff.sql - │ (materialized view) ▼ users_base │ @@ -194,12 +335,22 @@ The demo intentionally mirrors the basic_demo pipeline but splits staging into a mart_users_from_package ``` -The DAG (after a run) will roughly show: +**Git package:** ```text -crm.users (source) → users_base.ff (package) → mart_users_from_package.ff (main_project) +seed_users (same table) + │ + ├─ shared_package_git_remote/models/staging/users_base_git.ff.sql + ▼ + users_base_git + │ + ├─ main_project/models/marts/mart_users_from_git_package.ff.sql + ▼ + mart_users_from_git_package ``` +The DAG will show both paths, with nodes annotated by their originating files, but **no special syntax** for package references. + --- ## Running the demo @@ -213,49 +364,68 @@ cd examples/packages_demo/main_project ### 1. Configure DuckDB env ```bash -set -a; source .env.dev_duckdb; set +a -# This sets: +set -a; source env.dev_duckdb; set +a +# Sets: # FF_DUCKDB_PATH=.local/packages_demo.duckdb # FFT_ACTIVE_ENV=dev_duckdb # FF_ENGINE=duckdb ``` -### 2. Run the full demo +### 2. Inspect dependencies (local + git) + +You can check the package resolver and git clone behavior with: ```bash -make demo ENGINE=duckdb +fft deps . ``` -This will: +You should see output similar to: -1. **clean** – drop local artifacts and DuckDB file via `cleanup_env.py`. +```text +Project: /.../examples/packages_demo/main_project +Packages: + - shared_package (0.1.0) + kind: path + path: /.../examples/packages_demo/shared_package + models_dir: models -> /.../shared_package/models + status: OK + + - shared_package_git (0.1.0) + kind: git + git: https://github.com/fftlabs/fastflowtransform.git + rev: abc1234deadbeef... # resolved commit SHA + subdir: examples/packages_demo/shared_package_git_remote + models_dir: models -> /.../repo/examples/packages_demo/shared_package_git_remote/models + status: OK +``` -2. **seed** – `fft seed . --env dev_duckdb`: +If anything goes wrong (missing git, auth problems, bad ref, missing `project.yml`, missing `models_dir`, …) this command will fail with a targeted error message. - * loads `seeds/seed_users.csv` into DuckDB as `seed_users`. +### 3. Run the full demo -3. **run** – `fft run . --env dev_duckdb` with: +```bash +make demo ENGINE=duckdb +``` - ```bash - --select tag:example:packages_demo --select tag:engine:duckdb - ``` +This will: - Only models tagged for this example are built: +1. **clean** – drop local artifacts and DuckDB file via `_scripts/cleanup_env.py`. +2. **seed** – `fft seed . --env dev_duckdb`: - * `users_base.ff` (from `shared_package`) - * `mart_users_from_package.ff` (from `main_project`) + * loads `seeds/seed_users.csv` as `seed_users`. +3. **run** – `fft run . --env dev_duckdb` with example tags: + * builds `users_base.ff` and `users_base_git.ff` from the two packages, + * builds `mart_users_from_package` and `mart_users_from_git_package` in the main project. 4. **dag** – `fft dag . --env dev_duckdb --html`: - * writes HTML docs to `main_project/site/dag/index.html`. - + * writes DAG HTML to `site/dag/index.html`. 5. **test** – `fft test . --env dev_duckdb`: - * runs DQ tests from `project.yml` (`not_null`, `unique`, `greater_equal`). + * runs DQ tests from `project.yml` that reference both package and local tables. +6. **artifacts** – prints locations of `manifest.json`, `catalog.json`, `run_results.json`. -6. **artifacts** – prints paths to `manifest.json`, `run_results.json`, `catalog.json`. - -### 3. Inspect results +### 4. Inspect results * DAG HTML: @@ -267,79 +437,105 @@ This will: ```text examples/packages_demo/main_project/.fastflowtransform/target/manifest.json - examples/packages_demo/main_project/.fastflowtransform/target/run_results.json examples/packages_demo/main_project/.fastflowtransform/target/catalog.json + examples/packages_demo/main_project/.fastflowtransform/target/run_results.json ``` -* DuckDB file (if you want to open it manually): +* DuckDB file: ```text examples/packages_demo/main_project/.local/packages_demo.duckdb ``` +* Package cache (including git clones): + + ```text + examples/packages_demo/main_project/.fastflowtransform/packages/ + git/ + shared_package_git_https___github.com_fftlabs_fastflowtransform.git/ + repo/ # git checkout used for the package + ``` + --- ## What this demo demonstrates -1. **Package loading** +1. **Path-based packages** + + - `shared_package` is resolved via a local path. + - Models/macros from `../shared_package` appear as if local to `main_project`. + +2. **Git-based packages** - `packages.yml` allows you to point at **another tree of models** and macros and load them as if they were local. + - `shared_package_git` is resolved by: + - cloning a remote repo, + - checking out a specific ref/branch/tag/commit, + - using a `subdir` as the package root. + - The exact commit hash is written to `packages.lock.yml`. -2. **Shared staging layers** +3. **Versioning and constraints** - You can move “standardized” staging code (sources, cleaning, type-casting, email normalization, etc.) - into a central `shared_package` and reuse it from multiple projects. + - Package manifests (`project.yml`) expose `name` / `version` / `fft_version`. + - `packages.yml` can constrain: + - which **package versions** are allowed, + - which **FFT core versions** a package supports (via `fft_version`). -3. **Consistent naming** +4. **Diagnostics via `fft deps`** - Since packaged models live in the same logical namespace, you get early feedback if two projects try to - define a model with the same name. + - Single command to see: + - path vs git packages, + - clones and refs, + - resolved models_dir, + - basic validity (missing dirs, bad refs, etc.). -4. **Separation of concerns** +5. **Separation of concerns** - * Package: stable, reused logic (e.g. `users_base`). - * Main project: business-specific marts and reporting (`mart_users_from_package`). + - Packages: reusable staging and macro logic. + - Consumer project: seeds, sources, engine config, marts. --- ## Things to try -To understand packages better, experiment with: +1. **Change the git ref** -1. **Breaking the shared model** + - In `packages.yml`, switch `ref: "main"` to a tag or SHA. + - Run `fft deps .` and see which commit is used. + - Re-run `make demo` and confirm behavior is still consistent. - * Edit `shared_package/models/staging/users_base.ff.sql` (e.g. remove `email_domain`). - * Re-run `make demo`. - * Watch `mart_users_from_package` fail because the column is missing — proving the dependency goes through the package. +2. **Break the git package** -2. **Adding a new shared macro** + - Temporarily set `branch: "this-does-not-exist"`. + - Run `fft deps .` and observe the “requested ref/branch/tag does not exist” error. + - This is how you’d debug mis-typed branch/tag names in real life. - * Add a `country_label(expr)` macro in `shared_utils.sql`. - * Use it in a *local* model inside `main_project` to see that macros from the package are visible in the consumer. +3. **Introduce a version mismatch** -3. **Adding another package** + - Change `shared_package_git_remote/project.yml:version` to something incompatible with your `packages.yml:version` constraint. + - Run `fft deps .` and see the “spec requires … but resolved version is …” error. + - This is how you enforce “only use 0.1.x of this package” across projects. - * Create `examples/packages_demo/another_package` with its own `project.yml` and models. - * Extend `main_project/packages.yml` with a second entry and confirm both package’s models appear in the DAG. +4. **Add more packages** -4. **Introducing a naming conflict** - - * Define a model named `users_base.ff` inside `main_project/models/staging`. - * Reload the project; you should get a clear error about duplicate model names, which is your cue to rename or explicitly choose one. + - Create another mini-package under `examples/packages_demo/another_package`. + - Declare it in `main_project/packages.yml`. + - Use its models in a new mart and watch them appear in the DAG. --- ## Summary -The packages demo is a minimal, concrete example of: +The updated `packages_demo` shows: + +- How to **consume a local package** via `path` in `packages.yml`. +- How to **consume a git-based package** via `git` + `subdir` + `ref` and validate versions. +- How to use `fft deps` and `packages.lock.yml` to see exactly what code you’re running. +- How package models and macros integrate into your project as if everything lived in one tree. -* Defining a reusable FastFlowTransform **package** (`shared_package`). -* Wiring it into a **consumer project** (`main_project`) via `packages.yml`. -* Building a mart that depends on a model defined outside of the project tree. -* Running everything through the normal `fft seed`, `fft run`, `fft dag`, and `fft test` workflow. +Use this pattern to gradually factor out: -You can adopt the same pattern to share: +- shared staging layers (CRM, billing, web analytics), +- macro / utility libraries, +- and sharable marts— -* Standard staging layers (CRM / ERP / web analytics), -* Macro libraries (date helpers, casting utilities), -* Even entire mini-marts that represent common dimensional models across teams. +all while keeping local projects thin, focused, and easy to reason about. \ No newline at end of file diff --git a/examples/.env.dev_postgres b/examples/.env.dev_postgres deleted file mode 100644 index 8923ecc..0000000 --- a/examples/.env.dev_postgres +++ /dev/null @@ -1,3 +0,0 @@ -# Postgres profile for the hooks demo (replace with your own connection string) -FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432 -FF_PG_SCHEMA=hooks_demo diff --git a/examples/incremental_demo/site/dag/events_base.ff.html b/examples/incremental_demo/site/dag/events_base.ff.html deleted file mode 100644 index f3a9097..0000000 --- a/examples/incremental_demo/site/dag/events_base.ff.html +++ /dev/null @@ -1,161 +0,0 @@ - - - - - - events_base.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- events_base.ff - table -

-
Model Detail • FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
events_base
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/incremental_demo/models/common/events_base.ff.sql - -
- -
Dependencies
-
- - - -
- -
Sources
-
- - - -
- - -
Referenced by
- - -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/incremental_demo/site/dag/fct_events_py_incremental.html b/examples/incremental_demo/site/dag/fct_events_py_incremental.html deleted file mode 100644 index 3653808..0000000 --- a/examples/incremental_demo/site/dag/fct_events_py_incremental.html +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - fct_events_py_incremental – FastFlowTransform - - - -

← Back to overview

- -
-
-

- fct_events_py_incremental - table -

-
Model Detail • FastFlowTransform
-
- python -
- -
- -
-

Metadata

-
-
Materialized
-
table
- -
Relation
-
fct_events_py_incremental
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/incremental_demo/models/engines/databricks_spark/fct_events_py_incremental.ff.py - -
- -
Dependencies
-
- - - -
- -
Sources
-
- - No source() refs - -
- - -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/incremental_demo/site/dag/fct_events_sql_inline.ff.html b/examples/incremental_demo/site/dag/fct_events_sql_inline.ff.html deleted file mode 100644 index 92dc088..0000000 --- a/examples/incremental_demo/site/dag/fct_events_sql_inline.ff.html +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - fct_events_sql_inline.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- fct_events_sql_inline.ff - incremental -

-
Model Detail • FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
incremental
- -
Relation
-
fct_events_sql_inline
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/incremental_demo/models/common/fct_events_sql_inline.ff.sql - -
- -
Dependencies
-
- - - -
- -
Sources
-
- - No source() refs - -
- - -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/incremental_demo/site/dag/fct_events_sql_yaml.ff.html b/examples/incremental_demo/site/dag/fct_events_sql_yaml.ff.html deleted file mode 100644 index 4846cb1..0000000 --- a/examples/incremental_demo/site/dag/fct_events_sql_yaml.ff.html +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - fct_events_sql_yaml.ff – FastFlowTransform - - - -

← Back to overview

- -
-
-

- fct_events_sql_yaml.ff - incremental -

-
Model Detail • FastFlowTransform
-
- sql -
- -
- -
-

Metadata

-
-
Materialized
-
incremental
- -
Relation
-
fct_events_sql_yaml
- -
Path
-
- /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/incremental_demo/models/common/fct_events_sql_yaml.ff.sql - -
- -
Dependencies
-
- - - -
- -
Sources
-
- - No source() refs - -
- - -
-
- - - - - - -
- - - - \ No newline at end of file diff --git a/examples/incremental_demo/site/dag/index.html b/examples/incremental_demo/site/dag/index.html deleted file mode 100644 index e189cee..0000000 --- a/examples/incremental_demo/site/dag/index.html +++ /dev/null @@ -1,326 +0,0 @@ - - - - - - FastFlowTransform - DAG & Mini Docs - - - - - - - -
-
-

FastFlowTransform - DAG & Mini Docs

-
Mermaid renders automatically (light/dark)
-
-
- - -
-
- -
-
-

DAG

-
- SQL - Python - - Materialization: - - table - - view - - ephemeral - - incremental - - snapshot - -
-
flowchart TD - classDef sql fill:#e8f1ff,stroke:#5b8def,color:#0a1f44; - classDef py fill:#e9fbf1,stroke:#2bb673,color:#0b2e1f; - classDef source fill:#fef3c7,stroke:#f59e0b,color:#78350f; - events_base_ff["events_base.ff
(events_base)"] - class events_base_ff sql; - click events_base_ff "events_base.ff.html" "View model" - fct_events_py_incremental("fct_events_py_incremental
(fct_events_py_incremental)") - class fct_events_py_incremental py; - click fct_events_py_incremental "fct_events_py_incremental.html" "View model" - fct_events_sql_inline_ff["fct_events_sql_inline.ff
(fct_events_sql_inline)"] - class fct_events_sql_inline_ff sql; - click fct_events_sql_inline_ff "fct_events_sql_inline.ff.html" "View model" - fct_events_sql_yaml_ff["fct_events_sql_yaml.ff
(fct_events_sql_yaml)"] - class fct_events_sql_yaml_ff sql; - click fct_events_sql_yaml_ff "fct_events_sql_yaml.ff.html" "View model" - src_raw_events[["raw.events"]] - class src_raw_events source; - click src_raw_events "source_raw.events.html" "View source" - events_base_ff --> fct_events_sql_inline_ff - src_raw_events --> events_base_ff - events_base_ff --> fct_events_sql_yaml_ff - events_base_ff --> fct_events_py_incremental -
-
- - - -
-

Macros

- -

No macros found.

- -
- -
-

Sources

- - - - - - - - - - - - - - - - - - - - -
NameRelationFreshnessConsumers
- raw.events - - seed_events - -
- - no warn - - - no error - -
-
- - 1 model - -
- -
-
- - - - \ No newline at end of file diff --git a/examples/packages_demo/.gitignore b/examples/packages_demo/.gitignore deleted file mode 100644 index 4718b0e..0000000 --- a/examples/packages_demo/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -.fastflowtransform/packages/** -packages.lock.yml \ No newline at end of file diff --git a/examples/packages_demo/main_project/env.dev_duckdb b/examples/packages_demo/main_project/.env.dev_duckdb similarity index 100% rename from examples/packages_demo/main_project/env.dev_duckdb rename to examples/packages_demo/main_project/.env.dev_duckdb diff --git a/examples/packages_demo/main_project/Makefile b/examples/packages_demo/main_project/Makefile index 6221c81..837d2b4 100644 --- a/examples/packages_demo/main_project/Makefile +++ b/examples/packages_demo/main_project/Makefile @@ -53,6 +53,9 @@ test: dag: env $(RUN_ENV) $(UV) run fft dag "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) --html +deps: + env $(BASE_ENV) $(UV) run fft deps "$(PROJECT)" + show: @if [ -f "$(PROJECT)/site/dag/index.html" ]; then \ $(OPENER) "$(PROJECT)/site/dag/index.html" 2>/dev/null || echo "Open manually at: $(PROJECT)/site/dag/index.html"; \ @@ -76,5 +79,6 @@ demo: clean +$(MAKE) run ENGINE=$(ENGINE) +$(MAKE) dag ENGINE=$(ENGINE) +$(MAKE) test ENGINE=$(ENGINE) + +$(MAKE) deps ENGINE=$(ENGINE) +$(MAKE) artifacts @echo "✅ Demo complete." diff --git a/examples/packages_demo/main_project/packages.lock.yml b/examples/packages_demo/main_project/packages.lock.yml new file mode 100644 index 0000000..a427f14 --- /dev/null +++ b/examples/packages_demo/main_project/packages.lock.yml @@ -0,0 +1,14 @@ +fft_version: 0.6.11 +packages: +- name: shared_package + version: '0.1' + source: + kind: path + path: /Users/markolekic/Dev/FlowForge/fastflowtransform/examples/packages_demo/shared_package +- name: shared_package_git + version: 0.1.0 + source: + kind: git + git: https://github.com/fftlabs/fastflowtransform.git + rev: a523f90bab3c9e6293a4d7a327be646521bbce06 + subdir: examples/packages_demo/shared_package_git_remote diff --git a/examples/packages_demo/main_project/packages.yml b/examples/packages_demo/main_project/packages.yml index 79ca5df..11a099c 100644 --- a/examples/packages_demo/main_project/packages.yml +++ b/examples/packages_demo/main_project/packages.yml @@ -11,6 +11,6 @@ packages: subdir: "examples/packages_demo/shared_package_git_remote" models_dir: "models" # optional: use explicit ref / branch / tag - ref: "v0.6.11" + branch: "main" # optional: version constraint matched against shared_package_git_remote/project.yml version: ">=0.1.0,<0.2.0" diff --git a/mkdocs.yml b/mkdocs.yml index b716da8..8cb8f0d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -41,6 +41,7 @@ nav: - CI Checks & Change-Aware Runs: CI_Check.md - Cost Monitoring: Cost_Monitoring.md - Hooks: Hooks.md + - Packages: Packages.md - Troubleshooting: Troubleshooting.md - Examples: - Basic Demo: examples/Basic_Demo.md @@ -54,6 +55,7 @@ nav: - Local Engine Setup: examples/Local_Engine_Setup.md - Snapshot Demo: examples/Snapshot_Demo.md - Hooks Demo: examples/Hooks_Demo.md + - Packages Demo: examples/Packages_Demo.md - API Reference: reference/ - Contributing: Contributing.md - License: License.md diff --git a/src/fastflowtransform/cli/deps_cmd.py b/src/fastflowtransform/cli/deps_cmd.py index 6dbabde..0b21a4b 100644 --- a/src/fastflowtransform/cli/deps_cmd.py +++ b/src/fastflowtransform/cli/deps_cmd.py @@ -11,18 +11,29 @@ def deps(project: ProjectArg = ".") -> None: """ - Show packages configured in packages.yml and basic status checks. - - - Resolves project directory (must contain models/). - - Parses packages.yml (if present). - - For each package, resolves its base path and models_dir location. - - Prints a short report and exits with non-zero status when something is missing. + Inspect packages declared in packages.yml and show their resolved status. + + For the given project it will: + + - Resolve the project directory. + - Run the full package resolver (path + git packages): + * locate or clone/fetch each package + * load its project.yml manifest (name/version/etc.) + * enforce version / FFT compatibility / inter-package deps + * write packages.lock.yml with pinned sources + - For each resolved package, print: + * name + version + * source kind (path | git) and concrete location + * models_dir and resolved models root + - Exit with non-zero status if any package's models_dir is missing. """ proj = _resolve_project_path(project) try: pkgs = resolve_packages(proj) - except Exception as exc: # pragma: no cover - config error path + except Exception as exc: # pragma: no cover - resolution error path + # Keep this as a single, clear error line; resolve_packages already + # does step-by-step validation (git, refs, manifest, versions, etc.). raise typer.BadParameter(f"Failed to resolve packages: {exc}") from exc echo(f"Project: {proj}") @@ -53,13 +64,14 @@ def deps(project: ProjectArg = ".") -> None: echo(f" models_dir: {pkg.models_dir} -> {models_root}") echo(f" status: {status}") + # Non-zero exit if any package is structurally broken raise typer.Exit(1 if missing else 0) def register(app: typer.Typer) -> None: app.command( name="deps", - help="Show configured packages from packages.yml and their local status.", + help="Show resolved packages (path/git) from packages.yml and their local status.", )(deps) diff --git a/src/fastflowtransform/packages.py b/src/fastflowtransform/packages.py index 21ac195..f75a8bd 100644 --- a/src/fastflowtransform/packages.py +++ b/src/fastflowtransform/packages.py @@ -380,7 +380,15 @@ def _materialize_package_source( def _ensure_git_repo(cache_dir: Path, spec: PackageSpec) -> Path: """ Ensure we have a local clone for the given git package in cache_dir and - return the checked-out directory (HEAD at rev/tag/branch). + return the checked-out directory. + + Semantics: + + - path packages: handled in _materialize_package_source (not here) + - git + rev: pinned to specific commit (no auto-upgrade) + - git + tag: pinned to tag (no auto-upgrade, aside from tag being moved) + - git + branch: tracks origin/ on each run (auto-upgrade) + - git with none of rev/tag/branch: just use HEAD (whatever the repo's default is) """ assert spec.git git_root = cache_dir / "git" @@ -394,31 +402,107 @@ def _ensure_git_repo(cache_dir: Path, spec: PackageSpec) -> Path: echo(f"Cloning package '{spec.name}' from {spec.git} ...") _run_git(["clone", "--no-tags", "--quiet", spec.git, str(repo_dir)]) else: - # best-effort fetch; ignore errors (offline etc.) + # Always try to update remotes; failures are non-fatal (offline etc.). try: - _run_git(["-C", str(repo_dir), "fetch", "--all", "--quiet"]) + _run_git( + [ + "-C", + str(repo_dir), + "fetch", + "--all", + "--tags", + "--prune", + "--quiet", + ] + ) except Exception as exc: # pragma: no cover log.debug("Git fetch failed for %s: %s", spec.git, exc) - ref = spec.rev or spec.tag or spec.branch or "HEAD" - echo(f"Checking out {spec.name}@{ref} ...") - _run_git(["-C", str(repo_dir), "checkout", "--quiet", ref]) + # Decide which selector to use. + # NOTE: PackageSpec already maps `ref` → `rev` if no rev/tag/branch is set. + if spec.rev: + # Pinned commit (or generic ref treated as rev): no auto-upgrade. + ref = spec.rev + echo(f"Checking out {spec.name}@{ref} (pinned rev) ...") + _run_git(["-C", str(repo_dir), "checkout", "--quiet", ref]) + + elif spec.tag: + # Pinned tag: we assume tags are stable; we don't auto-reset anything. + ref = spec.tag + echo(f"Checking out {spec.name}@{ref} (tag) ...") + _run_git(["-C", str(repo_dir), "checkout", "--quiet", ref]) + + elif spec.branch: + # Moving branch: make local track origin/ and reset to it. + branch = spec.branch + echo(f"Checking out {spec.name}@{branch} (tracking origin/{branch}) ...") + + # Create or update local branch to follow origin/ + # -B: create or reset branch to start-point + _run_git( + [ + "-C", + str(repo_dir), + "checkout", + "--quiet", + "-B", + branch, + f"origin/{branch}", + ] + ) + # Force working tree to that commit (avoid local drift) + _run_git( + [ + "-C", + str(repo_dir), + "reset", + "--hard", + f"origin/{branch}", + ] + ) + + else: + # No explicit selector: just ensure we are on whatever HEAD currently is. + echo(f"Checking out {spec.name}@HEAD ...") + _run_git(["-C", str(repo_dir), "checkout", "--quiet", "HEAD"]) return repo_dir def _run_git(args: list[str]) -> None: try: - subprocess.run(["git", *args], check=True, capture_output=True) + # text=True so stdout/stderr are already str, not bytes + subprocess.run( + ["git", *args], + check=True, + capture_output=True, + text=True, + ) except FileNotFoundError as exc: # pragma: no cover raise RuntimeError( "git executable not found. Git-based packages require git to be installed and on PATH." ) from exc except subprocess.CalledProcessError as exc: + stdout = (exc.stdout or "").strip() + stderr = (exc.stderr or "").strip() + + cmd_str = "git " + " ".join(args) + + # Very rough classification, but enough for common cases + if "Authentication failed" in stderr or "Permission denied" in stderr: + raise RuntimeError(f"{cmd_str} failed: authentication error.\n{stderr}") from exc + + if "Repository not found" in stderr: + raise RuntimeError(f"{cmd_str} failed: repository not found.\n{stderr}") from exc + + if "did not match any file(s) known to git" in stderr or "unknown revision" in stderr: + raise RuntimeError( + f"{cmd_str} failed: requested ref/branch/tag does not exist.\n{stderr}" + ) from exc + + # Fallback: show full stdout/stderr raise RuntimeError( - f"git command failed: git {' '.join(args)}\n" - f"stdout:\n{exc.stdout.decode(errors='ignore')}\n" - f"stderr:\n{exc.stderr.decode(errors='ignore')}" + f"git command failed: {cmd_str}\nstdout:\n{stdout}\nstderr:\n{stderr}" ) from exc diff --git a/tests/integration/examples/config.py b/tests/integration/examples/config.py index c272dd0..238ee64 100644 --- a/tests/integration/examples/config.py +++ b/tests/integration/examples/config.py @@ -164,6 +164,14 @@ class ExampleConfig: "bigframes": "dev_bigquery_bigframes", }, ), + # ExampleConfig( + # name="packages_demo", + # path=ROOT / "examples" / "packages_demo", + # make_target="demo", + # env_by_engine={ + # "duckdb": "dev_duckdb", + # }, + # ), ExampleConfig( name="snapshot_demo", path=ROOT / "examples" / "snapshot_demo", diff --git a/tests/unit/cli/test_deps b/tests/unit/cli/test_deps new file mode 100644 index 0000000..be297e0 --- /dev/null +++ b/tests/unit/cli/test_deps @@ -0,0 +1,81 @@ +# tests/test_cli_deps_cmd.py +from __future__ import annotations + +from pathlib import Path + +import pytest +from typer.testing import CliRunner + +from fastflowtransform.cli import deps_cmd + +runner = CliRunner() + + +def test_deps_no_packages(tmp_path: Path) -> None: + """ + If no packages.yml exists, deps should say "No packages configured" + and exit with code 0. + """ + project_dir = tmp_path / "proj" + project_dir.mkdir() + + app = _build_app() + result = runner.invoke(app, ["deps", str(project_dir)]) + + assert result.exit_code == 0 + out = result.stdout + assert "No packages configured" in out + + +def test_deps_with_path_package(tmp_path: Path) -> None: + """ + End-to-end-ish check for a single path package that has a valid models_dir. + """ + project_dir = tmp_path / "proj" + project_dir.mkdir() + + # Write packages.yml in the main project + (project_dir / "packages.yml").write_text( + """ + packages: + - name: shared_package + path: "../shared_package" + models_dir: "models" + """, + encoding="utf-8", + ) + + # Create the path package that packages.yml refers to + pkg_root = tmp_path / "shared_package" + pkg_root.mkdir() + (pkg_root / "models").mkdir(parents=True, exist_ok=True) + + # Minimal project.yml so the resolver can load a manifest + (pkg_root / "project.yml").write_text( + """ + name: shared_package + version: "0.1.0" + models_dir: models + """, + encoding="utf-8", + ) + + app = _build_app() + result = runner.invoke(app, ["deps", str(project_dir)]) + + # models_dir exists, so deps should succeed + assert result.exit_code == 0 + + out = result.stdout + assert "shared_package (0.1.0)" in out + assert "kind: path" in out + assert "models_dir: models" in out + assert "status: OK" in out + + +def _build_app(): + import typer + + app = typer.Typer() + deps_cmd.register(app) + return app diff --git a/tests/unit/config/test_config_hook.py b/tests/unit/config/test_config_hook_unit.py similarity index 100% rename from tests/unit/config/test_config_hook.py rename to tests/unit/config/test_config_hook_unit.py diff --git a/tests/unit/config/test_packages_config_unit.py b/tests/unit/config/test_packages_config_unit.py new file mode 100644 index 0000000..b5906cb --- /dev/null +++ b/tests/unit/config/test_packages_config_unit.py @@ -0,0 +1,116 @@ +# tests/test_packages_config.py +from __future__ import annotations + +from pathlib import Path + +import pytest +from pydantic import ValidationError + +from fastflowtransform.config.packages import ( + PackagesConfig, + _normalize_raw_packages, + load_packages_config, +) + + +def test_normalize_raw_packages_list_form() -> None: + raw = [ + {"name": "pkg1", "path": "../pkg1"}, + {"name": "pkg2", "git": "https://example.com/repo.git"}, + ] + norm = _normalize_raw_packages(raw) + assert "packages" in norm + assert len(norm["packages"]) == 2 + assert norm["packages"][0]["name"] == "pkg1" + assert norm["packages"][1]["name"] == "pkg2" + + +def test_normalize_raw_packages_mapping_with_packages_key() -> None: + raw = { + "packages": [ + {"name": "pkg1", "path": "../pkg1"}, + ] + } + norm = _normalize_raw_packages(raw) + assert len(norm["packages"]) == 1 + assert norm["packages"][0]["name"] == "pkg1" + + +def test_normalize_raw_packages_shorthand_mapping() -> None: + raw = { + "pkg1": "../pkg1", + "pkg2": { + "path": "../pkg2", + "models_dir": "dbt_models", + }, + } + norm = _normalize_raw_packages(raw) + pkgs = {row["name"]: row for row in norm["packages"]} + assert pkgs["pkg1"]["path"] == "../pkg1" + assert pkgs["pkg2"]["path"] == "../pkg2" + assert pkgs["pkg2"]["models_dir"] == "dbt_models" + + +def test_packages_config_loads_path_package(tmp_path: Path) -> None: + cfg_text = """ + packages: + - name: shared_package + path: "../shared_package" + models_dir: "models" + """ + project_dir = tmp_path + (project_dir / "packages.yml").write_text(cfg_text, encoding="utf-8") + + cfg = load_packages_config(project_dir) + assert isinstance(cfg, PackagesConfig) + assert len(cfg.packages) == 1 + + spec = cfg.packages[0] + assert spec.name == "shared_package" + assert spec.path == "../shared_package" + assert spec.git is None + assert spec.models_dir == "models" + + +def test_git_package_ref_is_mapped_to_rev() -> None: + data = { + "packages": [ + { + "name": "shared_package_git", + "git": "https://example.com/repo.git", + "ref": "main", + "subdir": "pkg", + } + ] + } + norm = _normalize_raw_packages(data) + cfg = PackagesConfig.model_validate(norm) + assert len(cfg.packages) == 1 + spec = cfg.packages[0] + + # `ref` should be preserved, but also mapped to `rev` + assert spec.git == "https://example.com/repo.git" + assert spec.ref == "main" + assert spec.rev == "main" + assert spec.subdir == "pkg" + assert spec.path is None + + +def test_package_spec_requires_exactly_one_of_path_or_git() -> None: + # both path and git → error + norm = { + "packages": [ + { + "name": "bad", + "path": "../pkg", + "git": "https://example.com/repo.git", + } + ] + } + with pytest.raises(ValidationError): + PackagesConfig.model_validate(norm) + + # neither path nor git → error + norm2 = {"packages": [{"name": "bad2"}]} + with pytest.raises(ValidationError): + PackagesConfig.model_validate(norm2) diff --git a/tests/unit/test_packages_unit.py b/tests/unit/test_packages_unit.py new file mode 100644 index 0000000..e49e639 --- /dev/null +++ b/tests/unit/test_packages_unit.py @@ -0,0 +1,346 @@ +# tests/test_packages_resolver.py +from __future__ import annotations + +import subprocess +from pathlib import Path +from typing import Any + +import pytest +import yaml + +from fastflowtransform import packages as pkgmod +from fastflowtransform.config.packages import PackagesConfig, PackageSpec + + +def _write_package( + root: Path, + *, + name: str = "shared_package", + version: str = "0.1.0", + models_dir: str = "models", + extra_project_fields: dict[str, Any] | None = None, +) -> None: + """ + Helper: write a minimal project.yml + models_dir for a package. + """ + root.mkdir(parents=True, exist_ok=True) + (root / models_dir).mkdir(parents=True, exist_ok=True) + + data: dict[str, Any] = { + "name": name, + "version": version, + "models_dir": models_dir, + } + if extra_project_fields: + data.update(extra_project_fields) + + (root / "project.yml").write_text(yaml.safe_dump(data, sort_keys=False), encoding="utf-8") + + +def test_resolve_path_package_basic(tmp_path: Path) -> None: + project_dir = tmp_path / "proj" + project_dir.mkdir() + + pkg_root = tmp_path / "shared_package" + _write_package(pkg_root, name="shared_package", version="0.1.0") + + cfg = PackagesConfig( + packages=[ + PackageSpec( + name="shared_package", + path=str(pkg_root), + models_dir="models", + ) + ] + ) + + resolved = pkgmod.resolve_packages(project_dir, cfg) + assert len(resolved) == 1 + + rp = resolved[0] + assert rp.name == "shared_package" + assert rp.version == "0.1.0" + assert rp.root == pkg_root + assert rp.models_dir == "models" + assert rp.source.kind == "path" + assert rp.source.path == str(pkg_root) + assert rp.source.git is None + + +def test_resolve_git_package_uses_subdir_and_manifest( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + project_dir = tmp_path / "proj" + project_dir.mkdir() + + # Fake "repo" with a subdir containing a package + repo_root = tmp_path / "repo" + pkg_root = repo_root / "pkg_subdir" + _write_package( + pkg_root, + name="shared_package_git", + version="0.1.0", + models_dir="models", + ) + + # 1) monkeypatch _ensure_git_repo to avoid real git + def fake_ensure_git_repo(cache_dir: Path, spec: PackageSpec) -> Path: + # we pretend this is the cloned repo root + return repo_root + + monkeypatch.setattr(pkgmod, "_ensure_git_repo", fake_ensure_git_repo) + + # 2) monkeypatch subprocess.run used by _lock_source_for_spec (rev-parse HEAD) + def fake_run(args, check, capture_output, text=False): # type: ignore[override] + class Result: + def __init__(self) -> None: + # match real subprocess.run(..., capture_output=True) behavior: bytes + self.stdout = b"deadbeefcafebabe\n" + self.stderr = b"" + + return Result() + + monkeypatch.setattr(pkgmod.subprocess, "run", fake_run) + + cfg = PackagesConfig( + packages=[ + PackageSpec( + name="shared_package_git", + git="https://example.com/repo.git", + subdir="pkg_subdir", + models_dir="models", + version=">=0.1.0,<0.2.0", + ref="main", + ) + ] + ) + + resolved = pkgmod.resolve_packages(project_dir, cfg) + assert len(resolved) == 1 + + rp = resolved[0] + assert rp.name == "shared_package_git" + assert rp.version == "0.1.0" + assert rp.root == pkg_root + assert rp.source.kind == "git" + assert rp.source.git == "https://example.com/repo.git" + # our fake rev-parse returns a concrete SHA; ref->rev mapping + # is done in config, then overwritten here + assert rp.source.rev == "deadbeefcafebabe" + assert rp.source.subdir == "pkg_subdir" + + +def test_resolve_packages_version_constraint_failure(tmp_path: Path) -> None: + project_dir = tmp_path / "proj" + project_dir.mkdir() + + pkg_root = tmp_path / "shared_package" + _write_package(pkg_root, name="shared_package", version="0.2.0") + + cfg = PackagesConfig( + packages=[ + PackageSpec( + name="shared_package", + path=str(pkg_root), + models_dir="models", + version=">=0.1.0,<0.2.0", # incompatible + ) + ] + ) + + with pytest.raises(RuntimeError) as excinfo: + pkgmod.resolve_packages(project_dir, cfg) + + msg = str(excinfo.value) + assert "has version 0.2.0" in msg + assert "requires '>=" in msg + + +def test_resolve_packages_fft_version_failure( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + project_dir = tmp_path / "proj" + project_dir.mkdir() + + pkg_root = tmp_path / "shared_package" + _write_package( + pkg_root, + name="shared_package", + version="0.1.0", + extra_project_fields={"fft_version": "999.9.9"}, + ) + + # Make sure FFT_VERSION is something else + monkeypatch.setattr(pkgmod, "FFT_VERSION", "0.1.0", raising=False) + + cfg = PackagesConfig( + packages=[ + PackageSpec( + name="shared_package", + path=str(pkg_root), + ) + ] + ) + + with pytest.raises(RuntimeError) as excinfo: + pkgmod.resolve_packages(project_dir, cfg) + + msg = str(excinfo.value) + assert "requires FFT version '999.9.9'" in msg + + +def test_package_dependency_missing_required_package(tmp_path: Path) -> None: + project_dir = tmp_path / "proj" + project_dir.mkdir() + + # Single package that declares a dependency on "other_pkg" + pkg_root = tmp_path / "pkg1" + _write_package( + pkg_root, + name="pkg1", + version="0.1.0", + extra_project_fields={ + "dependencies": [ + {"name": "other_pkg", "version": ">=0.1.0"}, + ] + }, + ) + + cfg = PackagesConfig( + packages=[ + PackageSpec( + name="pkg1", + path=str(pkg_root), + ) + ] + ) + + with pytest.raises(RuntimeError) as excinfo: + pkgmod.resolve_packages(project_dir, cfg) + + msg = str(excinfo.value) + assert "depends on 'other_pkg'" in msg + assert "no package with that name is declared" in msg + + +def test_package_dependency_optional_missing_is_ok(tmp_path: Path) -> None: + project_dir = tmp_path / "proj" + project_dir.mkdir() + + pkg_root = tmp_path / "pkg1" + _write_package( + pkg_root, + name="pkg1", + version="0.1.0", + extra_project_fields={ + "dependencies": [ + {"name": "other_pkg", "optional": True}, + ] + }, + ) + + cfg = PackagesConfig( + packages=[ + PackageSpec( + name="pkg1", + path=str(pkg_root), + ) + ] + ) + + # optional dependency, so this should succeed and simply log a debug line + resolved = pkgmod.resolve_packages(project_dir, cfg) + assert len(resolved) == 1 + assert resolved[0].name == "pkg1" + + +# -------------------- Semver helpers -------------------- + + +def test_parse_and_compare_versions() -> None: + assert pkgmod.parse_version("1.2.3") == (1, 2, 3) + assert pkgmod.compare_versions("1.2.3", "1.2.3") == 0 + assert pkgmod.compare_versions("1.2.3", "1.2.4") < 0 + assert pkgmod.compare_versions("1.3.0", "1.2.9") > 0 + + with pytest.raises(ValueError): + pkgmod.parse_version("not-a-version") + + +@pytest.mark.parametrize( + "actual,constraint,expected", + [ + ("1.2.3", "1.2.3", True), + ("1.2.3", "==1.2.3", True), + ("1.2.3", "!=1.2.3", False), + ("1.2.3", ">=1.2.0,<2.0.0", True), + ("1.2.3", ">=1.3.0", False), + ("0.3.1", "^0.3.0", True), + ("0.4.0", "^0.3.0", False), + ("1.2.5", "~1.2.3", True), + ("1.3.0", "~1.2.3", False), + ("1.2.3", None, True), + ("1.2.3", "", True), + ], +) +def test_version_satisfies(actual: str, constraint: str | None, expected: bool) -> None: + assert pkgmod.version_satisfies(actual, constraint) is expected + + +# -------------------- git error classification -------------------- + + +def test_run_git_authentication_error(monkeypatch: pytest.MonkeyPatch) -> None: + def fake_run(*_args, **_kwargs): + raise subprocess.CalledProcessError( + 1, + ["git"], + output="", + stderr="fatal: Authentication failed for 'https://example.com/repo.git'\n", + ) + + monkeypatch.setattr(pkgmod.subprocess, "run", fake_run) + + with pytest.raises(RuntimeError) as excinfo: + pkgmod._run_git(["clone", "https://example.com/repo.git"]) # type: ignore[attr-defined] + + msg = str(excinfo.value) + assert "authentication error" in msg.lower() + assert "Authentication failed" in msg + + +def test_run_git_repo_not_found(monkeypatch: pytest.MonkeyPatch) -> None: + def fake_run(*_args, **_kwargs): + raise subprocess.CalledProcessError( + 1, + ["git"], + output="", + stderr="fatal: Repository not found.\n", + ) + + monkeypatch.setattr(pkgmod.subprocess, "run", fake_run) + + with pytest.raises(RuntimeError) as excinfo: + pkgmod._run_git(["clone", "https://example.com/missing.git"]) # type: ignore[attr-defined] + + msg = str(excinfo.value) + assert "repository not found" in msg.lower() + + +def test_run_git_bad_ref(monkeypatch: pytest.MonkeyPatch) -> None: + def fake_run(*_args, **_kwargs): + raise subprocess.CalledProcessError( + 1, + ["git"], + output="", + stderr="error: pathspec 'nope' did not match any file(s) known to git\n", + ) + + monkeypatch.setattr(pkgmod.subprocess, "run", fake_run) + + with pytest.raises(RuntimeError) as excinfo: + pkgmod._run_git(["checkout", "nope"]) # type: ignore[attr-defined] + + msg = str(excinfo.value) + assert "requested ref/branch/tag does not exist" in msg + assert "did not match any file" in msg