From e28468ff61d546a9a6d8c508088d0e046074c08d Mon Sep 17 00:00:00 2001 From: Marko Lekic Date: Sun, 4 Jan 2026 17:57:29 +0100 Subject: [PATCH 1/2] Added explicit lineage to basic demo + some lineage fixes --- .../bigframes/mart_latest_signup.ff.py | 15 +++++ .../bigquery/pandas/mart_latest_signup.ff.py | 15 +++++ .../databricks_spark/mart_latest_signup.ff.py | 15 +++++ .../engines/duckdb/mart_latest_signup.ff.py | 15 +++++ .../engines/postgres/mart_latest_signup.ff.py | 15 +++++ .../mart_latest_signup.ff.py | 15 +++++ .../models/marts/mart_users_by_domain.ff.sql | 6 ++ .../models/staging/users_clean.ff.sql | 3 + examples/basic_demo/project.yml | 5 ++ src/fastflowtransform/docs.py | 63 ++++++++++++++++++- src/fastflowtransform/templates/assets/spa.js | 4 +- 11 files changed, 168 insertions(+), 3 deletions(-) diff --git a/examples/basic_demo/models/engines/bigquery/bigframes/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/bigquery/bigframes/mart_latest_signup.ff.py index bdaa876..e9f23a6 100644 --- a/examples/basic_demo/models/engines/bigquery/bigframes/mart_latest_signup.ff.py +++ b/examples/basic_demo/models/engines/bigquery/bigframes/mart_latest_signup.ff.py @@ -14,6 +14,21 @@ class BFDataFrame: # pragma: no cover - placeholder for runtime type hints ... +__lineage__ = { + "email_domain": [ + { + "from_relation": "seed_users", + "from_column": "email", + "transformed": True, + "confidence": "annotated", + } + ], + "latest_signup_date": [ + {"from_relation": "seed_users", "from_column": "signup_date", "confidence": "annotated"} + ], +} + + def _get_bigframes() -> Any: try: import bigframes.pandas as bpd_mod diff --git a/examples/basic_demo/models/engines/bigquery/pandas/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/bigquery/pandas/mart_latest_signup.ff.py index 263aba9..a56e41a 100644 --- a/examples/basic_demo/models/engines/bigquery/pandas/mart_latest_signup.ff.py +++ b/examples/basic_demo/models/engines/bigquery/pandas/mart_latest_signup.ff.py @@ -3,6 +3,21 @@ from fastflowtransform import engine_model +__lineage__ = { + "email_domain": [ + { + "from_relation": "seed_users", + "from_column": "email", + "transformed": True, + "confidence": "annotated", + } + ], + "latest_signup_date": [ + {"from_relation": "seed_users", "from_column": "signup_date", "confidence": "annotated"} + ], +} + + @engine_model( env_match={ "FF_ENGINE": "bigquery", diff --git a/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py index 84ec4cb..428f0b0 100644 --- a/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py +++ b/examples/basic_demo/models/engines/databricks_spark/mart_latest_signup.ff.py @@ -12,6 +12,21 @@ F = Any +__lineage__ = { + "email_domain": [ + { + "from_relation": "seed_users", + "from_column": "email", + "transformed": True, + "confidence": "annotated", + } + ], + "latest_signup_date": [ + {"from_relation": "seed_users", "from_column": "signup_date", "confidence": "annotated"} + ], +} + + def _get_spark_utils() -> tuple[Any, Any]: try: from pyspark.sql import Window as _Window diff --git a/examples/basic_demo/models/engines/duckdb/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/duckdb/mart_latest_signup.ff.py index d33277e..4087ed2 100644 --- a/examples/basic_demo/models/engines/duckdb/mart_latest_signup.ff.py +++ b/examples/basic_demo/models/engines/duckdb/mart_latest_signup.ff.py @@ -3,6 +3,21 @@ from fastflowtransform import engine_model +__lineage__ = { + "email_domain": [ + { + "from_relation": "seed_users", + "from_column": "email", + "transformed": True, + "confidence": "annotated", + } + ], + "latest_signup_date": [ + {"from_relation": "seed_users", "from_column": "signup_date", "confidence": "annotated"} + ], +} + + @engine_model( only="duckdb", name="mart_latest_signup", diff --git a/examples/basic_demo/models/engines/postgres/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/postgres/mart_latest_signup.ff.py index f465fd0..c3c1e30 100644 --- a/examples/basic_demo/models/engines/postgres/mart_latest_signup.ff.py +++ b/examples/basic_demo/models/engines/postgres/mart_latest_signup.ff.py @@ -3,6 +3,21 @@ from fastflowtransform import engine_model +__lineage__ = { + "email_domain": [ + { + "from_relation": "seed_users", + "from_column": "email", + "transformed": True, + "confidence": "annotated", + } + ], + "latest_signup_date": [ + {"from_relation": "seed_users", "from_column": "signup_date", "confidence": "annotated"} + ], +} + + @engine_model( only="postgres", name="mart_latest_signup", diff --git a/examples/basic_demo/models/engines/snowflake_snowpark/mart_latest_signup.ff.py b/examples/basic_demo/models/engines/snowflake_snowpark/mart_latest_signup.ff.py index 19e6538..aa01728 100644 --- a/examples/basic_demo/models/engines/snowflake_snowpark/mart_latest_signup.ff.py +++ b/examples/basic_demo/models/engines/snowflake_snowpark/mart_latest_signup.ff.py @@ -14,6 +14,21 @@ WindowSpec = Any +__lineage__ = { + "email_domain": [ + { + "from_relation": "seed_users", + "from_column": "email", + "transformed": True, + "confidence": "annotated", + } + ], + "latest_signup_date": [ + {"from_relation": "seed_users", "from_column": "signup_date", "confidence": "annotated"} + ], +} + + def _get_snowpark_utils() -> tuple[Any, Any]: try: from snowflake.snowpark import functions as _F diff --git a/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql b/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql index 5a04f6f..158034c 100644 --- a/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql +++ b/examples/basic_demo/models/marts/mart_users_by_domain.ff.sql @@ -11,6 +11,12 @@ ], ) }} +/* lineage: + user_count <- {{ ref('users_clean.ff') }}.user_id xform + first_signup <- {{ ref('users_clean.ff') }}.signup_date xform + last_signup <- {{ ref('users_clean.ff') }}.signup_date xform +*/ + with base as ( select email_domain, diff --git a/examples/basic_demo/models/staging/users_clean.ff.sql b/examples/basic_demo/models/staging/users_clean.ff.sql index 26f255a..338276f 100644 --- a/examples/basic_demo/models/staging/users_clean.ff.sql +++ b/examples/basic_demo/models/staging/users_clean.ff.sql @@ -11,6 +11,9 @@ ], ) }} +-- lineage: email_domain <- {{ source('crm','users') }}.email xform +-- lineage: signup_date <- {{ source('crm','users') }}.signup_date + with raw_users as ( select cast(id as integer) as user_id, diff --git a/examples/basic_demo/project.yml b/examples/basic_demo/project.yml index 5756004..d8388a6 100644 --- a/examples/basic_demo/project.yml +++ b/examples/basic_demo/project.yml @@ -16,6 +16,11 @@ docs: description: "Lowercased domain extracted from email." mart_users_by_domain.ff: description: "Aggregates signup counts per email domain." + lineage: + user_count: + transformed: true + from: + - { table: users_clean, column: user_id } # Project-level variables accessible via {{ var('key') }} inside models. # Example: diff --git a/src/fastflowtransform/docs.py b/src/fastflowtransform/docs.py index f6eee7b..02cb1ab 100644 --- a/src/fastflowtransform/docs.py +++ b/src/fastflowtransform/docs.py @@ -832,6 +832,63 @@ def _clean_lineage(lin: dict[str, list[dict[str, Any]]]) -> dict[str, list[dict[ return out +def _build_lineage_ref_map(models: list[ModelDoc], sources: list[SourceDoc]) -> dict[str, str]: + """ + Build a mapping of "friendly"/logical relation names -> fully qualified relation strings + that exist in the docs manifest (so the SPA can link them). + + Examples it tries to cover: + - model name: "users_clean.ff" -> ".users_clean" + - model name without ".ff": "users_clean" -> ".users_clean" + - relation tail: "users_clean" -> ".users_clean" + - source key: "crm.users" -> ".seed_users" + - relation tail: "seed_users" -> ".seed_users" + """ + ref_map: dict[str, str] = {} + + def add(alias: str | None, target_rel: str | None) -> None: + a = str(alias or "").strip() + t = str(target_rel or "").strip() + if not a or not t: + return + # preserve first writer to reduce accidental collisions + ref_map.setdefault(a, t) + ref_map.setdefault(a.lower(), t) + ref_map.setdefault(a.upper(), t) + + def add_relation_tails(rel: str, target_rel: str) -> None: + rel_s = str(rel or "").strip() + if not rel_s: + return + # Add last identifier and last two identifiers (schema.table) as aliases + # (works for db.schema.table too) + parts = [p for p in rel_s.replace("`", "").replace('"', "").split(".") if p] + if not parts: + return + add(parts[-1], target_rel) + if len(parts) >= 2: + add(".".join(parts[-2:]), target_rel) + + # Models + for m in models or []: + add(m.name, m.relation) + if m.name.endswith(".ff"): + add(m.name[:-3], m.relation) # "users_clean.ff" -> "users_clean" + else: + add(m.name + ".ff", m.relation) # "users_clean" -> "users_clean.ff" (best-effort) + add(m.relation, m.relation) + add_relation_tails(m.relation, m.relation) + + # Sources + for s in sources or []: + add(f"{s.source_name}.{s.table_name}", s.relation) + add(s.table_name, s.relation) + add(s.relation, s.relation) + add_relation_tails(s.relation, s.relation) + + return ref_map + + def _infer_and_attach_lineage( models: list[ModelDoc], executor: Any | None, @@ -840,6 +897,7 @@ def _infer_and_attach_lineage( *, with_schema: bool, rendered_sql_by_model: dict[str, str] | None = None, + lineage_ref_map: dict[str, str] | None = None, ) -> None: """Best-effort Lineage ermitteln (SQL/Python) und auf Columns mappen.""" for m in models: @@ -870,7 +928,7 @@ def _infer_and_attach_lineage( elif m.kind == "python": func = getattr(REGISTRY, "py_funcs", {}).get(m.name) src = m.python_source or (inspect.getsource(func) if callable(func) else "") - inferred = infer_py_lineage(src) + inferred = infer_py_lineage(src, ref_map=lineage_ref_map) _mark_lineage_confidence(inferred, "inferred") except Exception: inferred = {} @@ -1076,6 +1134,8 @@ def render_site( cols_by_table = _collect_columns(executor) if (executor and with_schema) else {} _apply_descriptions_to_models(models, docs_meta, cols_by_table, with_schema=with_schema) + lineage_ref_map = _build_lineage_ref_map(models, sources) + _infer_and_attach_lineage( models, executor, @@ -1083,6 +1143,7 @@ def render_site( cols_by_table, with_schema=with_schema, rendered_sql_by_model=(rendered_sql_by_model or None), + lineage_ref_map=lineage_ref_map, ) used_by = _reverse_deps(nodes) diff --git a/src/fastflowtransform/templates/assets/spa.js b/src/fastflowtransform/templates/assets/spa.js index 91ee60c..d450757 100644 --- a/src/fastflowtransform/templates/assets/spa.js +++ b/src/fastflowtransform/templates/assets/spa.js @@ -2759,7 +2759,7 @@ function renderLineage(state, items) { renderRelationColRef(state, it.from_relation, it.from_column), " ", renderConfPill(conf), - it.transformed ? el("span", { class: "pillSmall" }, "XFORM") : "" + it.transformed ? el("span", { class: "pillSmall" }, "TRANSFORMED") : "" ) ); } @@ -3037,7 +3037,7 @@ function buildColumnsCard(state, m, colFromRoute) { renderRelationColRef(state, e.from_relation, e.from_column), " ", renderConfPill(e.confidence), - e.transformed ? el("span", { class: "pillSmall" }, "XFORM") : "" + e.transformed ? el("span", { class: "pillSmall" }, "TRANSFORMED") : "" ) ) ) From c1f555f65b2f919137b276fe3744deb232040669 Mon Sep 17 00:00:00 2001 From: Marko Lekic Date: Sun, 4 Jan 2026 19:44:20 +0100 Subject: [PATCH 2/2] Added documentation for autodocs --- docs/Auto_Docs.md | 390 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 357 insertions(+), 33 deletions(-) diff --git a/docs/Auto_Docs.md b/docs/Auto_Docs.md index 34914e4..e425e2a 100644 --- a/docs/Auto_Docs.md +++ b/docs/Auto_Docs.md @@ -1,69 +1,393 @@ -# Auto-Docs & Lineage +# Auto-Docs Portal — User Guide -FastFlowTransform can generate a lightweight documentation site (DAG + model detail pages) plus an optional JSON manifest for external tooling. +This guide explains how to **generate**, **run**, and **use** the FastFlowTransform Auto-Docs portal, plus what you need to define (and where) to unlock features like **lineage**, **coverage**, **contracts**, and **source freshness**. -## Commands +--- + +## What you get + +Auto-Docs generates a **static** documentation website (a single-page app) from your project: + +- **Overview dashboard** (graph + coverage + shortcuts) +- **Model pages** (tabs: Overview / Columns / Lineage / Code / Meta — plus Contract when present) +- **Source pages** (freshness + who uses the source) +- **Macro browser** (search + linkable routes) +- Optional: **run/test health strip** if you publish runtime artifacts + +Everything is rendered from a generated JSON file: + +- `site/docs/assets/docs_manifest.json` (path depends on `--out`) + +--- + +## Generate the docs site + +### One-liner (recommended) + +```bash +fft docgen . --env dev --out site/docs --emit-json site/docs/docs_manifest.json +``` + +This generates the SPA and also writes a manifest you can use for CI checks or custom tooling. + +### Classic (DAG-only) ```bash -# Classic fft dag . --env dev --html +``` -# Convenience wrapper (loads schema + descriptions + lineage, can emit JSON) -fft docgen . --env dev --out site/docs --emit-json site/docs/docs_manifest.json +### Open automatically in your browser + +```bash +fft docgen . --env dev --out site/docs --open-source ``` -Add `--open-source` if you want the default browser to open the rendered `index.html` immediately. +--- + +## Start the local “dev server” (to view the SPA) + +The generated portal is static HTML/JS that loads JSON via `fetch()`. Many browsers block `fetch()` from `file://`, so it’s best to serve it locally. + +From the output directory: + +```bash +cd site/docs +python -m http.server 8000 +``` + +Then open: + +- `http://localhost:8000` + +> Any static server works; `python -m http.server` is just the simplest default. + +### Typical dev loop + +1. Run `fft docgen ...` to regenerate docs after changes. +2. Refresh the browser tab (or hard refresh if assets are cached). + +--- + +## Where to put documentation (quick map) + +| What you want to document | Where to define it | Notes | +|---|---|---| +| Model description | `project.yml` → `docs.models..description` | Plain text | +| Column descriptions | `project.yml` → `docs.models..columns.` | Plain text | +| Rich model writeup | `docs/models/.md` | Overrides YAML description | +| Rich column writeup | `docs/columns//.md` | Overrides YAML column description for that relation | +| Column lineage hints | `project.yml` → `docs.models..lineage` | See examples below | +| Contract (expected schema) | `project.yml` → `docs.models..contract` or front matter in `docs/models/.md` | Enables Contract tab/badges | +| Source freshness | `sources.yml` → `freshness:` block | Shows on source pages | + +**Priority (when multiple exist):** +1. Markdown overrides +2. YAML docs (`project.yml`) +3. Empty + +--- + +## How to use the portal + +### Global search (keyboard-first) + +- Press `/` (or your UI’s shortcut, often **Ctrl+K**) to focus global search +- Type to search across: + - model name, relation, descriptions, columns + - sources + - macros +- Use **↑/↓** to move selection +- Press **Enter** to navigate +- Press **Esc** to clear/close + +### Sidebar navigation + persistent state + +The sidebar is organized into collapsible sections (typically **Models / Sources / Macros**). + +The portal remembers your UI state in the browser: +- collapsed sections +- last visited page +- your current search/filter text + +So when you reopen the docs, you land where you left off. + +--- + +## Model pages (what you’ll find) + +Model pages are structured into tabs so you can scan quickly and then drill in. + +### Overview tab + +Typical content: +- model description +- upstream dependencies (**depends on**) and downstream consumers (**used by**) +- docs coverage badges (model described, columns documented) +- a mini upstream/downstream panel (graph or list) + +### Columns tab + +A “docs-first” column table designed to make gaps obvious: + +- sort by column name / dtype / nullability / documented state +- quick filter +- highlight **undocumented** columns +- show dtype + nullability clearly +- expand a row to view the full description and lineage details + +> If schema introspection is enabled for your engine, dtype/nullability can be shown automatically. If it’s not available, you’ll still see any descriptions you provided in YAML/Markdown. + +### Lineage tab + +Shows where columns come from (best effort), based on what you define (see [Defining lineage](#defining-lineage)) plus any lineage the generator can infer. + +You’ll typically see: +- upstream relation + upstream column +- a “transformed” flag where appropriate +- a confidence indicator (e.g., annotated vs inferred), if your build emits it + +### Code tab + +Depending on model type: +- **SQL models**: raw SQL, and optionally rendered/compiled SQL (if enabled) +- **Python models**: source, signature, docstring (if extracted) + +### Meta tab + +A structured panel with: +- relation (database/schema/identifier) +- materialization +- file path +- tags/owners/domains (if present in your metadata) +- custom meta/config + +--- + +## Source pages + +A source page typically includes: +- description +- relation +- which models consume it +- freshness configuration, when present (loaded_at + warn/error thresholds) +- a clear status badge: **configured** vs **missing freshness** + +--- + +## Macro browser + +The macros section provides: +- searchable list of macros +- path/kind shown (so you can find the source) +- linkable macro detail routes (if enabled) + +--- + +## Overview dashboard + +The landing page is designed as a “project dashboard,” not just a DAG: + +- interactive DAG navigation +- coverage summary (“how documented are we?”) +- shortcuts like: + - undocumented models + - high-impact models (high fan-out) + - python models +- optionally: “newest/changed” models if your generator can surface change metadata -## Descriptions +--- -Descriptions can be provided in YAML (`project.yml`) and/or Markdown files. Markdown has higher priority. +## Defining lineage -YAML in `project.yml`: +Lineage only appears if you provide lineage metadata (and/or if your build infers it). The simplest supported approach is to define lineage in `project.yml` under the model. + +### Minimal lineage example (YAML) ```yaml docs: models: - users.ff: - description: "Raw users table imported from CRM." - columns: - id: "Primary key." - email: "User email address." users_enriched: description: "Adds gmail flag." columns: is_gmail: "True if email ends with @gmail.com" + + lineage: + is_gmail: + from: [{ table: users, column: email }] + transformed: true ``` -Markdown overrides YAML when present: +What this does in the UI: +- On `users_enriched`, column `is_gmail` will show lineage pointing to `users.email` +- `transformed: true` tells the UI this column is derived (not a direct passthrough) +### Lineage for multiple upstream inputs + +```yaml +docs: + models: + orders_enriched: + lineage: + user_segment: + from: + - { table: users, column: segment } + - { table: crm_users, column: lifecycle_stage } + transformed: true ``` -/docs/models/.md -/docs/columns//.md + +### Tips for lineage that “works well” in the portal + +- Use upstream identifiers that match how the portal labels relations (table/model names you use consistently). +- Prefer concrete upstream columns rather than “whole table” lineage. +- Mark `transformed: true` for derived columns (casts, concatenations, bucketing, joins, etc.). + +> If lineage is missing for a column, the portal will still display the column normally—lineage is additive and should never block docs generation. + +--- + +## Documenting models and columns + +### YAML: fast and lightweight + +`project.yml`: + +```yaml +docs: + models: + users_enriched: + description: "Downstream-ready users dimension." + columns: + id: "Primary key." + email: "Normalized email address." ``` -Optional front matter is ignored for now (title/tags may be used later). +### Markdown: richer writeups + +Model Markdown: + +- `docs/models/users_enriched.md` + +Column Markdown: + +- `docs/columns/analytics.users_enriched/email.md` (example relation folder) + +Use Markdown when you want: +- longer explanations +- usage notes +- examples +- “gotchas” for downstream consumers + +--- -## Column Lineage +## Contracts (expected schema) -- SQL models: expressions like `col`, `alias AS out`, `upper(u.email) AS email_upper)` are parsed; `u` must come from a `FROM ... AS u` clause that resolves to a relation. Functions mark lineage as *transformed*. -- Python (pandas) models: simple patterns like `rename`, `out["x"] = df["y"]`, `assign(x=...)` are recognized. -- Override hints in YAML when the heuristic is insufficient: +If you define a contract for a model, the portal can show: +- **Contracted** badge +- a dedicated **Contract** section listing expected columns/types/nullability/constraints +- optional contract-vs-warehouse comparison (drift detection) when schema info is available + +### Contract example (YAML) ```yaml docs: models: - mart_orders_enriched: - lineage: - email_upper: - from: [{ table: users, column: email }] - transformed: true + users_enriched: + contract: + enforced: true + columns: + id: { dtype: int, nullable: false, constraints: [unique] } + email: { dtype: text, nullable: false } ``` -## JSON Manifest +### Contract drift (when schema is available) + +If your build includes discovered schema (dtype/nullability), the UI can show a status badge like: +- Verified +- Drift detected +- Schema unavailable + +--- + +## Filters, deep links, and sharing + +### Faceted filtering + +You can filter models by: +- kind (sql/python) +- materialized +- folder/path prefix +- tags and owners (if emitted) + +### Shareable links with state + +The portal can encode state in the URL, so sharing a link preserves: +- active filters +- selected graph node +- search text +- selected tab + +Use the portal’s **Copy link** action (if present) to grab an exact “deep link” to your current view. + +--- + +## Coverage and “what’s missing?” + +Coverage indicators help you answer: +- Do we have a model description? +- How many columns have descriptions? + +Typical UI elements: +- “Model described” badge +- “X/Y columns documented” +- landing page shortcuts for: + - undocumented models + - models with lots of undocumented columns + +--- + +## Optional operational strip (health) + +If your build outputs runtime artifacts (run results, tests), the portal can display: +- last run status + duration +- test summaries and failures +- optional unit test results + +This is intended to be lightweight and opt-in: docs remain usable without it. + +--- + +## FAQ / troubleshooting + +### I opened `index.html` directly and the page is blank + +Serve the folder instead of using `file://`: + +```bash +cd site/docs +python -m http.server 8000 +``` + +### My Columns tab is empty or missing types/nullability + +Schema display depends on engine support and whether schema introspection is enabled in your build. If you don’t have schema introspection for your engine, you can still: +- document columns in YAML/Markdown +- provide lineage in YAML + +### Lineage isn’t showing + +Check: +- you added `docs.models..lineage` in `project.yml` +- the column name under `lineage:` matches the output column name you want to annotate +- you regenerated docs (`fft docgen ...`) + +--- -The optional manifest (via `--emit-json`) includes models, relations, descriptions, columns (with nullable/dtype), and lineage per column—useful for custom doc portals or CI checks. +## Appendix: recommended minimum documentation standard -## Notes +If you want a simple policy that yields a strong portal experience: -- Schema introspection currently supports DuckDB and Postgres. For other engines, the Columns card may be empty. -- Lineage is optional; when uncertain, entries fall back to “unknown” and never fail doc generation. +1. Every model has a one-sentence description. +2. Every model documents at least its “public interface” columns. +3. High-impact models add lineage for the top 5–20 columns consumers rely on. +4. Critical models define contracts (even if not enforced yet). +5. Sources define freshness thresholds where operationally relevant.