diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 0000000..4093290 --- /dev/null +++ b/.prettierrc @@ -0,0 +1,6 @@ +{ + "trailingComma": "all", + "printWidth": 120, + "tabWidth": 2, + "endOfLine": "auto" +} diff --git a/apps/demo/src/App.tsx b/apps/demo/src/App.tsx index 91b5f0a..2f8d046 100644 --- a/apps/demo/src/App.tsx +++ b/apps/demo/src/App.tsx @@ -3,11 +3,10 @@ import { GitBranch, Github } from "lucide-react"; import { Card, CardContent, CardHeader, CardTitle } from "@meta-sql/ui"; import { SQLEditor } from "./components/editor"; import { LineageGraph } from "./components/lineage/LineageGraph"; -import type { ColumnLineageDatasetFacet } from "@meta-sql/open-lineage"; -import type { Schema } from "@meta-sql/lineage"; +import type { getExtendedLineage, Schema } from "@meta-sql/lineage"; // Use the actual return type from getLineage -type LineageResult = ColumnLineageDatasetFacet["fields"]; +type LineageResult = ReturnType; // Default schema matching the sample queries const defaultSchema: Schema = { @@ -15,15 +14,7 @@ const defaultSchema: Schema = { tables: [ { name: "product_sales", - columns: [ - "id", - "product_id", - "quantity_sold", - "unit_price", - "sale_date", - "store_id", - "discount_percentage", - ], + columns: ["id", "product_id", "quantity_sold", "unit_price", "sale_date", "store_id", "discount_percentage"], }, { name: "stores", @@ -64,9 +55,7 @@ export default function App() {
-

- @meta-sql/lineage -

+

@meta-sql/lineage

Interactive Demo - SQL Column Lineage Analysis Package @@ -96,24 +85,16 @@ export default function App() { SQL Editor - + {/* Right Panel - Lineage Graph */} - + - + Data Lineage Graph diff --git a/apps/demo/src/components/editor/SQLEditor.tsx b/apps/demo/src/components/editor/SQLEditor.tsx index 83d3d41..dceeac1 100644 --- a/apps/demo/src/components/editor/SQLEditor.tsx +++ b/apps/demo/src/components/editor/SQLEditor.tsx @@ -12,21 +12,14 @@ import { PopoverContent, PopoverTrigger, } from "@meta-sql/ui"; -import { - FileText, - AlertCircle, - CheckCircle, - CheckIcon, - ChevronsUpDown, -} from "lucide-react"; -import { getLineage, type Schema } from "@meta-sql/lineage"; -import type { ColumnLineageDatasetFacet } from "@meta-sql/open-lineage"; +import { FileText, AlertCircle, CheckCircle, CheckIcon, ChevronsUpDown } from "lucide-react"; +import { getExtendedLineage, type Schema } from "@meta-sql/lineage"; import { Parser } from "node-sql-parser"; import { sampleQueries, type SupportedDialect } from "./sampleQueries.js"; import { cn } from "@meta-sql/ui/lib/utils"; // Use the actual return type from getLineage -type LineageResult = ColumnLineageDatasetFacet["fields"]; +type LineageResult = ReturnType; const dialectOptions = [ { @@ -56,20 +49,12 @@ const dialectOptions = [ ] satisfies Array<{ value: SupportedDialect; label: string }>; interface SQLEditorProps { - onQueryParsed: ( - lineageResult: LineageResult, - query: string, - dialect: string - ) => void; + onQueryParsed: (lineageResult: LineageResult, query: string, dialect: string) => void; schema?: Schema; className?: string; } -export const SQLEditor: React.FC = ({ - onQueryParsed, - schema, - className = "", -}) => { +export const SQLEditor: React.FC = ({ onQueryParsed, schema, className = "" }) => { const [query, setQuery] = useState(""); const [dialect, setDialect] = useState("mysql"); const [open, setOpen] = useState(false); @@ -96,7 +81,7 @@ export const SQLEditor: React.FC = ({ }; // Get column lineage and update the graph - const lineageResult = getLineage(firstStatement, lineageSchema); + const lineageResult = getExtendedLineage(firstStatement, lineageSchema); onQueryParsed(lineageResult, query, dialect); @@ -104,9 +89,7 @@ export const SQLEditor: React.FC = ({ } else { setValidationResult({ isValid: false, - errors: [ - "Only SELECT statements are supported for lineage analysis", - ], + errors: ["Only SELECT statements are supported for lineage analysis"], }); } } else { @@ -120,9 +103,7 @@ export const SQLEditor: React.FC = ({ setValidationResult({ isValid: false, - errors: [ - error instanceof Error ? error.message : "Unknown parsing error", - ], + errors: [error instanceof Error ? error.message : "Unknown parsing error"], }); } } else { @@ -152,10 +133,7 @@ export const SQLEditor: React.FC = ({ aria-expanded={open} className="w-[200px] justify-between" > - {dialect - ? dialectOptions.find((option) => option.value === dialect) - ?.label - : "Select dialect..."} + {dialect ? dialectOptions.find((option) => option.value === dialect)?.label : "Select dialect..."} @@ -175,12 +153,7 @@ export const SQLEditor: React.FC = ({ }} > {option.label} @@ -248,11 +221,7 @@ export const SQLEditor: React.FC = ({
{validationResult.errors.map((error, index) => ( - + {error} ))} diff --git a/bun.lock b/bun.lock index 6dc8603..5bbaa80 100644 --- a/bun.lock +++ b/bun.lock @@ -1,5 +1,6 @@ { "lockfileVersion": 1, + "configVersion": 0, "workspaces": { "": { "name": "meta-sql", diff --git a/packages/lineage/README.md b/packages/lineage/README.md index 3388ebd..3240d54 100644 --- a/packages/lineage/README.md +++ b/packages/lineage/README.md @@ -1,18 +1,25 @@ # @meta-sql/lineage -A TypeScript library for extracting column-level lineage from SQL queries, implementing the [OpenLineage Column Lineage Dataset Facet specification](https://openlineage.io/docs/spec/facets/dataset-facets/column_lineage_facet/). +A TypeScript library for extracting column-level and dataset-level lineage from SQL queries, implementing the [OpenLineage Column Lineage Dataset Facet specification](https://openlineage.io/docs/spec/facets/dataset-facets/column_lineage_facet/). > ⚠️ **Experimental**: This library is currently in active development and may undergo significant changes. APIs, interfaces, and functionality may change without notice in future versions. Use with caution in production environments. ## Overview -This library analyzes SQL SELECT statements to generate detailed column-level lineage information, tracking how data flows from input columns to output columns through various transformations like joins, aggregations, filters, and CTEs (Common Table Expressions). +This library analyzes SQL SELECT statements to generate detailed lineage information: + +- **Field-level lineage**: Tracks how data flows from input columns to output columns through transformations +- **Dataset-level lineage**: Tracks columns that indirectly affect the entire result set (JOINs, filters, grouping, sorting, window functions) ## Features - ✅ **Column-level lineage extraction** from SQL SELECT statements +- ✅ **Dataset-level indirect lineage** for columns affecting the entire result - ✅ **CTE (Common Table Expression) support** with nested lineage tracking -- ✅ **Direct transformations** (IDENTITY) +- ✅ **Direct transformations** (IDENTITY, TRANSFORMATION, AGGREGATION) +- ✅ **Indirect transformations** (JOIN, FILTER, GROUP_BY, SORT, WINDOW, CONDITION) +- ✅ **Window function support** (PARTITION BY, ORDER BY in OVER clauses) +- ✅ **Masking detection** for privacy-preserving transformations - ✅ **Schema-aware parsing** with table and column validation - ✅ **OpenLineage specification compliance** for interoperability - ✅ **TypeScript-first** with comprehensive type definitions @@ -27,6 +34,8 @@ bun add @meta-sql/lineage node-sql-parser ## Quick Start +### Basic Field-Level Lineage + ```typescript import { getLineage } from "@meta-sql/lineage"; import { Parser } from "node-sql-parser"; @@ -40,91 +49,126 @@ const schema = { }; const lineage = getLineage(ast, schema); -console.log(lineage); -// Output: +// Returns field-level lineage only +``` + +### Extended Lineage (Field + Dataset Level) + +```typescript +import { getExtendedLineage } from "@meta-sql/lineage"; +import { Parser } from "node-sql-parser"; + +const parser = new Parser(); +const sql = ` + SELECT u.name, COUNT(o.id) as order_count + FROM users u + JOIN orders o ON u.id = o.user_id + WHERE u.status = 'active' + GROUP BY u.name + ORDER BY order_count DESC +`; +const ast = parser.astify(sql, { database: "trino" }) as Select; + +const schema = { + namespace: "my_database", + tables: [ + { name: "users", columns: ["id", "name", "status"] }, + { name: "orders", columns: ["id", "user_id", "total"] }, + ], +}; + +const result = getExtendedLineage(ast, schema); + +// result.fields - Field-level lineage (which columns flow into output columns) // { -// id: { -// inputFields: [{ -// namespace: "my_database", -// name: "users", -// field: "id", -// transformations: [{ type: "DIRECT", subtype: "IDENTITY" }] -// }] -// }, -// name: { -// inputFields: [{ -// namespace: "my_database", -// name: "users", -// field: "name", -// transformations: [{ type: "DIRECT", subtype: "IDENTITY" }] -// }] -// } +// name: { inputFields: [{ field: "name", name: "users", ... }] }, +// order_count: { inputFields: [{ field: "id", name: "orders", transformations: [AGGREGATION] }] } // } + +// result.dataset - Dataset-level lineage (columns that indirectly affect the result) +// [ +// { field: "id", name: "users", transformations: [{ type: "INDIRECT", subtype: "JOIN" }] }, +// { field: "user_id", name: "orders", transformations: [{ type: "INDIRECT", subtype: "JOIN" }] }, +// { field: "status", name: "users", transformations: [{ type: "INDIRECT", subtype: "FILTER" }] }, +// { field: "name", name: "users", transformations: [{ type: "INDIRECT", subtype: "GROUP_BY" }] } +// ] ``` +## Transformation Types + +### Direct Transformations (Field-Level) + +| Subtype | Description | Example | +|---------|-------------|---------| +| `IDENTITY` | Column passed through unchanged | `SELECT id FROM users` | +| `TRANSFORMATION` | Column modified by function/expression | `SELECT UPPER(name)`, `SELECT price * qty` | +| `AGGREGATION` | Column aggregated | `SELECT SUM(amount)`, `SELECT COUNT(id)` | + +### Indirect Transformations (Dataset-Level) + +| Subtype | Description | Example | +|---------|-------------|---------| +| `JOIN` | Columns used in JOIN conditions | `ON u.id = o.user_id` | +| `FILTER` | Columns used in WHERE/HAVING | `WHERE status = 'active'` | +| `GROUP_BY` | Columns used in GROUP BY | `GROUP BY department` | +| `SORT` | Columns used in ORDER BY | `ORDER BY created_at` | +| `WINDOW` | Columns in OVER clause | `OVER (PARTITION BY dept ORDER BY salary)` | +| `CONDITION` | Columns in CASE WHEN conditions | `CASE WHEN status = 'x' THEN ...` | + ## Supported SQL Features ### ✅ Currently Supported - Basic SELECT statements +- `SELECT *` and `table.*` expansion (requires schema) - Column aliases (`SELECT id as user_id`) -- Common Table Expressions (CTEs) +- Common Table Expressions (CTEs) with lineage propagation - Nested subqueries -- Simple column references +- JOINs (INNER, LEFT, RIGHT) with ON conditions +- WHERE and HAVING clauses +- GROUP BY with aggregations +- ORDER BY sorting +- Window functions (`ROW_NUMBER`, `RANK`, `SUM OVER`, etc.) +- CASE WHEN expressions +- CAST and type conversions +- Mathematical operations (`SELECT price * quantity`) +- String functions (`SELECT UPPER(name)`) +- Date functions (`SELECT DATE_TRUNC('month', created_at)`) +- Masking functions (`MD5`, `SHA256`, `HASH`, `MASK`, `ANONYMIZE`, etc.) +- **Set operations** (`UNION`, `UNION ALL`, `INTERSECT`, `EXCEPT`) +- **All JOIN types** (`INNER`, `LEFT`, `RIGHT`, `FULL OUTER`, `CROSS`) -## Roadmap - -Our development roadmap aligns with the OpenLineage Column Lineage Dataset Facet specification: - -### 🚧 Phase 1: Enhanced Transformations - -- ✅ **DIRECT/TRANSFORMATION** support for computed columns - - ✅ Mathematical operations (`SELECT price * quantity`) - - ✅ String functions (`SELECT UPPER(name)`) - - ✅ Date functions (`SELECT DATE_ADD(created_at, INTERVAL 1 DAY)`) -- ✅ **DIRECT/AGGREGATION** support for aggregation functions - - ✅ Basic aggregations (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`) -- ✅ **Masking detection** for privacy-preserving transformations - - ✅ Hash functions (`SELECT MD5(email)`) - - ✅ Anonymization functions (`SELECT ANONYMIZE(ssn)`) - -### 🔄 Phase 2: Indirect Lineage +### 🔄 In Progress -- [ ] **INDIRECT/JOIN** lineage tracking - - Track columns used in JOIN conditions - - Multi-table relationship mapping -- [ ] **INDIRECT/FILTER** for WHERE clause dependencies - - Identify filtering columns that affect output -- [ ] **INDIRECT/GROUP_BY** for grouping dependencies - - Track GROUP BY columns impact on aggregations -- [ ] **INDIRECT/SORT** for ORDER BY clause tracking +- More complex recursive CTE patterns -### 📊 Phase 3: Advanced SQL Features +### 📋 Planned -- [ ] **INDIRECT/WINDOW** for window function dependencies -- [ ] **INDIRECT/CONDITION** for CASE WHEN and IF statements -- [ ] **Complex JOIN types** (LEFT, RIGHT, FULL OUTER) -- [ ] **UNION and INTERSECT** operations -- ✅ **Recursive CTEs** support +- Multi-statement support (DDL operations) +- Additional SQL dialect optimizations -### 🔧 Phase 4: Enhanced Analysis +## API Reference -- [ ] **Dataset-level lineage** for operations affecting entire datasets -- [ ] **Multi-statement support** (DDL operations) -- ✅ **Multiple SQL dialect support** (PostgreSQL, MySQL, BigQuery, Snowflake) +### `getLineage(select, schema)` -## API Reference +Extracts field-level column lineage from a SQL SELECT AST. -### `getLineage(select: Select, schema: Schema): ColumnLineageDatasetFacet["fields"]` +```typescript +function getLineage(select: Select, schema: Schema): ColumnLineageDatasetFacet["fields"]; +``` -Extracts column lineage from a SQL SELECT AST. +### `getExtendedLineage(select, schema)` -**Parameters:** +Extracts both field-level and dataset-level lineage. -- `select`: Parsed SQL SELECT statement from node-sql-parser -- `schema`: Schema definition with table and column information +```typescript +function getExtendedLineage(select: Select, schema: Schema): ExtendedLineageResult; -**Returns:** Column lineage mapping conforming to OpenLineage specification +interface ExtendedLineageResult { + fields: ColumnLineageDatasetFacet["fields"]; // Field-level lineage + dataset?: InputField[]; // Dataset-level indirect lineage +} +``` ### Types @@ -138,8 +182,35 @@ type Table = { name: string; columns: string[]; }; + +type Transformation = { + type: "DIRECT" | "INDIRECT"; + subtype: "IDENTITY" | "TRANSFORMATION" | "AGGREGATION" | "JOIN" | "FILTER" | "GROUP_BY" | "SORT" | "WINDOW" | "CONDITION"; + masking: boolean; +}; ``` +## Roadmap + +### ✅ Completed + +- Field-level lineage with DIRECT transformations +- Dataset-level lineage with INDIRECT transformations +- Window function support +- CTE lineage propagation +- Masking detection +- Set operations (UNION, INTERSECT, EXCEPT) +- All JOIN types (INNER, LEFT, RIGHT, FULL OUTER, CROSS) + +### 🔄 In Progress + +- More complex recursive CTE patterns + +### 📋 Planned + +- Multi-statement support (DDL operations) +- Additional SQL dialect optimizations + ## License MIT License - see [LICENSE](../../LICENSE) for details. diff --git a/packages/lineage/src/hashset.ts b/packages/lineage/src/hashset.ts index 9f9aec7..6e03de3 100644 --- a/packages/lineage/src/hashset.ts +++ b/packages/lineage/src/hashset.ts @@ -5,10 +5,7 @@ export class HashSet implements Set { constructor(hasher?: (value: T) => string) { this.hasher = hasher || ((value: T) => JSON.stringify(value)); } - forEach( - callbackfn: (value: T, value2: T, set: Set) => void, - thisArg?: unknown - ): void { + forEach(callbackfn: (value: T, value2: T, set: Set) => void, thisArg?: unknown): void { this.map.forEach((value) => { callbackfn.call(thisArg, value, value, this); }); @@ -63,4 +60,15 @@ export class HashSet implements Set { } return intersection; } + + union(other: HashSet): HashSet { + const union = new HashSet(this.hasher); + for (const value of this) { + union.add(value); + } + for (const value of other) { + union.add(value); + } + return union; + } } diff --git a/packages/lineage/src/index.ts b/packages/lineage/src/index.ts index 3ba63b6..7e121aa 100644 --- a/packages/lineage/src/index.ts +++ b/packages/lineage/src/index.ts @@ -13,6 +13,12 @@ import { AggrFunc, Function as AstFunction, With, + Case, + Interval, + Cast, + type AsWindowSpec, + NamedWindowExpr, + ExprList, } from "node-sql-parser"; import { HashSet } from "./hashset"; @@ -31,8 +37,13 @@ const MASKING_FUNCTIONS = new Set([ "MURMUR3", "SPOOKY_HASH_V2_32", "SPOOKY_HASH_V2_64", + "HASH", + "ANONYMIZE", + "MASK", + "REDACT", ]); +// Direct transformation constants export const DIRECT_TRANSFORMATION: Transformation = { type: "DIRECT", subtype: "TRANSFORMATION", @@ -51,41 +62,95 @@ export const DIRECT_AGGREGATION: Transformation = { masking: false, }; -function mergeTransformations( - parent: Transformation | undefined, - child: Transformation -): Transformation { +// Indirect transformation constants +export const INDIRECT_JOIN: Transformation = { + type: "INDIRECT", + subtype: "JOIN", + masking: false, +}; + +export const INDIRECT_FILTER: Transformation = { + type: "INDIRECT", + subtype: "FILTER", + masking: false, +}; + +export const INDIRECT_GROUP_BY: Transformation = { + type: "INDIRECT", + subtype: "GROUP_BY", + masking: false, +}; + +export const INDIRECT_SORT: Transformation = { + type: "INDIRECT", + subtype: "SORT", + masking: false, +}; + +export const INDIRECT_WINDOW: Transformation = { + type: "INDIRECT", + subtype: "WINDOW", + masking: false, +}; + +export const INDIRECT_CONDITION: Transformation = { + type: "INDIRECT", + subtype: "CONDITION", + masking: false, +}; + +/** + * Merges two transformations, combining their properties based on precedence rules. + * + * Precedence rules: + * - If types differ (DIRECT vs INDIRECT), keeps the child transformation + * - For DIRECT types: AGGREGATION > TRANSFORMATION > IDENTITY + * - For INDIRECT types: prefers the child (more recent context) + * - Masking is OR'd together (if either is masked, result is masked) + * + * @param parent - The parent/outer transformation (may be undefined) + * @param child - The child/inner transformation to merge + * @returns The merged transformation with combined properties + */ +function mergeTransformations(parent: Transformation | undefined, child: Transformation): Transformation { if (!parent) { return child; } - if (child.type !== "DIRECT" || parent.type !== "DIRECT") { - throw new Error("Indirect transformations not supported yet"); + // If types differ, prefer the more specific one + // INDIRECT is generally more specific than DIRECT for the same column + if (parent.type !== child.type) { + let leading: Transformation = child.type === "INDIRECT" ? child : parent; + return { ...leading, masking: parent.masking || child.masking }; } - let leading: Transformation; + if (child.type === "DIRECT" && parent.type === "DIRECT") { + let leading: Transformation; - // agg > transformation > identity + // agg > transformation > identity + if (parent.subtype === "AGGREGATION") { + leading = parent; + } else if (child.subtype === "AGGREGATION") { + leading = child; + } else if (parent.subtype === "TRANSFORMATION") { + leading = parent; + } else { + leading = child; + } - if (parent.subtype === "AGGREGATION") { - leading = parent; - } else if (child.subtype === "AGGREGATION") { - leading = child; - } else if (parent.subtype === "TRANSFORMATION") { - leading = parent; - } else { - leading = child; + return { ...leading, masking: parent.masking || child.masking }; } - return { ...leading, masking: parent.masking || child.masking }; + // For INDIRECT transformations, prefer the child (more recent context) + return { ...child, masking: parent.masking || child.masking }; } +const transformationHasher = (value: Transformation): string => + `${value.type}-${value.subtype}-${value.masking ? "MASKED" : "UNMASKED"}`; + class TransformationSet extends HashSet { constructor(values?: readonly Transformation[]) { - super( - (value: Transformation) => - `${value.type}-${value.subtype}-${value.masking ? "MASKED" : "UNMASKED"}` - ); + super((value: Transformation) => transformationHasher(value)); if (values) { values.forEach((value) => this.add(value)); @@ -93,20 +158,32 @@ class TransformationSet extends HashSet { } } +/** + * Unified column lineage result that contains both direct and indirect transformations + * per column reference. Used internally to collect all transformations for columns. + */ +type ColumnTransformations = Record; + export type Column = { name: string; }; export type Table = { - name: string; + name: string; // Format: schemaName.tableName columns: string[]; }; -export type Schema = { +export type Namespace = { namespace: string; - tables: Table[]; + tables?: Table[]; + defaultSchema?: string; }; +/** + * @deprecated Use Namespace instead + */ +export type Schema = Namespace; + export type InputColumn = { name: string; table?: string; @@ -116,9 +193,7 @@ export type SelectWithAlias = Select & { as?: string | null; }; -export function isColumn( - selectColumn: Select["columns"][number] -): selectColumn is AstColumn { +function isColumn(selectColumn: Select["columns"][number]): selectColumn is AstColumn { return ( typeof selectColumn === "object" && selectColumn !== null && @@ -129,12 +204,40 @@ export function isColumn( ); } +/** + * Checks if a column expression is a star (wildcard) expression like `*` or `table.*`. + */ +function isStar(column: AstColumn): boolean { + if (column.expr.type !== "column_ref") return false; + const colRef = column.expr as ColumnRefItem; + return colRef.column === "*" || (typeof colRef.column === "object" && colRef.column?.expr?.value === "*"); +} + +/** + * Extracts the table qualifier from a star expression. + * @returns The table alias/name (e.g., "u" from "u.*"), or null for plain "*" + * @calls isStar - To verify the column is a star expression + */ +function getStarTableQualifier(column: AstColumn): string | null { + if (!isStar(column)) return null; + const colRef = column.expr as ColumnRefItem; + if (!colRef.table) return null; + return typeof colRef.table === "string" ? colRef.table : (colRef.table as { type: string; value: string }).value; +} + +/** + * Formats a column reference into a string identifier. + * @returns Formatted string like "table.column" or just "column" if no table qualifier + * @calls getInputColumnName - To extract the column name from the reference + */ export function formatInputColumnName(column: ColumnRefItem): string { - return `${column.table ? `${column.table}.` : ""}${getInputColumnName( - column - )}`; + return `${column.table ? `${column.table}.` : ""}${getInputColumnName(column)}`; } +/** + * Parses a formatted column name string back into its components. + * @returns InputColumn object with name and optional table properties + */ export function parseInputColumnName(column: string): InputColumn { const parts = column.split("."); const name = parts.pop() || ""; @@ -143,6 +246,47 @@ export function parseInputColumnName(column: string): InputColumn { return { name, table }; } +/** + * Parses a fully qualified table name into schema and table components. + * @returns Object with schema (empty string if not specified) and table name + */ +export function parseTableName(tableName: string): { schema: string; table: string } { + const parts = tableName.split("."); + if (parts.length === 1) { + return { schema: "", table: parts[0]! }; + } + return { schema: parts[0]!, table: parts.slice(1).join(".") }; +} + +/** + * Checks if an AST table reference matches a schema table definition. + * Handles schema resolution including default schema fallback. + * @returns True if the AST table matches the schema table + * + * @calls parseTableName - To parse the schema table name into components + */ +function astTableMatchesSchemaTable(astTable: BaseFrom, schemaTableName: string, defaultSchema?: string): boolean { + const parsed = parseTableName(schemaTableName); + const astDb = astTable.db; + const effectiveAstSchema = astDb || defaultSchema; + + // Compare schema (or default schema if not specified) + if ( + (parsed.schema && !defaultSchema && !astDb) || + (parsed.schema && effectiveAstSchema && parsed.schema !== effectiveAstSchema) + ) { + return false; + } + + // Compare table name + return parsed.table === astTable.table; +} + +/** + * Extracts the column name from a ColumnRefItem AST node. + * Handles both simple string columns and complex expression columns. + * @returns The column name string, or null if it cannot be extracted + */ export function getInputColumnName(column: ColumnRefItem): string | null { return typeof column.column === "string" ? column.column @@ -151,6 +295,16 @@ export function getInputColumnName(column: ColumnRefItem): string | null { : null; } +/** + * Determines the output column name for a SELECT column. + * Uses the alias if present, otherwise extracts from the column reference. + * @returns The output column name (alias or original name), or null if undetermined + * @calls getInputColumnName - When no alias is present and expr is a column_ref + * @example + * // For "SELECT id AS user_id" returns "user_id" + * // For "SELECT id" returns "id" + * // For "SELECT 1 + 1" returns null (no determinable name) + */ export function getOutputColumnName(column: AstColumn): string | null { if (column.as) { return typeof column.as === "string" ? column.as : column.as.value; @@ -161,41 +315,181 @@ export function getOutputColumnName(column: AstColumn): string | null { return null; } -export function getDirectTransformationsFromExprValue( +/** + * Recursively extracts all column references from any SQL expression. + * This is the unified function for finding all columns referenced in expressions. + * @returns Array of ColumnRefItem objects found in the expression + * @calls extractColumnRefs - Recursively for nested expressions + */ +export function extractColumnRefs(expr: ExpressionValue | null | undefined): ColumnRefItem[] { + if (!expr) return []; + + const refs: ColumnRefItem[] = []; + + switch (expr.type) { + case "column_ref": + refs.push(expr as ColumnRefItem); + break; + + case "binary_expr": { + const binary = expr as Binary; + refs.push(...extractColumnRefs(binary.left)); + refs.push(...extractColumnRefs(binary.right)); + break; + } + + case "aggr_func": { + const aggr = expr as AggrFunc; + if (aggr.args?.expr) { + refs.push(...extractColumnRefs(aggr.args.expr)); + } + break; + } + + case "function": { + const func = expr as AstFunction; + if (func.args?.value) { + refs.push(...extractColumnRefs(func.args)); + } + break; + } + + case "case": { + const caseExpr = expr as Case; + if (caseExpr.args) { + for (const arg of caseExpr.args) { + if (arg.type === "when" && arg.cond) { + refs.push(...extractColumnRefs(arg.cond)); + } + if (arg.result) { + refs.push(...extractColumnRefs(arg.result)); + } + } + } + break; + } + + case "interval": { + const interval = expr as Interval; + if (interval.expr) { + refs.push(...extractColumnRefs(interval.expr)); + } + break; + } + + case "cast": { + const cast = expr as Cast; + if (cast.expr) { + refs.push(...extractColumnRefs(cast.expr)); + } + break; + } + + case "expr_list": { + const cast = expr as ExprList; + if (cast.value) { + for (const subExpr of cast.value) { + refs.push(...extractColumnRefs(subExpr)); + } + } + break; + } + + default: + console.warn("UNHANDLED EXPR TYPE IN EXTRACT COLUMN REFS:", expr.type, expr); + break; + } + + return refs; +} + +/** + * Type for OVER clause in window functions. + * Uses the library's AsWindowSpec type which matches the actual parser output. + */ +type OverClause = NamedWindowExpr & { + type: "window"; + as_window_specification: AsWindowSpec; +}; + +/** + * Extracts PARTITION BY and ORDER BY expressions from an OVER clause. + * @returns Array of expressions from PARTITION BY and ORDER BY clauses + */ +function extractWindowExpressionsFromOver(over: OverClause): ExpressionValue[] { + const expressions: ExpressionValue[] = []; + + // Handle string reference (named window) + if (typeof over.as_window_specification === "string") { + // Named window reference - no direct expressions to extract + return expressions; + } + + const windowSpec = over.as_window_specification.window_specification; + if (windowSpec) { + if (windowSpec.partitionby) { + expressions.push( + ...windowSpec.partitionby.flatMap((item) => (Array.isArray(item.expr) ? item.expr : [item.expr])), + ); + } + if (windowSpec.orderby) { + expressions.push(...windowSpec.orderby.map((item) => item.expr)); + } + } + + return expressions; +} + +/** + * Core unified function for extracting column transformations from any SQL expression. + * Returns a map of column names to their transformation sets. + * + * This function handles: + * - column_ref: Returns DIRECT/IDENTITY transformation + * - binary_expr: Returns DIRECT/TRANSFORMATION for both operands + * - aggr_func: Returns DIRECT/AGGREGATION (with masking for COUNT) + * - function: Returns DIRECT/TRANSFORMATION (with masking for hash functions) + * - case: Returns INDIRECT/CONDITION for conditions, DIRECT/IDENTITY for results + * - cast/interval: Returns DIRECT/TRANSFORMATION + * + * @param expr - The expression to extract transformations from + * @param parentTransformation - Optional parent transformation to merge with child transformations + * @returns Map of column names (e.g., "table.column") to their TransformationSet + * + * @calls formatInputColumnName - To format column references as keys + * @calls mergeTransformations - To combine parent and child transformations + * @calls extractWindowExpressionsFromOver - For window function OVER clauses + * @calls extractColumnsWithUniformTransformation - For CASE condition columns + * @calls extractTransformationsFromExpr - Recursively for nested expressions + */ +function extractTransformationsFromExpr( expr: ExpressionValue, - parentTransformation?: Transformation -): Record { + parentTransformation?: Transformation, +): ColumnTransformations { switch (expr.type) { case "column_ref": { const inputColumnName = formatInputColumnName(expr as ColumnRefItem); return inputColumnName ? { - [inputColumnName]: new TransformationSet([ - mergeTransformations(parentTransformation, DIRECT_IDENTITY), - ]), + [inputColumnName]: new TransformationSet([mergeTransformations(parentTransformation, DIRECT_IDENTITY)]), } : {}; } + case "binary_expr": { const { left, right } = expr as Binary; - const merged: Record = {}; + const merged: ColumnTransformations = {}; Object.entries( - getDirectTransformationsFromExprValue( - left, - mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION) - ) + extractTransformationsFromExpr(left, mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION)), ).forEach(([key, value]) => { merged[key] = value; }); Object.entries( - getDirectTransformationsFromExprValue( - right, - mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION) - ) + extractTransformationsFromExpr(right, mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION)), ).forEach(([key, value]) => { const prev = merged[key]; @@ -208,49 +502,449 @@ export function getDirectTransformationsFromExprValue( return merged; } + case "aggr_func": { const aggExpr = expr as AggrFunc; - return getDirectTransformationsFromExprValue( - aggExpr.args.expr, - mergeTransformations(parentTransformation, { - ...DIRECT_AGGREGATION, - masking: MASKING_AGG_FUNCTIONS.has(aggExpr.name), - }) - ); + const merged: ColumnTransformations = {}; + + // Extract lineage from aggregate function arguments + if (aggExpr.args?.expr) { + const argTransformations = extractTransformationsFromExpr( + aggExpr.args.expr, + mergeTransformations(parentTransformation, { + ...DIRECT_AGGREGATION, + masking: MASKING_AGG_FUNCTIONS.has(aggExpr.name), + }), + ); + Object.entries(argTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; + }); + } + + // For window functions (aggr_func with OVER clause), also extract columns from PARTITION BY/ORDER BY + if ("over" in aggExpr && aggExpr.over) { + const windowExprs = extractWindowExpressionsFromOver(aggExpr.over as OverClause); + for (const windowExpr of windowExprs) { + const windowTransformations = extractTransformationsFromExpr( + windowExpr, + mergeTransformations(parentTransformation, INDIRECT_WINDOW), + ); + Object.entries(windowTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; + }); + } + } + + return merged; } + case "function": { const funcExpr = expr as AstFunction; + const merged: ColumnTransformations = {}; + + // Extract lineage from function arguments + if (funcExpr.args?.value) { + for (const arg of funcExpr.args.value) { + const maskingExpr = funcExpr.name.name.at(-1)?.value.toUpperCase(); + const argTransformations = extractTransformationsFromExpr( + arg, + mergeTransformations(parentTransformation, { + ...DIRECT_TRANSFORMATION, + masking: !!maskingExpr && MASKING_FUNCTIONS.has(maskingExpr), + }), + ); + Object.entries(argTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; + }); + } + } - return ( - funcExpr.args?.value.reduce( - (acc, arg) => { - const argTransformations = getDirectTransformationsFromExprValue( - arg, - mergeTransformations(parentTransformation, { - ...DIRECT_TRANSFORMATION, - masking: - funcExpr.name.name.length > 0 && - MASKING_FUNCTIONS.has(funcExpr.name.name.at(-1)!.value), - }) - ); + // For window functions (function with OVER clause like RANK(), ROW_NUMBER()), + // extract columns from PARTITION BY/ORDER BY since these functions have no arguments + if ("over" in funcExpr && funcExpr.over) { + const windowExprs = extractWindowExpressionsFromOver(funcExpr.over as OverClause); + for (const windowExpr of windowExprs) { + const windowTransformations = extractTransformationsFromExpr( + windowExpr, + mergeTransformations(parentTransformation, INDIRECT_WINDOW), + ); + Object.entries(windowTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; + }); + } + } + + return merged; + } - Object.entries(argTransformations).forEach(([key, value]) => { - acc[key] = acc[key] ? acc[key].intersection(value) : value; + case "case": { + const caseExpr = expr as Case; + const merged: ColumnTransformations = {}; + + if (caseExpr.args) { + for (const arg of caseExpr.args) { + // Condition columns get INDIRECT/CONDITION (per-column indirect transformation) + if (arg.type === "when") { + const condTransformations = extractColumnsWithUniformTransformation(arg.cond, INDIRECT_CONDITION); + Object.entries(condTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; }); + } - return acc; - }, - {} as Record - ) ?? {} - ); + // Result columns get DIRECT/IDENTITY (value is taken from CASE result) + if (arg.result) { + const resultTransformations = extractTransformationsFromExpr( + arg.result, + mergeTransformations(parentTransformation, DIRECT_IDENTITY), + ); + Object.entries(resultTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; + }); + } + } + } + + return merged; } + + case "cast": { + const castExpr = expr as Cast; + if (castExpr.expr) { + return extractTransformationsFromExpr( + castExpr.expr, + mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION), + ); + } + return {}; + } + + case "interval": { + const intervalExpr = expr as Interval; + if (intervalExpr.expr) { + return extractTransformationsFromExpr( + intervalExpr.expr, + mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION), + ); + } + return {}; + } + default: return {}; } } -export function getTableExpressionsFromSelect(select: Select): { +/** + * Extracts column references and applies a uniform transformation type to all. + * Simpler than extractTransformationsFromExpr - doesn't analyze expression structure. + * + * Used for dataset-level indirect transformations where all columns in an expression + * receive the same transformation type (e.g., all columns in WHERE get FILTER). + * @returns Map of column names to TransformationSet containing the single transformation + * + * @calls extractColumnRefs - To find all column references in the expression + * @calls formatInputColumnName - To format column references as keys + */ +function extractColumnsWithUniformTransformation( + expr: ExpressionValue | null | undefined, + transformation: Transformation, +): ColumnTransformations { + if (!expr) return {}; + + const columnRefs = extractColumnRefs(expr); + const result: ColumnTransformations = {}; + + for (const ref of columnRefs) { + const columnName = formatInputColumnName(ref); + if (columnName) { + result[columnName] = new TransformationSet([transformation]); + } + } + + return result; +} + +/** + * Resolves a column reference to an InputField by finding the matching table in namespace. + * This is the core helper for converting AST column refs to OpenLineage InputField format. + * @returns InputField object if table found, null otherwise + * + * @calls getInputColumnName - To extract the column name + * @calls astTableMatchesSchemaTable - To match AST table to namespace table + */ +function resolveColumnRefToInputField( + ref: ColumnRefItem, + regularTables: BaseFrom[], + namespace: Namespace, + transformation: Transformation, +): InputField | null { + const columnName = getInputColumnName(ref); + const tableName = ref.table; + + if (!columnName) return null; + if (!namespace.tables) return null; + + const table = regularTables.find( + (t) => + (!tableName || tableName === t.table || tableName === t.as) && + namespace.tables!.some( + (s) => astTableMatchesSchemaTable(t, s.name, namespace.defaultSchema) && s.columns.includes(columnName), + ), + ); + + if (!table) return null; + + const schemaTable = namespace.tables.find((s) => astTableMatchesSchemaTable(table, s.name, namespace.defaultSchema)); + if (!schemaTable) return null; + + return { + namespace: namespace.namespace, + name: schemaTable.name, + field: columnName, + transformations: [transformation], + }; +} + +/** + * Extracts InputFields from all column references in an expression. + * Used by dataset-level lineage extraction (WHERE, HAVING, GROUP BY, ORDER BY, etc.). + * @returns Array of InputField objects for columns that could be resolved + * + * @calls extractColumnRefs - To find all column references + * @calls resolveColumnRefToInputField - To convert each ref to InputField + */ +function extractInputFieldsFromExpression( + expr: ExpressionValue | null | undefined, + regularTables: BaseFrom[], + namespace: Namespace, + transformation: Transformation, +): InputField[] { + if (!expr) return []; + + const columnRefs = extractColumnRefs(expr); + const inputFields: InputField[] = []; + + for (const ref of columnRefs) { + const inputField = resolveColumnRefToInputField(ref, regularTables, namespace, transformation); + if (inputField) { + inputFields.push(inputField); + } + } + + return inputFields; +} + +/** + * Extracts JOIN lineage from the FROM clause (ON and USING conditions). + * All columns in JOIN conditions receive INDIRECT/JOIN transformation. + * @returns Array of InputFields for columns used in JOIN conditions + * + * @calls getTableExpressionsFromSelect - To get regular tables from FROM + * @calls extractInputFieldsFromExpression - For ON clause columns + * @calls astTableMatchesSchemaTable - For USING clause table matching + */ +function getJoinLineage(select: Select, namespace: Namespace): InputField[] { + if (!select.from) return []; + if (!namespace.tables) return []; + + const fromItems = Array.isArray(select.from) ? select.from : [select.from]; + const { regularTables } = getTableExpressionsFromSelect(select); + const inputFields: InputField[] = []; + + for (const item of fromItems) { + // Handle ON clause + if ("on" in item && item.on) { + inputFields.push(...extractInputFieldsFromExpression(item.on, regularTables, namespace, INDIRECT_JOIN)); + } + + // Handle USING clause - columns exist in multiple tables + if ("using" in item && Array.isArray(item.using)) { + for (const usingCol of item.using) { + // Find tables that match the FROM clause and have this column + for (const schemaTable of namespace.tables) { + const matchingFromTable = regularTables.find((t) => + astTableMatchesSchemaTable(t, schemaTable.name, namespace.defaultSchema), + ); + if (matchingFromTable && schemaTable.columns.includes(usingCol)) { + inputFields.push({ + namespace: namespace.namespace, + name: schemaTable.name, + field: usingCol, + transformations: [INDIRECT_JOIN], + }); + } + } + } + } + } + + return inputFields; +} + +/** + * Extracts WHERE clause lineage. + * All columns in WHERE conditions receive INDIRECT/FILTER transformation. + * @returns Array of InputFields for columns used in WHERE clause + * + * @calls getTableExpressionsFromSelect - To get regular tables from FROM + * @calls extractInputFieldsFromExpression - To extract columns with FILTER transformation + */ +function getFilterLineage(select: Select, namespace: Namespace): InputField[] { + if (!select.where) return []; + + const { regularTables } = getTableExpressionsFromSelect(select); + return extractInputFieldsFromExpression(select.where, regularTables, namespace, INDIRECT_FILTER); +} + +/** + * Extracts GROUP BY clause lineage. + * All columns in GROUP BY receive INDIRECT/GROUP_BY transformation. + * @returns Array of InputFields for columns used in GROUP BY clause + * + * @calls normalizeGroupByItems - To handle different GROUP BY AST formats + * @calls getTableExpressionsFromSelect - To get regular tables from FROM + * @calls extractInputFieldsFromExpression - To extract columns with GROUP_BY transformation + */ +function getGroupByLineage(select: Select, namespace: Namespace): InputField[] { + if (!select.groupby) return []; + + // Normalize GROUP BY to array format + const groupByItems = normalizeGroupByItems(select.groupby); + const { regularTables } = getTableExpressionsFromSelect(select); + + return groupByItems.flatMap((expr) => + extractInputFieldsFromExpression(expr, regularTables, namespace, INDIRECT_GROUP_BY), + ); +} + +/** + * Normalizes GROUP BY clause to a consistent array format. + * Handles different AST representations from various SQL parsers. + * @returns Array of ExpressionValue for each GROUP BY item + */ +function normalizeGroupByItems(groupby: Select["groupby"]): ExpressionValue[] { + if (Array.isArray(groupby)) { + return groupby; + } + if (typeof groupby === "object" && groupby && "columns" in groupby && Array.isArray(groupby.columns)) { + return groupby.columns; + } + return [groupby as unknown as ExpressionValue]; +} + +/** + * Builds a map of output column aliases to their source expressions. + * Used to resolve ORDER BY alias references to their underlying columns. + * @returns Map where keys are output aliases, values are the source expressions + * + * @calls isColumn - To filter valid columns + * @calls getOutputColumnName - To get the alias/output name + */ +function buildAliasToExpressionMap(select: Select): Map { + const aliasMap = new Map(); + + if (!select.columns || typeof select.columns === "string") { + return aliasMap; + } + + for (const col of select.columns) { + if (!isColumn(col)) continue; + + const outputName = getOutputColumnName(col); + if (outputName && col.expr) { + aliasMap.set(outputName, col.expr); + } + } + + return aliasMap; +} + +/** + * Resolves an ORDER BY expression to its underlying column reference. + * If the expression is an alias (unqualified column_ref matching an alias), returns the aliased expression. + * @returns The resolved expression (original if not an alias, or the aliased expression) + * + * @calls getInputColumnName - To extract column name from column_ref + */ +function resolveOrderByExpression(expr: ExpressionValue, aliasMap: Map): ExpressionValue { + // If it's a column_ref, check if it's an alias + if (expr.type === "column_ref") { + const colRef = expr as ColumnRefItem; + const columnName = getInputColumnName(colRef); + + // Only resolve if there's no table qualifier (aliases don't have table qualifiers) + if (columnName && !colRef.table && aliasMap.has(columnName)) { + return aliasMap.get(columnName)!; + } + } + + return expr; +} + +/** + * Extracts ORDER BY clause lineage with alias resolution. + * All columns in ORDER BY receive INDIRECT/SORT transformation. + * Resolves aliases to their underlying column expressions. + * @returns Array of InputFields for columns used in ORDER BY clause + * + * @calls getTableExpressionsFromSelect - To get regular tables from FROM + * @calls buildAliasToExpressionMap - To resolve aliases + * @calls resolveOrderByExpression - To resolve each ORDER BY item + * @calls extractInputFieldsFromExpression - To extract columns with SORT transformation + */ +function getOrderByLineage(select: Select, namespace: Namespace): InputField[] { + if (!select.orderby) return []; + + const orderByItems = Array.isArray(select.orderby) ? select.orderby : [select.orderby]; + const { regularTables } = getTableExpressionsFromSelect(select); + + // Build alias map to resolve ORDER BY alias references + const aliasMap = buildAliasToExpressionMap(select); + + const inputFields: InputField[] = []; + + for (const item of orderByItems) { + const expr = ("expr" in item ? item.expr : item) as ExpressionValue; + + // Resolve the expression - if it's an alias, get the underlying expression + const resolvedExpr = resolveOrderByExpression(expr, aliasMap); + + // Extract input fields from the resolved expression + inputFields.push(...extractInputFieldsFromExpression(resolvedExpr, regularTables, namespace, INDIRECT_SORT)); + } + + return inputFields; +} + +/** + * Extracts HAVING clause lineage. + * All columns in HAVING conditions receive INDIRECT/FILTER transformation. + * @returns Array of InputFields for columns used in HAVING clause + * + * @calls getTableExpressionsFromSelect - To get regular tables from FROM + * @calls extractInputFieldsFromExpression - To extract columns with FILTER transformation + */ +function getHavingLineage(select: Select, namespace: Namespace): InputField[] { + if (!select.having) return []; + + const { regularTables } = getTableExpressionsFromSelect(select); + return extractInputFieldsFromExpression( + select.having as unknown as ExpressionValue, + regularTables, + namespace, + INDIRECT_FILTER, + ); +} + +/** + * Extracts and categorizes table expressions from a SELECT statement. + * Separates regular tables (physical tables) from select tables (CTEs, subqueries). + * @returns Object with: + * - regularTables: Physical tables from namespace + * - selectTables: CTEs and subqueries (as SelectWithAlias) + */ +function getTableExpressionsFromSelect(select: Select): { regularTables: BaseFrom[]; selectTables: SelectWithAlias[]; } { @@ -277,9 +971,9 @@ export function getTableExpressionsFromSelect(select: Select): { if (select.from) { const fromItems = Array.isArray(select.from) ? select.from : [select.from]; - fromItems.forEach((item) => { + for (const item of fromItems) { if ("table" in item) { - // might mention with statemnt in our select + // might mention with statement in our select const matchingWith = withByNames.get(item.table); if (matchingWith) { @@ -297,16 +991,20 @@ export function getTableExpressionsFromSelect(select: Select): { with: previousWiths, // propagate previous withs }); } - }); + } } return { regularTables, selectTables }; } -export function mergeTransformationSet( - parent: TransformationSet, - child: TransformationSet -): TransformationSet { +/** + * Merges two TransformationSets by combining each parent transformation with each child. + * Creates a Cartesian product of transformations, merging each pair. + * @returns New TransformationSet with all merged combinations + * + * @calls mergeTransformations - To merge each parent-child pair + */ +function mergeTransformationSet(parent: TransformationSet, child: TransformationSet): TransformationSet { const merged = new TransformationSet(); parent.forEach((tp) => { @@ -318,27 +1016,161 @@ export function mergeTransformationSet( return merged; } +/** + * Expands a star (wildcard) column into individual column entries. + * Handles both "*" (all tables) and "table.*" (specific table) patterns. + * @returns Array of AstColumn entries for each expanded column + * + * @calls isStar - To verify it's a star column + * @calls getStarTableQualifier - To get table qualifier if present + * @calls getTableExpressionsFromSelect - To get tables from FROM clause + * @calls astTableMatchesSchemaTable - To match tables to namespace + * @calls expandStarColumn - Recursively for nested star expressions in subqueries + * @calls getOutputColumnName - To get column names from subquery columns + */ +function expandStarColumn(column: AstColumn, select: Select, namespace: Namespace): AstColumn[] { + if (!isStar(column)) return [column]; + if (!namespace.tables) return [column]; + + const tableQualifier = getStarTableQualifier(column); + const { regularTables, selectTables } = getTableExpressionsFromSelect(select); + const expandedColumns: AstColumn[] = []; + + // Process regular tables (from namespace) + for (const fromTable of regularTables) { + // If there's a table qualifier, skip tables that don't match + if (tableQualifier && tableQualifier !== fromTable.table && tableQualifier !== fromTable.as) { + continue; + } + + const schemaTable = namespace.tables!.find((t) => + astTableMatchesSchemaTable(fromTable, t.name, namespace.defaultSchema), + ); + if (!schemaTable) continue; + + for (const colName of schemaTable.columns) { + expandedColumns.push({ + expr: { + type: "column_ref", + table: fromTable.as ?? fromTable.table, + column: colName, + } as ExpressionValue, + as: colName, + }); + } + } + + // Process subquery/CTE tables + for (const selectTable of selectTables) { + // If there's a table qualifier, skip tables that don't match + if (tableQualifier && tableQualifier !== selectTable.as) { + continue; + } + + // Get columns from the subquery/CTE + if (selectTable.columns && typeof selectTable.columns !== "string") { + for (const subCol of selectTable.columns) { + if (!isColumn(subCol)) continue; + + // Handle star in subquery recursively + if (isStar(subCol)) { + const expandedSubCols = expandStarColumn(subCol, selectTable, namespace); + for (const expandedSubCol of expandedSubCols) { + const outputName = getOutputColumnName(expandedSubCol); + if (outputName) { + expandedColumns.push({ + expr: { + type: "column_ref", + table: selectTable.as ?? null, + column: outputName, + } as unknown as ExpressionValue, + as: outputName, + }); + } + } + } else { + const outputName = getOutputColumnName(subCol); + if (outputName) { + expandedColumns.push({ + expr: { + type: "column_ref", + table: selectTable.as ?? null, + column: outputName, + } as unknown as ExpressionValue, + as: outputName, + }); + } + } + } + } + } + + return expandedColumns; +} + +/** + * Type guard to check if a SELECT has set operations (UNION, INTERSECT, EXCEPT). + * @returns True if the SELECT has a set_op property with a value + */ +function hasSetOperation(select: Select): select is Select { + return "set_op" in select && select.set_op != null; +} + +/** + * Collects all SELECT statements in a set operation chain. + * Follows the _next chain for UNION/INTERSECT/EXCEPT operations. + * @returns Array of SELECT statements, first element is the base select + * + * @calls hasSetOperation - To check if there are more SELECTs in the chain + */ +function getSetOperationSelects(select: Select): Select[] { + const selects: Select[] = [select]; + + if (hasSetOperation(select)) { + let current: Select | undefined | null = select._next; + while (current) { + selects.push(current); + current = hasSetOperation(current) ? current._next : null; + } + } + + return selects; +} + +/** + * Computes field-level lineage for a single output column. + * Traces the column back to its source columns in the namespace tables. + * + * Process: + * 1. Extracts transformations from the column expression + * 2. Merges with any parent transformations (for nested queries) + * 3. Resolves each column reference to InputField via regular tables or recursion into CTEs/subqueries + * @returns Array of InputField objects representing source columns with transformations + * + * @calls extractTransformationsFromExpr - To get column transformations from expression + * @calls mergeTransformationSet - To combine with parent transformations + * @calls getTableExpressionsFromSelect - To separate regular tables from CTEs/subqueries + * @calls parseInputColumnName - To parse column identifiers + * @calls astTableMatchesSchemaTable - To match columns to namespace tables + * @calls getColumnLineage - Recursively for columns from CTEs/subqueries + */ export function getColumnLineage( select: Select, - schema: Schema, + namespace: Namespace, column: AstColumn, - transformations?: TransformationSet + transformations?: TransformationSet, ): InputField[] { - let transformationsByColumns = getDirectTransformationsFromExprValue( - column.expr - ); + let transformationsByColumns = extractTransformationsFromExpr(column.expr); if (transformations) { transformationsByColumns = Object.entries(transformationsByColumns).reduce( (acc, [columnName, childTransformations]) => { - acc[columnName] = mergeTransformationSet( - transformations, - childTransformations - ); - - return acc; + return { + ...acc, + [columnName]: mergeTransformationSet(transformations, childTransformations), + }; }, - {} as Record + {} as Record, ); } @@ -346,26 +1178,26 @@ export function getColumnLineage( const inputFields: InputField[] = []; - for (const [inputColumnName, transformations] of Object.entries( - transformationsByColumns - )) { + if (!namespace.tables) return inputFields; + + for (const [inputColumnName, transformations] of Object.entries(transformationsByColumns)) { const inputColumn = parseInputColumnName(inputColumnName); const table = regularTables.find( (t) => - (!inputColumn.table || - inputColumn.table === t.table || - inputColumn.table === t.as) && - schema.tables.some( - (s) => - s.name === t.table && s.columns.some((c) => c === inputColumn.name) - ) + (!inputColumn.table || inputColumn.table === t.table || inputColumn.table === t.as) && + namespace.tables!.some( + (s) => astTableMatchesSchemaTable(t, s.name, namespace.defaultSchema) && s.columns.includes(inputColumn.name), + ), ); if (table) { + const schemaTable = namespace.tables.find((s) => + astTableMatchesSchemaTable(table, s.name, namespace.defaultSchema), + ); inputFields.push({ - namespace: schema.namespace, - name: table.table, + namespace: namespace.namespace, + name: schemaTable!.name, field: inputColumn.name, transformations: Array.from(transformations), }); @@ -375,9 +1207,7 @@ export function getColumnLineage( continue; } - const matchingColumn = selectTable.columns.find( - (c) => getOutputColumnName(c) === inputColumn.name - ); + const matchingColumn = selectTable.columns.find((c) => getOutputColumnName(c) === inputColumn.name); let nextColumn: AstColumn; @@ -386,7 +1216,7 @@ export function getColumnLineage( } else { nextColumn = column; - // stop propogating table of column as it is only in the context of the select + // stop propagating table of column as it is only in the context of the select if (nextColumn.expr.type === "column_ref") { const expr = nextColumn.expr as ColumnRefItem; @@ -394,9 +1224,7 @@ export function getColumnLineage( } } - inputFields.push( - ...getColumnLineage(selectTable, schema, nextColumn, transformations) - ); + inputFields.push(...getColumnLineage(selectTable, namespace, nextColumn, transformations)); } } } @@ -404,28 +1232,197 @@ export function getColumnLineage( return inputFields; } -export function getLineage( - select: Select, - schema: Schema -): ColumnLineageDatasetFacet["fields"] { - let unkownCount = 0; +/** + * Extracts dataset-level indirect lineage for a single SELECT statement. + * Collects all columns that affect the entire result set through indirect transformations. + * @returns Array of InputFields for all indirect lineage columns + * + * @calls getJoinLineage, getFilterLineage, getGroupByLineage, getOrderByLineage, + * getWindowLineage, getHavingLineage - To collect each type of indirect lineage + * @calls getTableExpressionsFromSelect - To find CTEs/subqueries + * @calls getDatasetLineage - Recursively for CTEs/subqueries + */ +function getDatasetLineageForSingleSelect(select: Select, namespace: Namespace): InputField[] { + const allIndirectFields: InputField[] = []; + + // Collect all indirect lineage from the outermost SELECT + allIndirectFields.push(...getJoinLineage(select, namespace)); + allIndirectFields.push(...getFilterLineage(select, namespace)); + allIndirectFields.push(...getGroupByLineage(select, namespace)); + allIndirectFields.push(...getOrderByLineage(select, namespace)); + allIndirectFields.push(...getHavingLineage(select, namespace)); + + // Recursively collect dataset lineage from CTEs and subqueries + const { selectTables } = getTableExpressionsFromSelect(select); + for (const selectTable of selectTables) { + allIndirectFields.push(...getDatasetLineage(selectTable, namespace)); + } - return select.columns.reduce((acc, column) => { - if (!isColumn(column)) { - return acc; + return allIndirectFields; +} + +/** + * Computes all dataset-level indirect lineage for a SELECT, including set operations. + * Returns columns that affect the entire result set (not mapped to specific output columns). + * @returns Deduplicated array of InputFields for dataset-level lineage + * + * @calls getSetOperationSelects - To collect all SELECTs in set operation chain + * @calls getDatasetLineageForSingleSelect - To get lineage for each SELECT + */ +export function getDatasetLineage(select: Select, namespace: Namespace): InputField[] { + const allIndirectFields: InputField[] = []; + + // Handle set operations (UNION, INTERSECT, EXCEPT) + const setOpSelects = getSetOperationSelects(select); + setOpSelects.forEach((setOpSelect) => { + allIndirectFields.push(...getDatasetLineageForSingleSelect(setOpSelect, namespace)); + }); + + // Deduplicate by creating a map keyed by namespace.table.field.type.subtype + const deduped = new Map(); + for (const field of allIndirectFields) { + const transformation = field.transformations?.[0]; + const key = `${field.namespace}.${field.name}.${field.field}.${transformation?.type}.${transformation?.subtype}`; + if (!deduped.has(key)) { + deduped.set(key, field); } + } - let outputFieldName = getOutputColumnName(column); + return Array.from(deduped.values()); +} + +/** + * Computes field-level lineage for a single SELECT (without set operations). + * Maps each output column to its source columns with transformations. + * @returns Object mapping output column names to their FieldLineage (inputFields array) + * + * @calls isColumn - To filter valid columns + * @calls isStar - To detect wildcard columns + * @calls expandStarColumn - To expand * into individual columns + * @calls getOutputColumnName - To determine output column name + * @calls getColumnLineage - To compute lineage for each column + */ +function getLineageForSingleSelect(select: Select, namespace: Namespace): ColumnLineageDatasetFacet["fields"] { + let unknownCount = 0; + + // Handle the case where columns is the string "*" (entire result is star) + if (typeof select.columns === "string" && select.columns === "*") { + return {}; + } + + return select.columns.reduce( + (acc, column) => { + if (!isColumn(column)) { + return acc; + } + + // Expand star columns into individual columns + if (isStar(column)) { + const expandedColumns = expandStarColumn(column, select, namespace); + for (const expandedCol of expandedColumns) { + let outputFieldName = getOutputColumnName(expandedCol); + if (!outputFieldName) { + outputFieldName = `unknown_${unknownCount++}`; + } + acc[outputFieldName] = { inputFields: getColumnLineage(select, namespace, expandedCol) }; + } - if (!outputFieldName) { - outputFieldName = `unknown_${unkownCount++}`; + return acc; + } + + let outputFieldName = getOutputColumnName(column); + + if (!outputFieldName) { + outputFieldName = `unknown_${unknownCount++}`; + } + + acc[outputFieldName] = { inputFields: getColumnLineage(select, namespace, column) }; + return acc; + }, + {} as ColumnLineageDatasetFacet["fields"], + ); +} + +/** + * Merges and deduplicates InputField arrays. + * Used when combining lineage from multiple sources (e.g., UNION branches). + * @returns Combined array with duplicates removed + * + * @calls transformationHasher - To create unique keys for transformations + */ +function mergeInputFields(existing: InputField[], incoming: InputField[]): InputField[] { + const hashset = new HashSet((value: InputField) => { + const transformationsString = + value.transformations?.map((t) => transformationHasher(t as Transformation)).join("-") ?? ""; + return `${value.namespace}-${value.name}-${value.field}-${transformationsString}`; + }); + existing.forEach((field) => hashset.add(field)); + incoming.forEach((field) => hashset.add(field)); + + return [...hashset.values()]; +} + +/** + * Main field-level lineage extraction function. + * Returns a map of output columns to their source columns with transformations. + * + * Handles set operations (UNION, INTERSECT, EXCEPT) by: + * 1. Using the first SELECT's column names as output names + * 2. Merging lineage from subsequent SELECTs by column position + * @returns Object mapping output column names to FieldLineage objects + * + * @calls getSetOperationSelects - To collect all SELECTs in chain + * @calls getLineageForSingleSelect - To compute lineage for each SELECT + * @calls mergeInputFields - To combine lineage from set operation branches + */ +export function getLineage(select: Select, namespace: Namespace): ColumnLineageDatasetFacet["fields"] { + // Get all SELECT statements in the set operation chain + const setOpSelects = getSetOperationSelects(select); + + // Get lineage from the first SELECT (determines output column names) + const baseLineage = getLineageForSingleSelect(setOpSelects[0]!, namespace); + + // If no set operations, return base lineage + if (setOpSelects.length === 1) { + return baseLineage; + } + + // Merge lineages from subsequent SELECTs in the set operation + // Output columns are matched by position, not name + const baseColumns = Object.keys(baseLineage); + + for (let i = 1; i < setOpSelects.length; i++) { + const nextSelect = setOpSelects[i]!; + const nextLineage = getLineageForSingleSelect(nextSelect, namespace); + const nextColumns = Object.keys(nextLineage); + + // Match columns by position and merge input fields + for (let j = 0; j < baseColumns.length && j < nextColumns.length; j++) { + const baseCol = baseColumns[j]!; + const nextCol = nextColumns[j]!; + + if (baseLineage[baseCol] && nextLineage[nextCol]) { + baseLineage[baseCol]!.inputFields = mergeInputFields( + baseLineage[baseCol]!.inputFields, + nextLineage[nextCol]!.inputFields, + ); + } } + } - return { - ...acc, - [outputFieldName]: { - inputFields: getColumnLineage(select, schema, column), - }, - }; - }, {}); + return baseLineage; +} + +/** + * Extended lineage extraction returning both field-level and dataset-level lineage. + * Follows the OpenLineage ColumnLineageDatasetFacet specification. + */ +export function getExtendedLineage( + select: Select, + namespace: Namespace, +): Pick { + return { + fields: getLineage(select, namespace), + dataset: getDatasetLineage(select, namespace), + }; } diff --git a/packages/lineage/test/extendedLineage.test.ts b/packages/lineage/test/extendedLineage.test.ts new file mode 100644 index 0000000..4918b32 --- /dev/null +++ b/packages/lineage/test/extendedLineage.test.ts @@ -0,0 +1,3068 @@ +import { describe, test, expect } from "bun:test"; +import { Parser } from "node-sql-parser"; +import type { AST, Select } from "node-sql-parser"; +import { + getExtendedLineage, + type Namespace, + type Table, + INDIRECT_JOIN, + INDIRECT_FILTER, + INDIRECT_GROUP_BY, + INDIRECT_SORT, + INDIRECT_WINDOW, + INDIRECT_CONDITION, + DIRECT_IDENTITY, + DIRECT_TRANSFORMATION, + DIRECT_AGGREGATION, +} from "../src/index.js"; + +const DEFAULT_SCHEMA = "public"; + +const parser = new Parser(); + +function createNamespace(namespace: string, tables: Table[], defaultSchema: string = DEFAULT_SCHEMA): Namespace { + return { namespace, tables, defaultSchema }; +} + +function createTable(name: string, columns: string[]): Table { + return { name, columns }; +} + +function parseSQL(sql: string, database: "trino" | "postgresql" = "trino"): AST { + const result = parser.astify(sql, { database }); + const ast = Array.isArray(result) ? result[0] : result; + if (!ast) throw new Error("Failed to parse SQL"); + return ast; +} + +function parseSQLPostgres(sql: string): AST { + const result = parser.astify(sql, { database: "postgresql" }); + const ast = Array.isArray(result) ? result[0] : result; + if (!ast) throw new Error("Failed to parse SQL"); + return ast; +} + +// ============================================================================= +// EXACT ASSERTION HELPERS +// ============================================================================= + +/** Sort input fields for consistent comparison */ +function sortInputFields(fields: ReturnType["fields"]) { + const sorted: typeof fields = {}; + for (const [key, value] of Object.entries(fields)) { + sorted[key] = { + inputFields: [...value.inputFields].sort((a, b) => { + const aKey = `${a.namespace}.${a.name}.${a.field}`; + const bKey = `${b.namespace}.${b.name}.${b.field}`; + return aKey.localeCompare(bKey); + }), + }; + } + return sorted; +} + +/** Sort dataset fields for consistent comparison */ +function sortDataset(dataset: ReturnType["dataset"]) { + if (!dataset) return []; + return [...dataset].sort((a, b) => { + const aKey = `${a.namespace}.${a.name}.${a.field}.${a.transformations?.[0]?.subtype}`; + const bKey = `${b.namespace}.${b.name}.${b.field}.${b.transformations?.[0]?.subtype}`; + return aKey.localeCompare(bKey); + }); +} + +// ============================================================================= +// SECTION 1: FIELD-LEVEL LINEAGE - DIRECT TRANSFORMATIONS +// ============================================================================= + +describe("Field-Level Lineage: DIRECT/IDENTITY", () => { + test("single column select", () => { + const sql = `SELECT id FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + id: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + }); + expect(result.dataset).toEqual([]); + }); + + test("multiple columns select", () => { + const sql = `SELECT id, name, email FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "email"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + id: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + email: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "email", transformations: [DIRECT_IDENTITY] }, + ], + }, + }); + expect(result.dataset).toEqual([]); + }); + + test("column with alias", () => { + const sql = `SELECT id as user_id, name as user_name FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + user_id: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + user_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }); + }); + + test("table-qualified column", () => { + const sql = `SELECT u.id, u.name FROM users u`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + id: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }); + }); +}); + +describe("Field-Level Lineage: DIRECT/TRANSFORMATION", () => { + test("function transformation - UPPER", () => { + const sql = `SELECT UPPER(name) as upper_name FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["name"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + upper_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_TRANSFORMATION] }, + ], + }, + }); + }); + + test("function transformation - CONCAT", () => { + const sql = `SELECT CONCAT(first_name, last_name) as full_name FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["first_name", "last_name"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + full_name: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.users`, + field: "first_name", + transformations: [DIRECT_TRANSFORMATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.users`, + field: "last_name", + transformations: [DIRECT_TRANSFORMATION], + }, + ], + }, + }), + ); + }); + + test("arithmetic transformation - addition", () => { + const sql = `SELECT price + tax as total FROM products`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.products`, ["price", "tax"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + total: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "price", + transformations: [DIRECT_TRANSFORMATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "tax", + transformations: [DIRECT_TRANSFORMATION], + }, + ], + }, + }), + ); + }); + + test("arithmetic transformation - multiplication", () => { + const sql = `SELECT quantity * unit_price as line_total FROM order_items`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.order_items`, ["quantity", "unit_price"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + line_total: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [DIRECT_TRANSFORMATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "unit_price", + transformations: [DIRECT_TRANSFORMATION], + }, + ], + }, + }), + ); + }); + + test("CAST transformation", () => { + const sql = `SELECT CAST(price AS VARCHAR) as price_str FROM products`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.products`, ["price"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + price_str: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "price", + transformations: [DIRECT_TRANSFORMATION], + }, + ], + }, + }); + }); + + test("nested function transformation", () => { + const sql = `SELECT LOWER(TRIM(name)) as clean_name FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["name"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + clean_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_TRANSFORMATION] }, + ], + }, + }); + }); +}); + +describe("Field-Level Lineage: DIRECT/AGGREGATION", () => { + test("SUM aggregation", () => { + const sql = `SELECT SUM(amount) as total FROM transactions`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.transactions`, ["amount"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + total: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.transactions`, + field: "amount", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + }); + }); + + test("AVG aggregation", () => { + const sql = `SELECT AVG(salary) as avg_salary FROM employees`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["salary"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + avg_salary: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "salary", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + }); + }); + + test("MIN aggregation", () => { + const sql = `SELECT MIN(price) as min_price FROM products`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.products`, ["price"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + min_price: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "price", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + }); + }); + + test("MAX aggregation", () => { + const sql = `SELECT MAX(created_at) as latest FROM events`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.events`, ["created_at"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + latest: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.events`, + field: "created_at", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + }); + }); + + test("COUNT with column - has masking", () => { + const sql = `SELECT COUNT(id) as count FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + count: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.users`, + field: "id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ], + }, + }); + }); + + test("COUNT DISTINCT - has masking", () => { + const sql = `SELECT COUNT(DISTINCT user_id) as unique_users FROM orders`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + unique_users: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, + field: "user_id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ], + }, + }); + }); + + test("aggregation with expression inside", () => { + const sql = `SELECT SUM(quantity * price) as revenue FROM order_items`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.order_items`, ["quantity", "price"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + revenue: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "price", + transformations: [DIRECT_AGGREGATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + }), + ); + }); +}); + +describe("Field-Level Lineage: Masking Functions", () => { + test("MD5 masking", () => { + const sql = `SELECT MD5(email) as hashed_email FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["email"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + hashed_email: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.users`, + field: "email", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ], + }, + }); + }); + + test("SHA256 masking", () => { + const sql = `SELECT SHA256(ssn) as hashed_ssn FROM employees`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["ssn"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + hashed_ssn: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "ssn", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ], + }, + }); + }); + + test("MASK function", () => { + const sql = `SELECT MASK(phone) as masked_phone FROM contacts`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.contacts`, ["phone"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + masked_phone: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.contacts`, + field: "phone", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ], + }, + }); + }); +}); + +describe("Field-Level Lineage: CASE Expressions", () => { + test("simple CASE WHEN", () => { + const sql = ` + SELECT + CASE WHEN status = 'active' THEN 'Active' ELSE 'Inactive' END as status_label + FROM users + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["status"])]); + const result = getExtendedLineage(ast as Select, schema); + + // CASE WHEN condition column gets INDIRECT/CONDITION + expect(result.fields.status_label).toBeDefined(); + expect(result.fields.status_label?.inputFields).toEqual([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.users`, + field: "status", + transformations: [INDIRECT_CONDITION], + }, + ]); + }); + + test("CASE with column in result", () => { + const sql = ` + SELECT + CASE WHEN is_premium THEN discount_rate ELSE 0 END as applied_discount + FROM customers + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.customers`, ["is_premium", "discount_rate"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + applied_discount: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.customers`, + field: "is_premium", + transformations: [INDIRECT_CONDITION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.customers`, + field: "discount_rate", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }), + ); + }); + + test("CASE with multiple conditions and results", () => { + const sql = ` + SELECT + CASE + WHEN age < 18 THEN minor_price + WHEN age < 65 THEN adult_price + ELSE senior_price + END as ticket_price + FROM visitors + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.visitors`, ["age", "minor_price", "adult_price", "senior_price"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + ticket_price: { + inputFields: [ + { + field: "age", + name: `${DEFAULT_SCHEMA}.visitors`, + namespace: "ns", + transformations: [INDIRECT_CONDITION], + }, + { + field: "minor_price", + name: `${DEFAULT_SCHEMA}.visitors`, + namespace: "ns", + transformations: [DIRECT_IDENTITY], + }, + { + field: "adult_price", + name: `${DEFAULT_SCHEMA}.visitors`, + namespace: "ns", + transformations: [DIRECT_IDENTITY], + }, + { + field: "senior_price", + name: `${DEFAULT_SCHEMA}.visitors`, + namespace: "ns", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }), + ); + }); +}); + +// ============================================================================= +// SECTION 2: DATASET-LEVEL LINEAGE - INDIRECT TRANSFORMATIONS +// ============================================================================= + +describe("Dataset-Level Lineage: INDIRECT/JOIN", () => { + test("simple INNER JOIN", () => { + const sql = ` + SELECT u.id, o.total + FROM users u + JOIN orders o ON u.id = o.user_id + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + ]), + ); + }); + + test("LEFT JOIN", () => { + const sql = ` + SELECT u.id, o.total + FROM users u + LEFT JOIN orders o ON u.id = o.user_id + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + ]), + ); + }); + + test("RIGHT JOIN", () => { + const sql = ` + SELECT u.id, o.total + FROM users u + RIGHT JOIN orders o ON u.id = o.user_id + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + ]), + ); + }); + + test("FULL OUTER JOIN", () => { + const sql = ` + SELECT u.id, o.total + FROM users u + FULL OUTER JOIN orders o ON u.id = o.user_id + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + ]), + ); + }); + + test("JOIN with compound ON condition (AND)", () => { + const sql = ` + SELECT u.id + FROM users u + JOIN orders o ON u.id = o.user_id AND u.region = o.region + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "region"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "region"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "region", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "region", transformations: [INDIRECT_JOIN] }, + ]), + ); + }); + + test("multiple JOINs - three tables", () => { + const sql = ` + SELECT u.id, o.total, p.name + FROM users u + JOIN orders o ON u.id = o.user_id + JOIN products p ON o.product_id = p.id + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "product_id", "total"]), + createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "product_id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "id", transformations: [INDIRECT_JOIN] }, + ]), + ); + }); + + test("CROSS JOIN - no dataset lineage", () => { + const sql = ` + SELECT u.id, p.name + FROM users u + CROSS JOIN products p + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), + createTable(`${DEFAULT_SCHEMA}.products`, ["name"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([]); + }); + + test("self JOIN", () => { + const sql = ` + SELECT e.name, m.name as manager_name + FROM employees e + JOIN employees m ON e.manager_id = m.id + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "name", "manager_id"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "manager_id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "id", transformations: [INDIRECT_JOIN] }, + ]), + ); + }); +}); + +describe("Dataset-Level Lineage: INDIRECT/FILTER (WHERE)", () => { + test("simple WHERE equality", () => { + const sql = `SELECT id FROM users WHERE status = 'active'`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + ]); + }); + + test("WHERE with AND", () => { + const sql = `SELECT id FROM users WHERE status = 'active' AND age > 18`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status", "age"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "age", transformations: [INDIRECT_FILTER] }, + ]), + ); + }); + + test("WHERE with OR", () => { + const sql = `SELECT id FROM users WHERE status = 'active' OR status = 'pending'`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status"])]); + const result = getExtendedLineage(ast as Select, schema); + + // Same column referenced twice, should be deduplicated + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + ]); + }); + + test("WHERE with IN clause", () => { + const sql = `SELECT id FROM users WHERE country IN ('US', 'UK', 'CA')`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "country"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_FILTER] }, + ]); + }); + + test("WHERE with IN subquery", () => { + const sql = ` + SELECT id, name + FROM users + WHERE id IN (SELECT user_id FROM orders WHERE total > 100) + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "total", transformations: [INDIRECT_FILTER] }, + ]), + ); + }); + + test("WHERE with BETWEEN", () => { + const sql = `SELECT id FROM users WHERE age BETWEEN 18 AND 65`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "age"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "age", transformations: [INDIRECT_FILTER] }, + ]); + }); + + test("WHERE with LIKE", () => { + const sql = `SELECT id FROM users WHERE name LIKE 'John%'`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [INDIRECT_FILTER] }, + ]); + }); + + test("WHERE with IS NULL", () => { + const sql = `SELECT id FROM users WHERE email IS NULL`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "email"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "email", transformations: [INDIRECT_FILTER] }, + ]); + }); + + test("WHERE with IS NOT NULL", () => { + const sql = `SELECT id FROM users WHERE email IS NOT NULL`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "email"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "email", transformations: [INDIRECT_FILTER] }, + ]); + }); + + test("WHERE with nested complex conditions", () => { + const sql = `SELECT id FROM users WHERE (status = 'active' AND age > 18) OR (country = 'US' AND verified = true)`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status", "age", "country", "verified"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "age", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "verified", transformations: [INDIRECT_FILTER] }, + ]), + ); + }); +}); + +describe("Dataset-Level Lineage: INDIRECT/GROUP_BY", () => { + test("simple GROUP BY single column", () => { + const sql = `SELECT country, COUNT(*) FROM users GROUP BY country`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["country"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_GROUP_BY] }, + ]); + }); + + test("GROUP BY multiple columns", () => { + const sql = `SELECT country, city, COUNT(*) FROM users GROUP BY country, city`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["country", "city"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "city", transformations: [INDIRECT_GROUP_BY] }, + ]), + ); + }); + + test("GROUP BY with expression", () => { + const sql = `SELECT DATE_TRUNC('month', created_at) as month, COUNT(*) FROM events GROUP BY DATE_TRUNC('month', created_at)`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.events`, ["id", "created_at"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.events`, field: "created_at", transformations: [INDIRECT_GROUP_BY] }, + ]); + }); +}); + +describe("Dataset-Level Lineage: INDIRECT/SORT (ORDER BY)", () => { + test("simple ORDER BY single column", () => { + const sql = `SELECT id, name FROM users ORDER BY created_at`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "created_at"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "created_at", transformations: [INDIRECT_SORT] }, + ]); + }); + + test("ORDER BY multiple columns", () => { + const sql = `SELECT id, name FROM users ORDER BY country ASC, name DESC`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "country"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_SORT] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [INDIRECT_SORT] }, + ]), + ); + }); + + test("ORDER BY alias resolves to base columns", () => { + const sql = `SELECT quantity * price as total FROM order_items ORDER BY total`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.order_items`, ["quantity", "price"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "price", transformations: [INDIRECT_SORT] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "quantity", transformations: [INDIRECT_SORT] }, + ]), + ); + }); + + test("ORDER BY with NULLS LAST", () => { + const sql = `SELECT id, name FROM users ORDER BY email NULLS LAST`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "email"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "email", transformations: [INDIRECT_SORT] }, + ]); + }); +}); + +describe("Dataset-Level Lineage: INDIRECT/FILTER (HAVING)", () => { + test("HAVING with aggregation column reference", () => { + const sql = ` + SELECT department, SUM(salary) as total + FROM employees + GROUP BY department + HAVING SUM(salary) > 100000 + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["department", "salary"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "department", + transformations: [INDIRECT_GROUP_BY], + }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_FILTER] }, + ]), + ); + }); + + test("HAVING with multiple conditions", () => { + const sql = ` + SELECT department, AVG(age), COUNT(id) + FROM employees + GROUP BY department + HAVING AVG(age) > 30 AND COUNT(id) > 5 + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department", "age"])]); + const result = getExtendedLineage(ast as Select, schema); + + const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); + expect(filterLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "age", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "id", + transformations: [INDIRECT_FILTER], + }); + }); +}); + +// ============================================================================= +// SECTION 3: COMBINED CLAUSES +// ============================================================================= + +describe("Combined Clauses: JOIN + WHERE", () => { + test("JOIN with WHERE on both tables", () => { + const sql = ` + SELECT u.id, o.total + FROM users u + JOIN orders o ON u.id = o.user_id + WHERE u.status = 'active' AND o.total > 100 + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "total", transformations: [INDIRECT_FILTER] }, + ]), + ); + }); +}); + +describe("Combined Clauses: WHERE + GROUP BY + HAVING", () => { + test("full aggregation query", () => { + const sql = ` + SELECT department, COUNT(*) as cnt, AVG(salary) as avg_sal + FROM employees + WHERE status = 'active' + GROUP BY department + HAVING COUNT(*) > 5 + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department", "salary", "status"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "status", transformations: [INDIRECT_FILTER] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "department", + transformations: [INDIRECT_GROUP_BY], + }, + // HAVING COUNT(*) doesn't add field lineage since COUNT(*) doesn't reference a column + ]), + ); + }); +}); + +describe("Combined Clauses: GROUP BY + ORDER BY", () => { + test("aggregation with sorting", () => { + const sql = ` + SELECT country, COUNT(*) as cnt + FROM users + GROUP BY country + ORDER BY cnt DESC + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["country"])]); + const result = getExtendedLineage(ast as Select, schema); + + // ORDER BY cnt references alias, which resolves to COUNT(*) - no additional lineage + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_GROUP_BY] }, + ]); + }); +}); + +describe("Combined Clauses: WINDOW + WHERE + ORDER BY", () => { + test("window function with filter and sort", () => { + const sql = ` + SELECT + id, + SUM(amount) OVER (PARTITION BY category ORDER BY created_at) as running_total + FROM transactions + WHERE status = 'completed' + ORDER BY created_at + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.transactions`, ["id", "amount", "category", "created_at", "status"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.transactions`, + field: "status", + transformations: [INDIRECT_FILTER], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.transactions`, + field: "created_at", + transformations: [INDIRECT_SORT], + }, + ]), + ); + }); +}); + +// ============================================================================= +// SECTION 4: CTEs (WITH clause) +// ============================================================================= + +describe("CTEs: Basic WITH clause", () => { + test("simple CTE propagates field lineage", () => { + const sql = ` + WITH active AS ( + SELECT id, name FROM users WHERE status = 'active' + ) + SELECT id, name FROM active + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "status"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + id: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }); + + // Dataset lineage should include the WHERE from the CTE + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + ]); + }); + + test("CTE with aggregation", () => { + const sql = ` + WITH summary AS ( + SELECT department, SUM(salary) as total_salary + FROM employees + GROUP BY department + ) + SELECT department, total_salary FROM summary + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["department", "salary"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + department: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "department", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total_salary: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "salary", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + }); + + // GROUP BY from CTE should be in dataset lineage + expect(result.dataset).toEqual([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "department", + transformations: [INDIRECT_GROUP_BY], + }, + ]); + }); +}); + +describe("CTEs: Multiple CTEs", () => { + test("two CTEs with JOIN", () => { + const sql = ` + WITH + users_cte AS ( + SELECT id, name FROM users WHERE status = 'active' + ), + orders_cte AS ( + SELECT user_id, SUM(total) as total_spent FROM orders GROUP BY user_id + ) + SELECT u.name, o.total_spent + FROM users_cte u + JOIN orders_cte o ON u.id = o.user_id + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "status"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + total_spent: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "total", transformations: [DIRECT_AGGREGATION] }, + ], + }, + }); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_GROUP_BY] }, + ]), + ); + }); +}); + +describe("CTEs: Nested transformations through CTEs", () => { + test("transformation propagation through nested CTEs", () => { + const sql = ` + WITH + base AS ( + SELECT id, quantity * price as revenue FROM sales WHERE sale_date >= '2024-01-01' + ), + summary AS ( + SELECT SUM(revenue) as total_revenue FROM base + ) + SELECT total_revenue FROM summary + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.sales`, ["id", "quantity", "price", "sale_date"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + // total_revenue -> SUM(revenue) -> quantity * price + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + total_revenue: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "price", transformations: [DIRECT_AGGREGATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + }), + ); + + // Dataset lineage includes WHERE from base CTE + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "sale_date", transformations: [INDIRECT_FILTER] }, + ]); + }); +}); + +// ============================================================================= +// SECTION 5: SUBQUERIES +// ============================================================================= + +describe("Subqueries: FROM clause subquery", () => { + test("simple subquery in FROM", () => { + const sql = ` + SELECT sub.country, sub.cnt + FROM ( + SELECT country, COUNT(*) as cnt + FROM users + WHERE status = 'active' + GROUP BY country + ) sub + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["country", "status"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields.country?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [DIRECT_IDENTITY] }, + ]); + + // Dataset lineage from subquery + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_GROUP_BY] }, + ]), + ); + }); +}); + +// ============================================================================= +// SECTION 6: SET OPERATIONS (UNION, INTERSECT, EXCEPT) +// ============================================================================= + +describe("Set Operations: UNION", () => { + test("simple UNION combines lineage from both queries", () => { + const sql = ` + SELECT id, name FROM users + UNION + SELECT id, name FROM customers + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name", "email"]), + createTable("customers", ["id", "name", "address"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + name: { + inputFields: [ + { namespace: "postgres", name: "users", field: "name", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + }); + + test("UNION with WHERE clauses combines field and dataset lineage", () => { + const sql = ` + SELECT id, name FROM users WHERE status = 'active' + UNION + SELECT id, name FROM customers WHERE verified = true + `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "status"]), + createTable(`${DEFAULT_SCHEMA}.customers`, ["id", "name", "verified"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + // Field lineage combines both sources + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + + // Dataset lineage includes filters from both + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "verified", transformations: [INDIRECT_FILTER] }, + ]), + ); + }); + + test("UNION ALL combines lineage from both queries", () => { + const sql = ` + SELECT id FROM users + UNION ALL + SELECT id FROM orders + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "product"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "orders", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + }); + + test("UNION ALL with GROUP BY on both sides", () => { + const sql = ` + SELECT department FROM employees GROUP BY department + UNION ALL + SELECT department FROM contractors GROUP BY department + `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department"]), + createTable(`${DEFAULT_SCHEMA}.contractors`, ["id", "department"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "department", + transformations: [INDIRECT_GROUP_BY], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.contractors`, + field: "department", + transformations: [INDIRECT_GROUP_BY], + }, + ]), + ); + }); + + test("chained UNION combines lineage from all queries", () => { + const sql = ` + SELECT id, name FROM users + UNION + SELECT id, name FROM customers + UNION + SELECT id, name FROM vendors + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("customers", ["id", "name"]), + createTable("vendors", ["id", "name"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "vendors", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + name: { + inputFields: [ + { namespace: "postgres", name: "users", field: "name", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "name", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "vendors", field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + }); + + test("triple UNION with WHERE clauses", () => { + const sql = ` + SELECT id FROM users WHERE region = 'US' + UNION + SELECT id FROM customers WHERE region = 'EU' + UNION + SELECT id FROM vendors WHERE region = 'APAC' + `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "region"]), + createTable(`${DEFAULT_SCHEMA}.customers`, ["id", "region"]), + createTable(`${DEFAULT_SCHEMA}.vendors`, ["id", "region"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "region", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "region", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.vendors`, field: "region", transformations: [INDIRECT_FILTER] }, + ]), + ); + }); + + test("UNION with aliases preserves first SELECT column names", () => { + const sql = ` + SELECT id AS user_id, name AS full_name FROM users + UNION + SELECT customer_id, customer_name FROM customers + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("customers", ["customer_id", "customer_name"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + // Output columns should be named according to the first SELECT + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + user_id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "customer_id", transformations: [DIRECT_IDENTITY] }, + ], + }, + full_name: { + inputFields: [ + { namespace: "postgres", name: "users", field: "name", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "customer_name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + }); + + test("UNION with transformations", () => { + const sql = ` + SELECT UPPER(name) AS name FROM users + UNION + SELECT LOWER(name) AS name FROM customers + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("customers", ["id", "name"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + name: { + inputFields: [ + { namespace: "postgres", name: "users", field: "name", transformations: [DIRECT_TRANSFORMATION] }, + { namespace: "postgres", name: "customers", field: "name", transformations: [DIRECT_TRANSFORMATION] }, + ], + }, + }), + ); + }); + + test("UNION with aggregation", () => { + const sql = ` + SELECT SUM(amount) AS total FROM sales + UNION + SELECT SUM(amount) AS total FROM refunds + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("sales", ["id", "amount"]), + createTable("refunds", ["id", "amount"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + total: { + inputFields: [ + { namespace: "postgres", name: "sales", field: "amount", transformations: [DIRECT_AGGREGATION] }, + { namespace: "postgres", name: "refunds", field: "amount", transformations: [DIRECT_AGGREGATION] }, + ], + }, + }), + ); + }); + + test("UNION with different column expressions", () => { + const sql = ` + SELECT id, first_name || ' ' || last_name AS full_name FROM users + UNION + SELECT id, company_name FROM customers + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "first_name", "last_name"]), + createTable("customers", ["id", "company_name"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + full_name: { + inputFields: [ + { namespace: "postgres", name: "users", field: "first_name", transformations: [DIRECT_TRANSFORMATION] }, + { namespace: "postgres", name: "users", field: "last_name", transformations: [DIRECT_TRANSFORMATION] }, + { namespace: "postgres", name: "customers", field: "company_name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + }); + + test("UNION with subqueries", () => { + const sql = ` + SELECT id FROM (SELECT id FROM users WHERE active = true) AS active_users + UNION + SELECT id FROM (SELECT id FROM customers WHERE verified = true) AS verified_customers + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "active"]), + createTable("customers", ["id", "verified"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "id", transformations: [DIRECT_IDENTITY] }, + // TODO - add support for dataset lineage from subquery WHERE clauses + // { namespace: "postgres", name: "users", field: "active", transformations: [INDIRECT_FILTER] }, + // { namespace: "postgres", name: "customers", field: "verified", transformations: [INDIRECT_FILTER] }, + ], + }, + }), + ); + }); + + test("UNION deduplicates identical input fields", () => { + const sql = ` + SELECT id FROM users + UNION + SELECT id FROM users + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [createTable("users", ["id", "name"])]); + + const lineage = getExtendedLineage(ast as Select, schema); + + // Same table appears in both SELECTs, but should be deduplicated + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "users", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); +}); + +describe("Set Operations: INTERSECT", () => { + test("simple INTERSECT combines lineage from both queries", () => { + const sql = ` + SELECT id FROM users + INTERSECT + SELECT id FROM premium_users + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("premium_users", ["id", "tier"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "premium_users", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + }); + + test("INTERSECT with ORDER BY on both sides", () => { + const sql = ` + SELECT id FROM active_users ORDER BY created_at + INTERSECT + SELECT id FROM premium_users ORDER BY upgraded_at + `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.active_users`, ["id", "created_at"]), + createTable(`${DEFAULT_SCHEMA}.premium_users`, ["id", "upgraded_at"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.active_users`, + field: "created_at", + transformations: [INDIRECT_SORT], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.premium_users`, + field: "upgraded_at", + transformations: [INDIRECT_SORT], + }, + ]), + ); + }); +}); + +describe("Set Operations: EXCEPT", () => { + test("simple EXCEPT combines lineage from both queries", () => { + const sql = ` + SELECT id FROM users + EXCEPT + SELECT id FROM banned_users + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("banned_users", ["id", "reason"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "banned_users", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + }); + + test("EXCEPT with WHERE on both sides", () => { + const sql = ` + SELECT id FROM users WHERE active = true + EXCEPT + SELECT id FROM banned_users WHERE ban_date > '2024-01-01' + `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "active"]), + createTable(`${DEFAULT_SCHEMA}.banned_users`, ["id", "ban_date"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "active", transformations: [INDIRECT_FILTER] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.banned_users`, + field: "ban_date", + transformations: [INDIRECT_FILTER], + }, + ]), + ); + }); +}); + +// ============================================================================= +// SECTION 7: STAR (*) EXPANSION +// ============================================================================= + +describe("Star Expansion", () => { + test("SELECT * expands to all columns", () => { + const sql = `SELECT * FROM users`; + const ast = parseSQL(sql); + const usersTable = createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "email"]); + const schema = createNamespace("ns", [usersTable]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual( + usersTable.columns.reduce( + (acc, col) => { + return { + ...acc, + [col]: { + inputFields: [{ namespace: "ns", name: usersTable.name, field: col, transformations: [DIRECT_IDENTITY] }], + }, + }; + }, + {} as Record, + ), + ); + }); + + test("table.* with multiple tables", () => { + const sql = `SELECT u.*, o.total FROM users u JOIN orders o ON u.id = o.user_id`; + const ast = parseSQL(sql); + const usersTable = createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "email"]); + const schema = createNamespace("ns", [usersTable, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + ...usersTable.columns.reduce( + (acc, col) => { + return { + ...acc, + [col]: { + inputFields: [{ namespace: "ns", name: usersTable.name, field: col, transformations: [DIRECT_IDENTITY] }], + }, + }; + }, + {} as Record, + ), + total: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "total", transformations: [DIRECT_IDENTITY] }, + ], + }, + }); + }); +}); + +// ============================================================================= +// SECTION 8: EDGE CASES +// ============================================================================= + +describe("Edge Cases", () => { + test("same column in multiple contexts", () => { + const sql = ` + SELECT status, COUNT(*) as cnt + FROM users + WHERE status != 'deleted' + GROUP BY status + ORDER BY status + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["status"])]); + const result = getExtendedLineage(ast as Select, schema); + + // Field lineage + expect(result.fields).toEqual({ + status: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [DIRECT_IDENTITY] }, + ], + }, + cnt: { + inputFields: [], + }, + }); + + // Dataset lineage should have all three subtypes + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_SORT] }, + ]), + ); + }); + + test("column name collision from different tables", () => { + const sql = ` + SELECT u.name as user_name, p.name as product_name + FROM users u + JOIN products p ON u.favorite_product = p.id + WHERE u.name LIKE 'A%' AND p.name LIKE 'Widget%' + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["name", "favorite_product"]), + createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + user_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + product_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "id", transformations: [INDIRECT_JOIN] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.users`, + field: "favorite_product", + transformations: [INDIRECT_JOIN], + }, + ]), + ); + }); + + test("deduplication of repeated column in same clause", () => { + const sql = `SELECT id FROM users WHERE status = 'active' AND status != 'banned'`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status"])]); + const result = getExtendedLineage(ast as Select, schema); + + // status appears twice but should be deduplicated + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + ]); + }); + + test("empty dataset lineage when no indirect clauses", () => { + const sql = `SELECT id, UPPER(name) as upper_name FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([]); + }); +}); + +// ============================================================================= +// SECTION 9: COMPREHENSIVE MEGA-QUERIES +// ============================================================================= + +describe("Comprehensive: Everything Together", () => { + test("mega query with all features", () => { + const sql = ` + WITH + filtered_sales AS ( + SELECT + product_id, + store_id, + quantity, + unit_price, + quantity * unit_price as line_total + FROM sales + WHERE sale_date >= '2024-01-01' + AND status = 'completed' + ), + store_totals AS ( + SELECT + store_id, + SUM(line_total) as total_revenue, + COUNT(product_id) as product_count, + AVG(unit_price) as avg_price + FROM filtered_sales + GROUP BY store_id + HAVING SUM(line_total) > 1000 + ) + SELECT + s.name as store_name, + s.region, + st.total_revenue, + st.product_count, + st.avg_price, + CASE + WHEN st.total_revenue > 100000 THEN 'Premium' + WHEN st.total_revenue > 50000 THEN 'Standard' + ELSE 'Basic' + END as tier, + RANK() OVER (PARTITION BY s.region ORDER BY st.total_revenue DESC) as region_rank, + MD5(s.name) as store_hash + FROM store_totals st + JOIN stores s ON st.store_id = s.id + WHERE s.active = true + ORDER BY s.region, st.total_revenue DESC + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.sales`, [ + "id", + "product_id", + "store_id", + "quantity", + "unit_price", + "sale_date", + "status", + ]), + createTable(`${DEFAULT_SCHEMA}.stores`, ["id", "name", "region", "active"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + // ========== FIELD-LEVEL LINEAGE ========== + + expect(result.fields).toEqual({ + store_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + region: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "region", transformations: [DIRECT_IDENTITY] }, + ], + }, + total_revenue: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "unit_price", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + product_count: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "product_id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ], + }, + avg_price: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "unit_price", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + tier: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "quantity", + transformations: [INDIRECT_CONDITION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "unit_price", + transformations: [INDIRECT_CONDITION], + }, + ], + }, + region_rank: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "region", transformations: [INDIRECT_WINDOW] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "quantity", + transformations: [INDIRECT_WINDOW], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "unit_price", + transformations: [INDIRECT_WINDOW], + }, + ], + }, + store_hash: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.stores`, + field: "name", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ], + }, + }); + + // ========== DATASET-LEVEL LINEAGE ========== + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + // FILTER from filtered_sales CTE (WHERE sale_date >= ... AND status = ...) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "sale_date", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "status", transformations: [INDIRECT_FILTER] }, + // FILTER from main query (WHERE s.active = true) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "active", transformations: [INDIRECT_FILTER] }, + // FILTER from store_totals CTE (HAVING SUM(line_total) > 1000) + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "quantity", transformations: [INDIRECT_FILTER] }, + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "unit_price", transformations: [INDIRECT_FILTER] }, + // JOIN from main query (st.store_id = s.id) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "id", transformations: [INDIRECT_JOIN] }, + // GROUP BY from store_totals CTE (GROUP BY store_id) + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "store_id", transformations: [INDIRECT_GROUP_BY] }, + // SORT from main query (ORDER BY s.region, st.total_revenue DESC) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "region", transformations: [INDIRECT_SORT] }, + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "quantity", transformations: [INDIRECT_SORT] }, + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "unit_price", transformations: [INDIRECT_SORT] }, + ]), + ); + + // Verify we have all 8 output fields + expect(Object.keys(result.fields).length).toBe(8); + }); + + test("e-commerce analytics mega query", () => { + const sql = ` + SELECT + c.name as category_name, + p.name as product_name, + SUM(oi.quantity) as total_qty, + SUM(oi.quantity * oi.price) as revenue, + AVG(oi.price) as avg_price, + COUNT(DISTINCT o.customer_id) as unique_customers, + ROW_NUMBER() OVER (PARTITION BY c.id ORDER BY SUM(oi.quantity * oi.price) DESC) as category_rank + FROM categories c + JOIN products p ON c.id = p.category_id + JOIN order_items oi ON p.id = oi.product_id + JOIN orders o ON oi.order_id = o.id + WHERE o.status = 'completed' + AND o.created_at >= '2024-01-01' + AND p.active = true + GROUP BY c.id, c.name, p.id, p.name + HAVING SUM(oi.quantity) > 10 + ORDER BY c.name, revenue DESC + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.categories`, ["id", "name"]), + createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name", "category_id", "active"]), + createTable(`${DEFAULT_SCHEMA}.order_items`, ["id", "order_id", "product_id", "quantity", "price"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["id", "customer_id", "status", "created_at"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + // ========== FIELD-LEVEL LINEAGE ========== + + // Verify we have all 7 output fields + expect(Object.keys(result.fields).length).toBe(7); + + expect(result.fields.category_name?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.categories`, field: "name", transformations: [DIRECT_IDENTITY] }, + ]); + + expect(result.fields.product_name?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [DIRECT_IDENTITY] }, + ]); + + expect(result.fields.total_qty?.inputFields).toEqual([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }, + ]); + + expect(sortInputFields({ revenue: result.fields.revenue! })).toEqual( + sortInputFields({ + revenue: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "price", + transformations: [DIRECT_AGGREGATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + }), + ); + + expect(result.fields.avg_price?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "price", transformations: [DIRECT_AGGREGATION] }, + ]); + + expect(result.fields.unique_customers?.inputFields).toEqual([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, + field: "customer_id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ]); + + expect(sortInputFields({ category_rank: result.fields.category_rank! })).toEqual( + sortInputFields({ + category_rank: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.categories`, field: "id", transformations: [INDIRECT_WINDOW] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "price", + transformations: [INDIRECT_WINDOW], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [INDIRECT_WINDOW], + }, + ], + }, + }), + ); + + // ========== DATASET-LEVEL LINEAGE ========== + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + // JOIN lineage - 3 joins with 2 columns each = 6 total + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, + field: "id", + transformations: [INDIRECT_JOIN], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "category_id", + transformations: [INDIRECT_JOIN], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "id", + transformations: [INDIRECT_JOIN], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "product_id", + transformations: [INDIRECT_JOIN], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "order_id", + transformations: [INDIRECT_JOIN], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, + field: "id", + transformations: [INDIRECT_JOIN], + }, + // FILTER lineage - status, created_at, active + HAVING quantity + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, + field: "status", + transformations: [INDIRECT_FILTER], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, + field: "created_at", + transformations: [INDIRECT_FILTER], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "active", + transformations: [INDIRECT_FILTER], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [INDIRECT_FILTER], + }, + // GROUP BY lineage - c.id, c.name, p.id, p.name + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, + field: "id", + transformations: [INDIRECT_GROUP_BY], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, + field: "name", + transformations: [INDIRECT_GROUP_BY], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "id", + transformations: [INDIRECT_GROUP_BY], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "name", + transformations: [INDIRECT_GROUP_BY], + }, + // SORT lineage - c.name, revenue (quantity * price) + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, + field: "name", + transformations: [INDIRECT_SORT], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "price", + transformations: [INDIRECT_SORT], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [INDIRECT_SORT], + }, + ]), + ); + }); + + test("HR analytics mega query with complex CTEs and CASE", () => { + const sql = ` + WITH + active_employees AS ( + SELECT + id, + department_id, + salary, + hire_date, + performance_score + FROM employees + WHERE status = 'active' AND terminated_at IS NULL + ), + dept_stats AS ( + SELECT + department_id, + COUNT(id) as headcount, + SUM(salary) as total_compensation, + AVG(salary) as avg_salary, + MIN(hire_date) as oldest_hire, + AVG(performance_score) as avg_performance + FROM active_employees + GROUP BY department_id + HAVING COUNT(id) >= 3 + ) + SELECT + d.name as department_name, + d.location, + ds.headcount, + ds.total_compensation, + ds.avg_salary, + ds.avg_performance, + CASE + WHEN ds.avg_performance >= 4.5 THEN 'Exceptional' + WHEN ds.avg_performance >= 3.5 THEN 'Good' + WHEN ds.avg_performance >= 2.5 THEN 'Average' + ELSE 'Needs Improvement' + END as performance_tier, + DENSE_RANK() OVER (ORDER BY ds.total_compensation DESC) as compensation_rank, + ROW_NUMBER() OVER (PARTITION BY d.location ORDER BY ds.headcount DESC) as location_rank, + SHA256(d.name) as dept_hash + FROM dept_stats ds + JOIN departments d ON ds.department_id = d.id + WHERE d.active = true + ORDER BY d.location, ds.total_compensation DESC + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.employees`, [ + "id", + "department_id", + "salary", + "hire_date", + "performance_score", + "status", + "terminated_at", + ]), + createTable(`${DEFAULT_SCHEMA}.departments`, ["id", "name", "location", "active"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + // ========== FIELD-LEVEL LINEAGE ========== + + expect(result.fields).toEqual({ + department_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + location: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "location", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + headcount: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ], + }, + total_compensation: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "salary", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + avg_salary: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "salary", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + avg_performance: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "performance_score", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + performance_tier: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "performance_score", + transformations: [INDIRECT_CONDITION], + }, + ], + }, + compensation_rank: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "salary", + transformations: [INDIRECT_WINDOW], + }, + ], + }, + location_rank: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "location", + transformations: [INDIRECT_WINDOW], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "id", + transformations: [{ type: "INDIRECT", subtype: "WINDOW", masking: true }], + }, + ], + }, + dept_hash: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "name", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ], + }, + }); + + // ========== DATASET-LEVEL LINEAGE ========== + + // FILTER from active_employees CTE + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + // FILTER from active_employees CTE (WHERE status = 'active' AND terminated_at IS NULL) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "status", transformations: [INDIRECT_FILTER] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "terminated_at", + transformations: [INDIRECT_FILTER], + }, + // FILTER from main query (WHERE d.active = true) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "active", transformations: [INDIRECT_FILTER] }, + // FILTER from dept_stats CTE (HAVING COUNT(id) >= 3) TODO + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "id", transformations: [INDIRECT_FILTER] }, + // JOIN from main query (ds.department_id = d.id) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "id", transformations: [INDIRECT_JOIN] }, + // GROUP BY from dept_stats CTE (GROUP BY department_id) TODO + // { + // namespace: "ns", + // name: `${DEFAULT_SCHEMA}.employees`, + // field: "department_id", + // transformations: [INDIRECT_GROUP_BY], + // }, + // SORT from main query (ORDER BY d.location, ds.total_compensation DESC) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "location", transformations: [INDIRECT_SORT] }, + // TODO - fix + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_SORT] }, + ]), + ); + }); + + test("UNION with CTEs and window functions mega query", () => { + const sql = ` + WITH + us_sales AS ( + SELECT + product_id, + SUM(amount) as total_amount, + COUNT(*) as sale_count + FROM sales + WHERE region = 'US' AND sale_date >= '2024-01-01' + GROUP BY product_id + ), + eu_sales AS ( + SELECT + product_id, + SUM(amount) as total_amount, + COUNT(*) as sale_count + FROM sales + WHERE region = 'EU' AND sale_date >= '2024-01-01' + GROUP BY product_id + ) + SELECT + 'US' as region, + p.name as product_name, + us.total_amount, + us.sale_count, + RANK() OVER (ORDER BY us.total_amount DESC) as revenue_rank + FROM us_sales us + JOIN products p ON us.product_id = p.id + WHERE p.active = true + + UNION ALL + + SELECT + 'EU' as region, + p.name as product_name, + eu.total_amount, + eu.sale_count, + RANK() OVER (ORDER BY eu.total_amount DESC) as revenue_rank + FROM eu_sales eu + JOIN products p ON eu.product_id = p.id + WHERE p.active = true + `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.sales`, ["id", "product_id", "amount", "region", "sale_date"]), + createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name", "active"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + // Field lineage - product_name comes from products.name + // Both UNION parts join the same products table, so we get one unique entry per field + // (the mergeInputFields deduplicates by full field identity including transformations) + expect(result.fields).toEqual({ + region: { + inputFields: [], + }, + product_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + total_amount: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "amount", transformations: [DIRECT_AGGREGATION] }, + ], + }, + sale_count: { + inputFields: [], + }, + revenue_rank: { + inputFields: [], + }, + }); + + // Dataset lineage from both CTEs and both UNION parts + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + // FILTER from both CTEs: WHERE region = '...' AND sale_date >= '...' + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "region", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "sale_date", transformations: [INDIRECT_FILTER] }, + // FILTER from both outer queries: WHERE p.active = true + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "active", transformations: [INDIRECT_FILTER] }, + // GROUP BY from both CTEs (deduplicated since same table.field) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "product_id", transformations: [INDIRECT_GROUP_BY] }, + // JOIN from both UNION parts + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "id", transformations: [INDIRECT_JOIN] }, + ]), + ); + }); +}); + +// ============================================================================= +// SECTION 10: SCHEMA HANDLING (DEFAULT & MULTI-SCHEMA SUPPORT) +// ============================================================================= + +describe("Default Schema Handling", () => { + test("matches table with default schema", () => { + const sql = `SELECT id, name FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"])], "public"); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + id: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }); + }); + + test("schema-qualified table name", () => { + const sql = `SELECT u.id FROM analytics.users u WHERE u.status = 'active'`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable("analytics.users", ["id", "status"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + id: { + inputFields: [{ namespace: "ns", name: "analytics.users", field: "id", transformations: [DIRECT_IDENTITY] }], + }, + }); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: "analytics.users", field: "status", transformations: [INDIRECT_FILTER] }, + ]); + }); +}); + +describe("Multi-Schema Support", () => { + test("select from table with explicit schema", () => { + const sql = `SELECT id, name FROM myschema.users`; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("myschema.users", ["id", "name", "email"]), + createTable("otherschema.users", ["id", "username"]), + ]); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("select from table with default schema", () => { + const sql = `SELECT id, name FROM users`; + const ast = parseSQL(sql); + const namespace = createNamespace( + "trino", + [createTable("myschema.users", ["id", "name", "email"]), createTable("otherschema.users", ["id", "username"])], + "myschema", // default schema + ); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("join across different schemas", () => { + const sql = ` + SELECT + u.id, + u.name, + o.total + FROM myschema.users u + JOIN orders_schema.orders o ON u.id = o.user_id + `; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("myschema.users", ["id", "name"]), + createTable("orders_schema.orders", ["id", "user_id", "total"]), + ]); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total: { + inputFields: [ + { + name: "orders_schema.orders", + namespace: "trino", + field: "total", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("mix explicit and default schema tables", () => { + const sql = ` + SELECT + u.id, + u.name, + o.total + FROM users u + JOIN orders_schema.orders o ON u.id = o.user_id + `; + const ast = parseSQL(sql); + const namespace = createNamespace( + "trino", + [createTable("myschema.users", ["id", "name"]), createTable("orders_schema.orders", ["id", "user_id", "total"])], + "myschema", // default schema + ); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total: { + inputFields: [ + { + name: "orders_schema.orders", + namespace: "trino", + field: "total", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("same table name in different schemas", () => { + const sql = ` + SELECT + u1.id as user1_id, + u2.id as user2_id + FROM schema1.users u1 + JOIN schema2.users u2 ON u1.id = u2.id + `; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("schema1.users", ["id", "name"]), + createTable("schema2.users", ["id", "username"]), + ]); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + user1_id: { + inputFields: [ + { + name: "schema1.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + user2_id: { + inputFields: [ + { + name: "schema2.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("CTE with schema-qualified tables", () => { + const sql = ` + WITH active_users AS ( + SELECT id, name FROM myschema.users WHERE status = 'active' + ) + SELECT + au.id, + au.name, + o.total + FROM active_users au + JOIN orders_schema.orders o ON au.id = o.user_id + `; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("myschema.users", ["id", "name", "status"]), + createTable("orders_schema.orders", ["id", "user_id", "total"]), + ]); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total: { + inputFields: [ + { + name: "orders_schema.orders", + namespace: "trino", + field: "total", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("select * from schema-qualified table", () => { + const sql = `SELECT * FROM myschema.users`; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [createTable("myschema.users", ["id", "name", "email"])]); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + email: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "email", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); +}); diff --git a/packages/lineage/test/index.test.ts b/packages/lineage/test/index.test.ts deleted file mode 100644 index 2f4cb26..0000000 --- a/packages/lineage/test/index.test.ts +++ /dev/null @@ -1,801 +0,0 @@ -import { describe, test, expect } from "bun:test"; -import { Parser } from "node-sql-parser"; -import type { AST, Select } from "node-sql-parser"; -import { - getLineage, - type Schema, - type Table, - DIRECT_AGGREGATION, - DIRECT_IDENTITY, - DIRECT_TRANSFORMATION, -} from "../src/index.js"; - -const parser = new Parser(); - -// Helper function to create schemas -function createSchema(namespace: string, tables: Table[]): Schema { - return { namespace, tables }; -} - -function createTable(name: string, columns: string[]): Table { - return { name, columns }; -} - -// Helper to ensure we get a single AST -function parseSQL(sql: string): AST { - const result = parser.astify(sql, { database: "trino" }); - const ast = Array.isArray(result) ? result[0] : result; - - if (!ast) { - throw new Error("Failed to parse SQL"); - } - - return ast; -} - -describe("Select Lineage", () => { - test("select from cte", () => { - const sql = ` - WITH u AS ( - SELECT - id, - name - FROM users - ) - SELECT - id, - name as wow - FROM u - `; - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "email"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - wow: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select from cte with *", () => { - const sql = ` - WITH u AS ( - SELECT * FROM users - ) - SELECT - id, - name as wow - FROM (SELECT * FROM u) AS t`; - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "email"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - wow: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select from multiple ctes", () => { - const sql = ` - WITH active_users AS ( - SELECT - id, - name, - email - FROM users - WHERE status = 'active' - ), - user_orders AS ( - SELECT - user_id, - COUNT(user_id) as order_count, - SUM(total) as total_spent - FROM orders - GROUP BY user_id - ), - enriched_users AS ( - SELECT - au.id, - au.name, - au.email, - COALESCE(uo.order_count, 0) as order_count, - COALESCE(uo.total_spent, 0) as total_spent - FROM active_users au - LEFT JOIN user_orders uo ON au.id = uo.user_id - ) - SELECT - id, - name as full_name, - order_count, - total_spent * 1.1 as total_with_tax - FROM enriched_users - WHERE order_count > 0 - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "email", "status"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - full_name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - order_count: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "user_id", - transformations: [ - { type: "DIRECT", subtype: "AGGREGATION", masking: true }, - ], - }, - ], - }, - total_with_tax: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "total", - transformations: [DIRECT_AGGREGATION], - }, - ], - }, - }); - }); - - test("product sales analysis with multiple ctes", () => { - const sql = `-- Product sales analysis with store information using CTEs -WITH filtered_sales AS ( - SELECT - product_id, - store_id, - quantity_sold, - unit_price, - discount_percentage - FROM product_sales - WHERE sale_date >= '2023-01-01' -), -store_sales_summary AS ( - SELECT - fs.product_id, - fs.store_id, - SUM(fs.quantity_sold) as total_quantity, - AVG(fs.unit_price) as avg_price, - SUM(fs.quantity_sold * fs.unit_price * (1 - fs.discount_percentage/100)) as net_revenue - FROM filtered_sales fs - GROUP BY fs.product_id, fs.store_id -), -final_report AS ( - SELECT - sss.product_id, - s.store_name, - s.region, - sss.total_quantity, - sss.avg_price, - sss.net_revenue - FROM store_sales_summary sss - JOIN stores s ON sss.store_id = s.id -) -SELECT - product_id, - store_name, - region, - total_quantity, - avg_price, - net_revenue -FROM final_report -ORDER BY net_revenue DESC`; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("product_sales", [ - "product_id", - "store_id", - "quantity_sold", - "unit_price", - "discount_percentage", - "sale_date", - ]), - createTable("stores", ["id", "store_name", "region"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - product_id: { - inputFields: [ - { - name: "product_sales", - namespace: "trino", - field: "product_id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - store_name: { - inputFields: [ - { - name: "stores", - namespace: "trino", - field: "store_name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - region: { - inputFields: [ - { - name: "stores", - namespace: "trino", - field: "region", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - total_quantity: { - inputFields: [ - { - name: "product_sales", - namespace: "trino", - field: "quantity_sold", - transformations: [DIRECT_AGGREGATION], - }, - ], - }, - avg_price: { - inputFields: [ - { - name: "product_sales", - namespace: "trino", - field: "unit_price", - transformations: [DIRECT_AGGREGATION], - }, - ], - }, - net_revenue: { - inputFields: [ - { - name: "product_sales", - namespace: "trino", - field: "quantity_sold", - transformations: [DIRECT_AGGREGATION], - }, - { - name: "product_sales", - namespace: "trino", - field: "unit_price", - transformations: [DIRECT_AGGREGATION], - }, - { - name: "product_sales", - namespace: "trino", - field: "discount_percentage", - transformations: [DIRECT_AGGREGATION], - }, - ], - }, - }); - }); - - test("select with lots of aliases", () => { - const sql = ` - WITH u AS ( - SELECT - id as i, - name as n - FROM users - ) - SELECT - i as id, - n as wow - FROM u - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "email"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - wow: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select with group by", () => { - const sql = `SELECT country, count(city) as city_count - FROM cities - GROUP BY country`; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("cities", ["country", "city"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - country: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "country", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - city_count: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "city", - transformations: [ - { type: "DIRECT", subtype: "AGGREGATION", masking: true }, - ], - }, - ], - }, - }); - }); - - test("select with binary expression", () => { - const sql = `SELECT id, name, id + 1 as next_id - FROM users`; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - next_id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - }); - }); - - test("select same column different tables", () => { - const sql = `SELECT u.id, o.id as order_id - FROM users u - JOIN orders o ON u.id = o.user_id`; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "email"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - order_id: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select with function transformation", () => { - const sql = `SELECT - id, - UPPER(name) as upper_name, - LENGTH(email) as email_length, - CONCAT(name, email) as name_email - FROM users`; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "email"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - upper_name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - email_length: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "email", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - name_email: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "users", - namespace: "trino", - field: "email", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - }); - }); - - test("select with arithmetic operations", () => { - const sql = `SELECT - id, - price + tax as total_price, - quantity * price as line_total, - (price + tax) * quantity as grand_total, - price - discount as discounted_price - FROM orders`; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("orders", ["id", "price", "tax", "quantity", "discount"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - total_price: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "tax", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - line_total: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "quantity", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - grand_total: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "tax", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "quantity", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - discounted_price: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "discount", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - }); - }); - - test("select with complex nested arithmetic", () => { - const sql = `SELECT - id, - ((price + tax) * quantity) / discount as complex_calc, - price % 10 as price_remainder - FROM orders`; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("orders", ["id", "price", "tax", "quantity", "discount"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - complex_calc: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "tax", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "quantity", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "discount", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - price_remainder: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - }); - }); - - test("select with mixed aggregation and arithmetic", () => { - const sql = `SELECT - country, - SUM(population) as total_population, - AVG(area) * 2 as double_avg_area, - COUNT(city) + 1 as city_count_plus_one - FROM cities - GROUP BY country`; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("cities", ["country", "city", "population", "area"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - country: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "country", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - total_population: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "population", - transformations: [{ ...DIRECT_AGGREGATION }], - }, - ], - }, - double_avg_area: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "area", - transformations: [DIRECT_AGGREGATION], - }, - ], - }, - city_count_plus_one: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "city", - transformations: [{ ...DIRECT_AGGREGATION, masking: true }], - }, - ], - }, - }); - }); -});