From 1c8b6e483f5a3167e8a96ee4a75762507ec93487 Mon Sep 17 00:00:00 2001 From: its-elad <59926027+its-elad@users.noreply.github.com> Date: Thu, 29 Jan 2026 10:25:06 +0200 Subject: [PATCH 01/10] feat: added indirect lineage via the new extendedLineage function --- .prettierrc | 6 + apps/demo/src/App.tsx | 35 +- apps/demo/src/components/editor/SQLEditor.tsx | 53 +- bun.lock | 1 + packages/lineage/src/hashset.ts | 16 +- packages/lineage/src/index.ts | 730 ++++++++++++++++-- packages/lineage/test/index.test.ts | 44 +- packages/lineage/test/indirect.test.ts | 358 +++++++++ 8 files changed, 1059 insertions(+), 184 deletions(-) create mode 100644 .prettierrc create mode 100644 packages/lineage/test/indirect.test.ts diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 0000000..4093290 --- /dev/null +++ b/.prettierrc @@ -0,0 +1,6 @@ +{ + "trailingComma": "all", + "printWidth": 120, + "tabWidth": 2, + "endOfLine": "auto" +} diff --git a/apps/demo/src/App.tsx b/apps/demo/src/App.tsx index 91b5f0a..f632899 100644 --- a/apps/demo/src/App.tsx +++ b/apps/demo/src/App.tsx @@ -1,13 +1,12 @@ -import { useState, useCallback } from "react"; +import { useState, useCallback, useEffect } from "react"; import { GitBranch, Github } from "lucide-react"; import { Card, CardContent, CardHeader, CardTitle } from "@meta-sql/ui"; import { SQLEditor } from "./components/editor"; import { LineageGraph } from "./components/lineage/LineageGraph"; -import type { ColumnLineageDatasetFacet } from "@meta-sql/open-lineage"; -import type { Schema } from "@meta-sql/lineage"; +import type { getExtendedLineage, Schema } from "@meta-sql/lineage"; // Use the actual return type from getLineage -type LineageResult = ColumnLineageDatasetFacet["fields"]; +type LineageResult = ReturnType; // Default schema matching the sample queries const defaultSchema: Schema = { @@ -15,15 +14,7 @@ const defaultSchema: Schema = { tables: [ { name: "product_sales", - columns: [ - "id", - "product_id", - "quantity_sold", - "unit_price", - "sale_date", - "store_id", - "discount_percentage", - ], + columns: ["id", "product_id", "quantity_sold", "unit_price", "sale_date", "store_id", "discount_percentage"], }, { name: "stores", @@ -56,6 +47,8 @@ export default function App() { setLineageData(lineageResult); }, []); + useEffect(() => console.log(lineageData), [lineageData]); + return (
{/* Header */} @@ -64,9 +57,7 @@ export default function App() {
-

- @meta-sql/lineage -

+

@meta-sql/lineage

Interactive Demo - SQL Column Lineage Analysis Package @@ -96,22 +87,14 @@ export default function App() { SQL Editor - + {/* Right Panel - Lineage Graph */} - + Data Lineage Graph diff --git a/apps/demo/src/components/editor/SQLEditor.tsx b/apps/demo/src/components/editor/SQLEditor.tsx index 83d3d41..dceeac1 100644 --- a/apps/demo/src/components/editor/SQLEditor.tsx +++ b/apps/demo/src/components/editor/SQLEditor.tsx @@ -12,21 +12,14 @@ import { PopoverContent, PopoverTrigger, } from "@meta-sql/ui"; -import { - FileText, - AlertCircle, - CheckCircle, - CheckIcon, - ChevronsUpDown, -} from "lucide-react"; -import { getLineage, type Schema } from "@meta-sql/lineage"; -import type { ColumnLineageDatasetFacet } from "@meta-sql/open-lineage"; +import { FileText, AlertCircle, CheckCircle, CheckIcon, ChevronsUpDown } from "lucide-react"; +import { getExtendedLineage, type Schema } from "@meta-sql/lineage"; import { Parser } from "node-sql-parser"; import { sampleQueries, type SupportedDialect } from "./sampleQueries.js"; import { cn } from "@meta-sql/ui/lib/utils"; // Use the actual return type from getLineage -type LineageResult = ColumnLineageDatasetFacet["fields"]; +type LineageResult = ReturnType; const dialectOptions = [ { @@ -56,20 +49,12 @@ const dialectOptions = [ ] satisfies Array<{ value: SupportedDialect; label: string }>; interface SQLEditorProps { - onQueryParsed: ( - lineageResult: LineageResult, - query: string, - dialect: string - ) => void; + onQueryParsed: (lineageResult: LineageResult, query: string, dialect: string) => void; schema?: Schema; className?: string; } -export const SQLEditor: React.FC = ({ - onQueryParsed, - schema, - className = "", -}) => { +export const SQLEditor: React.FC = ({ onQueryParsed, schema, className = "" }) => { const [query, setQuery] = useState(""); const [dialect, setDialect] = useState("mysql"); const [open, setOpen] = useState(false); @@ -96,7 +81,7 @@ export const SQLEditor: React.FC = ({ }; // Get column lineage and update the graph - const lineageResult = getLineage(firstStatement, lineageSchema); + const lineageResult = getExtendedLineage(firstStatement, lineageSchema); onQueryParsed(lineageResult, query, dialect); @@ -104,9 +89,7 @@ export const SQLEditor: React.FC = ({ } else { setValidationResult({ isValid: false, - errors: [ - "Only SELECT statements are supported for lineage analysis", - ], + errors: ["Only SELECT statements are supported for lineage analysis"], }); } } else { @@ -120,9 +103,7 @@ export const SQLEditor: React.FC = ({ setValidationResult({ isValid: false, - errors: [ - error instanceof Error ? error.message : "Unknown parsing error", - ], + errors: [error instanceof Error ? error.message : "Unknown parsing error"], }); } } else { @@ -152,10 +133,7 @@ export const SQLEditor: React.FC = ({ aria-expanded={open} className="w-[200px] justify-between" > - {dialect - ? dialectOptions.find((option) => option.value === dialect) - ?.label - : "Select dialect..."} + {dialect ? dialectOptions.find((option) => option.value === dialect)?.label : "Select dialect..."} @@ -175,12 +153,7 @@ export const SQLEditor: React.FC = ({ }} > {option.label} @@ -248,11 +221,7 @@ export const SQLEditor: React.FC = ({
{validationResult.errors.map((error, index) => ( - + {error} ))} diff --git a/bun.lock b/bun.lock index 6dc8603..5bbaa80 100644 --- a/bun.lock +++ b/bun.lock @@ -1,5 +1,6 @@ { "lockfileVersion": 1, + "configVersion": 0, "workspaces": { "": { "name": "meta-sql", diff --git a/packages/lineage/src/hashset.ts b/packages/lineage/src/hashset.ts index 9f9aec7..13b9605 100644 --- a/packages/lineage/src/hashset.ts +++ b/packages/lineage/src/hashset.ts @@ -5,10 +5,7 @@ export class HashSet implements Set { constructor(hasher?: (value: T) => string) { this.hasher = hasher || ((value: T) => JSON.stringify(value)); } - forEach( - callbackfn: (value: T, value2: T, set: Set) => void, - thisArg?: unknown - ): void { + forEach(callbackfn: (value: T, value2: T, set: Set) => void, thisArg?: unknown): void { this.map.forEach((value) => { callbackfn.call(thisArg, value, value, this); }); @@ -63,4 +60,15 @@ export class HashSet implements Set { } return intersection; } + + union(other: Set): HashSet { + const union = new HashSet(this.hasher); + for (const value of this) { + union.add(value); + } + for (const value of other) { + union.add(value); + } + return union; + } } diff --git a/packages/lineage/src/index.ts b/packages/lineage/src/index.ts index 3ba63b6..a1ff9e3 100644 --- a/packages/lineage/src/index.ts +++ b/packages/lineage/src/index.ts @@ -2,6 +2,8 @@ import { type ColumnLineageDatasetFacet, type InputField, type Transformation as _Transformation, + type TransformationType, + type TransformationSubtype, } from "@meta-sql/open-lineage"; import { Select, @@ -13,6 +15,9 @@ import { AggrFunc, Function as AstFunction, With, + Case, + Interval, + Cast, } from "node-sql-parser"; import { HashSet } from "./hashset"; @@ -31,8 +36,13 @@ const MASKING_FUNCTIONS = new Set([ "MURMUR3", "SPOOKY_HASH_V2_32", "SPOOKY_HASH_V2_64", + "HASH", + "ANONYMIZE", + "MASK", + "REDACT", ]); +// Direct transformation constants export const DIRECT_TRANSFORMATION: Transformation = { type: "DIRECT", subtype: "TRANSFORMATION", @@ -51,41 +61,87 @@ export const DIRECT_AGGREGATION: Transformation = { masking: false, }; -function mergeTransformations( - parent: Transformation | undefined, - child: Transformation +// Indirect transformation constants +export const INDIRECT_JOIN: Transformation = { + type: "INDIRECT", + subtype: "JOIN", + masking: false, +}; + +export const INDIRECT_FILTER: Transformation = { + type: "INDIRECT", + subtype: "FILTER", + masking: false, +}; + +export const INDIRECT_GROUP_BY: Transformation = { + type: "INDIRECT", + subtype: "GROUP_BY", + masking: false, +}; + +export const INDIRECT_SORT: Transformation = { + type: "INDIRECT", + subtype: "SORT", + masking: false, +}; + +export const INDIRECT_WINDOW: Transformation = { + type: "INDIRECT", + subtype: "WINDOW", + masking: false, +}; + +export const INDIRECT_CONDITION: Transformation = { + type: "INDIRECT", + subtype: "CONDITION", + masking: false, +}; + +function createTransformation( + type: TransformationType, + subtype: TransformationSubtype, + masking: boolean = false, ): Transformation { + return { type, subtype, masking }; +} + +function mergeTransformations(parent: Transformation | undefined, child: Transformation): Transformation { if (!parent) { return child; } - if (child.type !== "DIRECT" || parent.type !== "DIRECT") { - throw new Error("Indirect transformations not supported yet"); + // If types differ, prefer the more specific one + // INDIRECT is generally more specific than DIRECT for the same column + if (parent.type !== child.type) { + // Keep the child transformation but merge masking + return { ...child, masking: parent.masking || child.masking }; } - let leading: Transformation; + if (child.type === "DIRECT" && parent.type === "DIRECT") { + let leading: Transformation; - // agg > transformation > identity + // agg > transformation > identity + if (parent.subtype === "AGGREGATION") { + leading = parent; + } else if (child.subtype === "AGGREGATION") { + leading = child; + } else if (parent.subtype === "TRANSFORMATION") { + leading = parent; + } else { + leading = child; + } - if (parent.subtype === "AGGREGATION") { - leading = parent; - } else if (child.subtype === "AGGREGATION") { - leading = child; - } else if (parent.subtype === "TRANSFORMATION") { - leading = parent; - } else { - leading = child; + return { ...leading, masking: parent.masking || child.masking }; } - return { ...leading, masking: parent.masking || child.masking }; + // For INDIRECT transformations, prefer the child (more recent context) + return { ...child, masking: parent.masking || child.masking }; } class TransformationSet extends HashSet { constructor(values?: readonly Transformation[]) { - super( - (value: Transformation) => - `${value.type}-${value.subtype}-${value.masking ? "MASKED" : "UNMASKED"}` - ); + super((value: Transformation) => `${value.type}-${value.subtype}-${value.masking ? "MASKED" : "UNMASKED"}`); if (values) { values.forEach((value) => this.add(value)); @@ -116,9 +172,15 @@ export type SelectWithAlias = Select & { as?: string | null; }; -export function isColumn( - selectColumn: Select["columns"][number] -): selectColumn is AstColumn { +/** + * Extended lineage result that includes both field-level and dataset-level lineage + */ +export interface ExtendedLineageResult { + fields: ColumnLineageDatasetFacet["fields"]; + dataset?: InputField[]; +} + +export function isColumn(selectColumn: Select["columns"][number]): selectColumn is AstColumn { return ( typeof selectColumn === "object" && selectColumn !== null && @@ -130,9 +192,7 @@ export function isColumn( } export function formatInputColumnName(column: ColumnRefItem): string { - return `${column.table ? `${column.table}.` : ""}${getInputColumnName( - column - )}`; + return `${column.table ? `${column.table}.` : ""}${getInputColumnName(column)}`; } export function parseInputColumnName(column: string): InputColumn { @@ -161,9 +221,98 @@ export function getOutputColumnName(column: AstColumn): string | null { return null; } +/** + * Extract column references from any expression value + */ +export function extractColumnRefs(expr: ExpressionValue | null | undefined): ColumnRefItem[] { + if (!expr) return []; + + const refs: ColumnRefItem[] = []; + + // TODO - why not "exp" ? + switch (expr.type) { + case "column_ref": + refs.push(expr as ColumnRefItem); + break; + + case "binary_expr": { + const binary = expr as Binary; + refs.push(...extractColumnRefs(binary.left)); + refs.push(...extractColumnRefs(binary.right)); + break; + } + + case "aggr_func": { + const aggr = expr as AggrFunc; + if (aggr.args?.expr) { + refs.push(...extractColumnRefs(aggr.args.expr)); + } + break; + } + + case "function": { + const func = expr as AstFunction; + if (func.args?.value) { + for (const arg of func.args.value) { + refs.push(...extractColumnRefs(arg)); + } + } + break; + } + + case "case": { + const caseExpr = expr as Case; + if (caseExpr.args) { + for (const arg of caseExpr.args) { + if (arg.type === "when" && arg.cond) { + refs.push(...extractColumnRefs(arg.cond)); + } + if (arg.result) { + refs.push(...extractColumnRefs(arg.result)); + } + } + } + break; + } + + case "interval": { + const interval = expr as Interval; + if (interval.expr) { + refs.push(...extractColumnRefs(interval.expr)); + } + break; + } + + case "cast": { + const cast = expr as Cast; + if (cast.expr) { + refs.push(...extractColumnRefs(cast.expr)); + } + break; + } + + default: + // Handle nested expressions in unknown types + if (typeof expr === "object" && expr !== null) { + for (const key of Object.keys(expr)) { + const value = (expr as Record)[key]; + if (value && typeof value === "object" && "type" in value) { + refs.push(...extractColumnRefs(value as ExpressionValue)); + } + } + } + break; + } + + return refs; +} + +/** + * Get transformations from expression, now supporting CASE/IF for CONDITION subtype + */ export function getDirectTransformationsFromExprValue( expr: ExpressionValue, - parentTransformation?: Transformation + parentTransformation?: Transformation, ): Record { switch (expr.type) { case "column_ref": { @@ -171,31 +320,24 @@ export function getDirectTransformationsFromExprValue( return inputColumnName ? { - [inputColumnName]: new TransformationSet([ - mergeTransformations(parentTransformation, DIRECT_IDENTITY), - ]), + [inputColumnName]: new TransformationSet([mergeTransformations(parentTransformation, DIRECT_IDENTITY)]), } : {}; } + case "binary_expr": { const { left, right } = expr as Binary; const merged: Record = {}; Object.entries( - getDirectTransformationsFromExprValue( - left, - mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION) - ) + getDirectTransformationsFromExprValue(left, mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION)), ).forEach(([key, value]) => { merged[key] = value; }); Object.entries( - getDirectTransformationsFromExprValue( - right, - mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION) - ) + getDirectTransformationsFromExprValue(right, mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION)), ).forEach(([key, value]) => { const prev = merged[key]; @@ -208,6 +350,7 @@ export function getDirectTransformationsFromExprValue( return merged; } + case "aggr_func": { const aggExpr = expr as AggrFunc; @@ -216,9 +359,10 @@ export function getDirectTransformationsFromExprValue( mergeTransformations(parentTransformation, { ...DIRECT_AGGREGATION, masking: MASKING_AGG_FUNCTIONS.has(aggExpr.name), - }) + }), ); } + case "function": { const funcExpr = expr as AstFunction; @@ -231,8 +375,8 @@ export function getDirectTransformationsFromExprValue( ...DIRECT_TRANSFORMATION, masking: funcExpr.name.name.length > 0 && - MASKING_FUNCTIONS.has(funcExpr.name.name.at(-1)!.value), - }) + MASKING_FUNCTIONS.has(funcExpr.name.name.at(-1)!.value.toUpperCase()), + }), ); Object.entries(argTransformations).forEach(([key, value]) => { @@ -241,15 +385,422 @@ export function getDirectTransformationsFromExprValue( return acc; }, - {} as Record + {} as Record, ) ?? {} ); } + + case "case": { + const caseExpr = expr as Case; + const merged: Record = {}; + + if (caseExpr.args) { + for (const arg of caseExpr.args) { + // Condition columns get INDIRECT/CONDITION + if (arg.type === "when" && arg.cond) { + const condTransformations = getIndirectTransformationsFromExpr(arg.cond, INDIRECT_CONDITION); + Object.entries(condTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; + }); + } + + // Result columns get DIRECT/TRANSFORMATION (value is transformed through CASE) + if (arg.result) { + const resultTransformations = getDirectTransformationsFromExprValue( + arg.result, + mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION), + ); + Object.entries(resultTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; + }); + } + } + } + + return merged; + } + + case "cast": { + const castExpr = expr as Cast; + if (castExpr.expr) { + return getDirectTransformationsFromExprValue( + castExpr.expr, + mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION), + ); + } + return {}; + } + + case "interval": { + const intervalExpr = expr as Interval; + if (intervalExpr.expr) { + return getDirectTransformationsFromExprValue( + intervalExpr.expr, + mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION), + ); + } + return {}; + } + default: return {}; } } +/** + * Get indirect transformations from an expression with a specific transformation type + */ +export function getIndirectTransformationsFromExpr( + expr: ExpressionValue | null | undefined, + transformation: Transformation, +): Record { + if (!expr) return {}; + + const columnRefs = extractColumnRefs(expr); + const result: Record = {}; + + for (const ref of columnRefs) { + const columnName = formatInputColumnName(ref); + if (columnName) { + result[columnName] = new TransformationSet([transformation]); + } + } + + return result; +} + +/** + * Extract JOIN lineage from FROM clause + */ +export function getJoinLineage(select: Select, schema: Schema): InputField[] { + const inputFields: InputField[] = []; + + if (!select.from) return inputFields; + + const fromItems = Array.isArray(select.from) ? select.from : [select.from]; + const { regularTables } = getTableExpressionsFromSelect(select); + + for (const item of fromItems) { + // Check for JOIN conditions + if ("on" in item && item.on) { + const columnRefs = extractColumnRefs(item.on as ExpressionValue); + + for (const ref of columnRefs) { + const columnName = getInputColumnName(ref); + const tableName = ref.table; + + if (columnName) { + // Find the table - tableName might be an alias, so check against both table name and alias + const table = regularTables.find( + (t) => + (!tableName || tableName === t.table || tableName === t.as) && + schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), + ); + + if (table) { + const schemaTable = schema.tables.find((s) => s.name === table.table); + if (schemaTable) { + inputFields.push({ + namespace: schema.namespace, + name: schemaTable.name, + field: columnName, + transformations: [INDIRECT_JOIN], + }); + } + } + } + } + } + + // Check for USING clause + if ("using" in item && Array.isArray(item.using)) { + for (const usingCol of item.using) { + // USING columns exist in multiple tables + for (const schemaTable of schema.tables) { + if (schemaTable.columns.includes(usingCol)) { + inputFields.push({ + namespace: schema.namespace, + name: schemaTable.name, + field: usingCol, + transformations: [INDIRECT_JOIN], + }); + } + } + } + } + } + + return inputFields; +} + +/** + * Extract WHERE clause lineage (FILTER) + */ +export function getFilterLineage(select: Select, schema: Schema): InputField[] { + const inputFields: InputField[] = []; + + if (!select.where) return inputFields; + + const columnRefs = extractColumnRefs(select.where as ExpressionValue); + const { regularTables } = getTableExpressionsFromSelect(select); + + for (const ref of columnRefs) { + const columnName = getInputColumnName(ref); + const tableName = ref.table; + + if (columnName) { + // Find the table in schema + const table = regularTables.find( + (t) => + (!tableName || tableName === t.table || tableName === t.as) && + schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), + ); + + if (table) { + const schemaTable = schema.tables.find((s) => s.name === table.table); + if (schemaTable) { + inputFields.push({ + namespace: schema.namespace, + name: schemaTable.name, + field: columnName, + transformations: [INDIRECT_FILTER], + }); + } + } + } + } + + return inputFields; +} + +/** + * Extract GROUP BY lineage + */ +export function getGroupByLineage(select: Select, schema: Schema): InputField[] { + const inputFields: InputField[] = []; + + if (!select.groupby) return inputFields; + + // Handle both array format and object with columns property + let groupByItems: ExpressionValue[]; + if (Array.isArray(select.groupby)) { + groupByItems = select.groupby; + } else if ( + typeof select.groupby === "object" && + "columns" in select.groupby && + Array.isArray(select.groupby.columns) + ) { + groupByItems = select.groupby.columns; + } else { + groupByItems = [select.groupby as unknown as ExpressionValue]; + } + const { regularTables } = getTableExpressionsFromSelect(select); + + for (const item of groupByItems) { + const columnRefs = extractColumnRefs(item as ExpressionValue); + + for (const ref of columnRefs) { + const columnName = getInputColumnName(ref); + const tableName = ref.table; + + if (columnName) { + const table = regularTables.find( + (t) => + (!tableName || tableName === t.table || tableName === t.as) && + schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), + ); + + if (table) { + const schemaTable = schema.tables.find((s) => s.name === table.table); + if (schemaTable) { + inputFields.push({ + namespace: schema.namespace, + name: schemaTable.name, + field: columnName, + transformations: [INDIRECT_GROUP_BY], + }); + } + } + } + } + } + + return inputFields; +} + +/** + * Extract ORDER BY lineage (SORT) + */ +export function getOrderByLineage(select: Select, schema: Schema): InputField[] { + const inputFields: InputField[] = []; + + if (!select.orderby) return inputFields; + + const orderByItems = Array.isArray(select.orderby) ? select.orderby : [select.orderby]; + const { regularTables } = getTableExpressionsFromSelect(select); + + for (const item of orderByItems) { + const expr = "expr" in item ? item.expr : item; + const columnRefs = extractColumnRefs(expr as ExpressionValue); + + for (const ref of columnRefs) { + const columnName = getInputColumnName(ref); + const tableName = ref.table; + + if (columnName) { + const table = regularTables.find( + (t) => + (!tableName || tableName === t.table || tableName === t.as) && + schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), + ); + + if (table) { + const schemaTable = schema.tables.find((s) => s.name === table.table); + if (schemaTable) { + inputFields.push({ + namespace: schema.namespace, + name: schemaTable.name, + field: columnName, + transformations: [INDIRECT_SORT], + }); + } + } + } + } + } + + return inputFields; +} + +/** + * Extract WINDOW function lineage from SELECT columns + */ +export function getWindowLineage(select: Select, schema: Schema): InputField[] { + const inputFields: InputField[] = []; + const { regularTables } = getTableExpressionsFromSelect(select); + + if (!select.columns || (typeof select.columns === "string" && select.columns === "*")) return inputFields; + + for (const col of select.columns) { + if (!isColumn(col)) continue; + + // Check if this is a window function (has OVER clause) + const expr = col.expr; + if (expr.type === "aggr_func" && "over" in expr && (expr as AggrFunc & { over?: unknown }).over) { + const aggrFunc = expr as AggrFunc & { + over?: { + partitionby?: ExpressionValue[]; + orderby?: Array<{ expr: ExpressionValue }>; + }; + }; + + // Extract PARTITION BY columns + if (aggrFunc.over?.partitionby) { + for (const partExpr of aggrFunc.over.partitionby) { + const columnRefs = extractColumnRefs(partExpr); + for (const ref of columnRefs) { + const columnName = getInputColumnName(ref); + const tableName = ref.table; + + if (columnName) { + const table = regularTables.find( + (t) => + (!tableName || tableName === t.table || tableName === t.as) && + schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), + ); + + if (table) { + const schemaTable = schema.tables.find((s) => s.name === table.table); + if (schemaTable) { + inputFields.push({ + namespace: schema.namespace, + name: schemaTable.name, + field: columnName, + transformations: [INDIRECT_WINDOW], + }); + } + } + } + } + } + } + + // Extract ORDER BY within OVER clause + if (aggrFunc.over?.orderby) { + for (const orderItem of aggrFunc.over.orderby) { + const columnRefs = extractColumnRefs(orderItem.expr); + for (const ref of columnRefs) { + const columnName = getInputColumnName(ref); + const tableName = ref.table; + + if (columnName) { + const table = regularTables.find( + (t) => + (!tableName || tableName === t.table || tableName === t.as) && + schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), + ); + + if (table) { + const schemaTable = schema.tables.find((s) => s.name === table.table); + if (schemaTable) { + inputFields.push({ + namespace: schema.namespace, + name: schemaTable.name, + field: columnName, + transformations: [INDIRECT_WINDOW], + }); + } + } + } + } + } + } + } + } + + return inputFields; +} + +/** + * Extract HAVING clause lineage (combines FILTER with AGGREGATION context) + */ +export function getHavingLineage(select: Select, schema: Schema): InputField[] { + const inputFields: InputField[] = []; + + if (!select.having) return inputFields; + + // TODO - check type + const columnRefs = extractColumnRefs(select.having as unknown as ExpressionValue); + const { regularTables } = getTableExpressionsFromSelect(select); + + for (const ref of columnRefs) { + const columnName = getInputColumnName(ref); + const tableName = ref.table; + + if (columnName) { + const table = regularTables.find( + (t) => + (!tableName || tableName === t.table || tableName === t.as) && + schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), + ); + + if (table) { + const schemaTable = schema.tables.find((s) => s.name === table.table); + if (schemaTable) { + inputFields.push({ + namespace: schema.namespace, + name: schemaTable.name, + field: columnName, + transformations: [INDIRECT_FILTER], + }); + } + } + } + } + + return inputFields; +} + export function getTableExpressionsFromSelect(select: Select): { regularTables: BaseFrom[]; selectTables: SelectWithAlias[]; @@ -279,7 +830,7 @@ export function getTableExpressionsFromSelect(select: Select): { fromItems.forEach((item) => { if ("table" in item) { - // might mention with statemnt in our select + // might mention with statement in our select const matchingWith = withByNames.get(item.table); if (matchingWith) { @@ -303,10 +854,7 @@ export function getTableExpressionsFromSelect(select: Select): { return { regularTables, selectTables }; } -export function mergeTransformationSet( - parent: TransformationSet, - child: TransformationSet -): TransformationSet { +export function mergeTransformationSet(parent: TransformationSet, child: TransformationSet): TransformationSet { const merged = new TransformationSet(); parent.forEach((tp) => { @@ -322,23 +870,18 @@ export function getColumnLineage( select: Select, schema: Schema, column: AstColumn, - transformations?: TransformationSet + transformations?: TransformationSet, ): InputField[] { - let transformationsByColumns = getDirectTransformationsFromExprValue( - column.expr - ); + let transformationsByColumns = getDirectTransformationsFromExprValue(column.expr); if (transformations) { transformationsByColumns = Object.entries(transformationsByColumns).reduce( (acc, [columnName, childTransformations]) => { - acc[columnName] = mergeTransformationSet( - transformations, - childTransformations - ); + acc[columnName] = mergeTransformationSet(transformations, childTransformations); return acc; }, - {} as Record + {} as Record, ); } @@ -346,20 +889,13 @@ export function getColumnLineage( const inputFields: InputField[] = []; - for (const [inputColumnName, transformations] of Object.entries( - transformationsByColumns - )) { + for (const [inputColumnName, transformations] of Object.entries(transformationsByColumns)) { const inputColumn = parseInputColumnName(inputColumnName); const table = regularTables.find( (t) => - (!inputColumn.table || - inputColumn.table === t.table || - inputColumn.table === t.as) && - schema.tables.some( - (s) => - s.name === t.table && s.columns.some((c) => c === inputColumn.name) - ) + (!inputColumn.table || inputColumn.table === t.table || inputColumn.table === t.as) && + schema.tables.some((s) => s.name === t.table && s.columns.some((c) => c === inputColumn.name)), ); if (table) { @@ -375,9 +911,7 @@ export function getColumnLineage( continue; } - const matchingColumn = selectTable.columns.find( - (c) => getOutputColumnName(c) === inputColumn.name - ); + const matchingColumn = selectTable.columns.find((c) => getOutputColumnName(c) === inputColumn.name); let nextColumn: AstColumn; @@ -386,7 +920,7 @@ export function getColumnLineage( } else { nextColumn = column; - // stop propogating table of column as it is only in the context of the select + // stop propagating table of column as it is only in the context of the select if (nextColumn.expr.type === "column_ref") { const expr = nextColumn.expr as ColumnRefItem; @@ -394,9 +928,7 @@ export function getColumnLineage( } } - inputFields.push( - ...getColumnLineage(selectTable, schema, nextColumn, transformations) - ); + inputFields.push(...getColumnLineage(selectTable, schema, nextColumn, transformations)); } } } @@ -404,11 +936,38 @@ export function getColumnLineage( return inputFields; } -export function getLineage( - select: Select, - schema: Schema -): ColumnLineageDatasetFacet["fields"] { - let unkownCount = 0; +/** + * Get all dataset-level indirect lineage (columns that affect the entire result set) + */ +export function getDatasetLineage(select: Select, schema: Schema): InputField[] { + const allIndirectFields: InputField[] = []; + + // Collect all indirect lineage + allIndirectFields.push(...getJoinLineage(select, schema)); + allIndirectFields.push(...getFilterLineage(select, schema)); + allIndirectFields.push(...getGroupByLineage(select, schema)); + allIndirectFields.push(...getOrderByLineage(select, schema)); + allIndirectFields.push(...getWindowLineage(select, schema)); + allIndirectFields.push(...getHavingLineage(select, schema)); + + // Deduplicate by creating a map keyed by namespace.table.field.type.subtype + const deduped = new Map(); + for (const field of allIndirectFields) { + const transformation = field.transformations?.[0]; + const key = `${field.namespace}.${field.name}.${field.field}.${transformation?.type}.${transformation?.subtype}`; + if (!deduped.has(key)) { + deduped.set(key, field); + } + } + + return Array.from(deduped.values()); +} + +/** + * Main lineage extraction function - returns field-level lineage only (backward compatible) + */ +export function getLineage(select: Select, schema: Schema): ColumnLineageDatasetFacet["fields"] { + let unknownCount = 0; return select.columns.reduce((acc, column) => { if (!isColumn(column)) { @@ -418,7 +977,7 @@ export function getLineage( let outputFieldName = getOutputColumnName(column); if (!outputFieldName) { - outputFieldName = `unknown_${unkownCount++}`; + outputFieldName = `unknown_${unknownCount++}`; } return { @@ -429,3 +988,16 @@ export function getLineage( }; }, {}); } + +/** + * Extended lineage extraction function - returns both field-level and dataset-level lineage + */ +export function getExtendedLineage( + select: Select, + schema: Schema, +): Pick { + return { + fields: getLineage(select, schema), + dataset: getDatasetLineage(select, schema), + }; +} diff --git a/packages/lineage/test/index.test.ts b/packages/lineage/test/index.test.ts index 2f4cb26..705a305 100644 --- a/packages/lineage/test/index.test.ts +++ b/packages/lineage/test/index.test.ts @@ -48,9 +48,7 @@ describe("Select Lineage", () => { FROM u `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "email"]), - ]); + const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); const lineage = getLineage(ast as Select, schema); @@ -88,9 +86,7 @@ describe("Select Lineage", () => { name as wow FROM (SELECT * FROM u) AS t`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "email"]), - ]); + const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); const lineage = getLineage(ast as Select, schema); @@ -190,9 +186,7 @@ describe("Select Lineage", () => { name: "orders", namespace: "trino", field: "user_id", - transformations: [ - { type: "DIRECT", subtype: "AGGREGATION", masking: true }, - ], + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], }, ], }, @@ -358,9 +352,7 @@ ORDER BY net_revenue DESC`; `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "email"]), - ]); + const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); const lineage = getLineage(ast as Select, schema); @@ -394,9 +386,7 @@ ORDER BY net_revenue DESC`; GROUP BY country`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("cities", ["country", "city"]), - ]); + const schema = createSchema("trino", [createTable("cities", ["country", "city"])]); const lineage = getLineage(ast as Select, schema); @@ -417,9 +407,7 @@ ORDER BY net_revenue DESC`; name: "cities", namespace: "trino", field: "city", - transformations: [ - { type: "DIRECT", subtype: "AGGREGATION", masking: true }, - ], + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], }, ], }, @@ -431,9 +419,7 @@ ORDER BY net_revenue DESC`; FROM users`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name"]), - ]); + const schema = createSchema("trino", [createTable("users", ["id", "name"])]); const lineage = getLineage(ast as Select, schema); @@ -517,9 +503,7 @@ ORDER BY net_revenue DESC`; FROM users`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "email"]), - ]); + const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); const lineage = getLineage(ast as Select, schema); @@ -583,9 +567,7 @@ ORDER BY net_revenue DESC`; FROM orders`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("orders", ["id", "price", "tax", "quantity", "discount"]), - ]); + const schema = createSchema("trino", [createTable("orders", ["id", "price", "tax", "quantity", "discount"])]); const lineage = getLineage(ast as Select, schema); @@ -681,9 +663,7 @@ ORDER BY net_revenue DESC`; FROM orders`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("orders", ["id", "price", "tax", "quantity", "discount"]), - ]); + const schema = createSchema("trino", [createTable("orders", ["id", "price", "tax", "quantity", "discount"])]); const lineage = getLineage(ast as Select, schema); @@ -749,9 +729,7 @@ ORDER BY net_revenue DESC`; GROUP BY country`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("cities", ["country", "city", "population", "area"]), - ]); + const schema = createSchema("trino", [createTable("cities", ["country", "city", "population", "area"])]); const lineage = getLineage(ast as Select, schema); diff --git a/packages/lineage/test/indirect.test.ts b/packages/lineage/test/indirect.test.ts new file mode 100644 index 0000000..b279e05 --- /dev/null +++ b/packages/lineage/test/indirect.test.ts @@ -0,0 +1,358 @@ +import { describe, test, expect } from "bun:test"; +import { Parser } from "node-sql-parser"; +import type { AST, Select } from "node-sql-parser"; +import { + getExtendedLineage, + getJoinLineage, + getFilterLineage, + getGroupByLineage, + getOrderByLineage, + getWindowLineage, + getLineage, + type Schema, + type Table, + INDIRECT_JOIN, + INDIRECT_FILTER, + INDIRECT_GROUP_BY, + INDIRECT_SORT, + INDIRECT_WINDOW, + INDIRECT_CONDITION, + DIRECT_IDENTITY, + DIRECT_TRANSFORMATION, + DIRECT_AGGREGATION, +} from "../src/index.js"; + +const parser = new Parser(); + +// Helper function to create schemas +function createSchema(namespace: string, tables: Table[]): Schema { + return { namespace, tables }; +} + +function createTable(name: string, columns: string[]): Table { + return { name, columns }; +} + +// Helper to ensure we get a single AST +function parseSQL(sql: string): AST { + const result = parser.astify(sql, { database: "trino" }); + const ast = Array.isArray(result) ? result[0] : result; + + if (!ast) { + throw new Error("Failed to parse SQL"); + } + + return ast; +} + +describe("Indirect Lineage - JOIN", () => { + test("simple inner join", () => { + const sql = ` + SELECT u.id, u.name, o.total + FROM users u + JOIN orders o ON u.id = o.user_id + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name", "email"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const joinLineage = getJoinLineage(ast as Select, schema); + + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [INDIRECT_JOIN], + }); + + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "user_id", + transformations: [INDIRECT_JOIN], + }); + }); + + test("multiple joins", () => { + const sql = ` + SELECT u.name, o.total, p.name as product_name + FROM users u + JOIN orders o ON u.id = o.user_id + JOIN products p ON o.product_id = p.id + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "product_id", "total"]), + createTable("products", ["id", "name"]), + ]); + + const joinLineage = getJoinLineage(ast as Select, schema); + + // Should have join columns from all joins + expect(joinLineage.length).toBeGreaterThanOrEqual(4); + }); +}); + +describe("Indirect Lineage - FILTER (WHERE)", () => { + test("simple where clause", () => { + const sql = ` + SELECT id, name + FROM users + WHERE status = 'active' + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "status"])]); + + const filterLineage = getFilterLineage(ast as Select, schema); + + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "status", + transformations: [INDIRECT_FILTER], + }); + }); + + test("complex where clause with AND/OR", () => { + const sql = ` + SELECT id, name + FROM users + WHERE status = 'active' AND age > 18 OR country = 'US' + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "status", "age", "country"])]); + + const filterLineage = getFilterLineage(ast as Select, schema); + + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "status", + transformations: [INDIRECT_FILTER], + }); + + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "age", + transformations: [INDIRECT_FILTER], + }); + + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [INDIRECT_FILTER], + }); + }); +}); + +describe("Indirect Lineage - GROUP BY", () => { + test("simple group by", () => { + const sql = ` + SELECT country, COUNT(*) as user_count + FROM users + GROUP BY country + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "country"])]); + + const groupByLineage = getGroupByLineage(ast as Select, schema); + + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [INDIRECT_GROUP_BY], + }); + }); + + test("multiple group by columns", () => { + const sql = ` + SELECT country, city, COUNT(*) as user_count + FROM users + GROUP BY country, city + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "country", "city"])]); + + const groupByLineage = getGroupByLineage(ast as Select, schema); + + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [INDIRECT_GROUP_BY], + }); + + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "city", + transformations: [INDIRECT_GROUP_BY], + }); + }); +}); + +describe("Indirect Lineage - ORDER BY (SORT)", () => { + test("simple order by", () => { + const sql = ` + SELECT id, name + FROM users + ORDER BY created_at DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "created_at"])]); + + const orderByLineage = getOrderByLineage(ast as Select, schema); + + expect(orderByLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "created_at", + transformations: [INDIRECT_SORT], + }); + }); + + test("multiple order by columns", () => { + const sql = ` + SELECT id, name + FROM users + ORDER BY country ASC, created_at DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "country", "created_at"])]); + + const orderByLineage = getOrderByLineage(ast as Select, schema); + + expect(orderByLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [INDIRECT_SORT], + }); + + expect(orderByLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "created_at", + transformations: [INDIRECT_SORT], + }); + }); +}); + +describe("Extended Lineage", () => { + test("full query with all indirect types", () => { + const sql = ` + SELECT + u.country, + COUNT(u.id) as user_count, + SUM(o.total) as total_revenue + FROM users u + JOIN orders o ON u.id = o.user_id + WHERE u.status = 'active' + GROUP BY u.country + ORDER BY u.country DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name", "country", "status"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const extendedLineage = getExtendedLineage(ast as Select, schema); + + // Check field-level lineage + expect(extendedLineage.fields.country).toBeDefined(); + expect(extendedLineage.fields.user_count).toBeDefined(); + expect(extendedLineage.fields.total_revenue).toBeDefined(); + + // Check dataset-level lineage contains indirect transformations + expect(extendedLineage.dataset).toBeDefined(); + expect(extendedLineage.dataset!.length).toBeGreaterThan(0); + + // Should have JOIN lineage + const joinFields = extendedLineage.dataset!.filter((f) => f.transformations?.[0]?.subtype === "JOIN"); + expect(joinFields.length).toBeGreaterThan(0); + + // Should have FILTER lineage + const filterFields = extendedLineage.dataset!.filter((f) => f.transformations?.[0]?.subtype === "FILTER"); + expect(filterFields.length).toBeGreaterThan(0); + + // Should have GROUP_BY lineage + const groupByFields = extendedLineage.dataset!.filter((f) => f.transformations?.[0]?.subtype === "GROUP_BY"); + expect(groupByFields.length).toBeGreaterThan(0); + + // Should have SORT lineage + const sortFields = extendedLineage.dataset!.filter((f) => f.transformations?.[0]?.subtype === "SORT"); + expect(sortFields.length).toBeGreaterThan(0); + }); +}); + +describe("Direct Lineage - CASE/CONDITION", () => { + test("simple case when", () => { + const sql = ` + SELECT + id, + CASE + WHEN status = 'active' THEN 'Active User' + WHEN status = 'inactive' THEN 'Inactive User' + ELSE 'Unknown' + END as status_label + FROM users + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "status"])]); + + const lineage = getLineage(ast as Select, schema); + + // The status column should be in the lineage for status_label + expect(lineage.status_label).toBeDefined(); + expect(lineage.status_label?.inputFields.length).toBeGreaterThan(0); + + // Should have CONDITION transformation for the condition columns + const hasCondition = lineage.status_label?.inputFields.some( + (f) => + f.transformations?.some((t) => t.subtype === "CONDITION") || + f.transformations?.some((t) => t.subtype === "TRANSFORMATION"), + ); + expect(hasCondition).toBe(true); + }); + + test("case with expression in result", () => { + const sql = ` + SELECT + id, + CASE + WHEN quantity > 100 THEN price * 0.9 + ELSE price + END as final_price + FROM products + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("products", ["id", "price", "quantity"])]); + + const lineage = getLineage(ast as Select, schema); + + expect(lineage.final_price).toBeDefined(); + + // Should include both quantity (condition) and price (result) + const fields = lineage.final_price?.inputFields.map((f) => f.field); + expect(fields).toContain("price"); + expect(fields).toContain("quantity"); + }); +}); From ae4e2eadc3e44d4c6e01b80584fc66f512c48f6c Mon Sep 17 00:00:00 2001 From: its-elad <59926027+its-elad@users.noreply.github.com> Date: Thu, 29 Jan 2026 13:26:53 +0200 Subject: [PATCH 02/10] feat: fixed issues with window function parameters and handle recursive indirect relationships. --- packages/lineage/src/index.ts | 580 ++--- packages/lineage/test/extendedLineage.test.ts | 2086 +++++++++++++++++ packages/lineage/test/indirect.test.ts | 358 --- 3 files changed, 2378 insertions(+), 646 deletions(-) create mode 100644 packages/lineage/test/extendedLineage.test.ts delete mode 100644 packages/lineage/test/indirect.test.ts diff --git a/packages/lineage/src/index.ts b/packages/lineage/src/index.ts index a1ff9e3..a549be5 100644 --- a/packages/lineage/src/index.ts +++ b/packages/lineage/src/index.ts @@ -1,3 +1,7 @@ +// ============================================================================ +// Imports +// ============================================================================ + import { type ColumnLineageDatasetFacet, type InputField, @@ -21,6 +25,10 @@ import { } from "node-sql-parser"; import { HashSet } from "./hashset"; +// ============================================================================ +// Types +// ============================================================================ + type Transformation = Exclude<_Transformation, "masking"> & { masking: boolean; // output boolean only for easier testing }; @@ -42,6 +50,10 @@ const MASKING_FUNCTIONS = new Set([ "REDACT", ]); +// ============================================================================ +// Transformation Constants +// ============================================================================ + // Direct transformation constants export const DIRECT_TRANSFORMATION: Transformation = { type: "DIRECT", @@ -98,14 +110,6 @@ export const INDIRECT_CONDITION: Transformation = { masking: false, }; -function createTransformation( - type: TransformationType, - subtype: TransformationSubtype, - masking: boolean = false, -): Transformation { - return { type, subtype, masking }; -} - function mergeTransformations(parent: Transformation | undefined, child: Transformation): Transformation { if (!parent) { return child; @@ -149,6 +153,10 @@ class TransformationSet extends HashSet { } } +// ============================================================================ +// Exported Types +// ============================================================================ + export type Column = { name: string; }; @@ -180,6 +188,10 @@ export interface ExtendedLineageResult { dataset?: InputField[]; } +// ============================================================================ +// Column Name Utilities +// ============================================================================ + export function isColumn(selectColumn: Select["columns"][number]): selectColumn is AstColumn { return ( typeof selectColumn === "object" && @@ -307,8 +319,63 @@ export function extractColumnRefs(expr: ExpressionValue | null | undefined): Col return refs; } +// ============================================================================ +// Window Function Helpers (needed by both field-level and dataset-level lineage) +// ============================================================================ + +/** + * Type for OVER clause structure (shared between aggr_func and function types) + */ +type OverClause = { + // Direct structure (legacy/simple case) + partitionby?: ExpressionValue[]; + orderby?: Array<{ expr: ExpressionValue }>; + // Nested structure (Trino parser output) + as_window_specification?: { + window_specification?: { + partitionby?: Array<{ expr: ExpressionValue }>; + orderby?: Array<{ expr: ExpressionValue }>; + }; + }; +}; + /** - * Get transformations from expression, now supporting CASE/IF for CONDITION subtype + * Extract expressions from an OVER clause object (PARTITION BY and ORDER BY) + * This is a helper used by both field-level and dataset-level lineage extraction. + */ +function extractWindowExpressionsFromOver(over: OverClause): ExpressionValue[] { + const expressions: ExpressionValue[] = []; + + // Handle nested structure (Trino parser output) + const windowSpec = over.as_window_specification?.window_specification; + if (windowSpec) { + if (windowSpec.partitionby) { + expressions.push(...windowSpec.partitionby.map((item) => item.expr)); + } + if (windowSpec.orderby) { + expressions.push(...windowSpec.orderby.map((item) => item.expr)); + } + } + + // Handle direct structure (legacy/simple case) as fallback + if (expressions.length === 0) { + if (over.partitionby) { + expressions.push(...over.partitionby); + } + if (over.orderby) { + expressions.push(...over.orderby.map((item) => item.expr)); + } + } + + return expressions; +} + +// ============================================================================ +// Direct Transformation Extraction +// ============================================================================ + +/** + * Get transformations from expression, supporting CASE/IF for CONDITION subtype */ export function getDirectTransformationsFromExprValue( expr: ExpressionValue, @@ -354,40 +421,79 @@ export function getDirectTransformationsFromExprValue( case "aggr_func": { const aggExpr = expr as AggrFunc; - return getDirectTransformationsFromExprValue( - aggExpr.args.expr, - mergeTransformations(parentTransformation, { - ...DIRECT_AGGREGATION, - masking: MASKING_AGG_FUNCTIONS.has(aggExpr.name), - }), - ); + const merged: Record = {}; + + // Extract lineage from aggregate function arguments + if (aggExpr.args?.expr) { + const argTransformations = getDirectTransformationsFromExprValue( + aggExpr.args.expr, + mergeTransformations(parentTransformation, { + ...DIRECT_AGGREGATION, + masking: MASKING_AGG_FUNCTIONS.has(aggExpr.name), + }), + ); + Object.entries(argTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; + }); + } + + // For window functions (aggr_func with OVER clause), also extract columns from PARTITION BY/ORDER BY + if ("over" in aggExpr && aggExpr.over) { + const windowExprs = extractWindowExpressionsFromOver(aggExpr.over); + for (const windowExpr of windowExprs) { + const windowTransformations = getDirectTransformationsFromExprValue( + windowExpr, + mergeTransformations(parentTransformation, DIRECT_AGGREGATION), + ); + Object.entries(windowTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; + }); + } + } + + return merged; } case "function": { const funcExpr = expr as AstFunction; + const merged: Record = {}; - return ( - funcExpr.args?.value.reduce( - (acc, arg) => { - const argTransformations = getDirectTransformationsFromExprValue( - arg, - mergeTransformations(parentTransformation, { - ...DIRECT_TRANSFORMATION, - masking: - funcExpr.name.name.length > 0 && - MASKING_FUNCTIONS.has(funcExpr.name.name.at(-1)!.value.toUpperCase()), - }), - ); + // Extract lineage from function arguments + if (funcExpr.args?.value) { + for (const arg of funcExpr.args.value) { + const argTransformations = getDirectTransformationsFromExprValue( + arg, + mergeTransformations(parentTransformation, { + ...DIRECT_TRANSFORMATION, + masking: + funcExpr.name.name.length > 0 && + MASKING_FUNCTIONS.has(funcExpr.name.name.at(-1)!.value.toUpperCase()), + }), + ); + Object.entries(argTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; + }); + } + } - Object.entries(argTransformations).forEach(([key, value]) => { - acc[key] = acc[key] ? acc[key].intersection(value) : value; - }); + // For window functions (function with OVER clause like RANK(), ROW_NUMBER()), + // extract columns from PARTITION BY/ORDER BY since these functions have no arguments + if ("over" in funcExpr && funcExpr.over) { + const windowExprs = extractWindowExpressionsFromOver( + (funcExpr as AstFunction & { over: OverClause }).over, + ); + for (const windowExpr of windowExprs) { + const windowTransformations = getDirectTransformationsFromExprValue( + windowExpr, + mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION), + ); + Object.entries(windowTransformations).forEach(([key, value]) => { + merged[key] = merged[key] ? merged[key].union(value) : value; + }); + } + } - return acc; - }, - {} as Record, - ) ?? {} - ); + return merged; } case "case": { @@ -469,53 +575,108 @@ export function getIndirectTransformationsFromExpr( return result; } +// ============================================================================ +// Indirect Lineage Extraction Helpers +// ============================================================================ + /** - * Extract JOIN lineage from FROM clause + * Resolves a column reference to an InputField by finding the matching table in schema. + * This is the core helper that eliminates repetitive table lookup logic. */ -export function getJoinLineage(select: Select, schema: Schema): InputField[] { +function resolveColumnRefToInputField( + ref: ColumnRefItem, + regularTables: BaseFrom[], + schema: Schema, + transformation: Transformation, +): InputField | null { + const columnName = getInputColumnName(ref); + const tableName = ref.table; + + if (!columnName) return null; + + const table = regularTables.find( + (t) => + (!tableName || tableName === t.table || tableName === t.as) && + schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), + ); + + if (!table) return null; + + const schemaTable = schema.tables.find((s) => s.name === table.table); + if (!schemaTable) return null; + + return { + namespace: schema.namespace, + name: schemaTable.name, + field: columnName, + transformations: [transformation], + }; +} + +/** + * Extracts InputFields from column references in an expression. + * Common pattern used by WHERE, HAVING, GROUP BY, ORDER BY, etc. + */ +function extractInputFieldsFromExpression( + expr: ExpressionValue | null | undefined, + regularTables: BaseFrom[], + schema: Schema, + transformation: Transformation, +): InputField[] { + if (!expr) return []; + + const columnRefs = extractColumnRefs(expr); const inputFields: InputField[] = []; - if (!select.from) return inputFields; + for (const ref of columnRefs) { + const inputField = resolveColumnRefToInputField(ref, regularTables, schema, transformation); + if (inputField) { + inputFields.push(inputField); + } + } + + return inputFields; +} + +/** + * Extracts InputFields from multiple expressions. + */ +function extractInputFieldsFromExpressions( + expressions: (ExpressionValue | null | undefined)[], + regularTables: BaseFrom[], + schema: Schema, + transformation: Transformation, +): InputField[] { + return expressions.flatMap((expr) => + extractInputFieldsFromExpression(expr, regularTables, schema, transformation), + ); +} + +// ============================================================================ +// Clause-Specific Lineage Extractors +// ============================================================================ + +/** + * Extract JOIN lineage from FROM clause (ON and USING conditions) + */ +export function getJoinLineage(select: Select, schema: Schema): InputField[] { + if (!select.from) return []; const fromItems = Array.isArray(select.from) ? select.from : [select.from]; const { regularTables } = getTableExpressionsFromSelect(select); + const inputFields: InputField[] = []; for (const item of fromItems) { - // Check for JOIN conditions + // Handle ON clause if ("on" in item && item.on) { - const columnRefs = extractColumnRefs(item.on as ExpressionValue); - - for (const ref of columnRefs) { - const columnName = getInputColumnName(ref); - const tableName = ref.table; - - if (columnName) { - // Find the table - tableName might be an alias, so check against both table name and alias - const table = regularTables.find( - (t) => - (!tableName || tableName === t.table || tableName === t.as) && - schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), - ); - - if (table) { - const schemaTable = schema.tables.find((s) => s.name === table.table); - if (schemaTable) { - inputFields.push({ - namespace: schema.namespace, - name: schemaTable.name, - field: columnName, - transformations: [INDIRECT_JOIN], - }); - } - } - } - } + inputFields.push( + ...extractInputFieldsFromExpression(item.on as ExpressionValue, regularTables, schema, INDIRECT_JOIN), + ); } - // Check for USING clause + // Handle USING clause - columns exist in multiple tables if ("using" in item && Array.isArray(item.using)) { for (const usingCol of item.using) { - // USING columns exist in multiple tables for (const schemaTable of schema.tables) { if (schemaTable.columns.includes(usingCol)) { inputFields.push({ @@ -537,270 +698,102 @@ export function getJoinLineage(select: Select, schema: Schema): InputField[] { * Extract WHERE clause lineage (FILTER) */ export function getFilterLineage(select: Select, schema: Schema): InputField[] { - const inputFields: InputField[] = []; + if (!select.where) return []; - if (!select.where) return inputFields; - - const columnRefs = extractColumnRefs(select.where as ExpressionValue); const { regularTables } = getTableExpressionsFromSelect(select); - - for (const ref of columnRefs) { - const columnName = getInputColumnName(ref); - const tableName = ref.table; - - if (columnName) { - // Find the table in schema - const table = regularTables.find( - (t) => - (!tableName || tableName === t.table || tableName === t.as) && - schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), - ); - - if (table) { - const schemaTable = schema.tables.find((s) => s.name === table.table); - if (schemaTable) { - inputFields.push({ - namespace: schema.namespace, - name: schemaTable.name, - field: columnName, - transformations: [INDIRECT_FILTER], - }); - } - } - } - } - - return inputFields; + return extractInputFieldsFromExpression(select.where as ExpressionValue, regularTables, schema, INDIRECT_FILTER); } /** * Extract GROUP BY lineage */ export function getGroupByLineage(select: Select, schema: Schema): InputField[] { - const inputFields: InputField[] = []; + if (!select.groupby) return []; - if (!select.groupby) return inputFields; - - // Handle both array format and object with columns property - let groupByItems: ExpressionValue[]; - if (Array.isArray(select.groupby)) { - groupByItems = select.groupby; - } else if ( - typeof select.groupby === "object" && - "columns" in select.groupby && - Array.isArray(select.groupby.columns) - ) { - groupByItems = select.groupby.columns; - } else { - groupByItems = [select.groupby as unknown as ExpressionValue]; - } + // Normalize GROUP BY to array format + const groupByItems = normalizeGroupByItems(select.groupby); const { regularTables } = getTableExpressionsFromSelect(select); - for (const item of groupByItems) { - const columnRefs = extractColumnRefs(item as ExpressionValue); - - for (const ref of columnRefs) { - const columnName = getInputColumnName(ref); - const tableName = ref.table; - - if (columnName) { - const table = regularTables.find( - (t) => - (!tableName || tableName === t.table || tableName === t.as) && - schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), - ); + return extractInputFieldsFromExpressions(groupByItems, regularTables, schema, INDIRECT_GROUP_BY); +} - if (table) { - const schemaTable = schema.tables.find((s) => s.name === table.table); - if (schemaTable) { - inputFields.push({ - namespace: schema.namespace, - name: schemaTable.name, - field: columnName, - transformations: [INDIRECT_GROUP_BY], - }); - } - } - } - } +/** + * Normalize GROUP BY clause to array of ExpressionValue + */ +function normalizeGroupByItems(groupby: Select["groupby"]): ExpressionValue[] { + if (Array.isArray(groupby)) { + return groupby; } - - return inputFields; + if (typeof groupby === "object" && groupby && "columns" in groupby && Array.isArray(groupby.columns)) { + return groupby.columns; + } + return [groupby as unknown as ExpressionValue]; } /** * Extract ORDER BY lineage (SORT) */ export function getOrderByLineage(select: Select, schema: Schema): InputField[] { - const inputFields: InputField[] = []; - - if (!select.orderby) return inputFields; + if (!select.orderby) return []; const orderByItems = Array.isArray(select.orderby) ? select.orderby : [select.orderby]; const { regularTables } = getTableExpressionsFromSelect(select); - for (const item of orderByItems) { - const expr = "expr" in item ? item.expr : item; - const columnRefs = extractColumnRefs(expr as ExpressionValue); - - for (const ref of columnRefs) { - const columnName = getInputColumnName(ref); - const tableName = ref.table; - - if (columnName) { - const table = regularTables.find( - (t) => - (!tableName || tableName === t.table || tableName === t.as) && - schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), - ); - - if (table) { - const schemaTable = schema.tables.find((s) => s.name === table.table); - if (schemaTable) { - inputFields.push({ - namespace: schema.namespace, - name: schemaTable.name, - field: columnName, - transformations: [INDIRECT_SORT], - }); - } - } - } - } - } - - return inputFields; + const expressions = orderByItems.map((item) => ("expr" in item ? item.expr : item) as ExpressionValue); + return extractInputFieldsFromExpressions(expressions, regularTables, schema, INDIRECT_SORT); } /** - * Extract WINDOW function lineage from SELECT columns + * Extract WINDOW function lineage from SELECT columns (PARTITION BY and ORDER BY in OVER clause) */ export function getWindowLineage(select: Select, schema: Schema): InputField[] { - const inputFields: InputField[] = []; - const { regularTables } = getTableExpressionsFromSelect(select); + if (!select.columns || (typeof select.columns === "string" && select.columns === "*")) { + return []; + } - if (!select.columns || (typeof select.columns === "string" && select.columns === "*")) return inputFields; + const { regularTables } = getTableExpressionsFromSelect(select); + const inputFields: InputField[] = []; for (const col of select.columns) { if (!isColumn(col)) continue; - // Check if this is a window function (has OVER clause) - const expr = col.expr; - if (expr.type === "aggr_func" && "over" in expr && (expr as AggrFunc & { over?: unknown }).over) { - const aggrFunc = expr as AggrFunc & { - over?: { - partitionby?: ExpressionValue[]; - orderby?: Array<{ expr: ExpressionValue }>; - }; - }; - - // Extract PARTITION BY columns - if (aggrFunc.over?.partitionby) { - for (const partExpr of aggrFunc.over.partitionby) { - const columnRefs = extractColumnRefs(partExpr); - for (const ref of columnRefs) { - const columnName = getInputColumnName(ref); - const tableName = ref.table; - - if (columnName) { - const table = regularTables.find( - (t) => - (!tableName || tableName === t.table || tableName === t.as) && - schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), - ); - - if (table) { - const schemaTable = schema.tables.find((s) => s.name === table.table); - if (schemaTable) { - inputFields.push({ - namespace: schema.namespace, - name: schemaTable.name, - field: columnName, - transformations: [INDIRECT_WINDOW], - }); - } - } - } - } - } - } - - // Extract ORDER BY within OVER clause - if (aggrFunc.over?.orderby) { - for (const orderItem of aggrFunc.over.orderby) { - const columnRefs = extractColumnRefs(orderItem.expr); - for (const ref of columnRefs) { - const columnName = getInputColumnName(ref); - const tableName = ref.table; - - if (columnName) { - const table = regularTables.find( - (t) => - (!tableName || tableName === t.table || tableName === t.as) && - schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), - ); - - if (table) { - const schemaTable = schema.tables.find((s) => s.name === table.table); - if (schemaTable) { - inputFields.push({ - namespace: schema.namespace, - name: schemaTable.name, - field: columnName, - transformations: [INDIRECT_WINDOW], - }); - } - } - } - } - } - } - } + const windowExprs = extractWindowExpressions(col.expr); + inputFields.push(...extractInputFieldsFromExpressions(windowExprs, regularTables, schema, INDIRECT_WINDOW)); } return inputFields; } /** - * Extract HAVING clause lineage (combines FILTER with AGGREGATION context) + * Extract expressions from OVER clause in an expression (PARTITION BY and ORDER BY) + * Handles the parser structure: over.as_window_specification.window_specification.{partitionby,orderby} + * Supports both aggr_func (e.g., SUM() OVER) and function types (e.g., ROW_NUMBER() OVER) */ -export function getHavingLineage(select: Select, schema: Schema): InputField[] { - const inputFields: InputField[] = []; +function extractWindowExpressions(expr: ExpressionValue): ExpressionValue[] { + // Support both aggr_func and function types with OVER clause + if ((expr.type !== "aggr_func" && expr.type !== "function") || !("over" in expr)) return []; - if (!select.having) return inputFields; + const exprWithOver = expr as (AggrFunc | AstFunction) & { over?: OverClause }; - // TODO - check type - const columnRefs = extractColumnRefs(select.having as unknown as ExpressionValue); - const { regularTables } = getTableExpressionsFromSelect(select); - - for (const ref of columnRefs) { - const columnName = getInputColumnName(ref); - const tableName = ref.table; + if (!exprWithOver.over) return []; - if (columnName) { - const table = regularTables.find( - (t) => - (!tableName || tableName === t.table || tableName === t.as) && - schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), - ); + return extractWindowExpressionsFromOver(exprWithOver.over); +} - if (table) { - const schemaTable = schema.tables.find((s) => s.name === table.table); - if (schemaTable) { - inputFields.push({ - namespace: schema.namespace, - name: schemaTable.name, - field: columnName, - transformations: [INDIRECT_FILTER], - }); - } - } - } - } +/** + * Extract HAVING clause lineage (FILTER in aggregation context) + */ +export function getHavingLineage(select: Select, schema: Schema): InputField[] { + if (!select.having) return []; - return inputFields; + const { regularTables } = getTableExpressionsFromSelect(select); + return extractInputFieldsFromExpression(select.having as unknown as ExpressionValue, regularTables, schema, INDIRECT_FILTER); } +// ============================================================================ +// Table Expression Helpers +// ============================================================================ + export function getTableExpressionsFromSelect(select: Select): { regularTables: BaseFrom[]; selectTables: SelectWithAlias[]; @@ -866,6 +859,10 @@ export function mergeTransformationSet(parent: TransformationSet, child: Transfo return merged; } +// ============================================================================ +// Main Lineage Functions +// ============================================================================ + export function getColumnLineage( select: Select, schema: Schema, @@ -895,7 +892,7 @@ export function getColumnLineage( const table = regularTables.find( (t) => (!inputColumn.table || inputColumn.table === t.table || inputColumn.table === t.as) && - schema.tables.some((s) => s.name === t.table && s.columns.some((c) => c === inputColumn.name)), + schema.tables.some((s) => s.name === t.table && s.columns.includes(inputColumn.name)), ); if (table) { @@ -938,11 +935,12 @@ export function getColumnLineage( /** * Get all dataset-level indirect lineage (columns that affect the entire result set) + * This includes lineage from CTEs and subqueries that contribute to the final result. */ export function getDatasetLineage(select: Select, schema: Schema): InputField[] { const allIndirectFields: InputField[] = []; - // Collect all indirect lineage + // Collect all indirect lineage from the outermost SELECT allIndirectFields.push(...getJoinLineage(select, schema)); allIndirectFields.push(...getFilterLineage(select, schema)); allIndirectFields.push(...getGroupByLineage(select, schema)); @@ -950,6 +948,12 @@ export function getDatasetLineage(select: Select, schema: Schema): InputField[] allIndirectFields.push(...getWindowLineage(select, schema)); allIndirectFields.push(...getHavingLineage(select, schema)); + // Recursively collect dataset lineage from CTEs and subqueries + const { selectTables } = getTableExpressionsFromSelect(select); + for (const selectTable of selectTables) { + allIndirectFields.push(...getDatasetLineage(selectTable, schema)); + } + // Deduplicate by creating a map keyed by namespace.table.field.type.subtype const deduped = new Map(); for (const field of allIndirectFields) { diff --git a/packages/lineage/test/extendedLineage.test.ts b/packages/lineage/test/extendedLineage.test.ts new file mode 100644 index 0000000..d1d0614 --- /dev/null +++ b/packages/lineage/test/extendedLineage.test.ts @@ -0,0 +1,2086 @@ +import { describe, test, expect } from "bun:test"; +import { Parser } from "node-sql-parser"; +import type { AST, Select } from "node-sql-parser"; +import { + getExtendedLineage, + type Schema, + type Table, + INDIRECT_JOIN, + INDIRECT_FILTER, + INDIRECT_GROUP_BY, + INDIRECT_SORT, + INDIRECT_WINDOW, + DIRECT_IDENTITY, + DIRECT_TRANSFORMATION, + DIRECT_AGGREGATION, +} from "../src/index.js"; + +const parser = new Parser(); + +// Helper function to create schemas +function createSchema(namespace: string, tables: Table[]): Schema { + return { namespace, tables }; +} + +function createTable(name: string, columns: string[]): Table { + return { name, columns }; +} + +// Helper to ensure we get a single AST +function parseSQL(sql: string): AST { + const result = parser.astify(sql, { database: "trino" }); + const ast = Array.isArray(result) ? result[0] : result; + + if (!ast) { + throw new Error("Failed to parse SQL"); + } + + return ast; +} + +// Helper to find dataset lineage entries by transformation subtype +function findBySubtype(dataset: ReturnType["dataset"], subtype: string) { + return dataset?.filter((f) => f.transformations?.[0]?.subtype === subtype) ?? []; +} + +// Helper to find dataset lineage entry by field name and subtype +function findFieldBySubtype( + dataset: ReturnType["dataset"], + fieldName: string, + subtype: string, +) { + return dataset?.find((f) => f.field === fieldName && f.transformations?.[0]?.subtype === subtype); +} + +// ============================================================================= +// SIMPLE TESTS - Single Clause Scenarios +// ============================================================================= + +describe("getExtendedLineage - Simple SELECT (no indirect lineage)", () => { + test("simple select without any indirect clauses", () => { + const sql = `SELECT id, name FROM users`; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage should exist + expect(result.fields.id).toBeDefined(); + expect(result.fields.name).toBeDefined(); + expect(result.fields.id?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.name?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + + // Dataset-level lineage should be empty + expect(result.dataset).toEqual([]); + }); + + test("select with alias", () => { + const sql = `SELECT id as user_id, name as user_name FROM users`; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name"])]); + + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields.user_id?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.user_name?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + expect(result.dataset).toEqual([]); + }); +}); + +describe("getExtendedLineage - JOIN only", () => { + test("simple INNER JOIN", () => { + const sql = ` + SELECT u.id, u.name, o.total + FROM users u + JOIN orders o ON u.id = o.user_id + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage + expect(result.fields.id?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + + // Dataset-level lineage - JOIN + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(2); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [INDIRECT_JOIN], + }); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "user_id", + transformations: [INDIRECT_JOIN], + }); + }); + + test("LEFT JOIN", () => { + const sql = ` + SELECT u.id, o.total + FROM users u + LEFT JOIN orders o ON u.id = o.user_id + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(2); + }); + + test("RIGHT JOIN", () => { + const sql = ` + SELECT u.id, o.total + FROM users u + RIGHT JOIN orders o ON u.id = o.user_id + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(2); + }); + + test("multiple JOINs", () => { + const sql = ` + SELECT u.name, o.total, p.name as product_name + FROM users u + JOIN orders o ON u.id = o.user_id + JOIN products p ON o.product_id = p.id + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "product_id", "total"]), + createTable("products", ["id", "name"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(4); // u.id, o.user_id, o.product_id, p.id + }); + + test("JOIN with complex ON condition", () => { + const sql = ` + SELECT u.id, o.total + FROM users u + JOIN orders o ON u.id = o.user_id AND u.status = o.status + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "status"]), + createTable("orders", ["id", "user_id", "status", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(4); // u.id, o.user_id, u.status, o.status + }); + + test("self JOIN", () => { + const sql = ` + SELECT e.name as employee, m.name as manager + FROM employees e + JOIN employees m ON e.manager_id = m.id + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("employees", ["id", "name", "manager_id"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBeGreaterThanOrEqual(2); + }); +}); + +describe("getExtendedLineage - WHERE only (FILTER)", () => { + test("simple WHERE clause", () => { + const sql = ` + SELECT id, name + FROM users + WHERE status = 'active' + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "status"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(1); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "status", + transformations: [INDIRECT_FILTER], + }); + }); + + test("WHERE with AND", () => { + const sql = ` + SELECT id, name + FROM users + WHERE status = 'active' AND age > 18 + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "status", "age"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(2); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "status", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "age", + transformations: [INDIRECT_FILTER], + }); + }); + + test("WHERE with OR", () => { + const sql = ` + SELECT id, name + FROM users + WHERE status = 'active' OR country = 'US' + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "status", "country"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(2); + }); + + test("WHERE with complex nested conditions", () => { + const sql = ` + SELECT id, name + FROM users + WHERE (status = 'active' AND age > 18) OR (country = 'US' AND verified = true) + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name", "status", "age", "country", "verified"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(4); + }); + + test("WHERE with IN clause", () => { + const sql = ` + SELECT id, name + FROM users + WHERE country IN ('US', 'UK', 'CA') + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "country"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(1); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [INDIRECT_FILTER], + }); + }); + + test("WHERE with BETWEEN", () => { + const sql = ` + SELECT id, name + FROM users + WHERE age BETWEEN 18 AND 65 + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "age"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(1); + }); + + test("WHERE with LIKE", () => { + const sql = ` + SELECT id, name + FROM users + WHERE name LIKE 'John%' + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(1); + }); + + test("WHERE with IS NULL", () => { + const sql = ` + SELECT id, name + FROM users + WHERE email IS NULL + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(1); + }); +}); + +describe("getExtendedLineage - GROUP BY only", () => { + test("simple GROUP BY", () => { + const sql = ` + SELECT country, COUNT(*) as count + FROM users + GROUP BY country + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "country"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage.length).toBe(1); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [INDIRECT_GROUP_BY], + }); + }); + + test("multiple GROUP BY columns", () => { + const sql = ` + SELECT country, city, COUNT(*) as count + FROM users + GROUP BY country, city + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "country", "city"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage.length).toBe(2); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [INDIRECT_GROUP_BY], + }); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "city", + transformations: [INDIRECT_GROUP_BY], + }); + }); + + test("GROUP BY with aggregation functions", () => { + const sql = ` + SELECT + department, + SUM(salary) as total_salary, + AVG(age) as avg_age, + MIN(hire_date) as first_hire + FROM employees + GROUP BY department + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("employees", ["id", "department", "salary", "age", "hire_date"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage should show aggregations + expect(result.fields.total_salary?.inputFields).toContainEqual({ + namespace: "trino", + name: "employees", + field: "salary", + transformations: [DIRECT_AGGREGATION], + }); + + // Dataset-level lineage should show GROUP_BY + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage.length).toBe(1); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "employees", + field: "department", + transformations: [INDIRECT_GROUP_BY], + }); + }); +}); + +describe("getExtendedLineage - ORDER BY only (SORT)", () => { + test("simple ORDER BY", () => { + const sql = ` + SELECT id, name + FROM users + ORDER BY created_at DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "created_at"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const sortLineage = findBySubtype(result.dataset, "SORT"); + expect(sortLineage.length).toBe(1); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "created_at", + transformations: [INDIRECT_SORT], + }); + }); + + test("multiple ORDER BY columns", () => { + const sql = ` + SELECT id, name + FROM users + ORDER BY country ASC, name DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "country"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const sortLineage = findBySubtype(result.dataset, "SORT"); + expect(sortLineage.length).toBe(2); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [INDIRECT_SORT], + }); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "name", + transformations: [INDIRECT_SORT], + }); + }); + + test("ORDER BY with NULLS FIRST/LAST", () => { + const sql = ` + SELECT id, name + FROM users + ORDER BY email NULLS LAST + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const sortLineage = findBySubtype(result.dataset, "SORT"); + expect(sortLineage.length).toBe(1); + }); +}); + +describe("getExtendedLineage - HAVING only", () => { + test("simple HAVING clause", () => { + const sql = ` + SELECT country, COUNT(*) as count + FROM users + GROUP BY country + HAVING COUNT(*) > 10 + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "country"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // GROUP BY lineage + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage.length).toBe(1); + + // Note: COUNT(*) doesn't reference a specific column, so HAVING may not add to dataset + }); + + test("HAVING with column reference", () => { + const sql = ` + SELECT department, SUM(salary) as total_salary + FROM employees + GROUP BY department + HAVING SUM(salary) > 100000 + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("employees", ["id", "department", "salary"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // Should have GROUP_BY + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage.length).toBe(1); + + // HAVING filters should show as FILTER + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "employees", + field: "salary", + transformations: [INDIRECT_FILTER], + }); + }); + + test("HAVING with multiple conditions", () => { + const sql = ` + SELECT department, AVG(age) as avg_age + FROM employees + GROUP BY department + HAVING AVG(age) > 30 AND COUNT(id) > 5 + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("employees", ["id", "department", "age"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBeGreaterThanOrEqual(1); + }); +}); + +describe("getExtendedLineage - WINDOW functions", () => { + test("window function with PARTITION BY and ORDER BY - full lineage captured", () => { + const sql = ` + SELECT + id, + department, + SUM(salary) OVER (PARTITION BY department ORDER BY salary DESC) as running_total + FROM employees + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("employees", ["id", "department", "salary"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage is captured correctly + expect(result.fields.id?.inputFields).toContainEqual({ + namespace: "trino", + name: "employees", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.department?.inputFields).toContainEqual({ + namespace: "trino", + name: "employees", + field: "department", + transformations: [DIRECT_IDENTITY], + }); + // The aggregated column in window function is captured + expect(result.fields.running_total?.inputFields).toContainEqual({ + namespace: "trino", + name: "employees", + field: "salary", + transformations: [DIRECT_AGGREGATION], + }); + + // Dataset-level WINDOW lineage from PARTITION BY and ORDER BY + const windowLineage = findBySubtype(result.dataset, "WINDOW"); + expect(windowLineage.length).toBe(2); + expect(windowLineage).toContainEqual({ + namespace: "trino", + name: "employees", + field: "department", + transformations: [INDIRECT_WINDOW], + }); + expect(windowLineage).toContainEqual({ + namespace: "trino", + name: "employees", + field: "salary", + transformations: [INDIRECT_WINDOW], + }); + }); + + test("window function with filter - combined lineage", () => { + const sql = ` + SELECT + id, + SUM(amount) OVER (ORDER BY date) as running_total + FROM transactions + WHERE status = 'completed' + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("transactions", ["id", "amount", "date", "status"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // WINDOW lineage from ORDER BY in OVER clause + const windowLineage = findBySubtype(result.dataset, "WINDOW"); + expect(windowLineage.length).toBe(1); + expect(windowLineage).toContainEqual({ + namespace: "trino", + name: "transactions", + field: "date", + transformations: [INDIRECT_WINDOW], + }); + + // FILTER lineage from WHERE is captured + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "transactions", + field: "status", + transformations: [INDIRECT_FILTER], + }); + }); + + test("multiple window functions", () => { + const sql = ` + SELECT + id, + ROW_NUMBER() OVER (PARTITION BY category ORDER BY created_at) as row_num, + SUM(amount) OVER (PARTITION BY user_id ORDER BY created_at) as running_total + FROM orders + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("orders", ["id", "category", "created_at", "amount", "user_id"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const windowLineage = findBySubtype(result.dataset, "WINDOW"); + // category, created_at (from first window), user_id, created_at (from second window) + // created_at should be deduplicated + expect(windowLineage.length).toBe(3); + expect(windowLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "category", + transformations: [INDIRECT_WINDOW], + }); + expect(windowLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "user_id", + transformations: [INDIRECT_WINDOW], + }); + expect(windowLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "created_at", + transformations: [INDIRECT_WINDOW], + }); + }); +}); + +describe("getExtendedLineage - CASE expressions (CONDITION)", () => { + test("simple CASE WHEN", () => { + const sql = ` + SELECT + id, + CASE + WHEN status = 'active' THEN 'Active' + ELSE 'Inactive' + END as status_label + FROM users + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "status"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage should contain the status column with CONDITION + expect(result.fields.status_label).toBeDefined(); + const hasConditionOrTransformation = result.fields.status_label?.inputFields.some( + (f) => + f.transformations?.some((t) => t.subtype === "CONDITION") || + f.transformations?.some((t) => t.subtype === "TRANSFORMATION"), + ); + expect(hasConditionOrTransformation).toBe(true); + }); + + test("CASE with multiple conditions", () => { + const sql = ` + SELECT + id, + CASE + WHEN age < 18 THEN 'Minor' + WHEN age < 65 THEN 'Adult' + ELSE 'Senior' + END as age_group + FROM users + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "age"])]); + + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields.age_group).toBeDefined(); + expect(result.fields.age_group?.inputFields.length).toBeGreaterThan(0); + }); + + test("CASE with column in result", () => { + const sql = ` + SELECT + id, + CASE + WHEN discount_type = 'percent' THEN price * (1 - discount_value / 100) + ELSE price - discount_value + END as final_price + FROM products + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("products", ["id", "price", "discount_type", "discount_value"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const inputFields = result.fields.final_price?.inputFields.map((f) => f.field); + expect(inputFields).toContain("price"); + expect(inputFields).toContain("discount_type"); + expect(inputFields).toContain("discount_value"); + }); +}); + +// ============================================================================= +// COMPLEX TESTS - Multiple Clauses Combined +// ============================================================================= + +describe("getExtendedLineage - JOIN + WHERE", () => { + test("JOIN with WHERE filter", () => { + const sql = ` + SELECT u.id, u.name, o.total + FROM users u + JOIN orders o ON u.id = o.user_id + WHERE u.status = 'active' AND o.total > 100 + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name", "status"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // JOIN lineage + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(2); + + // FILTER lineage + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(2); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "status", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "total", + transformations: [INDIRECT_FILTER], + }); + }); +}); + +describe("getExtendedLineage - JOIN + GROUP BY", () => { + test("JOIN with GROUP BY aggregation", () => { + const sql = ` + SELECT + u.country, + COUNT(o.id) as order_count, + SUM(o.total) as total_revenue + FROM users u + JOIN orders o ON u.id = o.user_id + GROUP BY u.country + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "country"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage with aggregations + expect(result.fields.order_count?.inputFields[0]?.transformations).toContainEqual( + expect.objectContaining({ type: "DIRECT", subtype: "AGGREGATION" }), + ); + + // JOIN lineage + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(2); + + // GROUP BY lineage + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage.length).toBe(1); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [INDIRECT_GROUP_BY], + }); + }); +}); + +describe("getExtendedLineage - WHERE + GROUP BY + HAVING", () => { + test("full aggregation query", () => { + const sql = ` + SELECT + department, + COUNT(*) as employee_count, + AVG(salary) as avg_salary + FROM employees + WHERE status = 'active' + GROUP BY department + HAVING COUNT(*) > 5 + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("employees", ["id", "department", "salary", "status"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // FILTER lineage (from WHERE) + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "employees", + field: "status", + transformations: [INDIRECT_FILTER], + }); + + // GROUP BY lineage + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "employees", + field: "department", + transformations: [INDIRECT_GROUP_BY], + }); + }); +}); + +describe("getExtendedLineage - Full query with all clauses", () => { + test("comprehensive query with all indirect lineage types", () => { + const sql = ` + SELECT + u.country, + COUNT(u.id) as user_count, + SUM(o.total) as total_revenue, + ROW_NUMBER() OVER (ORDER BY SUM(o.total) DESC) as revenue_rank + FROM users u + JOIN orders o ON u.id = o.user_id + WHERE u.status = 'active' AND o.order_date >= '2024-01-01' + GROUP BY u.country + HAVING SUM(o.total) > 1000 + ORDER BY total_revenue DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "country", "status"]), + createTable("orders", ["id", "user_id", "total", "order_date"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage + expect(result.fields.country).toBeDefined(); + expect(result.fields.user_count).toBeDefined(); + expect(result.fields.total_revenue).toBeDefined(); + + // Dataset-level lineage + expect(result.dataset).toBeDefined(); + expect(result.dataset!.length).toBeGreaterThan(0); + + // JOIN lineage + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBeGreaterThan(0); + + // FILTER lineage (from WHERE) + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBeGreaterThan(0); + + // GROUP BY lineage + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage.length).toBeGreaterThan(0); + + // SORT lineage (from ORDER BY) + // Note: ORDER BY total_revenue references an alias, may not resolve to base column + }); + + test("analytics query with aggregate window functions and multiple joins", () => { + const sql = ` + SELECT + d.name as department_name, + e.name as employee_name, + e.salary, + SUM(e.salary) OVER (PARTITION BY e.department_id ORDER BY e.salary DESC) as running_salary + FROM employees e + JOIN departments d ON e.department_id = d.id + WHERE e.status = 'active' + ORDER BY d.name, e.salary DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("employees", ["id", "name", "department_id", "salary", "status"]), + createTable("departments", ["id", "name"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // JOIN lineage + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(2); + + // WINDOW lineage from PARTITION BY department_id and ORDER BY salary DESC in OVER clause + const windowLineage = findBySubtype(result.dataset, "WINDOW"); + expect(windowLineage.length).toBe(2); + expect(windowLineage).toContainEqual({ + namespace: "trino", + name: "employees", + field: "department_id", + transformations: [INDIRECT_WINDOW], + }); + expect(windowLineage).toContainEqual({ + namespace: "trino", + name: "employees", + field: "salary", + transformations: [INDIRECT_WINDOW], + }); + + // FILTER lineage + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "employees", + field: "status", + transformations: [INDIRECT_FILTER], + }); + + // SORT lineage + const sortLineage = findBySubtype(result.dataset, "SORT"); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "departments", + field: "name", + transformations: [INDIRECT_SORT], + }); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "employees", + field: "salary", + transformations: [INDIRECT_SORT], + }); + + // Field-level lineage captures the aggregation + expect(result.fields.running_salary?.inputFields).toContainEqual({ + namespace: "trino", + name: "employees", + field: "salary", + transformations: [DIRECT_AGGREGATION], + }); + }); +}); + +// ============================================================================= +// CTE (WITH clause) TESTS +// ============================================================================= + +describe("getExtendedLineage - WITH clause (CTEs)", () => { + test("simple CTE with filter - dataset lineage propagation", () => { + const sql = ` + WITH active_users AS ( + SELECT id, name, country + FROM users + WHERE status = 'active' + ) + SELECT country, COUNT(*) as count + FROM active_users + GROUP BY country + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "country", "status"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage should trace back to users table + expect(result.fields.country?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [DIRECT_IDENTITY], + }); + + // Dataset-level lineage from the CTE's WHERE clause should be propagated + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(1); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "status", + transformations: [INDIRECT_FILTER], + }); + + // GROUP BY from outer query should also be captured (but references the CTE, not direct table) + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + // country GROUP BY references active_users.country which resolves to users.country + expect(groupByLineage.length).toBe(0); // GROUP BY references CTE alias, not resolved to base table + }); + + test("multiple CTEs", () => { + const sql = ` + WITH + active_users AS ( + SELECT id, name, country FROM users WHERE status = 'active' + ), + user_orders AS ( + SELECT user_id, SUM(total) as total_spent FROM orders GROUP BY user_id + ) + SELECT + au.name, + au.country, + uo.total_spent + FROM active_users au + JOIN user_orders uo ON au.id = uo.user_id + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name", "country", "status"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // ========== Field-level lineage ========== + + // name traces back to users.name with IDENTITY + expect(result.fields.name).toBeDefined(); + expect(result.fields.name?.inputFields.length).toBe(1); + expect(result.fields.name?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + + // country traces back to users.country with IDENTITY + expect(result.fields.country).toBeDefined(); + expect(result.fields.country?.inputFields.length).toBe(1); + expect(result.fields.country?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [DIRECT_IDENTITY], + }); + + // total_spent traces back to orders.total with AGGREGATION (through SUM in CTE) + expect(result.fields.total_spent).toBeDefined(); + expect(result.fields.total_spent?.inputFields.length).toBe(1); + expect(result.fields.total_spent?.inputFields).toContainEqual({ + namespace: "trino", + name: "orders", + field: "total", + transformations: [DIRECT_AGGREGATION], + }); + + // ========== Dataset-level lineage ========== + // Dataset lineage from CTEs is now propagated to outer query + + // FILTER from active_users CTE (WHERE status = 'active') + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(1); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "status", + transformations: [INDIRECT_FILTER], + }); + + // GROUP BY from user_orders CTE (GROUP BY user_id) + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage.length).toBe(1); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "user_id", + transformations: [INDIRECT_GROUP_BY], + }); + + // Verify we have the correct number of output fields + expect(Object.keys(result.fields).length).toBe(3); + }); + + test("nested CTEs with complex transformations", () => { + const sql = ` + WITH + base_data AS ( + SELECT + product_id, + store_id, + quantity * price as revenue + FROM sales + WHERE sale_date >= '2024-01-01' + ), + store_summary AS ( + SELECT + store_id, + SUM(revenue) as total_revenue + FROM base_data + GROUP BY store_id + ) + SELECT + s.name as store_name, + ss.total_revenue + FROM store_summary ss + JOIN stores s ON ss.store_id = s.id + ORDER BY ss.total_revenue DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("sales", ["id", "product_id", "store_id", "quantity", "price", "sale_date"]), + createTable("stores", ["id", "name"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // ========== Field-level lineage ========== + + // store_name should trace back to stores.name + expect(result.fields.store_name).toBeDefined(); + expect(result.fields.store_name?.inputFields).toContainEqual({ + namespace: "trino", + name: "stores", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + + // total_revenue should trace back through CTEs to quantity and price with AGGREGATION + expect(result.fields.total_revenue).toBeDefined(); + expect(result.fields.total_revenue?.inputFields.length).toBe(2); + expect(result.fields.total_revenue?.inputFields).toContainEqual({ + namespace: "trino", + name: "sales", + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }); + expect(result.fields.total_revenue?.inputFields).toContainEqual({ + namespace: "trino", + name: "sales", + field: "price", + transformations: [DIRECT_AGGREGATION], + }); + + // ========== Dataset-level lineage ========== + + // JOIN lineage - should capture the join between store_summary and stores + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(1); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "stores", + field: "id", + transformations: [INDIRECT_JOIN], + }); + + // FILTER lineage - propagated from base_data CTE (WHERE sale_date >= '2024-01-01') + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(1); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "sales", + field: "sale_date", + transformations: [INDIRECT_FILTER], + }); + + // Note: GROUP BY in store_summary (GROUP BY store_id) references base_data.store_id + // which is a CTE column, not a direct table column. Dataset lineage extraction + // for GROUP BY/ORDER BY only resolves to direct table columns, not CTE columns. + // This is a known limitation - CTE-to-CTE indirect lineage is not resolved. + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage.length).toBe(0); + }); +}); + +// ============================================================================= +// SUBQUERY TESTS +// ============================================================================= + +describe("getExtendedLineage - Subqueries", () => { + test("subquery in FROM clause", () => { + const sql = ` + SELECT + sub.country, + sub.user_count + FROM ( + SELECT country, COUNT(*) as user_count + FROM users + WHERE status = 'active' + GROUP BY country + ) sub + ORDER BY sub.user_count DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "country", "status"])]); + + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields.country?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "country", + transformations: [DIRECT_IDENTITY], + }); + }); +}); + +// ============================================================================= +// EDGE CASES AND SPECIAL SCENARIOS +// ============================================================================= + +describe("getExtendedLineage - Edge cases", () => { + test("same column used in multiple contexts", () => { + const sql = ` + SELECT + status, + COUNT(*) as count + FROM users + WHERE status != 'deleted' + GROUP BY status + ORDER BY status + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "status"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // status appears in: + // 1. SELECT (DIRECT/IDENTITY) + // 2. WHERE (INDIRECT/FILTER) + // 3. GROUP BY (INDIRECT/GROUP_BY) + // 4. ORDER BY (INDIRECT/SORT) + + expect(result.fields.status?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "status", + transformations: [DIRECT_IDENTITY], + }); + + const filterLineage = findFieldBySubtype(result.dataset, "status", "FILTER"); + expect(filterLineage).toBeDefined(); + + const groupByLineage = findFieldBySubtype(result.dataset, "status", "GROUP_BY"); + expect(groupByLineage).toBeDefined(); + + const sortLineage = findFieldBySubtype(result.dataset, "status", "SORT"); + expect(sortLineage).toBeDefined(); + }); + + test("column from multiple tables with same name", () => { + const sql = ` + SELECT u.name as user_name, p.name as product_name + FROM users u + JOIN products p ON u.favorite_product_id = p.id + WHERE u.name LIKE 'A%' AND p.name LIKE 'Widget%' + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name", "favorite_product_id"]), + createTable("products", ["id", "name"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage should distinguish the two name columns + expect(result.fields.user_name?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.product_name?.inputFields).toContainEqual({ + namespace: "trino", + name: "products", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + + // FILTER lineage should have both name columns + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "name", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "products", + field: "name", + transformations: [INDIRECT_FILTER], + }); + }); + + test("deduplication of dataset lineage", () => { + const sql = ` + SELECT id, name + FROM users + WHERE status = 'active' AND status != 'banned' + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "status"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // Even though status appears twice in WHERE, it should be deduplicated + const filterLineage = findBySubtype(result.dataset, "FILTER"); + const statusFilters = filterLineage.filter((f) => f.field === "status"); + expect(statusFilters.length).toBe(1); + }); + + test("empty dataset lineage when no indirect clauses", () => { + const sql = `SELECT id, UPPER(name) as upper_name FROM users`; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name"])]); + + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields.id).toBeDefined(); + expect(result.fields.upper_name).toBeDefined(); + expect(result.dataset).toEqual([]); + }); + + test("transformation functions with masking", () => { + const sql = ` + SELECT + MD5(email) as email_hash, + SHA256(ssn) as ssn_hash, + MASK(phone) as masked_phone + FROM users + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "email", "ssn", "phone"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // All three should have masking: true in their transformations + expect(result.fields.email_hash?.inputFields[0]?.transformations?.[0]?.masking).toBe(true); + expect(result.fields.ssn_hash?.inputFields[0]?.transformations?.[0]?.masking).toBe(true); + expect(result.fields.masked_phone?.inputFields[0]?.transformations?.[0]?.masking).toBe(true); + }); + + test("COUNT aggregation has masking flag", () => { + const sql = ` + SELECT country, COUNT(id) as user_count + FROM users + GROUP BY country + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "country"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // COUNT should have masking: true + expect(result.fields.user_count?.inputFields[0]?.transformations).toContainEqual({ + type: "DIRECT", + subtype: "AGGREGATION", + masking: true, + }); + }); +}); + +// ============================================================================= +// REAL-WORLD COMPLEX QUERIES +// ============================================================================= + +describe("getExtendedLineage - Real-world complex queries", () => { + test("e-commerce analytics query", () => { + const sql = ` + SELECT + c.name as category_name, + p.name as product_name, + SUM(oi.quantity) as total_quantity, + SUM(oi.quantity * oi.unit_price) as total_revenue, + AVG(oi.unit_price) as avg_price + FROM categories c + JOIN products p ON c.id = p.category_id + JOIN order_items oi ON p.id = oi.product_id + JOIN orders o ON oi.order_id = o.id + WHERE o.status = 'completed' AND o.order_date >= '2024-01-01' + GROUP BY c.id, c.name, p.id, p.name + HAVING SUM(oi.quantity) > 10 + ORDER BY c.name, total_revenue DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("categories", ["id", "name"]), + createTable("products", ["id", "name", "category_id"]), + createTable("order_items", ["id", "order_id", "product_id", "quantity", "unit_price"]), + createTable("orders", ["id", "status", "order_date"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // ========== Field-level lineage ========== + + // category_name traces to categories.name + expect(result.fields.category_name).toBeDefined(); + expect(result.fields.category_name?.inputFields).toContainEqual({ + namespace: "trino", + name: "categories", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + + // product_name traces to products.name + expect(result.fields.product_name).toBeDefined(); + expect(result.fields.product_name?.inputFields).toContainEqual({ + namespace: "trino", + name: "products", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + + // total_quantity traces to order_items.quantity with AGGREGATION + expect(result.fields.total_quantity).toBeDefined(); + expect(result.fields.total_quantity?.inputFields).toContainEqual({ + namespace: "trino", + name: "order_items", + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }); + + // total_revenue traces to both quantity and unit_price with AGGREGATION + expect(result.fields.total_revenue).toBeDefined(); + expect(result.fields.total_revenue?.inputFields.length).toBe(2); + expect(result.fields.total_revenue?.inputFields).toContainEqual({ + namespace: "trino", + name: "order_items", + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }); + expect(result.fields.total_revenue?.inputFields).toContainEqual({ + namespace: "trino", + name: "order_items", + field: "unit_price", + transformations: [DIRECT_AGGREGATION], + }); + + // avg_price traces to order_items.unit_price with AGGREGATION + expect(result.fields.avg_price).toBeDefined(); + expect(result.fields.avg_price?.inputFields).toContainEqual({ + namespace: "trino", + name: "order_items", + field: "unit_price", + transformations: [DIRECT_AGGREGATION], + }); + + // ========== Dataset-level lineage ========== + + // JOIN lineage - 3 joins with 2 columns each = 6 join columns + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(6); + // First join: c.id = p.category_id + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "categories", + field: "id", + transformations: [INDIRECT_JOIN], + }); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "products", + field: "category_id", + transformations: [INDIRECT_JOIN], + }); + // Second join: p.id = oi.product_id + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "products", + field: "id", + transformations: [INDIRECT_JOIN], + }); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "order_items", + field: "product_id", + transformations: [INDIRECT_JOIN], + }); + // Third join: oi.order_id = o.id + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "order_items", + field: "order_id", + transformations: [INDIRECT_JOIN], + }); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "id", + transformations: [INDIRECT_JOIN], + }); + + // FILTER lineage - status and order_date from WHERE clause + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBeGreaterThanOrEqual(2); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "status", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "order_date", + transformations: [INDIRECT_FILTER], + }); + // HAVING also contributes to filter lineage + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "order_items", + field: "quantity", + transformations: [INDIRECT_FILTER], + }); + + // GROUP BY lineage - c.id, c.name, p.id, p.name + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage.length).toBe(4); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "categories", + field: "id", + transformations: [INDIRECT_GROUP_BY], + }); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "categories", + field: "name", + transformations: [INDIRECT_GROUP_BY], + }); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "products", + field: "id", + transformations: [INDIRECT_GROUP_BY], + }); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "products", + field: "name", + transformations: [INDIRECT_GROUP_BY], + }); + + // SORT lineage - c.name (total_revenue is alias, may not resolve) + const sortLineage = findBySubtype(result.dataset, "SORT"); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "categories", + field: "name", + transformations: [INDIRECT_SORT], + }); + }); + + test("HR analytics query with employee hierarchy", () => { + const sql = ` + WITH department_stats AS ( + SELECT + department_id, + COUNT(*) as employee_count, + AVG(salary) as avg_salary, + SUM(salary) as total_salary + FROM employees + WHERE status = 'active' + GROUP BY department_id + ) + SELECT + d.name as department_name, + ds.employee_count, + ds.avg_salary, + ds.total_salary, + CASE + WHEN ds.avg_salary > 100000 THEN 'High' + WHEN ds.avg_salary > 50000 THEN 'Medium' + ELSE 'Low' + END as salary_tier, + RANK() OVER (ORDER BY ds.total_salary DESC) as budget_rank + FROM department_stats ds + JOIN departments d ON ds.department_id = d.id + WHERE ds.employee_count >= 5 + ORDER BY budget_rank + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("employees", ["id", "department_id", "salary", "status"]), + createTable("departments", ["id", "name"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // ========== Field-level lineage ========== + + // department_name traces to departments.name + expect(result.fields.department_name).toBeDefined(); + expect(result.fields.department_name?.inputFields).toContainEqual({ + namespace: "trino", + name: "departments", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + + // employee_count comes from COUNT(*) in CTE - no specific field traced + expect(result.fields.employee_count).toBeDefined(); + // COUNT(*) doesn't reference a specific column, so inputFields may be empty + + // avg_salary traces to employees.salary with AGGREGATION + expect(result.fields.avg_salary).toBeDefined(); + expect(result.fields.avg_salary?.inputFields).toContainEqual({ + namespace: "trino", + name: "employees", + field: "salary", + transformations: [DIRECT_AGGREGATION], + }); + + // total_salary traces to employees.salary with AGGREGATION + expect(result.fields.total_salary).toBeDefined(); + expect(result.fields.total_salary?.inputFields).toContainEqual({ + namespace: "trino", + name: "employees", + field: "salary", + transformations: [DIRECT_AGGREGATION], + }); + + // salary_tier uses CASE with avg_salary in conditions - traces back to salary + expect(result.fields.salary_tier).toBeDefined(); + expect(result.fields.salary_tier?.inputFields.length).toBeGreaterThan(0); + // The salary_tier CASE statement references avg_salary which traces to employees.salary + const salaryTierFields = result.fields.salary_tier?.inputFields.map((f) => f.field); + expect(salaryTierFields).toContain("salary"); + // Should have CONDITION subtype for the WHEN clauses + const hasConditionTransformation = result.fields.salary_tier?.inputFields.some( + (f) => f.transformations?.some((t) => t.subtype === "CONDITION" || t.subtype === "AGGREGATION"), + ); + expect(hasConditionTransformation).toBe(true); + + // budget_rank from RANK() OVER (ORDER BY ds.total_salary DESC) - now tracks columns from OVER clause + expect(result.fields.budget_rank).toBeDefined(); + // RANK() has no arguments, but it should capture total_salary from ORDER BY in OVER clause + // total_salary traces back through CTE to employees.salary + expect(result.fields.budget_rank?.inputFields.length).toBeGreaterThan(0); + const budgetRankFields = result.fields.budget_rank?.inputFields.map((f) => f.field); + expect(budgetRankFields).toContain("salary"); + + // ========== Dataset-level lineage ========== + + // JOIN lineage - ds.department_id = d.id + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBeGreaterThanOrEqual(1); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "departments", + field: "id", + transformations: [INDIRECT_JOIN], + }); + + // Verify total output field count + expect(Object.keys(result.fields).length).toBe(6); + }); + + test("time-series analysis query", () => { + const sql = ` + SELECT + DATE_TRUNC('month', event_date) as month, + event_type, + COUNT(*) as event_count, + COUNT(DISTINCT user_id) as unique_users + FROM events + WHERE event_date >= '2024-01-01' + AND event_type IN ('login', 'purchase', 'view') + GROUP BY DATE_TRUNC('month', event_date), event_type + ORDER BY month, event_type + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("events", ["id", "event_date", "event_type", "user_id"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // ========== Field-level lineage ========== + + // month traces to event_date with TRANSFORMATION (DATE_TRUNC) + expect(result.fields.month).toBeDefined(); + expect(result.fields.month?.inputFields.length).toBe(1); + expect(result.fields.month?.inputFields[0]?.field).toBe("event_date"); + expect(result.fields.month?.inputFields[0]?.transformations?.[0]?.subtype).toBe("TRANSFORMATION"); + + // event_type is direct IDENTITY + expect(result.fields.event_type).toBeDefined(); + expect(result.fields.event_type?.inputFields).toContainEqual({ + namespace: "trino", + name: "events", + field: "event_type", + transformations: [DIRECT_IDENTITY], + }); + + // event_count from COUNT(*) - no specific field + expect(result.fields.event_count).toBeDefined(); + + // unique_users from COUNT(DISTINCT user_id) - traces to user_id with AGGREGATION + masking + expect(result.fields.unique_users).toBeDefined(); + expect(result.fields.unique_users?.inputFields.length).toBe(1); + expect(result.fields.unique_users?.inputFields[0]?.field).toBe("user_id"); + expect(result.fields.unique_users?.inputFields[0]?.transformations?.[0]?.subtype).toBe("AGGREGATION"); + expect(result.fields.unique_users?.inputFields[0]?.transformations?.[0]?.masking).toBe(true); + + // Verify total output field count + expect(Object.keys(result.fields).length).toBe(4); + + // ========== Dataset-level lineage ========== + + // FILTER lineage - event_date and event_type from WHERE + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(2); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "events", + field: "event_date", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "events", + field: "event_type", + transformations: [INDIRECT_FILTER], + }); + + // GROUP BY lineage - DATE_TRUNC('month', event_date) and event_type + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage.length).toBe(2); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "events", + field: "event_date", + transformations: [INDIRECT_GROUP_BY], + }); + expect(groupByLineage).toContainEqual({ + namespace: "trino", + name: "events", + field: "event_type", + transformations: [INDIRECT_GROUP_BY], + }); + + // SORT lineage - month and event_type + // Note: month is an alias that may not resolve to base column + const sortLineage = findBySubtype(result.dataset, "SORT"); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "events", + field: "event_type", + transformations: [INDIRECT_SORT], + }); + }); + + test("multi-level aggregation query", () => { + const sql = ` + WITH daily_sales AS ( + SELECT + store_id, + DATE(sale_timestamp) as sale_date, + SUM(amount) as daily_total + FROM sales + WHERE sale_timestamp >= '2024-01-01' + GROUP BY store_id, DATE(sale_timestamp) + ), + weekly_sales AS ( + SELECT + store_id, + DATE_TRUNC('week', sale_date) as week_start, + SUM(daily_total) as weekly_total, + AVG(daily_total) as daily_avg + FROM daily_sales + GROUP BY store_id, DATE_TRUNC('week', sale_date) + ) + SELECT + s.name as store_name, + s.region, + ws.week_start, + ws.weekly_total, + ws.daily_avg + FROM weekly_sales ws + JOIN stores s ON ws.store_id = s.id + ORDER BY s.region, ws.week_start + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("sales", ["id", "store_id", "sale_timestamp", "amount"]), + createTable("stores", ["id", "name", "region"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // ========== Field-level lineage ========== + + // store_name traces to stores.name + expect(result.fields.store_name).toBeDefined(); + expect(result.fields.store_name?.inputFields).toContainEqual({ + namespace: "trino", + name: "stores", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + + // region traces to stores.region + expect(result.fields.region).toBeDefined(); + expect(result.fields.region?.inputFields).toContainEqual({ + namespace: "trino", + name: "stores", + field: "region", + transformations: [DIRECT_IDENTITY], + }); + + // week_start traces through CTEs to sale_timestamp (via DATE and DATE_TRUNC transformations) + expect(result.fields.week_start).toBeDefined(); + expect(result.fields.week_start?.inputFields.length).toBe(1); + expect(result.fields.week_start?.inputFields[0]?.field).toBe("sale_timestamp"); + expect(result.fields.week_start?.inputFields[0]?.name).toBe("sales"); + // Should have TRANSFORMATION due to DATE_TRUNC/DATE functions + expect(result.fields.week_start?.inputFields[0]?.transformations?.[0]?.type).toBe("DIRECT"); + + // weekly_total traces through CTEs: SUM(SUM(amount)) -> amount with AGGREGATION + expect(result.fields.weekly_total).toBeDefined(); + expect(result.fields.weekly_total?.inputFields.length).toBe(1); + expect(result.fields.weekly_total?.inputFields).toContainEqual({ + namespace: "trino", + name: "sales", + field: "amount", + transformations: [DIRECT_AGGREGATION], + }); + + // daily_avg traces through CTEs: AVG(SUM(amount)) -> amount with AGGREGATION + expect(result.fields.daily_avg).toBeDefined(); + expect(result.fields.daily_avg?.inputFields.length).toBe(1); + expect(result.fields.daily_avg?.inputFields).toContainEqual({ + namespace: "trino", + name: "sales", + field: "amount", + transformations: [DIRECT_AGGREGATION], + }); + + // Verify total output field count + expect(Object.keys(result.fields).length).toBe(5); + + // ========== Dataset-level lineage ========== + + // JOIN lineage - ws.store_id = s.id + // Note: store_id from CTE doesn't resolve, but stores.id does + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBeGreaterThanOrEqual(1); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "stores", + field: "id", + transformations: [INDIRECT_JOIN], + }); + + // SORT lineage - s.region and ws.week_start + // s.region should resolve to stores.region + const sortLineage = findBySubtype(result.dataset, "SORT"); + expect(sortLineage.length).toBeGreaterThanOrEqual(1); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "stores", + field: "region", + transformations: [INDIRECT_SORT], + }); + }); +}); + +// ============================================================================= +// TRANSFORMATION TYPE VERIFICATION +// ============================================================================= + +describe("getExtendedLineage - Transformation type verification", () => { + test("verifies all transformation types are correct", () => { + const sql = ` + SELECT + u.country, + COUNT(u.id) as user_count, + ROW_NUMBER() OVER (ORDER BY COUNT(u.id) DESC) as rank + FROM users u + JOIN orders o ON u.id = o.user_id + WHERE u.status = 'active' + GROUP BY u.country + ORDER BY u.country + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "country", "status"]), + createTable("orders", ["id", "user_id"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Verify JOIN transformation structure + const joinLineage = findBySubtype(result.dataset, "JOIN"); + joinLineage.forEach((field) => { + expect(field.transformations?.[0]).toEqual(INDIRECT_JOIN); + }); + + // Verify FILTER transformation structure + const filterLineage = findBySubtype(result.dataset, "FILTER"); + filterLineage.forEach((field) => { + expect(field.transformations?.[0]).toEqual(INDIRECT_FILTER); + }); + + // Verify GROUP_BY transformation structure + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + groupByLineage.forEach((field) => { + expect(field.transformations?.[0]).toEqual(INDIRECT_GROUP_BY); + }); + + // Verify SORT transformation structure + const sortLineage = findBySubtype(result.dataset, "SORT"); + sortLineage.forEach((field) => { + expect(field.transformations?.[0]).toEqual(INDIRECT_SORT); + }); + }); + + test("field-level transformations for direct lineage", () => { + const sql = ` + SELECT + id, + name, + UPPER(email) as upper_email, + age + 1 as next_age, + COUNT(status) as status_count, + SUM(salary) as total_salary + FROM employees + GROUP BY id, name, email, age + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("employees", ["id", "name", "email", "age", "status", "salary"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // IDENTITY transformations + expect(result.fields.id?.inputFields[0]?.transformations).toContainEqual(DIRECT_IDENTITY); + expect(result.fields.name?.inputFields[0]?.transformations).toContainEqual(DIRECT_IDENTITY); + + // TRANSFORMATION (function) + expect(result.fields.upper_email?.inputFields[0]?.transformations).toContainEqual(DIRECT_TRANSFORMATION); + + // TRANSFORMATION (arithmetic) + expect(result.fields.next_age?.inputFields[0]?.transformations).toContainEqual(DIRECT_TRANSFORMATION); + + // AGGREGATION (with masking for COUNT) + expect(result.fields.status_count?.inputFields[0]?.transformations).toContainEqual({ + ...DIRECT_AGGREGATION, + masking: true, + }); + + // AGGREGATION (without masking for SUM) + expect(result.fields.total_salary?.inputFields[0]?.transformations).toContainEqual(DIRECT_AGGREGATION); + }); +}); diff --git a/packages/lineage/test/indirect.test.ts b/packages/lineage/test/indirect.test.ts deleted file mode 100644 index b279e05..0000000 --- a/packages/lineage/test/indirect.test.ts +++ /dev/null @@ -1,358 +0,0 @@ -import { describe, test, expect } from "bun:test"; -import { Parser } from "node-sql-parser"; -import type { AST, Select } from "node-sql-parser"; -import { - getExtendedLineage, - getJoinLineage, - getFilterLineage, - getGroupByLineage, - getOrderByLineage, - getWindowLineage, - getLineage, - type Schema, - type Table, - INDIRECT_JOIN, - INDIRECT_FILTER, - INDIRECT_GROUP_BY, - INDIRECT_SORT, - INDIRECT_WINDOW, - INDIRECT_CONDITION, - DIRECT_IDENTITY, - DIRECT_TRANSFORMATION, - DIRECT_AGGREGATION, -} from "../src/index.js"; - -const parser = new Parser(); - -// Helper function to create schemas -function createSchema(namespace: string, tables: Table[]): Schema { - return { namespace, tables }; -} - -function createTable(name: string, columns: string[]): Table { - return { name, columns }; -} - -// Helper to ensure we get a single AST -function parseSQL(sql: string): AST { - const result = parser.astify(sql, { database: "trino" }); - const ast = Array.isArray(result) ? result[0] : result; - - if (!ast) { - throw new Error("Failed to parse SQL"); - } - - return ast; -} - -describe("Indirect Lineage - JOIN", () => { - test("simple inner join", () => { - const sql = ` - SELECT u.id, u.name, o.total - FROM users u - JOIN orders o ON u.id = o.user_id - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "email"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - - const joinLineage = getJoinLineage(ast as Select, schema); - - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [INDIRECT_JOIN], - }); - - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "user_id", - transformations: [INDIRECT_JOIN], - }); - }); - - test("multiple joins", () => { - const sql = ` - SELECT u.name, o.total, p.name as product_name - FROM users u - JOIN orders o ON u.id = o.user_id - JOIN products p ON o.product_id = p.id - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "product_id", "total"]), - createTable("products", ["id", "name"]), - ]); - - const joinLineage = getJoinLineage(ast as Select, schema); - - // Should have join columns from all joins - expect(joinLineage.length).toBeGreaterThanOrEqual(4); - }); -}); - -describe("Indirect Lineage - FILTER (WHERE)", () => { - test("simple where clause", () => { - const sql = ` - SELECT id, name - FROM users - WHERE status = 'active' - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "status"])]); - - const filterLineage = getFilterLineage(ast as Select, schema); - - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "status", - transformations: [INDIRECT_FILTER], - }); - }); - - test("complex where clause with AND/OR", () => { - const sql = ` - SELECT id, name - FROM users - WHERE status = 'active' AND age > 18 OR country = 'US' - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "status", "age", "country"])]); - - const filterLineage = getFilterLineage(ast as Select, schema); - - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "status", - transformations: [INDIRECT_FILTER], - }); - - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "age", - transformations: [INDIRECT_FILTER], - }); - - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [INDIRECT_FILTER], - }); - }); -}); - -describe("Indirect Lineage - GROUP BY", () => { - test("simple group by", () => { - const sql = ` - SELECT country, COUNT(*) as user_count - FROM users - GROUP BY country - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "country"])]); - - const groupByLineage = getGroupByLineage(ast as Select, schema); - - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [INDIRECT_GROUP_BY], - }); - }); - - test("multiple group by columns", () => { - const sql = ` - SELECT country, city, COUNT(*) as user_count - FROM users - GROUP BY country, city - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "country", "city"])]); - - const groupByLineage = getGroupByLineage(ast as Select, schema); - - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [INDIRECT_GROUP_BY], - }); - - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "city", - transformations: [INDIRECT_GROUP_BY], - }); - }); -}); - -describe("Indirect Lineage - ORDER BY (SORT)", () => { - test("simple order by", () => { - const sql = ` - SELECT id, name - FROM users - ORDER BY created_at DESC - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "created_at"])]); - - const orderByLineage = getOrderByLineage(ast as Select, schema); - - expect(orderByLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "created_at", - transformations: [INDIRECT_SORT], - }); - }); - - test("multiple order by columns", () => { - const sql = ` - SELECT id, name - FROM users - ORDER BY country ASC, created_at DESC - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "country", "created_at"])]); - - const orderByLineage = getOrderByLineage(ast as Select, schema); - - expect(orderByLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [INDIRECT_SORT], - }); - - expect(orderByLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "created_at", - transformations: [INDIRECT_SORT], - }); - }); -}); - -describe("Extended Lineage", () => { - test("full query with all indirect types", () => { - const sql = ` - SELECT - u.country, - COUNT(u.id) as user_count, - SUM(o.total) as total_revenue - FROM users u - JOIN orders o ON u.id = o.user_id - WHERE u.status = 'active' - GROUP BY u.country - ORDER BY u.country DESC - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [ - createTable("users", ["id", "name", "country", "status"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - - const extendedLineage = getExtendedLineage(ast as Select, schema); - - // Check field-level lineage - expect(extendedLineage.fields.country).toBeDefined(); - expect(extendedLineage.fields.user_count).toBeDefined(); - expect(extendedLineage.fields.total_revenue).toBeDefined(); - - // Check dataset-level lineage contains indirect transformations - expect(extendedLineage.dataset).toBeDefined(); - expect(extendedLineage.dataset!.length).toBeGreaterThan(0); - - // Should have JOIN lineage - const joinFields = extendedLineage.dataset!.filter((f) => f.transformations?.[0]?.subtype === "JOIN"); - expect(joinFields.length).toBeGreaterThan(0); - - // Should have FILTER lineage - const filterFields = extendedLineage.dataset!.filter((f) => f.transformations?.[0]?.subtype === "FILTER"); - expect(filterFields.length).toBeGreaterThan(0); - - // Should have GROUP_BY lineage - const groupByFields = extendedLineage.dataset!.filter((f) => f.transformations?.[0]?.subtype === "GROUP_BY"); - expect(groupByFields.length).toBeGreaterThan(0); - - // Should have SORT lineage - const sortFields = extendedLineage.dataset!.filter((f) => f.transformations?.[0]?.subtype === "SORT"); - expect(sortFields.length).toBeGreaterThan(0); - }); -}); - -describe("Direct Lineage - CASE/CONDITION", () => { - test("simple case when", () => { - const sql = ` - SELECT - id, - CASE - WHEN status = 'active' THEN 'Active User' - WHEN status = 'inactive' THEN 'Inactive User' - ELSE 'Unknown' - END as status_label - FROM users - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "status"])]); - - const lineage = getLineage(ast as Select, schema); - - // The status column should be in the lineage for status_label - expect(lineage.status_label).toBeDefined(); - expect(lineage.status_label?.inputFields.length).toBeGreaterThan(0); - - // Should have CONDITION transformation for the condition columns - const hasCondition = lineage.status_label?.inputFields.some( - (f) => - f.transformations?.some((t) => t.subtype === "CONDITION") || - f.transformations?.some((t) => t.subtype === "TRANSFORMATION"), - ); - expect(hasCondition).toBe(true); - }); - - test("case with expression in result", () => { - const sql = ` - SELECT - id, - CASE - WHEN quantity > 100 THEN price * 0.9 - ELSE price - END as final_price - FROM products - `; - - const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("products", ["id", "price", "quantity"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage.final_price).toBeDefined(); - - // Should include both quantity (condition) and price (result) - const fields = lineage.final_price?.inputFields.map((f) => f.field); - expect(fields).toContain("price"); - expect(fields).toContain("quantity"); - }); -}); From 2f239d81fbd6bb4627aeff6b3b27b2d663ee71a9 Mon Sep 17 00:00:00 2001 From: its-elad <59926027+its-elad@users.noreply.github.com> Date: Thu, 29 Jan 2026 14:09:29 +0200 Subject: [PATCH 03/10] updated readme --- packages/lineage/README.md | 204 +++++++++++++++++++++++----------- packages/lineage/src/index.ts | 2 +- 2 files changed, 138 insertions(+), 68 deletions(-) diff --git a/packages/lineage/README.md b/packages/lineage/README.md index 3388ebd..fae0af2 100644 --- a/packages/lineage/README.md +++ b/packages/lineage/README.md @@ -1,18 +1,25 @@ # @meta-sql/lineage -A TypeScript library for extracting column-level lineage from SQL queries, implementing the [OpenLineage Column Lineage Dataset Facet specification](https://openlineage.io/docs/spec/facets/dataset-facets/column_lineage_facet/). +A TypeScript library for extracting column-level and dataset-level lineage from SQL queries, implementing the [OpenLineage Column Lineage Dataset Facet specification](https://openlineage.io/docs/spec/facets/dataset-facets/column_lineage_facet/). > ⚠️ **Experimental**: This library is currently in active development and may undergo significant changes. APIs, interfaces, and functionality may change without notice in future versions. Use with caution in production environments. ## Overview -This library analyzes SQL SELECT statements to generate detailed column-level lineage information, tracking how data flows from input columns to output columns through various transformations like joins, aggregations, filters, and CTEs (Common Table Expressions). +This library analyzes SQL SELECT statements to generate detailed lineage information: + +- **Field-level lineage**: Tracks how data flows from input columns to output columns through transformations +- **Dataset-level lineage**: Tracks columns that indirectly affect the entire result set (JOINs, filters, grouping, sorting, window functions) ## Features - ✅ **Column-level lineage extraction** from SQL SELECT statements +- ✅ **Dataset-level indirect lineage** for columns affecting the entire result - ✅ **CTE (Common Table Expression) support** with nested lineage tracking -- ✅ **Direct transformations** (IDENTITY) +- ✅ **Direct transformations** (IDENTITY, TRANSFORMATION, AGGREGATION) +- ✅ **Indirect transformations** (JOIN, FILTER, GROUP_BY, SORT, WINDOW, CONDITION) +- ✅ **Window function support** (PARTITION BY, ORDER BY in OVER clauses) +- ✅ **Masking detection** for privacy-preserving transformations - ✅ **Schema-aware parsing** with table and column validation - ✅ **OpenLineage specification compliance** for interoperability - ✅ **TypeScript-first** with comprehensive type definitions @@ -27,6 +34,8 @@ bun add @meta-sql/lineage node-sql-parser ## Quick Start +### Basic Field-Level Lineage + ```typescript import { getLineage } from "@meta-sql/lineage"; import { Parser } from "node-sql-parser"; @@ -40,91 +49,126 @@ const schema = { }; const lineage = getLineage(ast, schema); -console.log(lineage); -// Output: -// { -// id: { -// inputFields: [{ -// namespace: "my_database", -// name: "users", -// field: "id", -// transformations: [{ type: "DIRECT", subtype: "IDENTITY" }] -// }] -// }, -// name: { -// inputFields: [{ -// namespace: "my_database", -// name: "users", -// field: "name", -// transformations: [{ type: "DIRECT", subtype: "IDENTITY" }] -// }] -// } -// } +// Returns field-level lineage only ``` -## Supported SQL Features +### Extended Lineage (Field + Dataset Level) -### ✅ Currently Supported +```typescript +import { getExtendedLineage } from "@meta-sql/lineage"; +import { Parser } from "node-sql-parser"; -- Basic SELECT statements -- Column aliases (`SELECT id as user_id`) -- Common Table Expressions (CTEs) -- Nested subqueries -- Simple column references +const parser = new Parser(); +const sql = ` + SELECT u.name, COUNT(o.id) as order_count + FROM users u + JOIN orders o ON u.id = o.user_id + WHERE u.status = 'active' + GROUP BY u.name + ORDER BY order_count DESC +`; +const ast = parser.astify(sql, { database: "trino" }) as Select; -## Roadmap +const schema = { + namespace: "my_database", + tables: [ + { name: "users", columns: ["id", "name", "status"] }, + { name: "orders", columns: ["id", "user_id", "total"] }, + ], +}; -Our development roadmap aligns with the OpenLineage Column Lineage Dataset Facet specification: +const result = getExtendedLineage(ast, schema); -### 🚧 Phase 1: Enhanced Transformations +// result.fields - Field-level lineage (which columns flow into output columns) +// { +// name: { inputFields: [{ field: "name", name: "users", ... }] }, +// order_count: { inputFields: [{ field: "id", name: "orders", transformations: [AGGREGATION] }] } +// } -- ✅ **DIRECT/TRANSFORMATION** support for computed columns - - ✅ Mathematical operations (`SELECT price * quantity`) - - ✅ String functions (`SELECT UPPER(name)`) - - ✅ Date functions (`SELECT DATE_ADD(created_at, INTERVAL 1 DAY)`) -- ✅ **DIRECT/AGGREGATION** support for aggregation functions - - ✅ Basic aggregations (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`) -- ✅ **Masking detection** for privacy-preserving transformations - - ✅ Hash functions (`SELECT MD5(email)`) - - ✅ Anonymization functions (`SELECT ANONYMIZE(ssn)`) +// result.dataset - Dataset-level lineage (columns that indirectly affect the result) +// [ +// { field: "id", name: "users", transformations: [{ type: "INDIRECT", subtype: "JOIN" }] }, +// { field: "user_id", name: "orders", transformations: [{ type: "INDIRECT", subtype: "JOIN" }] }, +// { field: "status", name: "users", transformations: [{ type: "INDIRECT", subtype: "FILTER" }] }, +// { field: "name", name: "users", transformations: [{ type: "INDIRECT", subtype: "GROUP_BY" }] } +// ] +``` + +## Transformation Types -### 🔄 Phase 2: Indirect Lineage +### Direct Transformations (Field-Level) -- [ ] **INDIRECT/JOIN** lineage tracking - - Track columns used in JOIN conditions - - Multi-table relationship mapping -- [ ] **INDIRECT/FILTER** for WHERE clause dependencies - - Identify filtering columns that affect output -- [ ] **INDIRECT/GROUP_BY** for grouping dependencies - - Track GROUP BY columns impact on aggregations -- [ ] **INDIRECT/SORT** for ORDER BY clause tracking +| Subtype | Description | Example | +|---------|-------------|---------| +| `IDENTITY` | Column passed through unchanged | `SELECT id FROM users` | +| `TRANSFORMATION` | Column modified by function/expression | `SELECT UPPER(name)`, `SELECT price * qty` | +| `AGGREGATION` | Column aggregated | `SELECT SUM(amount)`, `SELECT COUNT(id)` | -### 📊 Phase 3: Advanced SQL Features +### Indirect Transformations (Dataset-Level) -- [ ] **INDIRECT/WINDOW** for window function dependencies -- [ ] **INDIRECT/CONDITION** for CASE WHEN and IF statements -- [ ] **Complex JOIN types** (LEFT, RIGHT, FULL OUTER) -- [ ] **UNION and INTERSECT** operations -- ✅ **Recursive CTEs** support +| Subtype | Description | Example | +|---------|-------------|---------| +| `JOIN` | Columns used in JOIN conditions | `ON u.id = o.user_id` | +| `FILTER` | Columns used in WHERE/HAVING | `WHERE status = 'active'` | +| `GROUP_BY` | Columns used in GROUP BY | `GROUP BY department` | +| `SORT` | Columns used in ORDER BY | `ORDER BY created_at` | +| `WINDOW` | Columns in OVER clause | `OVER (PARTITION BY dept ORDER BY salary)` | +| `CONDITION` | Columns in CASE WHEN conditions | `CASE WHEN status = 'x' THEN ...` | -### 🔧 Phase 4: Enhanced Analysis +## Supported SQL Features + +### ✅ Currently Supported -- [ ] **Dataset-level lineage** for operations affecting entire datasets -- [ ] **Multi-statement support** (DDL operations) -- ✅ **Multiple SQL dialect support** (PostgreSQL, MySQL, BigQuery, Snowflake) +- Basic SELECT statements +- Column aliases (`SELECT id as user_id`) +- Common Table Expressions (CTEs) with lineage propagation +- Nested subqueries +- JOINs (INNER, LEFT, RIGHT) with ON conditions +- WHERE and HAVING clauses +- GROUP BY with aggregations +- ORDER BY sorting +- Window functions (`ROW_NUMBER`, `RANK`, `SUM OVER`, etc.) +- CASE WHEN expressions +- CAST and type conversions +- Mathematical operations (`SELECT price * quantity`) +- String functions (`SELECT UPPER(name)`) +- Date functions (`SELECT DATE_TRUNC('month', created_at)`) +- Masking functions (`MD5`, `SHA256`, `HASH`, `MASK`, `ANONYMIZE`, etc.) + +### 🔄 In Progress + +- UNION and INTERSECT operations +- More complex recursive CTE patterns + +### 📋 Planned + +- FULL OUTER JOIN support +- Multi-statement support (DDL operations) +- `select *` support +- Additional SQL dialect optimizations ## API Reference -### `getLineage(select: Select, schema: Schema): ColumnLineageDatasetFacet["fields"]` +### `getLineage(select, schema)` -Extracts column lineage from a SQL SELECT AST. +Extracts field-level column lineage from a SQL SELECT AST. -**Parameters:** +```typescript +function getLineage(select: Select, schema: Schema): ColumnLineageDatasetFacet["fields"]; +``` + +### `getExtendedLineage(select, schema)` -- `select`: Parsed SQL SELECT statement from node-sql-parser -- `schema`: Schema definition with table and column information +Extracts both field-level and dataset-level lineage. + +```typescript +function getExtendedLineage(select: Select, schema: Schema): ExtendedLineageResult; -**Returns:** Column lineage mapping conforming to OpenLineage specification +interface ExtendedLineageResult { + fields: ColumnLineageDatasetFacet["fields"]; // Field-level lineage + dataset?: InputField[]; // Dataset-level indirect lineage +} +``` ### Types @@ -138,8 +182,34 @@ type Table = { name: string; columns: string[]; }; + +type Transformation = { + type: "DIRECT" | "INDIRECT"; + subtype: "IDENTITY" | "TRANSFORMATION" | "AGGREGATION" | "JOIN" | "FILTER" | "GROUP_BY" | "SORT" | "WINDOW" | "CONDITION"; + masking: boolean; +}; ``` +## Roadmap + +### ✅ Completed + +- Field-level lineage with DIRECT transformations +- Dataset-level lineage with INDIRECT transformations +- Window function support +- CTE lineage propagation +- Masking detection + +### 🔄 In Progress + +- UNION and INTERSECT operations +- More complex recursive CTE patterns + +### 📋 Planned + +- Multi-statement support (DDL operations) +- Additional SQL dialect optimizations + ## License MIT License - see [LICENSE](../../LICENSE) for details. diff --git a/packages/lineage/src/index.ts b/packages/lineage/src/index.ts index a549be5..30d3f4f 100644 --- a/packages/lineage/src/index.ts +++ b/packages/lineage/src/index.ts @@ -514,7 +514,7 @@ export function getDirectTransformationsFromExprValue( if (arg.result) { const resultTransformations = getDirectTransformationsFromExprValue( arg.result, - mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION), + mergeTransformations(parentTransformation, DIRECT_IDENTITY), ); Object.entries(resultTransformations).forEach(([key, value]) => { merged[key] = merged[key] ? merged[key].union(value) : value; From adc75231dedbb40c4eb5843c982c58078d63c354 Mon Sep 17 00:00:00 2001 From: its-elad <59926027+its-elad@users.noreply.github.com> Date: Sun, 1 Feb 2026 12:15:42 +0200 Subject: [PATCH 04/10] feat: added star support (*), set operations support, all joins --- packages/lineage/README.md | 9 +- packages/lineage/src/index.ts | 370 ++++++++++- packages/lineage/test/extendedLineage.test.ts | 627 ++++++++++++++++++ packages/lineage/test/index.test.ts | 564 ++++++++++++++++ 4 files changed, 1533 insertions(+), 37 deletions(-) diff --git a/packages/lineage/README.md b/packages/lineage/README.md index fae0af2..3240d54 100644 --- a/packages/lineage/README.md +++ b/packages/lineage/README.md @@ -120,6 +120,7 @@ const result = getExtendedLineage(ast, schema); ### ✅ Currently Supported - Basic SELECT statements +- `SELECT *` and `table.*` expansion (requires schema) - Column aliases (`SELECT id as user_id`) - Common Table Expressions (CTEs) with lineage propagation - Nested subqueries @@ -134,17 +135,16 @@ const result = getExtendedLineage(ast, schema); - String functions (`SELECT UPPER(name)`) - Date functions (`SELECT DATE_TRUNC('month', created_at)`) - Masking functions (`MD5`, `SHA256`, `HASH`, `MASK`, `ANONYMIZE`, etc.) +- **Set operations** (`UNION`, `UNION ALL`, `INTERSECT`, `EXCEPT`) +- **All JOIN types** (`INNER`, `LEFT`, `RIGHT`, `FULL OUTER`, `CROSS`) ### 🔄 In Progress -- UNION and INTERSECT operations - More complex recursive CTE patterns ### 📋 Planned -- FULL OUTER JOIN support - Multi-statement support (DDL operations) -- `select *` support - Additional SQL dialect optimizations ## API Reference @@ -199,10 +199,11 @@ type Transformation = { - Window function support - CTE lineage propagation - Masking detection +- Set operations (UNION, INTERSECT, EXCEPT) +- All JOIN types (INNER, LEFT, RIGHT, FULL OUTER, CROSS) ### 🔄 In Progress -- UNION and INTERSECT operations - More complex recursive CTE patterns ### 📋 Planned diff --git a/packages/lineage/src/index.ts b/packages/lineage/src/index.ts index 30d3f4f..b245768 100644 --- a/packages/lineage/src/index.ts +++ b/packages/lineage/src/index.ts @@ -6,8 +6,6 @@ import { type ColumnLineageDatasetFacet, type InputField, type Transformation as _Transformation, - type TransformationType, - type TransformationSubtype, } from "@meta-sql/open-lineage"; import { Select, @@ -143,9 +141,11 @@ function mergeTransformations(parent: Transformation | undefined, child: Transfo return { ...child, masking: parent.masking || child.masking }; } +const transformationHasher = (value: Transformation): string => + `${value.type}-${value.subtype}-${value.masking ? "MASKED" : "UNMASKED"}`; class TransformationSet extends HashSet { constructor(values?: readonly Transformation[]) { - super((value: Transformation) => `${value.type}-${value.subtype}-${value.masking ? "MASKED" : "UNMASKED"}`); + super((value: Transformation) => transformationHasher(value)); if (values) { values.forEach((value) => this.add(value)); @@ -180,6 +180,19 @@ export type SelectWithAlias = Select & { as?: string | null; }; +/** + * Set operation type for UNION, INTERSECT, EXCEPT + */ +export type SetOperation = "union" | "union all" | "intersect" | "intersect all" | "except" | "except all"; + +/** + * Extended Select type that includes types for set operations (_next and set_op) + */ +export type SelectWithSetOp = Select & { + set_op?: SetOperation | null; + _next?: SelectWithSetOp | null; +}; + /** * Extended lineage result that includes both field-level and dataset-level lineage */ @@ -203,6 +216,26 @@ export function isColumn(selectColumn: Select["columns"][number]): selectColumn ); } +/** + * Check if a column expression is a star (wildcard) expression like * or table.* + */ +export function isStar(column: AstColumn): boolean { + if (column.expr.type !== "column_ref") return false; + const colRef = column.expr as ColumnRefItem; + return colRef.column === "*" || (typeof colRef.column === "object" && colRef.column?.expr?.value === "*"); +} + +/** + * Get the table qualifier from a star expression (e.g., "u" from "u.*") + * Returns null if there's no table qualifier (plain "*") + */ +export function getStarTableQualifier(column: AstColumn): string | null { + if (!isStar(column)) return null; + const colRef = column.expr as ColumnRefItem; + if (!colRef.table) return null; + return typeof colRef.table === "string" ? colRef.table : (colRef.table as { type: string; value: string }).value; +} + export function formatInputColumnName(column: ColumnRefItem): string { return `${column.table ? `${column.table}.` : ""}${getInputColumnName(column)}`; } @@ -422,7 +455,7 @@ export function getDirectTransformationsFromExprValue( const aggExpr = expr as AggrFunc; const merged: Record = {}; - + // Extract lineage from aggregate function arguments if (aggExpr.args?.expr) { const argTransformations = getDirectTransformationsFromExprValue( @@ -466,8 +499,7 @@ export function getDirectTransformationsFromExprValue( mergeTransformations(parentTransformation, { ...DIRECT_TRANSFORMATION, masking: - funcExpr.name.name.length > 0 && - MASKING_FUNCTIONS.has(funcExpr.name.name.at(-1)!.value.toUpperCase()), + funcExpr.name.name.length > 0 && MASKING_FUNCTIONS.has(funcExpr.name.name.at(-1)!.value.toUpperCase()), }), ); Object.entries(argTransformations).forEach(([key, value]) => { @@ -479,9 +511,7 @@ export function getDirectTransformationsFromExprValue( // For window functions (function with OVER clause like RANK(), ROW_NUMBER()), // extract columns from PARTITION BY/ORDER BY since these functions have no arguments if ("over" in funcExpr && funcExpr.over) { - const windowExprs = extractWindowExpressionsFromOver( - (funcExpr as AstFunction & { over: OverClause }).over, - ); + const windowExprs = extractWindowExpressionsFromOver((funcExpr as AstFunction & { over: OverClause }).over); for (const windowExpr of windowExprs) { const windowTransformations = getDirectTransformationsFromExprValue( windowExpr, @@ -647,9 +677,7 @@ function extractInputFieldsFromExpressions( schema: Schema, transformation: Transformation, ): InputField[] { - return expressions.flatMap((expr) => - extractInputFieldsFromExpression(expr, regularTables, schema, transformation), - ); + return expressions.flatMap((expr) => extractInputFieldsFromExpression(expr, regularTables, schema, transformation)); } // ============================================================================ @@ -730,8 +758,51 @@ function normalizeGroupByItems(groupby: Select["groupby"]): ExpressionValue[] { return [groupby as unknown as ExpressionValue]; } +/** + * Build a map of output aliases to their source expressions from SELECT columns. + * This allows ORDER BY alias resolution. + */ +function buildAliasToExpressionMap(select: Select): Map { + const aliasMap = new Map(); + + if (!select.columns || typeof select.columns === "string") { + return aliasMap; + } + + select.columns.forEach((col) => { + if (!isColumn(col)) return; + + const outputName = getOutputColumnName(col); + if (outputName && col.expr) { + aliasMap.set(outputName, col.expr); + } + }); + + return aliasMap; +} + +/** + * Resolve an ORDER BY expression to its underlying column references. + * If the expression is a column reference that matches an alias, resolve it to the aliased expression. + */ +function resolveOrderByExpression(expr: ExpressionValue, aliasMap: Map): ExpressionValue { + // If it's a column_ref, check if it's an alias + if (expr.type === "column_ref") { + const colRef = expr as ColumnRefItem; + const columnName = getInputColumnName(colRef); + + // Only resolve if there's no table qualifier (aliases don't have table qualifiers) + if (columnName && !colRef.table && aliasMap.has(columnName)) { + return aliasMap.get(columnName)!; + } + } + + return expr; +} + /** * Extract ORDER BY lineage (SORT) + * Resolves alias references to their underlying column expressions. */ export function getOrderByLineage(select: Select, schema: Schema): InputField[] { if (!select.orderby) return []; @@ -739,8 +810,22 @@ export function getOrderByLineage(select: Select, schema: Schema): InputField[] const orderByItems = Array.isArray(select.orderby) ? select.orderby : [select.orderby]; const { regularTables } = getTableExpressionsFromSelect(select); - const expressions = orderByItems.map((item) => ("expr" in item ? item.expr : item) as ExpressionValue); - return extractInputFieldsFromExpressions(expressions, regularTables, schema, INDIRECT_SORT); + // Build alias map to resolve ORDER BY alias references + const aliasMap = buildAliasToExpressionMap(select); + + const inputFields: InputField[] = []; + + orderByItems.forEach((item) => { + const expr = ("expr" in item ? item.expr : item) as ExpressionValue; + + // Resolve the expression - if it's an alias, get the underlying expression + const resolvedExpr = resolveOrderByExpression(expr, aliasMap); + + // Extract input fields from the resolved expression + inputFields.push(...extractInputFieldsFromExpression(resolvedExpr, regularTables, schema, INDIRECT_SORT)); + }); + + return inputFields; } /** @@ -787,7 +872,12 @@ export function getHavingLineage(select: Select, schema: Schema): InputField[] { if (!select.having) return []; const { regularTables } = getTableExpressionsFromSelect(select); - return extractInputFieldsFromExpression(select.having as unknown as ExpressionValue, regularTables, schema, INDIRECT_FILTER); + return extractInputFieldsFromExpression( + select.having as unknown as ExpressionValue, + regularTables, + schema, + INDIRECT_FILTER, + ); } // ============================================================================ @@ -859,6 +949,118 @@ export function mergeTransformationSet(parent: TransformationSet, child: Transfo return merged; } +/** + * Expand a star column into individual columns based on schema and FROM clause. + * For "*", returns all columns from all tables in FROM clause. + * For "table.*", returns all columns from that specific table. + */ +export function expandStarColumn(column: AstColumn, select: Select, schema: Schema): AstColumn[] { + if (!isStar(column)) return [column]; + + const tableQualifier = getStarTableQualifier(column); + const { regularTables, selectTables } = getTableExpressionsFromSelect(select); + const expandedColumns: AstColumn[] = []; + + // Process regular tables (from schema) + regularTables.forEach((fromTable) => { + // If there's a table qualifier, skip tables that don't match + if (tableQualifier && tableQualifier !== fromTable.table && tableQualifier !== fromTable.as) { + return; + } + + const schemaTable = schema.tables.find((t) => t.name === fromTable.table); + if (!schemaTable) return; + + for (const colName of schemaTable.columns) { + expandedColumns.push({ + expr: { + type: "column_ref", + table: fromTable.as ?? fromTable.table, + column: colName, + } as ExpressionValue, + as: colName, + }); + } + }); + + // Process subquery/CTE tables + selectTables.forEach((selectTable) => { + // If there's a table qualifier, skip tables that don't match + if (tableQualifier && tableQualifier !== selectTable.as) { + return; + } + + // Get columns from the subquery/CTE + if (selectTable.columns && typeof selectTable.columns !== "string") { + selectTable.columns.forEach((subCol) => { + if (!isColumn(subCol)) return; + + // Handle star in subquery recursively + if (isStar(subCol)) { + const expandedSubCols = expandStarColumn(subCol, selectTable, schema); + expandedSubCols.forEach((expandedSubCol) => { + const outputName = getOutputColumnName(expandedSubCol); + if (outputName) { + expandedColumns.push({ + expr: { + type: "column_ref", + table: selectTable.as ?? null, + column: outputName, + } as unknown as ExpressionValue, + as: outputName, + }); + } + }); + } else { + const outputName = getOutputColumnName(subCol); + if (outputName) { + expandedColumns.push({ + expr: { + type: "column_ref", + table: selectTable.as ?? null, + column: outputName, + } as unknown as ExpressionValue, + as: outputName, + }); + } + } + }); + } + }); + + return expandedColumns; +} + +// ============================================================================ +// Set Operation Helpers +// ============================================================================ + +/** + * Check if a SELECT has set operations (UNION, INTERSECT, EXCEPT) + */ +export function hasSetOperation(select: Select): select is SelectWithSetOp { + return "set_op" in select && select.set_op != null; +} + +/** + * Get all SELECT statements in a set operation chain. + * Returns an array of SELECT statements, where the first element is the base select + * and subsequent elements are the _next selects in the chain. + */ +export function getSetOperationSelects(select: Select): Select[] { + const selects: Select[] = [select]; + + if (hasSetOperation(select)) { + let current: SelectWithSetOp | null | undefined = select._next; + while (current) { + selects.push(current as Select); + current = hasSetOperation(current) ? current._next : null; + } + } + + return selects; +} + // ============================================================================ // Main Lineage Functions // ============================================================================ @@ -934,10 +1136,9 @@ export function getColumnLineage( } /** - * Get all dataset-level indirect lineage (columns that affect the entire result set) - * This includes lineage from CTEs and subqueries that contribute to the final result. + * Get dataset-level indirect lineage for a single SELECT (without following set operations) */ -export function getDatasetLineage(select: Select, schema: Schema): InputField[] { +function getDatasetLineageForSingleSelect(select: Select, schema: Schema): InputField[] { const allIndirectFields: InputField[] = []; // Collect all indirect lineage from the outermost SELECT @@ -954,6 +1155,23 @@ export function getDatasetLineage(select: Select, schema: Schema): InputField[] allIndirectFields.push(...getDatasetLineage(selectTable, schema)); } + return allIndirectFields; +} + +/** + * Get all dataset-level indirect lineage (columns that affect the entire result set) + * This includes lineage from CTEs, subqueries, and set operations (UNION, INTERSECT, EXCEPT) + * that contribute to the final result. + */ +export function getDatasetLineage(select: Select, schema: Schema): InputField[] { + const allIndirectFields: InputField[] = []; + + // Handle set operations (UNION, INTERSECT, EXCEPT) + const setOpSelects = getSetOperationSelects(select); + setOpSelects.forEach((setOpSelect) => { + allIndirectFields.push(...getDatasetLineageForSingleSelect(setOpSelect, schema)); + }); + // Deduplicate by creating a map keyed by namespace.table.field.type.subtype const deduped = new Map(); for (const field of allIndirectFields) { @@ -967,30 +1185,116 @@ export function getDatasetLineage(select: Select, schema: Schema): InputField[] return Array.from(deduped.values()); } +/** + * Get field-level lineage for a single SELECT (without following set operations) + */ +function getLineageForSingleSelect(select: Select, schema: Schema): ColumnLineageDatasetFacet["fields"] { + let unknownCount = 0; + + // Handle the case where columns is the string "*" (entire result is star) + if (typeof select.columns === "string" && select.columns === "*") { + return {}; + } + + return select.columns.reduce( + (acc, column) => { + if (!isColumn(column)) { + return acc; + } + + // Expand star columns into individual columns + if (isStar(column)) { + const expandedColumns = expandStarColumn(column, select, schema); + expandedColumns.forEach((expandedCol) => { + let outputFieldName = getOutputColumnName(expandedCol); + if (!outputFieldName) { + outputFieldName = `unknown_${unknownCount++}`; + } + acc = { + ...acc, + [outputFieldName]: { + inputFields: getColumnLineage(select, schema, expandedCol), + }, + }; + }); + + return acc; + } + + let outputFieldName = getOutputColumnName(column); + + if (!outputFieldName) { + outputFieldName = `unknown_${unknownCount++}`; + } + + return { + ...acc, + [outputFieldName]: { + inputFields: getColumnLineage(select, schema, column), + }, + }; + }, + {} as ColumnLineageDatasetFacet["fields"], + ); +} + +/** + * Merge input fields from multiple sources, deduplicating by field identity + */ +function mergeInputFields(existing: InputField[], incoming: InputField[]): InputField[] { + const hasher = (value: InputField) => { + const transformationsString = + value.transformations?.map((t) => transformationHasher(t as Transformation)).join("-") ?? ""; + return `${value.namespace}-${value.name}-${value.field}-${transformationsString}`; + }; + const mergedMap = new Map(existing.map((field) => [hasher(field), field] as const, {})); + + incoming.forEach((field) => mergedMap.set(hasher(field), field)); + + return [...mergedMap.values()]; +} + /** * Main lineage extraction function - returns field-level lineage only (backward compatible) + * Handles set operations (UNION, INTERSECT, EXCEPT) by merging lineages from all parts. + * Output column names are determined by the first SELECT in the set operation. */ export function getLineage(select: Select, schema: Schema): ColumnLineageDatasetFacet["fields"] { - let unknownCount = 0; + // Get all SELECT statements in the set operation chain + const setOpSelects = getSetOperationSelects(select); - return select.columns.reduce((acc, column) => { - if (!isColumn(column)) { - return acc; - } + // Get lineage from the first SELECT (determines output column names) + const baseLineage = getLineageForSingleSelect(setOpSelects[0]!, schema); + + // If no set operations, return base lineage + if (setOpSelects.length === 1) { + return baseLineage; + } + + // Merge lineages from subsequent SELECTs in the set operation + // Output columns are matched by position, not name + const baseColumns = Object.keys(baseLineage); - let outputFieldName = getOutputColumnName(column); + for (let i = 1; i < setOpSelects.length; i++) { + const nextSelect = setOpSelects[i]!; + const nextLineage = getLineageForSingleSelect(nextSelect, schema); + const nextColumns = Object.keys(nextLineage); - if (!outputFieldName) { - outputFieldName = `unknown_${unknownCount++}`; + // Match columns by position and merge input fields + for (let j = 0; j < baseColumns.length && j < nextColumns.length; j++) { + const baseCol = baseColumns[j]!; + const nextCol = nextColumns[j]!; + + if (baseLineage[baseCol] && nextLineage[nextCol]) { + baseLineage[baseCol]!.inputFields = mergeInputFields( + baseLineage[baseCol]!.inputFields, + nextLineage[nextCol]!.inputFields, + ); + } } + } - return { - ...acc, - [outputFieldName]: { - inputFields: getColumnLineage(select, schema, column), - }, - }; - }, {}); + return baseLineage; } /** diff --git a/packages/lineage/test/extendedLineage.test.ts b/packages/lineage/test/extendedLineage.test.ts index d1d0614..4db15d6 100644 --- a/packages/lineage/test/extendedLineage.test.ts +++ b/packages/lineage/test/extendedLineage.test.ts @@ -188,6 +188,371 @@ describe("getExtendedLineage - JOIN only", () => { expect(joinLineage.length).toBe(2); }); + test("FULL OUTER JOIN", () => { + const sql = ` + SELECT u.id, u.name, o.total + FROM users u + FULL OUTER JOIN orders o ON u.id = o.user_id + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage + expect(result.fields.id?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.name?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.total?.inputFields).toContainEqual({ + namespace: "trino", + name: "orders", + field: "total", + transformations: [DIRECT_IDENTITY], + }); + + // Dataset-level lineage - JOIN columns from both tables + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(2); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [INDIRECT_JOIN], + }); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "user_id", + transformations: [INDIRECT_JOIN], + }); + }); + + test("FULL JOIN (shorthand)", () => { + const sql = ` + SELECT u.id, o.total + FROM users u + FULL JOIN orders o ON u.id = o.user_id + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(2); + }); + + test("FULL OUTER JOIN with complex ON condition", () => { + const sql = ` + SELECT u.id, u.name, o.total + FROM users u + FULL OUTER JOIN orders o ON u.id = o.user_id AND u.region = o.region + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name", "region"]), + createTable("orders", ["id", "user_id", "region", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(4); // u.id, o.user_id, u.region, o.region + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [INDIRECT_JOIN], + }); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "user_id", + transformations: [INDIRECT_JOIN], + }); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "region", + transformations: [INDIRECT_JOIN], + }); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "region", + transformations: [INDIRECT_JOIN], + }); + }); + + test("FULL OUTER JOIN with WHERE clause", () => { + const sql = ` + SELECT u.id, u.name, o.total + FROM users u + FULL OUTER JOIN orders o ON u.id = o.user_id + WHERE u.status = 'active' OR o.status = 'completed' + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name", "status"]), + createTable("orders", ["id", "user_id", "status", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // JOIN lineage + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(2); + + // FILTER lineage from WHERE clause + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(2); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "status", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "status", + transformations: [INDIRECT_FILTER], + }); + }); + + test("CROSS JOIN", () => { + const sql = ` + SELECT u.id, u.name, p.name as product_name + FROM users u + CROSS JOIN products p + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("products", ["id", "name"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage should work correctly + expect(result.fields.id?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.name?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.product_name?.inputFields).toContainEqual({ + namespace: "trino", + name: "products", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + + // CROSS JOIN has no ON clause, so no JOIN lineage in dataset + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(0); + }); + + test("CROSS JOIN with WHERE clause", () => { + const sql = ` + SELECT u.id, u.name, p.name as product_name + FROM users u + CROSS JOIN products p + WHERE u.status = 'active' AND p.category = 'electronics' + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name", "status"]), + createTable("products", ["id", "name", "category"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // CROSS JOIN has no ON clause, so no JOIN lineage + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(0); + + // FILTER lineage from WHERE clause should be captured + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(2); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "status", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "products", + field: "category", + transformations: [INDIRECT_FILTER], + }); + }); + + test("CROSS JOIN combined with regular JOIN", () => { + const sql = ` + SELECT u.id, o.total, p.name as product_name + FROM users u + JOIN orders o ON u.id = o.user_id + CROSS JOIN products p + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "total"]), + createTable("products", ["id", "name"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage + expect(result.fields.id?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.total?.inputFields).toContainEqual({ + namespace: "trino", + name: "orders", + field: "total", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.product_name?.inputFields).toContainEqual({ + namespace: "trino", + name: "products", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + + // JOIN lineage only from the regular JOIN (not CROSS JOIN) + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(2); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [INDIRECT_JOIN], + }); + expect(joinLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "user_id", + transformations: [INDIRECT_JOIN], + }); + }); + + test("implicit CROSS JOIN (comma syntax)", () => { + const sql = ` + SELECT u.id, u.name, p.name as product_name + FROM users u, products p + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("products", ["id", "name"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage should work correctly + expect(result.fields.id?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.product_name?.inputFields).toContainEqual({ + namespace: "trino", + name: "products", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + + // No JOIN lineage since implicit cross join has no ON clause + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(0); + }); + + test("implicit CROSS JOIN with WHERE acting as JOIN condition", () => { + const sql = ` + SELECT u.id, o.total + FROM users u, orders o + WHERE u.id = o.user_id + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field-level lineage + expect(result.fields.id?.inputFields).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(result.fields.total?.inputFields).toContainEqual({ + namespace: "trino", + name: "orders", + field: "total", + transformations: [DIRECT_IDENTITY], + }); + + // No JOIN lineage (CROSS JOIN has no ON clause) + const joinLineage = findBySubtype(result.dataset, "JOIN"); + expect(joinLineage.length).toBe(0); + + // The WHERE condition is captured as FILTER lineage + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage.length).toBe(2); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "users", + field: "id", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "user_id", + transformations: [INDIRECT_FILTER], + }); + }); + test("multiple JOINs", () => { const sql = ` SELECT u.name, o.total, p.name as product_name @@ -556,6 +921,88 @@ describe("getExtendedLineage - ORDER BY only (SORT)", () => { const sortLineage = findBySubtype(result.dataset, "SORT"); expect(sortLineage.length).toBe(1); }); + + test("ORDER BY alias resolves to base column", () => { + const sql = ` + SELECT country, SUM(revenue) as total_revenue + FROM orders + GROUP BY country + ORDER BY total_revenue DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("orders", ["id", "country", "revenue"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // ORDER BY total_revenue should resolve to the base column 'revenue' used in SUM(revenue) + const sortLineage = findBySubtype(result.dataset, "SORT"); + expect(sortLineage.length).toBe(1); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "revenue", + transformations: [INDIRECT_SORT], + }); + }); + + test("ORDER BY alias with multiple columns in expression", () => { + const sql = ` + SELECT product_id, (quantity * price) as total_value + FROM order_items + ORDER BY total_value DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("order_items", ["id", "product_id", "quantity", "price"])]); + + const result = getExtendedLineage(ast as Select, schema); + + // ORDER BY total_value should resolve to both 'quantity' and 'price' columns + const sortLineage = findBySubtype(result.dataset, "SORT"); + expect(sortLineage.length).toBe(2); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "order_items", + field: "quantity", + transformations: [INDIRECT_SORT], + }); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "order_items", + field: "price", + transformations: [INDIRECT_SORT], + }); + }); + + test("ORDER BY with mix of alias and direct column references", () => { + const sql = ` + SELECT country, SUM(revenue) as total_revenue + FROM orders + GROUP BY country + ORDER BY country ASC, total_revenue DESC + `; + + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("orders", ["id", "country", "revenue"])]); + + const result = getExtendedLineage(ast as Select, schema); + + const sortLineage = findBySubtype(result.dataset, "SORT"); + expect(sortLineage.length).toBe(2); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "country", + transformations: [INDIRECT_SORT], + }); + expect(sortLineage).toContainEqual({ + namespace: "trino", + name: "orders", + field: "revenue", + transformations: [INDIRECT_SORT], + }); + }); }); describe("getExtendedLineage - HAVING only", () => { @@ -2084,3 +2531,183 @@ describe("getExtendedLineage - Transformation type verification", () => { expect(result.fields.total_salary?.inputFields[0]?.transformations).toContainEqual(DIRECT_AGGREGATION); }); }); + +// Helper to parse SQL for PostgreSQL (which supports INTERSECT and EXCEPT) +function parseSQLPostgres(sql: string): AST { + const result = parser.astify(sql, { database: "postgresql" }); + const ast = Array.isArray(result) ? result[0] : result; + + if (!ast) { + throw new Error("Failed to parse SQL"); + } + + return ast; +} + +describe("getExtendedLineage - Set Operations (UNION, INTERSECT, EXCEPT)", () => { + test("UNION with WHERE clauses captures all dataset lineage", () => { + const sql = ` + SELECT id, name FROM users WHERE status = 'active' + UNION + SELECT id, name FROM customers WHERE verified = true + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "name", "status"]), + createTable("customers", ["id", "name", "verified"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field lineage should combine both sources + expect(result.fields.id?.inputFields).toHaveLength(2); + expect(result.fields.name?.inputFields).toHaveLength(2); + + // Dataset lineage should include filters from both queries + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage).toHaveLength(2); + expect(filterLineage).toContainEqual({ + namespace: "postgres", + name: "users", + field: "status", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "postgres", + name: "customers", + field: "verified", + transformations: [INDIRECT_FILTER], + }); + }); + + test("INTERSECT with GROUP BY captures all dataset lineage", () => { + const sql = ` + SELECT department_id FROM employees GROUP BY department_id + INTERSECT + SELECT department_id FROM managers GROUP BY department_id + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("employees", ["id", "department_id"]), + createTable("managers", ["id", "department_id"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Field lineage combines both sources + expect(result.fields.department_id?.inputFields).toHaveLength(2); + + // Dataset lineage should include GROUP BY from both queries + const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); + expect(groupByLineage).toHaveLength(2); + expect(groupByLineage).toContainEqual({ + namespace: "postgres", + name: "employees", + field: "department_id", + transformations: [INDIRECT_GROUP_BY], + }); + expect(groupByLineage).toContainEqual({ + namespace: "postgres", + name: "managers", + field: "department_id", + transformations: [INDIRECT_GROUP_BY], + }); + }); + + test("EXCEPT with ORDER BY captures all dataset lineage", () => { + const sql = ` + SELECT id FROM users ORDER BY created_at + EXCEPT + SELECT id FROM banned_users ORDER BY banned_at + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "created_at"]), + createTable("banned_users", ["id", "banned_at"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Dataset lineage should include ORDER BY from both queries + const sortLineage = findBySubtype(result.dataset, "SORT"); + expect(sortLineage).toHaveLength(2); + expect(sortLineage).toContainEqual({ + namespace: "postgres", + name: "users", + field: "created_at", + transformations: [INDIRECT_SORT], + }); + expect(sortLineage).toContainEqual({ + namespace: "postgres", + name: "banned_users", + field: "banned_at", + transformations: [INDIRECT_SORT], + }); + }); + + test("chained UNION captures dataset lineage from all parts", () => { + const sql = ` + SELECT id FROM users WHERE region = 'US' + UNION + SELECT id FROM customers WHERE region = 'EU' + UNION + SELECT id FROM vendors WHERE region = 'APAC' + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "region"]), + createTable("customers", ["id", "region"]), + createTable("vendors", ["id", "region"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Dataset lineage should include filters from all three queries + const filterLineage = findBySubtype(result.dataset, "FILTER"); + expect(filterLineage).toHaveLength(3); + expect(filterLineage).toContainEqual({ + namespace: "postgres", + name: "users", + field: "region", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "postgres", + name: "customers", + field: "region", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "postgres", + name: "vendors", + field: "region", + transformations: [INDIRECT_FILTER], + }); + }); + + test("UNION with JOINs captures dataset lineage from both parts", () => { + const sql = ` + SELECT u.id, u.name + FROM users u + JOIN orders o ON u.id = o.user_id + UNION + SELECT c.id, c.name + FROM customers c + JOIN purchases p ON c.id = p.customer_id + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id"]), + createTable("customers", ["id", "name"]), + createTable("purchases", ["id", "customer_id"]), + ]); + + const result = getExtendedLineage(ast as Select, schema); + + // Dataset lineage should include JOIN conditions from both queries + const joinLineage = findBySubtype(result.dataset, "JOIN"); + // Each JOIN contributes 2 fields (from ON condition) + expect(joinLineage.length).toBeGreaterThanOrEqual(4); + }); +}); diff --git a/packages/lineage/test/index.test.ts b/packages/lineage/test/index.test.ts index 705a305..d43cb48 100644 --- a/packages/lineage/test/index.test.ts +++ b/packages/lineage/test/index.test.ts @@ -776,4 +776,568 @@ ORDER BY net_revenue DESC`; }, }); }); + + test("select * from single table", () => { + const sql = `SELECT * FROM users`; + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + + const lineage = getLineage(ast as Select, schema); + + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + email: { + inputFields: [ + { + name: "users", + namespace: "trino", + field: "email", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("select * from multiple tables (JOIN)", () => { + const sql = `SELECT * FROM users u JOIN orders o ON u.id = o.user_id`; + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + // Note: When both tables have "id", the second one (orders.id) overwrites the first (users.id) + // This is expected behavior since the output column names would conflict + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "orders", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + user_id: { + inputFields: [ + { + name: "orders", + namespace: "trino", + field: "user_id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total: { + inputFields: [ + { + name: "orders", + namespace: "trino", + field: "total", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("select table.* from specific table", () => { + const sql = `SELECT u.* FROM users u JOIN orders o ON u.id = o.user_id`; + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("select * mixed with specific columns", () => { + const sql = `SELECT o.*, u.name as user_name FROM users u JOIN orders o ON u.id = o.user_id`; + const ast = parseSQL(sql); + const schema = createSchema("trino", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "user_id", "total"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "orders", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + user_id: { + inputFields: [ + { + name: "orders", + namespace: "trino", + field: "user_id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total: { + inputFields: [ + { + name: "orders", + namespace: "trino", + field: "total", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + user_name: { + inputFields: [ + { + name: "users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("select * from CTE", () => { + const sql = ` + WITH filtered_users AS ( + SELECT id, name FROM users WHERE active = true + ) + SELECT * FROM filtered_users + `; + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "active"])]); + + const lineage = getLineage(ast as Select, schema); + + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("select * from nested subquery", () => { + const sql = `SELECT * FROM (SELECT id, name FROM users) AS subq`; + const ast = parseSQL(sql); + const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + + const lineage = getLineage(ast as Select, schema); + + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); +}); + +// Helper to parse SQL for PostgreSQL (which supports INTERSECT and EXCEPT) +function parseSQLPostgres(sql: string): AST { + const result = parser.astify(sql, { database: "postgresql" }); + const ast = Array.isArray(result) ? result[0] : result; + + if (!ast) { + throw new Error("Failed to parse SQL"); + } + + return ast; +} + +describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { + test("simple UNION combines lineage from both queries", () => { + const sql = ` + SELECT id, name FROM users + UNION + SELECT id, name FROM customers + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "name", "email"]), + createTable("customers", ["id", "name", "address"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + // Output columns are from the first SELECT, but input fields include both tables + expect(lineage.id?.inputFields).toHaveLength(2); + expect(lineage.id?.inputFields).toContainEqual({ + name: "users", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(lineage.id?.inputFields).toContainEqual({ + name: "customers", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + + expect(lineage.name?.inputFields).toHaveLength(2); + expect(lineage.name?.inputFields).toContainEqual({ + name: "users", + namespace: "postgres", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + expect(lineage.name?.inputFields).toContainEqual({ + name: "customers", + namespace: "postgres", + field: "name", + transformations: [DIRECT_IDENTITY], + }); + }); + + test("UNION ALL combines lineage from both queries", () => { + const sql = ` + SELECT id FROM users + UNION ALL + SELECT id FROM orders + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "product"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + expect(lineage.id?.inputFields).toHaveLength(2); + expect(lineage.id?.inputFields).toContainEqual({ + name: "users", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(lineage.id?.inputFields).toContainEqual({ + name: "orders", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + }); + + test("INTERSECT combines lineage from both queries", () => { + const sql = ` + SELECT id FROM users + INTERSECT + SELECT id FROM premium_users + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "name"]), + createTable("premium_users", ["id", "tier"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + expect(lineage.id?.inputFields).toHaveLength(2); + expect(lineage.id?.inputFields).toContainEqual({ + name: "users", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(lineage.id?.inputFields).toContainEqual({ + name: "premium_users", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + }); + + test("EXCEPT combines lineage from both queries", () => { + const sql = ` + SELECT id FROM users + EXCEPT + SELECT id FROM banned_users + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "name"]), + createTable("banned_users", ["id", "reason"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + expect(lineage.id?.inputFields).toHaveLength(2); + expect(lineage.id?.inputFields).toContainEqual({ + name: "users", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(lineage.id?.inputFields).toContainEqual({ + name: "banned_users", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + }); + + test("chained UNION combines lineage from all queries", () => { + const sql = ` + SELECT id, name FROM users + UNION + SELECT id, name FROM customers + UNION + SELECT id, name FROM vendors + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "name"]), + createTable("customers", ["id", "name"]), + createTable("vendors", ["id", "name"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + // All three tables contribute to the lineage + expect(lineage.id?.inputFields).toHaveLength(3); + expect(lineage.name?.inputFields).toHaveLength(3); + }); + + test("UNION with aliases preserves first SELECT column names", () => { + const sql = ` + SELECT id AS user_id, name AS full_name FROM users + UNION + SELECT customer_id, customer_name FROM customers + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "name"]), + createTable("customers", ["customer_id", "customer_name"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + // Output columns should be named according to the first SELECT + expect(Object.keys(lineage)).toEqual(["user_id", "full_name"]); + expect(lineage.user_id?.inputFields).toContainEqual({ + name: "users", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + expect(lineage.user_id?.inputFields).toContainEqual({ + name: "customers", + namespace: "postgres", + field: "customer_id", + transformations: [DIRECT_IDENTITY], + }); + }); + + test("UNION with transformations", () => { + const sql = ` + SELECT UPPER(name) AS name FROM users + UNION + SELECT LOWER(name) AS name FROM customers + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "name"]), + createTable("customers", ["id", "name"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + // Both inputs should have TRANSFORMATION type + expect(lineage.name?.inputFields).toHaveLength(2); + expect(lineage.name?.inputFields).toContainEqual({ + name: "users", + namespace: "postgres", + field: "name", + transformations: [DIRECT_TRANSFORMATION], + }); + expect(lineage.name?.inputFields).toContainEqual({ + name: "customers", + namespace: "postgres", + field: "name", + transformations: [DIRECT_TRANSFORMATION], + }); + }); + + test("UNION with aggregation", () => { + const sql = ` + SELECT SUM(amount) AS total FROM sales + UNION + SELECT SUM(amount) AS total FROM refunds + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("sales", ["id", "amount"]), + createTable("refunds", ["id", "amount"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + expect(lineage.total?.inputFields).toHaveLength(2); + expect(lineage.total?.inputFields).toContainEqual({ + name: "sales", + namespace: "postgres", + field: "amount", + transformations: [DIRECT_AGGREGATION], + }); + expect(lineage.total?.inputFields).toContainEqual({ + name: "refunds", + namespace: "postgres", + field: "amount", + transformations: [DIRECT_AGGREGATION], + }); + }); + + test("UNION with different column expressions", () => { + const sql = ` + SELECT id, first_name || ' ' || last_name AS full_name FROM users + UNION + SELECT id, company_name FROM customers + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "first_name", "last_name"]), + createTable("customers", ["id", "company_name"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + // First SELECT contributes first_name and last_name, second contributes company_name + expect(lineage.full_name?.inputFields.length).toBeGreaterThanOrEqual(3); + }); + + test("UNION with subqueries", () => { + const sql = ` + SELECT id FROM (SELECT id FROM users WHERE active = true) AS active_users + UNION + SELECT id FROM (SELECT id FROM customers WHERE verified = true) AS verified_customers + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [ + createTable("users", ["id", "active"]), + createTable("customers", ["id", "verified"]), + ]); + + const lineage = getLineage(ast as Select, schema); + + expect(lineage.id?.inputFields).toHaveLength(2); + }); + + test("UNION deduplicates identical input fields", () => { + const sql = ` + SELECT id FROM users + UNION + SELECT id FROM users + `; + const ast = parseSQLPostgres(sql); + const schema = createSchema("postgres", [createTable("users", ["id", "name"])]); + + const lineage = getLineage(ast as Select, schema); + + // Same table appears in both SELECTs, but should be deduplicated + expect(lineage.id?.inputFields).toHaveLength(1); + expect(lineage.id?.inputFields).toContainEqual({ + name: "users", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }); + }); }); From 0618275a9a80eb3bcf1c488815604a0b7eb1a137 Mon Sep 17 00:00:00 2001 From: its-elad <59926027+its-elad@users.noreply.github.com> Date: Sun, 1 Feb 2026 16:23:38 +0200 Subject: [PATCH 05/10] feat: added support for schema.table --- packages/lineage/src/index.ts | 163 +++++--- packages/lineage/test/extendedLineage.test.ts | 146 +++---- packages/lineage/test/index.test.ts | 383 ++++++++++++++++-- 3 files changed, 529 insertions(+), 163 deletions(-) diff --git a/packages/lineage/src/index.ts b/packages/lineage/src/index.ts index b245768..e5ee3b4 100644 --- a/packages/lineage/src/index.ts +++ b/packages/lineage/src/index.ts @@ -162,15 +162,21 @@ export type Column = { }; export type Table = { - name: string; + name: string; // Format: schemaName.tableName columns: string[]; }; -export type Schema = { +export type Namespace = { namespace: string; - tables: Table[]; + tables?: Table[]; + defaultSchema?: string; }; +/** + * @deprecated Use Namespace instead + */ +export type Schema = Namespace; + export type InputColumn = { name: string; table?: string; @@ -248,6 +254,39 @@ export function parseInputColumnName(column: string): InputColumn { return { name, table }; } +/** + * Parse a fully qualified table name (schemaName.tableName) into its parts + */ +export function parseTableName(tableName: string): { schema: string; table: string } { + const parts = tableName.split("."); + if (parts.length === 1) { + return { schema: "", table: parts[0]! }; + } + return { schema: parts[0]!, table: parts.slice(1).join(".") }; +} + +/** + * Check if an AST table reference matches a schema table + * Takes into account the db property from AST and the defaultSchema from namespace + */ +function astTableMatchesSchemaTable( + astTable: BaseFrom, + schemaTableName: string, + defaultSchema?: string, +): boolean { + const parsed = parseTableName(schemaTableName); + const astDb = (astTable as BaseFrom & { db?: string }).db; + const effectiveAstSchema = astDb || defaultSchema || ""; + + // Compare schema (or default schema if not specified) + if (parsed.schema && effectiveAstSchema && parsed.schema !== effectiveAstSchema) { + return false; + } + + // Compare table name + return parsed.table === astTable.table; +} + export function getInputColumnName(column: ColumnRefItem): string | null { return typeof column.column === "string" ? column.column @@ -610,33 +649,34 @@ export function getIndirectTransformationsFromExpr( // ============================================================================ /** - * Resolves a column reference to an InputField by finding the matching table in schema. + * Resolves a column reference to an InputField by finding the matching table in namespace. * This is the core helper that eliminates repetitive table lookup logic. */ function resolveColumnRefToInputField( ref: ColumnRefItem, regularTables: BaseFrom[], - schema: Schema, + namespace: Namespace, transformation: Transformation, ): InputField | null { const columnName = getInputColumnName(ref); const tableName = ref.table; if (!columnName) return null; + if (!namespace.tables) return null; const table = regularTables.find( (t) => (!tableName || tableName === t.table || tableName === t.as) && - schema.tables.some((s) => s.name === t.table && s.columns.includes(columnName)), + namespace.tables!.some((s) => astTableMatchesSchemaTable(t, s.name, namespace.defaultSchema) && s.columns.includes(columnName)), ); if (!table) return null; - const schemaTable = schema.tables.find((s) => s.name === table.table); + const schemaTable = namespace.tables.find((s) => astTableMatchesSchemaTable(table, s.name, namespace.defaultSchema)); if (!schemaTable) return null; return { - namespace: schema.namespace, + namespace: namespace.namespace, name: schemaTable.name, field: columnName, transformations: [transformation], @@ -650,7 +690,7 @@ function resolveColumnRefToInputField( function extractInputFieldsFromExpression( expr: ExpressionValue | null | undefined, regularTables: BaseFrom[], - schema: Schema, + namespace: Namespace, transformation: Transformation, ): InputField[] { if (!expr) return []; @@ -659,7 +699,7 @@ function extractInputFieldsFromExpression( const inputFields: InputField[] = []; for (const ref of columnRefs) { - const inputField = resolveColumnRefToInputField(ref, regularTables, schema, transformation); + const inputField = resolveColumnRefToInputField(ref, regularTables, namespace, transformation); if (inputField) { inputFields.push(inputField); } @@ -674,10 +714,10 @@ function extractInputFieldsFromExpression( function extractInputFieldsFromExpressions( expressions: (ExpressionValue | null | undefined)[], regularTables: BaseFrom[], - schema: Schema, + namespace: Namespace, transformation: Transformation, ): InputField[] { - return expressions.flatMap((expr) => extractInputFieldsFromExpression(expr, regularTables, schema, transformation)); + return expressions.flatMap((expr) => extractInputFieldsFromExpression(expr, regularTables, namespace, transformation)); } // ============================================================================ @@ -687,8 +727,9 @@ function extractInputFieldsFromExpressions( /** * Extract JOIN lineage from FROM clause (ON and USING conditions) */ -export function getJoinLineage(select: Select, schema: Schema): InputField[] { +export function getJoinLineage(select: Select, namespace: Namespace): InputField[] { if (!select.from) return []; + if (!namespace.tables) return []; const fromItems = Array.isArray(select.from) ? select.from : [select.from]; const { regularTables } = getTableExpressionsFromSelect(select); @@ -698,17 +739,21 @@ export function getJoinLineage(select: Select, schema: Schema): InputField[] { // Handle ON clause if ("on" in item && item.on) { inputFields.push( - ...extractInputFieldsFromExpression(item.on as ExpressionValue, regularTables, schema, INDIRECT_JOIN), + ...extractInputFieldsFromExpression(item.on as ExpressionValue, regularTables, namespace, INDIRECT_JOIN), ); } // Handle USING clause - columns exist in multiple tables if ("using" in item && Array.isArray(item.using)) { for (const usingCol of item.using) { - for (const schemaTable of schema.tables) { - if (schemaTable.columns.includes(usingCol)) { + // Find tables that match the FROM clause and have this column + for (const schemaTable of namespace.tables) { + const matchingFromTable = regularTables.find((t) => + astTableMatchesSchemaTable(t, schemaTable.name, namespace.defaultSchema) + ); + if (matchingFromTable && schemaTable.columns.includes(usingCol)) { inputFields.push({ - namespace: schema.namespace, + namespace: namespace.namespace, name: schemaTable.name, field: usingCol, transformations: [INDIRECT_JOIN], @@ -725,24 +770,24 @@ export function getJoinLineage(select: Select, schema: Schema): InputField[] { /** * Extract WHERE clause lineage (FILTER) */ -export function getFilterLineage(select: Select, schema: Schema): InputField[] { +export function getFilterLineage(select: Select, namespace: Namespace): InputField[] { if (!select.where) return []; const { regularTables } = getTableExpressionsFromSelect(select); - return extractInputFieldsFromExpression(select.where as ExpressionValue, regularTables, schema, INDIRECT_FILTER); + return extractInputFieldsFromExpression(select.where as ExpressionValue, regularTables, namespace, INDIRECT_FILTER); } /** * Extract GROUP BY lineage */ -export function getGroupByLineage(select: Select, schema: Schema): InputField[] { +export function getGroupByLineage(select: Select, namespace: Namespace): InputField[] { if (!select.groupby) return []; // Normalize GROUP BY to array format const groupByItems = normalizeGroupByItems(select.groupby); const { regularTables } = getTableExpressionsFromSelect(select); - return extractInputFieldsFromExpressions(groupByItems, regularTables, schema, INDIRECT_GROUP_BY); + return extractInputFieldsFromExpressions(groupByItems, regularTables, namespace, INDIRECT_GROUP_BY); } /** @@ -804,7 +849,7 @@ function resolveOrderByExpression(expr: ExpressionValue, aliasMap: Map { // If there's a table qualifier, skip tables that don't match if (tableQualifier && tableQualifier !== fromTable.table && tableQualifier !== fromTable.as) { return; } - const schemaTable = schema.tables.find((t) => t.name === fromTable.table); + const schemaTable = namespace.tables!.find((t) => astTableMatchesSchemaTable(fromTable, t.name, namespace.defaultSchema)); if (!schemaTable) return; for (const colName of schemaTable.columns) { @@ -997,7 +1043,7 @@ export function expandStarColumn(column: AstColumn, select: Select, schema: Sche // Handle star in subquery recursively if (isStar(subCol)) { - const expandedSubCols = expandStarColumn(subCol, selectTable, schema); + const expandedSubCols = expandStarColumn(subCol, selectTable, namespace); expandedSubCols.forEach((expandedSubCol) => { const outputName = getOutputColumnName(expandedSubCol); if (outputName) { @@ -1067,7 +1113,7 @@ export function getSetOperationSelects(select: Select): Select[] { export function getColumnLineage( select: Select, - schema: Schema, + namespace: Namespace, column: AstColumn, transformations?: TransformationSet, ): InputField[] { @@ -1088,19 +1134,22 @@ export function getColumnLineage( const inputFields: InputField[] = []; + if (!namespace.tables) return inputFields; + for (const [inputColumnName, transformations] of Object.entries(transformationsByColumns)) { const inputColumn = parseInputColumnName(inputColumnName); const table = regularTables.find( (t) => (!inputColumn.table || inputColumn.table === t.table || inputColumn.table === t.as) && - schema.tables.some((s) => s.name === t.table && s.columns.includes(inputColumn.name)), + namespace.tables!.some((s) => astTableMatchesSchemaTable(t, s.name, namespace.defaultSchema) && s.columns.includes(inputColumn.name)), ); if (table) { + const schemaTable = namespace.tables.find((s) => astTableMatchesSchemaTable(table, s.name, namespace.defaultSchema)); inputFields.push({ - namespace: schema.namespace, - name: table.table, + namespace: namespace.namespace, + name: schemaTable!.name, field: inputColumn.name, transformations: Array.from(transformations), }); @@ -1127,7 +1176,7 @@ export function getColumnLineage( } } - inputFields.push(...getColumnLineage(selectTable, schema, nextColumn, transformations)); + inputFields.push(...getColumnLineage(selectTable, namespace, nextColumn, transformations)); } } } @@ -1138,21 +1187,21 @@ export function getColumnLineage( /** * Get dataset-level indirect lineage for a single SELECT (without following set operations) */ -function getDatasetLineageForSingleSelect(select: Select, schema: Schema): InputField[] { +function getDatasetLineageForSingleSelect(select: Select, namespace: Namespace): InputField[] { const allIndirectFields: InputField[] = []; // Collect all indirect lineage from the outermost SELECT - allIndirectFields.push(...getJoinLineage(select, schema)); - allIndirectFields.push(...getFilterLineage(select, schema)); - allIndirectFields.push(...getGroupByLineage(select, schema)); - allIndirectFields.push(...getOrderByLineage(select, schema)); - allIndirectFields.push(...getWindowLineage(select, schema)); - allIndirectFields.push(...getHavingLineage(select, schema)); + allIndirectFields.push(...getJoinLineage(select, namespace)); + allIndirectFields.push(...getFilterLineage(select, namespace)); + allIndirectFields.push(...getGroupByLineage(select, namespace)); + allIndirectFields.push(...getOrderByLineage(select, namespace)); + allIndirectFields.push(...getWindowLineage(select, namespace)); + allIndirectFields.push(...getHavingLineage(select, namespace)); // Recursively collect dataset lineage from CTEs and subqueries const { selectTables } = getTableExpressionsFromSelect(select); for (const selectTable of selectTables) { - allIndirectFields.push(...getDatasetLineage(selectTable, schema)); + allIndirectFields.push(...getDatasetLineage(selectTable, namespace)); } return allIndirectFields; @@ -1163,13 +1212,13 @@ function getDatasetLineageForSingleSelect(select: Select, schema: Schema): Input * This includes lineage from CTEs, subqueries, and set operations (UNION, INTERSECT, EXCEPT) * that contribute to the final result. */ -export function getDatasetLineage(select: Select, schema: Schema): InputField[] { +export function getDatasetLineage(select: Select, namespace: Namespace): InputField[] { const allIndirectFields: InputField[] = []; // Handle set operations (UNION, INTERSECT, EXCEPT) const setOpSelects = getSetOperationSelects(select); setOpSelects.forEach((setOpSelect) => { - allIndirectFields.push(...getDatasetLineageForSingleSelect(setOpSelect, schema)); + allIndirectFields.push(...getDatasetLineageForSingleSelect(setOpSelect, namespace)); }); // Deduplicate by creating a map keyed by namespace.table.field.type.subtype @@ -1188,7 +1237,7 @@ export function getDatasetLineage(select: Select, schema: Schema): InputField[] /** * Get field-level lineage for a single SELECT (without following set operations) */ -function getLineageForSingleSelect(select: Select, schema: Schema): ColumnLineageDatasetFacet["fields"] { +function getLineageForSingleSelect(select: Select, namespace: Namespace): ColumnLineageDatasetFacet["fields"] { let unknownCount = 0; // Handle the case where columns is the string "*" (entire result is star) @@ -1204,7 +1253,7 @@ function getLineageForSingleSelect(select: Select, schema: Schema): ColumnLineag // Expand star columns into individual columns if (isStar(column)) { - const expandedColumns = expandStarColumn(column, select, schema); + const expandedColumns = expandStarColumn(column, select, namespace); expandedColumns.forEach((expandedCol) => { let outputFieldName = getOutputColumnName(expandedCol); if (!outputFieldName) { @@ -1213,7 +1262,7 @@ function getLineageForSingleSelect(select: Select, schema: Schema): ColumnLineag acc = { ...acc, [outputFieldName]: { - inputFields: getColumnLineage(select, schema, expandedCol), + inputFields: getColumnLineage(select, namespace, expandedCol), }, }; }); @@ -1230,7 +1279,7 @@ function getLineageForSingleSelect(select: Select, schema: Schema): ColumnLineag return { ...acc, [outputFieldName]: { - inputFields: getColumnLineage(select, schema, column), + inputFields: getColumnLineage(select, namespace, column), }, }; }, @@ -1259,12 +1308,12 @@ function mergeInputFields(existing: InputField[], incoming: InputField[]): Input * Handles set operations (UNION, INTERSECT, EXCEPT) by merging lineages from all parts. * Output column names are determined by the first SELECT in the set operation. */ -export function getLineage(select: Select, schema: Schema): ColumnLineageDatasetFacet["fields"] { +export function getLineage(select: Select, namespace: Namespace): ColumnLineageDatasetFacet["fields"] { // Get all SELECT statements in the set operation chain const setOpSelects = getSetOperationSelects(select); // Get lineage from the first SELECT (determines output column names) - const baseLineage = getLineageForSingleSelect(setOpSelects[0]!, schema); + const baseLineage = getLineageForSingleSelect(setOpSelects[0]!, namespace); // If no set operations, return base lineage if (setOpSelects.length === 1) { @@ -1277,7 +1326,7 @@ export function getLineage(select: Select, schema: Schema): ColumnLineageDataset for (let i = 1; i < setOpSelects.length; i++) { const nextSelect = setOpSelects[i]!; - const nextLineage = getLineageForSingleSelect(nextSelect, schema); + const nextLineage = getLineageForSingleSelect(nextSelect, namespace); const nextColumns = Object.keys(nextLineage); // Match columns by position and merge input fields @@ -1302,10 +1351,10 @@ export function getLineage(select: Select, schema: Schema): ColumnLineageDataset */ export function getExtendedLineage( select: Select, - schema: Schema, + namespace: Namespace, ): Pick { return { - fields: getLineage(select, schema), - dataset: getDatasetLineage(select, schema), + fields: getLineage(select, namespace), + dataset: getDatasetLineage(select, namespace), }; } diff --git a/packages/lineage/test/extendedLineage.test.ts b/packages/lineage/test/extendedLineage.test.ts index 4db15d6..2c85a3d 100644 --- a/packages/lineage/test/extendedLineage.test.ts +++ b/packages/lineage/test/extendedLineage.test.ts @@ -3,7 +3,7 @@ import { Parser } from "node-sql-parser"; import type { AST, Select } from "node-sql-parser"; import { getExtendedLineage, - type Schema, + type Namespace, type Table, INDIRECT_JOIN, INDIRECT_FILTER, @@ -17,9 +17,9 @@ import { const parser = new Parser(); -// Helper function to create schemas -function createSchema(namespace: string, tables: Table[]): Schema { - return { namespace, tables }; +// Helper function to create namespaces +function createNamespace(namespace: string, tables: Table[], defaultSchema?: string): Namespace { + return { namespace, tables, defaultSchema }; } function createTable(name: string, columns: string[]): Table { @@ -61,7 +61,7 @@ describe("getExtendedLineage - Simple SELECT (no indirect lineage)", () => { const sql = `SELECT id, name FROM users`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); const result = getExtendedLineage(ast as Select, schema); @@ -89,7 +89,7 @@ describe("getExtendedLineage - Simple SELECT (no indirect lineage)", () => { const sql = `SELECT id as user_id, name as user_name FROM users`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name"])]); const result = getExtendedLineage(ast as Select, schema); @@ -118,7 +118,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -158,7 +158,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -177,7 +177,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -196,7 +196,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -248,7 +248,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -267,7 +267,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name", "region"]), createTable("orders", ["id", "user_id", "region", "total"]), ]); @@ -311,7 +311,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name", "status"]), createTable("orders", ["id", "user_id", "status", "total"]), ]); @@ -347,7 +347,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("products", ["id", "name"]), ]); @@ -388,7 +388,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name", "status"]), createTable("products", ["id", "name", "category"]), ]); @@ -425,7 +425,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id", "total"]), createTable("products", ["id", "name"]), @@ -477,7 +477,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("products", ["id", "name"]), ]); @@ -511,7 +511,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -562,7 +562,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id", "product_id", "total"]), createTable("products", ["id", "name"]), @@ -582,7 +582,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "status"]), createTable("orders", ["id", "user_id", "status", "total"]), ]); @@ -601,7 +601,7 @@ describe("getExtendedLineage - JOIN only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("employees", ["id", "name", "manager_id"])]); + const schema = createNamespace("trino", [createTable("employees", ["id", "name", "manager_id"])]); const result = getExtendedLineage(ast as Select, schema); @@ -619,7 +619,7 @@ describe("getExtendedLineage - WHERE only (FILTER)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "status"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "status"])]); const result = getExtendedLineage(ast as Select, schema); @@ -641,7 +641,7 @@ describe("getExtendedLineage - WHERE only (FILTER)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "status", "age"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "status", "age"])]); const result = getExtendedLineage(ast as Select, schema); @@ -669,7 +669,7 @@ describe("getExtendedLineage - WHERE only (FILTER)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "status", "country"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "status", "country"])]); const result = getExtendedLineage(ast as Select, schema); @@ -685,7 +685,7 @@ describe("getExtendedLineage - WHERE only (FILTER)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name", "status", "age", "country", "verified"]), ]); @@ -703,7 +703,7 @@ describe("getExtendedLineage - WHERE only (FILTER)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "country"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "country"])]); const result = getExtendedLineage(ast as Select, schema); @@ -725,7 +725,7 @@ describe("getExtendedLineage - WHERE only (FILTER)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "age"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "age"])]); const result = getExtendedLineage(ast as Select, schema); @@ -741,7 +741,7 @@ describe("getExtendedLineage - WHERE only (FILTER)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name"])]); const result = getExtendedLineage(ast as Select, schema); @@ -757,7 +757,7 @@ describe("getExtendedLineage - WHERE only (FILTER)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); const result = getExtendedLineage(ast as Select, schema); @@ -775,7 +775,7 @@ describe("getExtendedLineage - GROUP BY only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "country"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "country"])]); const result = getExtendedLineage(ast as Select, schema); @@ -797,7 +797,7 @@ describe("getExtendedLineage - GROUP BY only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "country", "city"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "country", "city"])]); const result = getExtendedLineage(ast as Select, schema); @@ -829,7 +829,7 @@ describe("getExtendedLineage - GROUP BY only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("employees", ["id", "department", "salary", "age", "hire_date"]), ]); @@ -864,7 +864,7 @@ describe("getExtendedLineage - ORDER BY only (SORT)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "created_at"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "created_at"])]); const result = getExtendedLineage(ast as Select, schema); @@ -886,7 +886,7 @@ describe("getExtendedLineage - ORDER BY only (SORT)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "country"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "country"])]); const result = getExtendedLineage(ast as Select, schema); @@ -914,7 +914,7 @@ describe("getExtendedLineage - ORDER BY only (SORT)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); const result = getExtendedLineage(ast as Select, schema); @@ -931,7 +931,7 @@ describe("getExtendedLineage - ORDER BY only (SORT)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("orders", ["id", "country", "revenue"])]); + const schema = createNamespace("trino", [createTable("orders", ["id", "country", "revenue"])]); const result = getExtendedLineage(ast as Select, schema); @@ -954,7 +954,7 @@ describe("getExtendedLineage - ORDER BY only (SORT)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("order_items", ["id", "product_id", "quantity", "price"])]); + const schema = createNamespace("trino", [createTable("order_items", ["id", "product_id", "quantity", "price"])]); const result = getExtendedLineage(ast as Select, schema); @@ -984,7 +984,7 @@ describe("getExtendedLineage - ORDER BY only (SORT)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("orders", ["id", "country", "revenue"])]); + const schema = createNamespace("trino", [createTable("orders", ["id", "country", "revenue"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1015,7 +1015,7 @@ describe("getExtendedLineage - HAVING only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "country"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "country"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1035,7 +1035,7 @@ describe("getExtendedLineage - HAVING only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("employees", ["id", "department", "salary"])]); + const schema = createNamespace("trino", [createTable("employees", ["id", "department", "salary"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1062,7 +1062,7 @@ describe("getExtendedLineage - HAVING only", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("employees", ["id", "department", "age"])]); + const schema = createNamespace("trino", [createTable("employees", ["id", "department", "age"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1082,7 +1082,7 @@ describe("getExtendedLineage - WINDOW functions", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("employees", ["id", "department", "salary"])]); + const schema = createNamespace("trino", [createTable("employees", ["id", "department", "salary"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1134,7 +1134,7 @@ describe("getExtendedLineage - WINDOW functions", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("transactions", ["id", "amount", "date", "status"])]); + const schema = createNamespace("trino", [createTable("transactions", ["id", "amount", "date", "status"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1168,7 +1168,7 @@ describe("getExtendedLineage - WINDOW functions", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("orders", ["id", "category", "created_at", "amount", "user_id"])]); + const schema = createNamespace("trino", [createTable("orders", ["id", "category", "created_at", "amount", "user_id"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1210,7 +1210,7 @@ describe("getExtendedLineage - CASE expressions (CONDITION)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "status"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "status"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1237,7 +1237,7 @@ describe("getExtendedLineage - CASE expressions (CONDITION)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "age"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "age"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1257,7 +1257,7 @@ describe("getExtendedLineage - CASE expressions (CONDITION)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("products", ["id", "price", "discount_type", "discount_value"])]); + const schema = createNamespace("trino", [createTable("products", ["id", "price", "discount_type", "discount_value"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1282,7 +1282,7 @@ describe("getExtendedLineage - JOIN + WHERE", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name", "status"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -1324,7 +1324,7 @@ describe("getExtendedLineage - JOIN + GROUP BY", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "country"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -1366,7 +1366,7 @@ describe("getExtendedLineage - WHERE + GROUP BY + HAVING", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("employees", ["id", "department", "salary", "status"])]); + const schema = createNamespace("trino", [createTable("employees", ["id", "department", "salary", "status"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1407,7 +1407,7 @@ describe("getExtendedLineage - Full query with all clauses", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "country", "status"]), createTable("orders", ["id", "user_id", "total", "order_date"]), ]); @@ -1453,7 +1453,7 @@ describe("getExtendedLineage - Full query with all clauses", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("employees", ["id", "name", "department_id", "salary", "status"]), createTable("departments", ["id", "name"]), ]); @@ -1532,7 +1532,7 @@ describe("getExtendedLineage - WITH clause (CTEs)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "country", "status"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "country", "status"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1578,7 +1578,7 @@ describe("getExtendedLineage - WITH clause (CTEs)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name", "country", "status"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -1671,7 +1671,7 @@ describe("getExtendedLineage - WITH clause (CTEs)", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("sales", ["id", "product_id", "store_id", "quantity", "price", "sale_date"]), createTable("stores", ["id", "name"]), ]); @@ -1756,7 +1756,7 @@ describe("getExtendedLineage - Subqueries", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "country", "status"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "country", "status"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1786,7 +1786,7 @@ describe("getExtendedLineage - Edge cases", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "status"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "status"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1822,7 +1822,7 @@ describe("getExtendedLineage - Edge cases", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name", "favorite_product_id"]), createTable("products", ["id", "name"]), ]); @@ -1867,7 +1867,7 @@ describe("getExtendedLineage - Edge cases", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "status"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "status"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1881,7 +1881,7 @@ describe("getExtendedLineage - Edge cases", () => { const sql = `SELECT id, UPPER(name) as upper_name FROM users`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1900,7 +1900,7 @@ describe("getExtendedLineage - Edge cases", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "email", "ssn", "phone"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "email", "ssn", "phone"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1918,7 +1918,7 @@ describe("getExtendedLineage - Edge cases", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "country"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "country"])]); const result = getExtendedLineage(ast as Select, schema); @@ -1955,7 +1955,7 @@ describe("getExtendedLineage - Real-world complex queries", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("categories", ["id", "name"]), createTable("products", ["id", "name", "category_id"]), createTable("order_items", ["id", "order_id", "product_id", "quantity", "unit_price"]), @@ -2154,7 +2154,7 @@ describe("getExtendedLineage - Real-world complex queries", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("employees", ["id", "department_id", "salary", "status"]), createTable("departments", ["id", "name"]), ]); @@ -2245,7 +2245,7 @@ describe("getExtendedLineage - Real-world complex queries", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("events", ["id", "event_date", "event_type", "user_id"])]); + const schema = createNamespace("trino", [createTable("events", ["id", "event_date", "event_type", "user_id"])]); const result = getExtendedLineage(ast as Select, schema); @@ -2356,7 +2356,7 @@ describe("getExtendedLineage - Real-world complex queries", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("sales", ["id", "store_id", "sale_timestamp", "amount"]), createTable("stores", ["id", "name", "region"]), ]); @@ -2459,7 +2459,7 @@ describe("getExtendedLineage - Transformation type verification", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "country", "status"]), createTable("orders", ["id", "user_id"]), ]); @@ -2505,7 +2505,7 @@ describe("getExtendedLineage - Transformation type verification", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("employees", ["id", "name", "email", "age", "status", "salary"]), ]); @@ -2552,7 +2552,7 @@ describe("getExtendedLineage - Set Operations (UNION, INTERSECT, EXCEPT)", () => SELECT id, name FROM customers WHERE verified = true `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "name", "status"]), createTable("customers", ["id", "name", "verified"]), ]); @@ -2587,7 +2587,7 @@ describe("getExtendedLineage - Set Operations (UNION, INTERSECT, EXCEPT)", () => SELECT department_id FROM managers GROUP BY department_id `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("employees", ["id", "department_id"]), createTable("managers", ["id", "department_id"]), ]); @@ -2621,7 +2621,7 @@ describe("getExtendedLineage - Set Operations (UNION, INTERSECT, EXCEPT)", () => SELECT id FROM banned_users ORDER BY banned_at `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "created_at"]), createTable("banned_users", ["id", "banned_at"]), ]); @@ -2654,7 +2654,7 @@ describe("getExtendedLineage - Set Operations (UNION, INTERSECT, EXCEPT)", () => SELECT id FROM vendors WHERE region = 'APAC' `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "region"]), createTable("customers", ["id", "region"]), createTable("vendors", ["id", "region"]), @@ -2696,7 +2696,7 @@ describe("getExtendedLineage - Set Operations (UNION, INTERSECT, EXCEPT)", () => JOIN purchases p ON c.id = p.customer_id `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id"]), createTable("customers", ["id", "name"]), diff --git a/packages/lineage/test/index.test.ts b/packages/lineage/test/index.test.ts index d43cb48..4674e32 100644 --- a/packages/lineage/test/index.test.ts +++ b/packages/lineage/test/index.test.ts @@ -3,7 +3,7 @@ import { Parser } from "node-sql-parser"; import type { AST, Select } from "node-sql-parser"; import { getLineage, - type Schema, + type Namespace, type Table, DIRECT_AGGREGATION, DIRECT_IDENTITY, @@ -12,9 +12,9 @@ import { const parser = new Parser(); -// Helper function to create schemas -function createSchema(namespace: string, tables: Table[]): Schema { - return { namespace, tables }; +// Helper function to create namespaces +function createNamespace(namespace: string, tables: Table[], defaultSchema?: string): Namespace { + return { namespace, tables, defaultSchema }; } function createTable(name: string, columns: string[]): Table { @@ -48,7 +48,7 @@ describe("Select Lineage", () => { FROM u `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); const lineage = getLineage(ast as Select, schema); @@ -86,7 +86,7 @@ describe("Select Lineage", () => { name as wow FROM (SELECT * FROM u) AS t`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); const lineage = getLineage(ast as Select, schema); @@ -152,7 +152,7 @@ describe("Select Lineage", () => { `; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name", "email", "status"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -247,7 +247,7 @@ FROM final_report ORDER BY net_revenue DESC`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("product_sales", [ "product_id", "store_id", @@ -352,7 +352,7 @@ ORDER BY net_revenue DESC`; `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); const lineage = getLineage(ast as Select, schema); @@ -386,7 +386,7 @@ ORDER BY net_revenue DESC`; GROUP BY country`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("cities", ["country", "city"])]); + const schema = createNamespace("trino", [createTable("cities", ["country", "city"])]); const lineage = getLineage(ast as Select, schema); @@ -419,7 +419,7 @@ ORDER BY net_revenue DESC`; FROM users`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name"])]); const lineage = getLineage(ast as Select, schema); @@ -463,7 +463,7 @@ ORDER BY net_revenue DESC`; JOIN orders o ON u.id = o.user_id`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name", "email"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -503,7 +503,7 @@ ORDER BY net_revenue DESC`; FROM users`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); const lineage = getLineage(ast as Select, schema); @@ -567,7 +567,7 @@ ORDER BY net_revenue DESC`; FROM orders`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("orders", ["id", "price", "tax", "quantity", "discount"])]); + const schema = createNamespace("trino", [createTable("orders", ["id", "price", "tax", "quantity", "discount"])]); const lineage = getLineage(ast as Select, schema); @@ -663,7 +663,7 @@ ORDER BY net_revenue DESC`; FROM orders`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("orders", ["id", "price", "tax", "quantity", "discount"])]); + const schema = createNamespace("trino", [createTable("orders", ["id", "price", "tax", "quantity", "discount"])]); const lineage = getLineage(ast as Select, schema); @@ -729,7 +729,7 @@ ORDER BY net_revenue DESC`; GROUP BY country`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("cities", ["country", "city", "population", "area"])]); + const schema = createNamespace("trino", [createTable("cities", ["country", "city", "population", "area"])]); const lineage = getLineage(ast as Select, schema); @@ -780,7 +780,7 @@ ORDER BY net_revenue DESC`; test("select * from single table", () => { const sql = `SELECT * FROM users`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); const lineage = getLineage(ast as Select, schema); @@ -821,7 +821,7 @@ ORDER BY net_revenue DESC`; test("select * from multiple tables (JOIN)", () => { const sql = `SELECT * FROM users u JOIN orders o ON u.id = o.user_id`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -877,7 +877,7 @@ ORDER BY net_revenue DESC`; test("select table.* from specific table", () => { const sql = `SELECT u.* FROM users u JOIN orders o ON u.id = o.user_id`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -911,7 +911,7 @@ ORDER BY net_revenue DESC`; test("select * mixed with specific columns", () => { const sql = `SELECT o.*, u.name as user_name FROM users u JOIN orders o ON u.id = o.user_id`; const ast = parseSQL(sql); - const schema = createSchema("trino", [ + const schema = createNamespace("trino", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "user_id", "total"]), ]); @@ -970,7 +970,7 @@ ORDER BY net_revenue DESC`; SELECT * FROM filtered_users `; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "active"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "active"])]); const lineage = getLineage(ast as Select, schema); @@ -1001,7 +1001,7 @@ ORDER BY net_revenue DESC`; test("select * from nested subquery", () => { const sql = `SELECT * FROM (SELECT id, name FROM users) AS subq`; const ast = parseSQL(sql); - const schema = createSchema("trino", [createTable("users", ["id", "name", "email"])]); + const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); const lineage = getLineage(ast as Select, schema); @@ -1050,7 +1050,7 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { SELECT id, name FROM customers `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "name", "email"]), createTable("customers", ["id", "name", "address"]), ]); @@ -1094,7 +1094,7 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { SELECT id FROM orders `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "name"]), createTable("orders", ["id", "product"]), ]); @@ -1123,7 +1123,7 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { SELECT id FROM premium_users `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "name"]), createTable("premium_users", ["id", "tier"]), ]); @@ -1152,7 +1152,7 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { SELECT id FROM banned_users `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "name"]), createTable("banned_users", ["id", "reason"]), ]); @@ -1183,7 +1183,7 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { SELECT id, name FROM vendors `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "name"]), createTable("customers", ["id", "name"]), createTable("vendors", ["id", "name"]), @@ -1203,7 +1203,7 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { SELECT customer_id, customer_name FROM customers `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "name"]), createTable("customers", ["customer_id", "customer_name"]), ]); @@ -1233,7 +1233,7 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { SELECT LOWER(name) AS name FROM customers `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "name"]), createTable("customers", ["id", "name"]), ]); @@ -1263,7 +1263,7 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { SELECT SUM(amount) AS total FROM refunds `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("sales", ["id", "amount"]), createTable("refunds", ["id", "amount"]), ]); @@ -1292,7 +1292,7 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { SELECT id, company_name FROM customers `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "first_name", "last_name"]), createTable("customers", ["id", "company_name"]), ]); @@ -1310,7 +1310,7 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { SELECT id FROM (SELECT id FROM customers WHERE verified = true) AS verified_customers `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [ + const schema = createNamespace("postgres", [ createTable("users", ["id", "active"]), createTable("customers", ["id", "verified"]), ]); @@ -1327,7 +1327,7 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { SELECT id FROM users `; const ast = parseSQLPostgres(sql); - const schema = createSchema("postgres", [createTable("users", ["id", "name"])]); + const schema = createNamespace("postgres", [createTable("users", ["id", "name"])]); const lineage = getLineage(ast as Select, schema); @@ -1341,3 +1341,320 @@ describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { }); }); }); + +describe("Multi-Schema Support", () => { + test("select from table with explicit schema", () => { + const sql = `SELECT id, name FROM myschema.users`; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("myschema.users", ["id", "name", "email"]), + createTable("otherschema.users", ["id", "username"]), + ]); + + const lineage = getLineage(ast as Select, namespace); + + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("select from table with default schema", () => { + const sql = `SELECT id, name FROM users`; + const ast = parseSQL(sql); + const namespace = createNamespace( + "trino", + [ + createTable("myschema.users", ["id", "name", "email"]), + createTable("otherschema.users", ["id", "username"]), + ], + "myschema", // default schema + ); + + const lineage = getLineage(ast as Select, namespace); + + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("join across different schemas", () => { + const sql = ` + SELECT + u.id, + u.name, + o.total + FROM myschema.users u + JOIN orders_schema.orders o ON u.id = o.user_id + `; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("myschema.users", ["id", "name"]), + createTable("orders_schema.orders", ["id", "user_id", "total"]), + ]); + + const lineage = getLineage(ast as Select, namespace); + + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total: { + inputFields: [ + { + name: "orders_schema.orders", + namespace: "trino", + field: "total", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("mix explicit and default schema tables", () => { + const sql = ` + SELECT + u.id, + u.name, + o.total + FROM users u + JOIN orders_schema.orders o ON u.id = o.user_id + `; + const ast = parseSQL(sql); + const namespace = createNamespace( + "trino", + [ + createTable("myschema.users", ["id", "name"]), + createTable("orders_schema.orders", ["id", "user_id", "total"]), + ], + "myschema", // default schema + ); + + const lineage = getLineage(ast as Select, namespace); + + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total: { + inputFields: [ + { + name: "orders_schema.orders", + namespace: "trino", + field: "total", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("same table name in different schemas", () => { + const sql = ` + SELECT + u1.id as user1_id, + u2.id as user2_id + FROM schema1.users u1 + JOIN schema2.users u2 ON u1.id = u2.id + `; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("schema1.users", ["id", "name"]), + createTable("schema2.users", ["id", "username"]), + ]); + + const lineage = getLineage(ast as Select, namespace); + + expect(lineage).toEqual({ + user1_id: { + inputFields: [ + { + name: "schema1.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + user2_id: { + inputFields: [ + { + name: "schema2.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("CTE with schema-qualified tables", () => { + const sql = ` + WITH active_users AS ( + SELECT id, name FROM myschema.users WHERE status = 'active' + ) + SELECT + au.id, + au.name, + o.total + FROM active_users au + JOIN orders_schema.orders o ON au.id = o.user_id + `; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("myschema.users", ["id", "name", "status"]), + createTable("orders_schema.orders", ["id", "user_id", "total"]), + ]); + + const lineage = getLineage(ast as Select, namespace); + + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total: { + inputFields: [ + { + name: "orders_schema.orders", + namespace: "trino", + field: "total", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("select * from schema-qualified table", () => { + const sql = `SELECT * FROM myschema.users`; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("myschema.users", ["id", "name", "email"]), + ]); + + const lineage = getLineage(ast as Select, namespace); + + expect(lineage).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + email: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "email", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); +}); From 020aa272e13212081e79b850a70d4823a55a612b Mon Sep 17 00:00:00 2001 From: its-elad <59926027+its-elad@users.noreply.github.com> Date: Mon, 2 Feb 2026 10:02:19 +0200 Subject: [PATCH 06/10] orginized tests --- apps/demo/src/App.tsx | 2 - packages/lineage/src/index.ts | 88 +- packages/lineage/test/extendedLineage.test.ts | 3596 +++++++---------- 3 files changed, 1574 insertions(+), 2112 deletions(-) diff --git a/apps/demo/src/App.tsx b/apps/demo/src/App.tsx index f632899..1410509 100644 --- a/apps/demo/src/App.tsx +++ b/apps/demo/src/App.tsx @@ -47,8 +47,6 @@ export default function App() { setLineageData(lineageResult); }, []); - useEffect(() => console.log(lineageData), [lineageData]); - return (
{/* Header */} diff --git a/packages/lineage/src/index.ts b/packages/lineage/src/index.ts index e5ee3b4..8764519 100644 --- a/packages/lineage/src/index.ts +++ b/packages/lineage/src/index.ts @@ -191,27 +191,16 @@ export type SelectWithAlias = Select & { */ export type SetOperation = "union" | "union all" | "intersect" | "intersect all" | "except" | "except all"; -/** - * Extended Select type that includes types for set operations (_next and set_op) - */ -export type SelectWithSetOp = Select & { - set_op?: SetOperation | null; - _next?: SelectWithSetOp | null; -}; - /** * Extended lineage result that includes both field-level and dataset-level lineage */ -export interface ExtendedLineageResult { - fields: ColumnLineageDatasetFacet["fields"]; - dataset?: InputField[]; -} +export type ExtendedLineageResult = Pick // ============================================================================ // Column Name Utilities // ============================================================================ -export function isColumn(selectColumn: Select["columns"][number]): selectColumn is AstColumn { +function isColumn(selectColumn: Select["columns"][number]): selectColumn is AstColumn { return ( typeof selectColumn === "object" && selectColumn !== null && @@ -225,7 +214,7 @@ export function isColumn(selectColumn: Select["columns"][number]): selectColumn /** * Check if a column expression is a star (wildcard) expression like * or table.* */ -export function isStar(column: AstColumn): boolean { +function isStar(column: AstColumn): boolean { if (column.expr.type !== "column_ref") return false; const colRef = column.expr as ColumnRefItem; return colRef.column === "*" || (typeof colRef.column === "object" && colRef.column?.expr?.value === "*"); @@ -235,7 +224,7 @@ export function isStar(column: AstColumn): boolean { * Get the table qualifier from a star expression (e.g., "u" from "u.*") * Returns null if there's no table qualifier (plain "*") */ -export function getStarTableQualifier(column: AstColumn): string | null { +function getStarTableQualifier(column: AstColumn): string | null { if (!isStar(column)) return null; const colRef = column.expr as ColumnRefItem; if (!colRef.table) return null; @@ -269,17 +258,16 @@ export function parseTableName(tableName: string): { schema: string; table: stri * Check if an AST table reference matches a schema table * Takes into account the db property from AST and the defaultSchema from namespace */ -function astTableMatchesSchemaTable( - astTable: BaseFrom, - schemaTableName: string, - defaultSchema?: string, -): boolean { +function astTableMatchesSchemaTable(astTable: BaseFrom, schemaTableName: string, defaultSchema?: string): boolean { const parsed = parseTableName(schemaTableName); - const astDb = (astTable as BaseFrom & { db?: string }).db; - const effectiveAstSchema = astDb || defaultSchema || ""; + const astDb = astTable.db; + const effectiveAstSchema = astDb || defaultSchema; // Compare schema (or default schema if not specified) - if (parsed.schema && effectiveAstSchema && parsed.schema !== effectiveAstSchema) { + if ( + (parsed.schema && !defaultSchema && !astDb) || + (parsed.schema && effectiveAstSchema && parsed.schema !== effectiveAstSchema) + ) { return false; } @@ -308,7 +296,7 @@ export function getOutputColumnName(column: AstColumn): string | null { /** * Extract column references from any expression value */ -export function extractColumnRefs(expr: ExpressionValue | null | undefined): ColumnRefItem[] { +function extractColumnRefs(expr: ExpressionValue | null | undefined): ColumnRefItem[] { if (!expr) return []; const refs: ColumnRefItem[] = []; @@ -449,7 +437,7 @@ function extractWindowExpressionsFromOver(over: OverClause): ExpressionValue[] { /** * Get transformations from expression, supporting CASE/IF for CONDITION subtype */ -export function getDirectTransformationsFromExprValue( +function getDirectTransformationsFromExprValue( expr: ExpressionValue, parentTransformation?: Transformation, ): Record { @@ -625,7 +613,7 @@ export function getDirectTransformationsFromExprValue( /** * Get indirect transformations from an expression with a specific transformation type */ -export function getIndirectTransformationsFromExpr( +function getIndirectTransformationsFromExpr( expr: ExpressionValue | null | undefined, transformation: Transformation, ): Record { @@ -667,7 +655,9 @@ function resolveColumnRefToInputField( const table = regularTables.find( (t) => (!tableName || tableName === t.table || tableName === t.as) && - namespace.tables!.some((s) => astTableMatchesSchemaTable(t, s.name, namespace.defaultSchema) && s.columns.includes(columnName)), + namespace.tables!.some( + (s) => astTableMatchesSchemaTable(t, s.name, namespace.defaultSchema) && s.columns.includes(columnName), + ), ); if (!table) return null; @@ -717,7 +707,9 @@ function extractInputFieldsFromExpressions( namespace: Namespace, transformation: Transformation, ): InputField[] { - return expressions.flatMap((expr) => extractInputFieldsFromExpression(expr, regularTables, namespace, transformation)); + return expressions.flatMap((expr) => + extractInputFieldsFromExpression(expr, regularTables, namespace, transformation), + ); } // ============================================================================ @@ -727,7 +719,7 @@ function extractInputFieldsFromExpressions( /** * Extract JOIN lineage from FROM clause (ON and USING conditions) */ -export function getJoinLineage(select: Select, namespace: Namespace): InputField[] { +function getJoinLineage(select: Select, namespace: Namespace): InputField[] { if (!select.from) return []; if (!namespace.tables) return []; @@ -749,7 +741,7 @@ export function getJoinLineage(select: Select, namespace: Namespace): InputField // Find tables that match the FROM clause and have this column for (const schemaTable of namespace.tables) { const matchingFromTable = regularTables.find((t) => - astTableMatchesSchemaTable(t, schemaTable.name, namespace.defaultSchema) + astTableMatchesSchemaTable(t, schemaTable.name, namespace.defaultSchema), ); if (matchingFromTable && schemaTable.columns.includes(usingCol)) { inputFields.push({ @@ -770,7 +762,7 @@ export function getJoinLineage(select: Select, namespace: Namespace): InputField /** * Extract WHERE clause lineage (FILTER) */ -export function getFilterLineage(select: Select, namespace: Namespace): InputField[] { +function getFilterLineage(select: Select, namespace: Namespace): InputField[] { if (!select.where) return []; const { regularTables } = getTableExpressionsFromSelect(select); @@ -780,7 +772,7 @@ export function getFilterLineage(select: Select, namespace: Namespace): InputFie /** * Extract GROUP BY lineage */ -export function getGroupByLineage(select: Select, namespace: Namespace): InputField[] { +function getGroupByLineage(select: Select, namespace: Namespace): InputField[] { if (!select.groupby) return []; // Normalize GROUP BY to array format @@ -849,7 +841,7 @@ function resolveOrderByExpression(expr: ExpressionValue, aliasMap: Map { @@ -999,7 +991,7 @@ export function mergeTransformationSet(parent: TransformationSet, child: Transfo * For "*", returns all columns from all tables in FROM clause. * For "table.*", returns all columns from that specific table. */ -export function expandStarColumn(column: AstColumn, select: Select, namespace: Namespace): AstColumn[] { +function expandStarColumn(column: AstColumn, select: Select, namespace: Namespace): AstColumn[] { if (!isStar(column)) return [column]; if (!namespace.tables) return [column]; @@ -1014,7 +1006,9 @@ export function expandStarColumn(column: AstColumn, select: Select, namespace: N return; } - const schemaTable = namespace.tables!.find((t) => astTableMatchesSchemaTable(fromTable, t.name, namespace.defaultSchema)); + const schemaTable = namespace.tables!.find((t) => + astTableMatchesSchemaTable(fromTable, t.name, namespace.defaultSchema), + ); if (!schemaTable) return; for (const colName of schemaTable.columns) { @@ -1084,7 +1078,7 @@ export function expandStarColumn(column: AstColumn, select: Select, namespace: N /** * Check if a SELECT has set operations (UNION, INTERSECT, EXCEPT) */ -export function hasSetOperation(select: Select): select is SelectWithSetOp { +function hasSetOperation(select: Select): select is Select { return "set_op" in select && select.set_op != null; } @@ -1093,13 +1087,13 @@ export function hasSetOperation(select: Select): select is SelectWithSetOp { * Returns an array of SELECT statements, where the first element is the base select * and subsequent elements are the _next selects in the chain. */ -export function getSetOperationSelects(select: Select): Select[] { +function getSetOperationSelects(select: Select): Select[] { const selects: Select[] = [select]; if (hasSetOperation(select)) { - let current: SelectWithSetOp | null | undefined = select._next; + let current: Select | undefined | null = select._next; while (current) { - selects.push(current as Select); + selects.push(current); current = hasSetOperation(current) ? current._next : null; } } @@ -1142,11 +1136,15 @@ export function getColumnLineage( const table = regularTables.find( (t) => (!inputColumn.table || inputColumn.table === t.table || inputColumn.table === t.as) && - namespace.tables!.some((s) => astTableMatchesSchemaTable(t, s.name, namespace.defaultSchema) && s.columns.includes(inputColumn.name)), + namespace.tables!.some( + (s) => astTableMatchesSchemaTable(t, s.name, namespace.defaultSchema) && s.columns.includes(inputColumn.name), + ), ); if (table) { - const schemaTable = namespace.tables.find((s) => astTableMatchesSchemaTable(table, s.name, namespace.defaultSchema)); + const schemaTable = namespace.tables.find((s) => + astTableMatchesSchemaTable(table, s.name, namespace.defaultSchema), + ); inputFields.push({ namespace: namespace.namespace, name: schemaTable!.name, diff --git a/packages/lineage/test/extendedLineage.test.ts b/packages/lineage/test/extendedLineage.test.ts index 2c85a3d..4ba2ac6 100644 --- a/packages/lineage/test/extendedLineage.test.ts +++ b/packages/lineage/test/extendedLineage.test.ts @@ -10,15 +10,32 @@ import { INDIRECT_GROUP_BY, INDIRECT_SORT, INDIRECT_WINDOW, + INDIRECT_CONDITION, DIRECT_IDENTITY, DIRECT_TRANSFORMATION, DIRECT_AGGREGATION, } from "../src/index.js"; +const DEFAULT_SCHEMA = "public"; +const USERS_TABLE = createTable(`${DEFAULT_SCHEMA}.users`, [ + "id", + "name", + "email", + "first_name", + "last_name", + "status", + "age", + "country", + "city", + "region", + "verified", + "active", + "created_at", +]); + const parser = new Parser(); -// Helper function to create namespaces -function createNamespace(namespace: string, tables: Table[], defaultSchema?: string): Namespace { +function createNamespace(namespace: string, tables: Table[], defaultSchema: string = DEFAULT_SCHEMA): Namespace { return { namespace, tables, defaultSchema }; } @@ -26,2688 +43,2137 @@ function createTable(name: string, columns: string[]): Table { return { name, columns }; } -// Helper to ensure we get a single AST -function parseSQL(sql: string): AST { - const result = parser.astify(sql, { database: "trino" }); +function parseSQL(sql: string, database: "trino" | "postgresql" = "trino"): AST { + const result = parser.astify(sql, { database }); const ast = Array.isArray(result) ? result[0] : result; - - if (!ast) { - throw new Error("Failed to parse SQL"); - } - + if (!ast) throw new Error("Failed to parse SQL"); return ast; } -// Helper to find dataset lineage entries by transformation subtype -function findBySubtype(dataset: ReturnType["dataset"], subtype: string) { - return dataset?.filter((f) => f.transformations?.[0]?.subtype === subtype) ?? []; +// ============================================================================= +// EXACT ASSERTION HELPERS +// ============================================================================= + +/** Sort input fields for consistent comparison */ +function sortInputFields(fields: ReturnType["fields"]) { + const sorted: typeof fields = {}; + for (const [key, value] of Object.entries(fields)) { + sorted[key] = { + inputFields: [...value.inputFields].sort((a, b) => { + const aKey = `${a.namespace}.${a.name}.${a.field}`; + const bKey = `${b.namespace}.${b.name}.${b.field}`; + return aKey.localeCompare(bKey); + }), + }; + } + return sorted; } -// Helper to find dataset lineage entry by field name and subtype -function findFieldBySubtype( - dataset: ReturnType["dataset"], - fieldName: string, - subtype: string, -) { - return dataset?.find((f) => f.field === fieldName && f.transformations?.[0]?.subtype === subtype); +/** Sort dataset fields for consistent comparison */ +function sortDataset(dataset: ReturnType["dataset"]) { + if (!dataset) return []; + return [...dataset].sort((a, b) => { + const aKey = `${a.namespace}.${a.name}.${a.field}.${a.transformations?.[0]?.subtype}`; + const bKey = `${b.namespace}.${b.name}.${b.field}.${b.transformations?.[0]?.subtype}`; + return aKey.localeCompare(bKey); + }); } // ============================================================================= -// SIMPLE TESTS - Single Clause Scenarios +// SECTION 1: FIELD-LEVEL LINEAGE - DIRECT TRANSFORMATIONS // ============================================================================= -describe("getExtendedLineage - Simple SELECT (no indirect lineage)", () => { - test("simple select without any indirect clauses", () => { - const sql = `SELECT id, name FROM users`; - +describe("Field-Level Lineage: DIRECT/IDENTITY", () => { + test("single column select", () => { + const sql = `SELECT id FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage should exist - expect(result.fields.id).toBeDefined(); - expect(result.fields.name).toBeDefined(); - expect(result.fields.id?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.name?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "name", - transformations: [DIRECT_IDENTITY], + expect(result.fields).toEqual({ + id: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + }, }); - - // Dataset-level lineage should be empty expect(result.dataset).toEqual([]); }); - test("select with alias", () => { - const sql = `SELECT id as user_id, name as user_name FROM users`; - + test("multiple columns select", () => { + const sql = `SELECT id, name, email FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - expect(result.fields.user_id?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.user_name?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "name", - transformations: [DIRECT_IDENTITY], + expect(result.fields).toEqual({ + id: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + }, + name: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + }, + email: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "email", transformations: [DIRECT_IDENTITY] }], + }, }); expect(result.dataset).toEqual([]); }); -}); - -describe("getExtendedLineage - JOIN only", () => { - test("simple INNER JOIN", () => { - const sql = ` - SELECT u.id, u.name, o.total - FROM users u - JOIN orders o ON u.id = o.user_id - `; + test("column with alias", () => { + const sql = `SELECT id as user_id, name as user_name FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage - expect(result.fields.id?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [DIRECT_IDENTITY], + expect(result.fields).toEqual({ + user_id: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + }, + user_name: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + }, }); + }); - // Dataset-level lineage - JOIN - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(2); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [INDIRECT_JOIN], - }); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "user_id", - transformations: [INDIRECT_JOIN], + test("table-qualified column", () => { + const sql = `SELECT u.id, u.name FROM users u`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + id: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + }, + name: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + }, }); }); +}); - test("LEFT JOIN", () => { - const sql = ` - SELECT u.id, o.total - FROM users u - LEFT JOIN orders o ON u.id = o.user_id - `; - +describe("Field-Level Lineage: DIRECT/TRANSFORMATION", () => { + test("function transformation - UPPER", () => { + const sql = `SELECT UPPER(name) as upper_name FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(2); + expect(result.fields).toEqual({ + upper_name: { + inputFields: [ + { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_TRANSFORMATION] }, + ], + }, + }); }); - test("RIGHT JOIN", () => { - const sql = ` - SELECT u.id, o.total - FROM users u - RIGHT JOIN orders o ON u.id = o.user_id - `; - + test("function transformation - CONCAT", () => { + const sql = `SELECT CONCAT(first_name, last_name) as full_name FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "total"]), - ]); + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + full_name: { + inputFields: [ + { namespace: "ns", name: USERS_TABLE.name, field: "first_name", transformations: [DIRECT_TRANSFORMATION] }, + { namespace: "ns", name: USERS_TABLE.name, field: "last_name", transformations: [DIRECT_TRANSFORMATION] }, + ], + }, + }), + ); + }); + + test("arithmetic transformation - addition", () => { + const sql = `SELECT price + tax as total FROM products`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.products`, ["price", "tax"])]); const result = getExtendedLineage(ast as Select, schema); - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(2); + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + total: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "price", transformations: [DIRECT_TRANSFORMATION] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "tax", transformations: [DIRECT_TRANSFORMATION] }, + ], + }, + }), + ); }); - test("FULL OUTER JOIN", () => { - const sql = ` - SELECT u.id, u.name, o.total - FROM users u - FULL OUTER JOIN orders o ON u.id = o.user_id - `; + test("arithmetic transformation - multiplication", () => { + const sql = `SELECT quantity * unit_price as line_total FROM order_items`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.order_items`, ["quantity", "unit_price"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + line_total: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [DIRECT_TRANSFORMATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "unit_price", + transformations: [DIRECT_TRANSFORMATION], + }, + ], + }, + }), + ); + }); + test("CAST transformation", () => { + const sql = `SELECT CAST(price AS VARCHAR) as price_str FROM products`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.products`, ["price"])]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage - expect(result.fields.id?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.name?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "name", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.total?.inputFields).toContainEqual({ - namespace: "trino", - name: "orders", - field: "total", - transformations: [DIRECT_IDENTITY], - }); - - // Dataset-level lineage - JOIN columns from both tables - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(2); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [INDIRECT_JOIN], - }); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "user_id", - transformations: [INDIRECT_JOIN], + expect(result.fields).toEqual({ + price_str: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "price", transformations: [DIRECT_TRANSFORMATION] }, + ], + }, }); }); - test("FULL JOIN (shorthand)", () => { - const sql = ` - SELECT u.id, o.total - FROM users u - FULL JOIN orders o ON u.id = o.user_id - `; - + test("nested function transformation", () => { + const sql = `SELECT LOWER(TRIM(name)) as clean_name FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(2); + expect(result.fields).toEqual({ + clean_name: { + inputFields: [ + { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_TRANSFORMATION] }, + ], + }, + }); }); +}); - test("FULL OUTER JOIN with complex ON condition", () => { - const sql = ` - SELECT u.id, u.name, o.total - FROM users u - FULL OUTER JOIN orders o ON u.id = o.user_id AND u.region = o.region - `; - +describe("Field-Level Lineage: DIRECT/AGGREGATION", () => { + test("SUM aggregation", () => { + const sql = `SELECT SUM(amount) as total FROM transactions`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name", "region"]), - createTable("orders", ["id", "user_id", "region", "total"]), - ]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.transactions`, ["amount"])]); const result = getExtendedLineage(ast as Select, schema); - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(4); // u.id, o.user_id, u.region, o.region - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [INDIRECT_JOIN], - }); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "user_id", - transformations: [INDIRECT_JOIN], - }); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "region", - transformations: [INDIRECT_JOIN], - }); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "region", - transformations: [INDIRECT_JOIN], + expect(result.fields).toEqual({ + total: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "amount", transformations: [DIRECT_AGGREGATION] }, + ], + }, }); }); - test("FULL OUTER JOIN with WHERE clause", () => { - const sql = ` - SELECT u.id, u.name, o.total - FROM users u - FULL OUTER JOIN orders o ON u.id = o.user_id - WHERE u.status = 'active' OR o.status = 'completed' - `; - + test("AVG aggregation", () => { + const sql = `SELECT AVG(salary) as avg_salary FROM employees`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name", "status"]), - createTable("orders", ["id", "user_id", "status", "total"]), - ]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["salary"])]); const result = getExtendedLineage(ast as Select, schema); - // JOIN lineage - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(2); - - // FILTER lineage from WHERE clause - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(2); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "status", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "status", - transformations: [INDIRECT_FILTER], + expect(result.fields).toEqual({ + avg_salary: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [DIRECT_AGGREGATION] }, + ], + }, }); }); - test("CROSS JOIN", () => { - const sql = ` - SELECT u.id, u.name, p.name as product_name - FROM users u - CROSS JOIN products p - `; - + test("MIN aggregation", () => { + const sql = `SELECT MIN(price) as min_price FROM products`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("products", ["id", "name"]), - ]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.products`, ["price"])]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage should work correctly - expect(result.fields.id?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.name?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "name", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.product_name?.inputFields).toContainEqual({ - namespace: "trino", - name: "products", - field: "name", - transformations: [DIRECT_IDENTITY], + expect(result.fields).toEqual({ + min_price: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "price", transformations: [DIRECT_AGGREGATION] }, + ], + }, }); - - // CROSS JOIN has no ON clause, so no JOIN lineage in dataset - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(0); }); - test("CROSS JOIN with WHERE clause", () => { - const sql = ` - SELECT u.id, u.name, p.name as product_name - FROM users u - CROSS JOIN products p - WHERE u.status = 'active' AND p.category = 'electronics' - `; - + test("MAX aggregation", () => { + const sql = `SELECT MAX(created_at) as latest FROM events`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name", "status"]), - createTable("products", ["id", "name", "category"]), - ]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.events`, ["created_at"])]); const result = getExtendedLineage(ast as Select, schema); - // CROSS JOIN has no ON clause, so no JOIN lineage - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(0); - - // FILTER lineage from WHERE clause should be captured - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(2); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "status", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "products", - field: "category", - transformations: [INDIRECT_FILTER], + expect(result.fields).toEqual({ + latest: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.events`, field: "created_at", transformations: [DIRECT_AGGREGATION] }, + ], + }, }); }); - test("CROSS JOIN combined with regular JOIN", () => { - const sql = ` - SELECT u.id, o.total, p.name as product_name - FROM users u - JOIN orders o ON u.id = o.user_id - CROSS JOIN products p - `; - + test("COUNT with column - has masking", () => { + const sql = `SELECT COUNT(id) as count FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "total"]), - createTable("products", ["id", "name"]), - ]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage - expect(result.fields.id?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.total?.inputFields).toContainEqual({ - namespace: "trino", - name: "orders", - field: "total", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.product_name?.inputFields).toContainEqual({ - namespace: "trino", - name: "products", - field: "name", - transformations: [DIRECT_IDENTITY], + expect(result.fields).toEqual({ + count: { + inputFields: [ + { + namespace: "ns", + name: USERS_TABLE.name, + field: "id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ], + }, }); + }); - // JOIN lineage only from the regular JOIN (not CROSS JOIN) - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(2); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [INDIRECT_JOIN], - }); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "user_id", - transformations: [INDIRECT_JOIN], + test("COUNT DISTINCT - has masking", () => { + const sql = `SELECT COUNT(DISTINCT user_id) as unique_users FROM orders`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + unique_users: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, + field: "user_id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ], + }, }); }); - test("implicit CROSS JOIN (comma syntax)", () => { - const sql = ` - SELECT u.id, u.name, p.name as product_name - FROM users u, products p - `; - + test("aggregation with expression inside", () => { + const sql = `SELECT SUM(quantity * price) as revenue FROM order_items`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("products", ["id", "name"]), - ]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.order_items`, ["quantity", "price"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + revenue: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "price", transformations: [DIRECT_AGGREGATION] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "quantity", transformations: [DIRECT_AGGREGATION] }, + ], + }, + }), + ); + }); +}); +describe("Field-Level Lineage: Masking Functions", () => { + test("MD5 masking", () => { + const sql = `SELECT MD5(email) as hashed_email FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage should work correctly - expect(result.fields.id?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.product_name?.inputFields).toContainEqual({ - namespace: "trino", - name: "products", - field: "name", - transformations: [DIRECT_IDENTITY], + expect(result.fields).toEqual({ + hashed_email: { + inputFields: [ + { + namespace: "ns", + name: USERS_TABLE.name, + field: "email", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ], + }, }); - - // No JOIN lineage since implicit cross join has no ON clause - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(0); }); - test("implicit CROSS JOIN with WHERE acting as JOIN condition", () => { - const sql = ` - SELECT u.id, o.total - FROM users u, orders o - WHERE u.id = o.user_id - `; - + test("SHA256 masking", () => { + const sql = `SELECT SHA256(ssn) as hashed_ssn FROM employees`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["ssn"])]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage - expect(result.fields.id?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.total?.inputFields).toContainEqual({ - namespace: "trino", - name: "orders", - field: "total", - transformations: [DIRECT_IDENTITY], + expect(result.fields).toEqual({ + hashed_ssn: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "ssn", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ], + }, }); + }); - // No JOIN lineage (CROSS JOIN has no ON clause) - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(0); + test("MASK function", () => { + const sql = `SELECT MASK(phone) as masked_phone FROM contacts`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.contacts`, ["phone"])]); + const result = getExtendedLineage(ast as Select, schema); - // The WHERE condition is captured as FILTER lineage - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(2); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "id", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "user_id", - transformations: [INDIRECT_FILTER], + expect(result.fields).toEqual({ + masked_phone: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.contacts`, + field: "phone", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ], + }, }); }); +}); - test("multiple JOINs", () => { +describe("Field-Level Lineage: CASE Expressions", () => { + test("simple CASE WHEN", () => { const sql = ` - SELECT u.name, o.total, p.name as product_name - FROM users u - JOIN orders o ON u.id = o.user_id - JOIN products p ON o.product_id = p.id + SELECT + CASE WHEN status = 'active' THEN 'Active' ELSE 'Inactive' END as status_label + FROM users `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "product_id", "total"]), - createTable("products", ["id", "name"]), - ]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(4); // u.id, o.user_id, o.product_id, p.id + // CASE WHEN condition column gets INDIRECT/CONDITION + expect(result.fields.status_label).toBeDefined(); + expect(result.fields.status_label?.inputFields).toContainEqual({ + namespace: "ns", + name: USERS_TABLE.name, + field: "status", + transformations: [INDIRECT_CONDITION], + }); }); - test("JOIN with complex ON condition", () => { + test("CASE with column in result", () => { const sql = ` - SELECT u.id, o.total - FROM users u - JOIN orders o ON u.id = o.user_id AND u.status = o.status + SELECT + CASE WHEN is_premium THEN discount_rate ELSE 0 END as applied_discount + FROM customers `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "status"]), - createTable("orders", ["id", "user_id", "status", "total"]), - ]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.customers`, ["is_premium", "discount_rate"])]); const result = getExtendedLineage(ast as Select, schema); - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(4); // u.id, o.user_id, u.status, o.status + const inputFieldNames = result.fields.applied_discount?.inputFields.map((f) => f.field); + expect(inputFieldNames).toContain("is_premium"); + expect(inputFieldNames).toContain("discount_rate"); }); - test("self JOIN", () => { + test("CASE with multiple conditions and results", () => { const sql = ` - SELECT e.name as employee, m.name as manager - FROM employees e - JOIN employees m ON e.manager_id = m.id + SELECT + CASE + WHEN age < 18 THEN minor_price + WHEN age < 65 THEN adult_price + ELSE senior_price + END as ticket_price + FROM visitors `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("employees", ["id", "name", "manager_id"])]); - + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.visitors`, ["age", "minor_price", "adult_price", "senior_price"]), + ]); const result = getExtendedLineage(ast as Select, schema); - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBeGreaterThanOrEqual(2); + const inputFieldNames = result.fields.ticket_price?.inputFields.map((f) => f.field); + expect(inputFieldNames).toContain("age"); + expect(inputFieldNames).toContain("minor_price"); + expect(inputFieldNames).toContain("adult_price"); + expect(inputFieldNames).toContain("senior_price"); }); }); -describe("getExtendedLineage - WHERE only (FILTER)", () => { - test("simple WHERE clause", () => { +// ============================================================================= +// SECTION 2: DATASET-LEVEL LINEAGE - INDIRECT TRANSFORMATIONS +// ============================================================================= + +describe("Dataset-Level Lineage: INDIRECT/JOIN", () => { + test("simple INNER JOIN", () => { const sql = ` - SELECT id, name - FROM users - WHERE status = 'active' + SELECT u.id, o.total + FROM users u + JOIN orders o ON u.id = o.user_id `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "status"])]); - + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); const result = getExtendedLineage(ast as Select, schema); - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(1); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "status", - transformations: [INDIRECT_FILTER], - }); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + ]), + ); }); - test("WHERE with AND", () => { + test("LEFT JOIN", () => { const sql = ` - SELECT id, name - FROM users - WHERE status = 'active' AND age > 18 + SELECT u.id, o.total + FROM users u + LEFT JOIN orders o ON u.id = o.user_id `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "status", "age"])]); - + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); const result = getExtendedLineage(ast as Select, schema); - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(2); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "status", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "age", - transformations: [INDIRECT_FILTER], - }); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + ]), + ); }); - test("WHERE with OR", () => { + test("RIGHT JOIN", () => { const sql = ` - SELECT id, name - FROM users - WHERE status = 'active' OR country = 'US' + SELECT u.id, o.total + FROM users u + RIGHT JOIN orders o ON u.id = o.user_id `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "status", "country"])]); - + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); const result = getExtendedLineage(ast as Select, schema); - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(2); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + ]), + ); }); - test("WHERE with complex nested conditions", () => { + test("FULL OUTER JOIN", () => { const sql = ` - SELECT id, name - FROM users - WHERE (status = 'active' AND age > 18) OR (country = 'US' AND verified = true) + SELECT u.id, o.total + FROM users u + FULL OUTER JOIN orders o ON u.id = o.user_id `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name", "status", "age", "country", "verified"]), - ]); - + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); const result = getExtendedLineage(ast as Select, schema); - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(4); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + ]), + ); }); - test("WHERE with IN clause", () => { + test("JOIN with compound ON condition (AND)", () => { const sql = ` - SELECT id, name - FROM users - WHERE country IN ('US', 'UK', 'CA') + SELECT u.id + FROM users u + JOIN orders o ON u.id = o.user_id AND u.region = o.region `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "country"])]); - + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "region"])]); const result = getExtendedLineage(ast as Select, schema); - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(1); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [INDIRECT_FILTER], - }); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: USERS_TABLE.name, field: "region", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "region", transformations: [INDIRECT_JOIN] }, + ]), + ); }); - test("WHERE with BETWEEN", () => { + test("multiple JOINs - three tables", () => { const sql = ` - SELECT id, name - FROM users - WHERE age BETWEEN 18 AND 65 + SELECT u.id, o.total, p.name + FROM users u + JOIN orders o ON u.id = o.user_id + JOIN products p ON o.product_id = p.id `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "age"])]); - + const schema = createNamespace("ns", [ + USERS_TABLE, + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "product_id", "total"]), + createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name"]), + ]); const result = getExtendedLineage(ast as Select, schema); - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(1); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "product_id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "id", transformations: [INDIRECT_JOIN] }, + ]), + ); }); - test("WHERE with LIKE", () => { + test("CROSS JOIN - no dataset lineage", () => { const sql = ` - SELECT id, name - FROM users - WHERE name LIKE 'John%' + SELECT u.id, p.name + FROM users u + CROSS JOIN products p `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name"])]); - + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.products`, ["name"])]); const result = getExtendedLineage(ast as Select, schema); - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(1); + expect(result.dataset).toEqual([]); }); - test("WHERE with IS NULL", () => { + test("self JOIN", () => { const sql = ` - SELECT id, name - FROM users - WHERE email IS NULL + SELECT e.name, m.name as manager_name + FROM employees e + JOIN employees m ON e.manager_id = m.id `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "name", "manager_id"])]); const result = getExtendedLineage(ast as Select, schema); - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(1); + // Self join should have both columns from same table + expect(result.dataset?.length).toBeGreaterThanOrEqual(2); }); }); -describe("getExtendedLineage - GROUP BY only", () => { - test("simple GROUP BY", () => { - const sql = ` - SELECT country, COUNT(*) as count - FROM users - GROUP BY country - `; - +describe("Dataset-Level Lineage: INDIRECT/FILTER (WHERE)", () => { + test("simple WHERE equality", () => { + const sql = `SELECT id FROM users WHERE status = 'active'`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "country"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage.length).toBe(1); - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [INDIRECT_GROUP_BY], - }); + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + ]); }); - test("multiple GROUP BY columns", () => { - const sql = ` - SELECT country, city, COUNT(*) as count - FROM users - GROUP BY country, city - `; - + test("WHERE with AND", () => { + const sql = `SELECT id FROM users WHERE status = 'active' AND age > 18`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "country", "city"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage.length).toBe(2); - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [INDIRECT_GROUP_BY], - }); - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "city", - transformations: [INDIRECT_GROUP_BY], - }); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: USERS_TABLE.name, field: "age", transformations: [INDIRECT_FILTER] }, + ]), + ); }); - test("GROUP BY with aggregation functions", () => { - const sql = ` - SELECT - department, - SUM(salary) as total_salary, - AVG(age) as avg_age, - MIN(hire_date) as first_hire - FROM employees - GROUP BY department - `; - + test("WHERE with OR", () => { + const sql = `SELECT id FROM users WHERE status = 'active' OR status = 'pending'`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("employees", ["id", "department", "salary", "age", "hire_date"]), + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); + + // Same column referenced twice, should be deduplicated + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, ]); + }); + test("WHERE with IN clause", () => { + const sql = `SELECT id FROM users WHERE country IN ('US', 'UK', 'CA')`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage should show aggregations - expect(result.fields.total_salary?.inputFields).toContainEqual({ - namespace: "trino", - name: "employees", - field: "salary", - transformations: [DIRECT_AGGREGATION], - }); - - // Dataset-level lineage should show GROUP_BY - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage.length).toBe(1); - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "employees", - field: "department", - transformations: [INDIRECT_GROUP_BY], - }); + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_FILTER] }, + ]); }); -}); - -describe("getExtendedLineage - ORDER BY only (SORT)", () => { - test("simple ORDER BY", () => { - const sql = ` - SELECT id, name - FROM users - ORDER BY created_at DESC - `; + test("WHERE with BETWEEN", () => { + const sql = `SELECT id FROM users WHERE age BETWEEN 18 AND 65`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "created_at"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - const sortLineage = findBySubtype(result.dataset, "SORT"); - expect(sortLineage.length).toBe(1); - expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "created_at", - transformations: [INDIRECT_SORT], - }); + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "age", transformations: [INDIRECT_FILTER] }, + ]); }); - test("multiple ORDER BY columns", () => { - const sql = ` - SELECT id, name - FROM users - ORDER BY country ASC, name DESC - `; - + test("WHERE with LIKE", () => { + const sql = `SELECT id FROM users WHERE name LIKE 'John%'`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "country"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - const sortLineage = findBySubtype(result.dataset, "SORT"); - expect(sortLineage.length).toBe(2); - expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [INDIRECT_SORT], - }); - expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "name", - transformations: [INDIRECT_SORT], - }); + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [INDIRECT_FILTER] }, + ]); }); - test("ORDER BY with NULLS FIRST/LAST", () => { - const sql = ` - SELECT id, name - FROM users - ORDER BY email NULLS LAST - `; - + test("WHERE with IS NULL", () => { + const sql = `SELECT id FROM users WHERE email IS NULL`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - const sortLineage = findBySubtype(result.dataset, "SORT"); - expect(sortLineage.length).toBe(1); + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "email", transformations: [INDIRECT_FILTER] }, + ]); }); - test("ORDER BY alias resolves to base column", () => { - const sql = ` - SELECT country, SUM(revenue) as total_revenue - FROM orders - GROUP BY country - ORDER BY total_revenue DESC - `; - + test("WHERE with IS NOT NULL", () => { + const sql = `SELECT id FROM users WHERE email IS NOT NULL`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("orders", ["id", "country", "revenue"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - // ORDER BY total_revenue should resolve to the base column 'revenue' used in SUM(revenue) - const sortLineage = findBySubtype(result.dataset, "SORT"); - expect(sortLineage.length).toBe(1); - expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "revenue", - transformations: [INDIRECT_SORT], - }); + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "email", transformations: [INDIRECT_FILTER] }, + ]); }); - test("ORDER BY alias with multiple columns in expression", () => { - const sql = ` - SELECT product_id, (quantity * price) as total_value - FROM order_items - ORDER BY total_value DESC - `; - + test("WHERE with nested complex conditions", () => { + const sql = `SELECT id FROM users WHERE (status = 'active' AND age > 18) OR (country = 'US' AND verified = true)`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("order_items", ["id", "product_id", "quantity", "price"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - // ORDER BY total_value should resolve to both 'quantity' and 'price' columns - const sortLineage = findBySubtype(result.dataset, "SORT"); - expect(sortLineage.length).toBe(2); - expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "order_items", - field: "quantity", - transformations: [INDIRECT_SORT], - }); - expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "order_items", - field: "price", - transformations: [INDIRECT_SORT], - }); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: USERS_TABLE.name, field: "age", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: USERS_TABLE.name, field: "verified", transformations: [INDIRECT_FILTER] }, + ]), + ); }); +}); - test("ORDER BY with mix of alias and direct column references", () => { - const sql = ` - SELECT country, SUM(revenue) as total_revenue - FROM orders - GROUP BY country - ORDER BY country ASC, total_revenue DESC - `; +describe("Dataset-Level Lineage: INDIRECT/GROUP_BY", () => { + test("simple GROUP BY single column", () => { + const sql = `SELECT country, COUNT(*) FROM users GROUP BY country`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_GROUP_BY] }, + ]); + }); + test("GROUP BY multiple columns", () => { + const sql = `SELECT country, city, COUNT(*) FROM users GROUP BY country, city`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("orders", ["id", "country", "revenue"])]); + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: USERS_TABLE.name, field: "city", transformations: [INDIRECT_GROUP_BY] }, + ]), + ); + }); + test("GROUP BY with expression", () => { + const sql = `SELECT DATE_TRUNC('month', created_at) as month, COUNT(*) FROM events GROUP BY DATE_TRUNC('month', created_at)`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.events`, ["id", "created_at"])]); const result = getExtendedLineage(ast as Select, schema); - const sortLineage = findBySubtype(result.dataset, "SORT"); - expect(sortLineage.length).toBe(2); - expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "country", - transformations: [INDIRECT_SORT], - }); - expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "revenue", - transformations: [INDIRECT_SORT], - }); + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.events`, field: "created_at", transformations: [INDIRECT_GROUP_BY] }, + ]); }); }); -describe("getExtendedLineage - HAVING only", () => { - test("simple HAVING clause", () => { - const sql = ` - SELECT country, COUNT(*) as count - FROM users - GROUP BY country - HAVING COUNT(*) > 10 - `; +describe("Dataset-Level Lineage: INDIRECT/SORT (ORDER BY)", () => { + test("simple ORDER BY single column", () => { + const sql = `SELECT id, name FROM users ORDER BY created_at`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "created_at", transformations: [INDIRECT_SORT] }, + ]); + }); + test("ORDER BY multiple columns", () => { + const sql = `SELECT id, name FROM users ORDER BY country ASC, name DESC`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "country"])]); + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_SORT] }, + { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [INDIRECT_SORT] }, + ]), + ); + }); + test("ORDER BY alias resolves to base columns", () => { + const sql = `SELECT quantity * price as total FROM order_items ORDER BY total`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.order_items`, ["quantity", "price"])]); const result = getExtendedLineage(ast as Select, schema); - // GROUP BY lineage - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage.length).toBe(1); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "price", transformations: [INDIRECT_SORT] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "quantity", transformations: [INDIRECT_SORT] }, + ]), + ); + }); + + test("ORDER BY with NULLS LAST", () => { + const sql = `SELECT id, name FROM users ORDER BY email NULLS LAST`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); - // Note: COUNT(*) doesn't reference a specific column, so HAVING may not add to dataset + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "email", transformations: [INDIRECT_SORT] }, + ]); }); +}); - test("HAVING with column reference", () => { +describe("Dataset-Level Lineage: INDIRECT/FILTER (HAVING)", () => { + test("HAVING with aggregation column reference", () => { const sql = ` - SELECT department, SUM(salary) as total_salary + SELECT department, SUM(salary) as total FROM employees GROUP BY department HAVING SUM(salary) > 100000 `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("employees", ["id", "department", "salary"])]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["department", "salary"])]); const result = getExtendedLineage(ast as Select, schema); - // Should have GROUP_BY - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage.length).toBe(1); - - // HAVING filters should show as FILTER - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "employees", - field: "salary", - transformations: [INDIRECT_FILTER], - }); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_FILTER] }, + ]), + ); }); test("HAVING with multiple conditions", () => { const sql = ` - SELECT department, AVG(age) as avg_age + SELECT department, AVG(age), COUNT(id) FROM employees GROUP BY department HAVING AVG(age) > 30 AND COUNT(id) > 5 `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("employees", ["id", "department", "age"])]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department", "age"])]); const result = getExtendedLineage(ast as Select, schema); - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBeGreaterThanOrEqual(1); + const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); + expect(filterLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "age", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "id", + transformations: [INDIRECT_FILTER], + }); }); }); -describe("getExtendedLineage - WINDOW functions", () => { - test("window function with PARTITION BY and ORDER BY - full lineage captured", () => { +describe("Dataset-Level Lineage: INDIRECT/WINDOW", () => { + test("window function with PARTITION BY only", () => { const sql = ` - SELECT - id, - department, - SUM(salary) OVER (PARTITION BY department ORDER BY salary DESC) as running_total - FROM employees + SELECT id, SUM(amount) OVER (PARTITION BY category) as category_total + FROM transactions `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("employees", ["id", "department", "salary"])]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.transactions`, ["id", "amount", "category"])]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage is captured correctly - expect(result.fields.id?.inputFields).toContainEqual({ - namespace: "trino", - name: "employees", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.department?.inputFields).toContainEqual({ - namespace: "trino", - name: "employees", - field: "department", - transformations: [DIRECT_IDENTITY], - }); - // The aggregated column in window function is captured - expect(result.fields.running_total?.inputFields).toContainEqual({ - namespace: "trino", - name: "employees", - field: "salary", - transformations: [DIRECT_AGGREGATION], - }); - - // Dataset-level WINDOW lineage from PARTITION BY and ORDER BY - const windowLineage = findBySubtype(result.dataset, "WINDOW"); - expect(windowLineage.length).toBe(2); - expect(windowLineage).toContainEqual({ - namespace: "trino", - name: "employees", - field: "department", - transformations: [INDIRECT_WINDOW], - }); - expect(windowLineage).toContainEqual({ - namespace: "trino", - name: "employees", - field: "salary", - transformations: [INDIRECT_WINDOW], - }); - }); - - test("window function with filter - combined lineage", () => { - const sql = ` - SELECT - id, - SUM(amount) OVER (ORDER BY date) as running_total - FROM transactions - WHERE status = 'completed' - `; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("transactions", ["id", "amount", "date", "status"])]); - - const result = getExtendedLineage(ast as Select, schema); - - // WINDOW lineage from ORDER BY in OVER clause - const windowLineage = findBySubtype(result.dataset, "WINDOW"); - expect(windowLineage.length).toBe(1); - expect(windowLineage).toContainEqual({ - namespace: "trino", - name: "transactions", - field: "date", - transformations: [INDIRECT_WINDOW], - }); - - // FILTER lineage from WHERE is captured - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "transactions", - field: "status", - transformations: [INDIRECT_FILTER], - }); + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "category", transformations: [INDIRECT_WINDOW] }, + ]); }); - test("multiple window functions", () => { + test("window function with ORDER BY only", () => { const sql = ` - SELECT - id, - ROW_NUMBER() OVER (PARTITION BY category ORDER BY created_at) as row_num, - SUM(amount) OVER (PARTITION BY user_id ORDER BY created_at) as running_total - FROM orders + SELECT id, ROW_NUMBER() OVER (ORDER BY created_at) as row_num + FROM events `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("orders", ["id", "category", "created_at", "amount", "user_id"])]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.events`, ["id", "created_at"])]); const result = getExtendedLineage(ast as Select, schema); - const windowLineage = findBySubtype(result.dataset, "WINDOW"); - // category, created_at (from first window), user_id, created_at (from second window) - // created_at should be deduplicated - expect(windowLineage.length).toBe(3); - expect(windowLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "category", - transformations: [INDIRECT_WINDOW], - }); - expect(windowLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "user_id", - transformations: [INDIRECT_WINDOW], - }); - expect(windowLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "created_at", - transformations: [INDIRECT_WINDOW], - }); + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.events`, field: "created_at", transformations: [INDIRECT_WINDOW] }, + ]); }); -}); -describe("getExtendedLineage - CASE expressions (CONDITION)", () => { - test("simple CASE WHEN", () => { + test("window function with PARTITION BY and ORDER BY", () => { const sql = ` - SELECT - id, - CASE - WHEN status = 'active' THEN 'Active' - ELSE 'Inactive' - END as status_label - FROM users + SELECT id, SUM(amount) OVER (PARTITION BY user_id ORDER BY created_at) as running_total + FROM transactions `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "status"])]); - + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.transactions`, ["id", "amount", "user_id", "created_at"]), + ]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage should contain the status column with CONDITION - expect(result.fields.status_label).toBeDefined(); - const hasConditionOrTransformation = result.fields.status_label?.inputFields.some( - (f) => - f.transformations?.some((t) => t.subtype === "CONDITION") || - f.transformations?.some((t) => t.subtype === "TRANSFORMATION"), + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "user_id", transformations: [INDIRECT_WINDOW] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "created_at", transformations: [INDIRECT_WINDOW] }, + ]), ); - expect(hasConditionOrTransformation).toBe(true); }); - test("CASE with multiple conditions", () => { + test("multiple window functions", () => { const sql = ` SELECT id, - CASE - WHEN age < 18 THEN 'Minor' - WHEN age < 65 THEN 'Adult' - ELSE 'Senior' - END as age_group - FROM users + ROW_NUMBER() OVER (PARTITION BY category ORDER BY created_at) as row_num, + SUM(amount) OVER (PARTITION BY user_id ORDER BY created_at) as running_total + FROM orders `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "age"])]); - + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.orders`, ["id", "category", "created_at", "amount", "user_id"]), + ]); const result = getExtendedLineage(ast as Select, schema); - expect(result.fields.age_group).toBeDefined(); - expect(result.fields.age_group?.inputFields.length).toBeGreaterThan(0); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "category", transformations: [INDIRECT_WINDOW] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "created_at", transformations: [INDIRECT_WINDOW] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_WINDOW] }, + ]), + ); }); - test("CASE with column in result", () => { + test("RANK window function", () => { const sql = ` - SELECT - id, - CASE - WHEN discount_type = 'percent' THEN price * (1 - discount_value / 100) - ELSE price - discount_value - END as final_price - FROM products + SELECT id, RANK() OVER (PARTITION BY department ORDER BY salary DESC) as salary_rank + FROM employees `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("products", ["id", "price", "discount_type", "discount_value"])]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department", "salary"])]); const result = getExtendedLineage(ast as Select, schema); - const inputFields = result.fields.final_price?.inputFields.map((f) => f.field); - expect(inputFields).toContain("price"); - expect(inputFields).toContain("discount_type"); - expect(inputFields).toContain("discount_value"); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [INDIRECT_WINDOW] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_WINDOW] }, + ]), + ); }); }); // ============================================================================= -// COMPLEX TESTS - Multiple Clauses Combined +// SECTION 3: COMBINED CLAUSES // ============================================================================= -describe("getExtendedLineage - JOIN + WHERE", () => { - test("JOIN with WHERE filter", () => { +describe("Combined Clauses: JOIN + WHERE", () => { + test("JOIN with WHERE on both tables", () => { const sql = ` - SELECT u.id, u.name, o.total + SELECT u.id, o.total FROM users u JOIN orders o ON u.id = o.user_id WHERE u.status = 'active' AND o.total > 100 `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name", "status"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); const result = getExtendedLineage(ast as Select, schema); - // JOIN lineage - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(2); - - // FILTER lineage - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(2); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "status", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "total", - transformations: [INDIRECT_FILTER], - }); - }); -}); - -describe("getExtendedLineage - JOIN + GROUP BY", () => { - test("JOIN with GROUP BY aggregation", () => { - const sql = ` - SELECT - u.country, - COUNT(o.id) as order_count, - SUM(o.total) as total_revenue - FROM users u - JOIN orders o ON u.id = o.user_id - GROUP BY u.country - `; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "country"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - - const result = getExtendedLineage(ast as Select, schema); - - // Field-level lineage with aggregations - expect(result.fields.order_count?.inputFields[0]?.transformations).toContainEqual( - expect.objectContaining({ type: "DIRECT", subtype: "AGGREGATION" }), + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "total", transformations: [INDIRECT_FILTER] }, + ]), ); - - // JOIN lineage - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(2); - - // GROUP BY lineage - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage.length).toBe(1); - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [INDIRECT_GROUP_BY], - }); }); }); -describe("getExtendedLineage - WHERE + GROUP BY + HAVING", () => { +describe("Combined Clauses: WHERE + GROUP BY + HAVING", () => { test("full aggregation query", () => { const sql = ` - SELECT - department, - COUNT(*) as employee_count, - AVG(salary) as avg_salary + SELECT department, COUNT(*) as cnt, AVG(salary) as avg_sal FROM employees WHERE status = 'active' GROUP BY department HAVING COUNT(*) > 5 `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("employees", ["id", "department", "salary", "status"])]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department", "salary", "status"])]); const result = getExtendedLineage(ast as Select, schema); - // FILTER lineage (from WHERE) - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "employees", - field: "status", - transformations: [INDIRECT_FILTER], - }); - - // GROUP BY lineage - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "employees", - field: "department", - transformations: [INDIRECT_GROUP_BY], - }); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [INDIRECT_GROUP_BY] }, + // HAVING COUNT(*) doesn't add field lineage since COUNT(*) doesn't reference a column + ]), + ); }); }); -describe("getExtendedLineage - Full query with all clauses", () => { - test("comprehensive query with all indirect lineage types", () => { +describe("Combined Clauses: GROUP BY + ORDER BY", () => { + test("aggregation with sorting", () => { const sql = ` - SELECT - u.country, - COUNT(u.id) as user_count, - SUM(o.total) as total_revenue, - ROW_NUMBER() OVER (ORDER BY SUM(o.total) DESC) as revenue_rank - FROM users u - JOIN orders o ON u.id = o.user_id - WHERE u.status = 'active' AND o.order_date >= '2024-01-01' - GROUP BY u.country - HAVING SUM(o.total) > 1000 - ORDER BY total_revenue DESC + SELECT country, COUNT(*) as cnt + FROM users + GROUP BY country + ORDER BY cnt DESC `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "country", "status"]), - createTable("orders", ["id", "user_id", "total", "order_date"]), - ]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage - expect(result.fields.country).toBeDefined(); - expect(result.fields.user_count).toBeDefined(); - expect(result.fields.total_revenue).toBeDefined(); - - // Dataset-level lineage - expect(result.dataset).toBeDefined(); - expect(result.dataset!.length).toBeGreaterThan(0); - - // JOIN lineage - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBeGreaterThan(0); - - // FILTER lineage (from WHERE) - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBeGreaterThan(0); - - // GROUP BY lineage - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage.length).toBeGreaterThan(0); - - // SORT lineage (from ORDER BY) - // Note: ORDER BY total_revenue references an alias, may not resolve to base column + // ORDER BY cnt references alias, which resolves to COUNT(*) - no additional lineage + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_GROUP_BY] }, + ]); }); +}); - test("analytics query with aggregate window functions and multiple joins", () => { +describe("Combined Clauses: WINDOW + WHERE + ORDER BY", () => { + test("window function with filter and sort", () => { const sql = ` SELECT - d.name as department_name, - e.name as employee_name, - e.salary, - SUM(e.salary) OVER (PARTITION BY e.department_id ORDER BY e.salary DESC) as running_salary - FROM employees e - JOIN departments d ON e.department_id = d.id - WHERE e.status = 'active' - ORDER BY d.name, e.salary DESC + id, + SUM(amount) OVER (PARTITION BY category ORDER BY created_at) as running_total + FROM transactions + WHERE status = 'completed' + ORDER BY created_at `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("employees", ["id", "name", "department_id", "salary", "status"]), - createTable("departments", ["id", "name"]), + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.transactions`, ["id", "amount", "category", "created_at", "status"]), ]); - const result = getExtendedLineage(ast as Select, schema); - // JOIN lineage - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(2); - - // WINDOW lineage from PARTITION BY department_id and ORDER BY salary DESC in OVER clause - const windowLineage = findBySubtype(result.dataset, "WINDOW"); - expect(windowLineage.length).toBe(2); - expect(windowLineage).toContainEqual({ - namespace: "trino", - name: "employees", - field: "department_id", - transformations: [INDIRECT_WINDOW], - }); - expect(windowLineage).toContainEqual({ - namespace: "trino", - name: "employees", - field: "salary", - transformations: [INDIRECT_WINDOW], - }); - - // FILTER lineage - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "employees", - field: "status", - transformations: [INDIRECT_FILTER], - }); - - // SORT lineage - const sortLineage = findBySubtype(result.dataset, "SORT"); - expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "departments", - field: "name", - transformations: [INDIRECT_SORT], - }); - expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "employees", - field: "salary", - transformations: [INDIRECT_SORT], - }); - - // Field-level lineage captures the aggregation - expect(result.fields.running_salary?.inputFields).toContainEqual({ - namespace: "trino", - name: "employees", - field: "salary", - transformations: [DIRECT_AGGREGATION], - }); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "category", transformations: [INDIRECT_WINDOW] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "created_at", transformations: [INDIRECT_WINDOW] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "created_at", transformations: [INDIRECT_SORT] }, + ]), + ); }); }); // ============================================================================= -// CTE (WITH clause) TESTS +// SECTION 4: CTEs (WITH clause) // ============================================================================= -describe("getExtendedLineage - WITH clause (CTEs)", () => { - test("simple CTE with filter - dataset lineage propagation", () => { +describe("CTEs: Basic WITH clause", () => { + test("simple CTE propagates field lineage", () => { const sql = ` - WITH active_users AS ( - SELECT id, name, country - FROM users - WHERE status = 'active' + WITH active AS ( + SELECT id, name FROM users WHERE status = 'active' ) - SELECT country, COUNT(*) as count - FROM active_users - GROUP BY country + SELECT id, name FROM active `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "country", "status"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage should trace back to users table - expect(result.fields.country?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [DIRECT_IDENTITY], + expect(result.fields).toEqual({ + id: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + }, + name: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + }, }); - // Dataset-level lineage from the CTE's WHERE clause should be propagated - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(1); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "status", - transformations: [INDIRECT_FILTER], + // Dataset lineage should include the WHERE from the CTE + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + ]); + }); + + test("CTE with aggregation", () => { + const sql = ` + WITH summary AS ( + SELECT department, SUM(salary) as total_salary + FROM employees + GROUP BY department + ) + SELECT department, total_salary FROM summary + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["department", "salary"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + department: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [DIRECT_IDENTITY] }, + ], + }, + total_salary: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [DIRECT_AGGREGATION] }, + ], + }, }); - // GROUP BY from outer query should also be captured (but references the CTE, not direct table) - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - // country GROUP BY references active_users.country which resolves to users.country - expect(groupByLineage.length).toBe(0); // GROUP BY references CTE alias, not resolved to base table + // GROUP BY from CTE should be in dataset lineage + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [INDIRECT_GROUP_BY] }, + ]); }); +}); - test("multiple CTEs", () => { +describe("CTEs: Multiple CTEs", () => { + test("two CTEs with JOIN", () => { const sql = ` WITH - active_users AS ( - SELECT id, name, country FROM users WHERE status = 'active' + users_cte AS ( + SELECT id, name FROM users WHERE status = 'active' ), - user_orders AS ( + orders_cte AS ( SELECT user_id, SUM(total) as total_spent FROM orders GROUP BY user_id ) - SELECT - au.name, - au.country, - uo.total_spent - FROM active_users au - JOIN user_orders uo ON au.id = uo.user_id + SELECT u.name, o.total_spent + FROM users_cte u + JOIN orders_cte o ON u.id = o.user_id `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name", "country", "status"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); const result = getExtendedLineage(ast as Select, schema); - // ========== Field-level lineage ========== - - // name traces back to users.name with IDENTITY - expect(result.fields.name).toBeDefined(); - expect(result.fields.name?.inputFields.length).toBe(1); - expect(result.fields.name?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "name", - transformations: [DIRECT_IDENTITY], - }); - - // country traces back to users.country with IDENTITY - expect(result.fields.country).toBeDefined(); - expect(result.fields.country?.inputFields.length).toBe(1); - expect(result.fields.country?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [DIRECT_IDENTITY], - }); - - // total_spent traces back to orders.total with AGGREGATION (through SUM in CTE) - expect(result.fields.total_spent).toBeDefined(); - expect(result.fields.total_spent?.inputFields.length).toBe(1); - expect(result.fields.total_spent?.inputFields).toContainEqual({ - namespace: "trino", - name: "orders", - field: "total", - transformations: [DIRECT_AGGREGATION], + expect(result.fields).toEqual({ + name: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + }, + total_spent: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "total", transformations: [DIRECT_AGGREGATION] }, + ], + }, }); - // ========== Dataset-level lineage ========== - // Dataset lineage from CTEs is now propagated to outer query - - // FILTER from active_users CTE (WHERE status = 'active') - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(1); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "status", - transformations: [INDIRECT_FILTER], - }); - - // GROUP BY from user_orders CTE (GROUP BY user_id) - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage.length).toBe(1); - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "user_id", - transformations: [INDIRECT_GROUP_BY], - }); - - // Verify we have the correct number of output fields - expect(Object.keys(result.fields).length).toBe(3); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_GROUP_BY] }, + ]), + ); }); +}); - test("nested CTEs with complex transformations", () => { +describe("CTEs: Nested transformations through CTEs", () => { + test("transformation propagation through nested CTEs", () => { const sql = ` WITH - base_data AS ( - SELECT - product_id, - store_id, - quantity * price as revenue - FROM sales - WHERE sale_date >= '2024-01-01' + base AS ( + SELECT id, quantity * price as revenue FROM sales WHERE sale_date >= '2024-01-01' ), - store_summary AS ( - SELECT - store_id, - SUM(revenue) as total_revenue - FROM base_data - GROUP BY store_id + summary AS ( + SELECT SUM(revenue) as total_revenue FROM base ) - SELECT - s.name as store_name, - ss.total_revenue - FROM store_summary ss - JOIN stores s ON ss.store_id = s.id - ORDER BY ss.total_revenue DESC + SELECT total_revenue FROM summary `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("sales", ["id", "product_id", "store_id", "quantity", "price", "sale_date"]), - createTable("stores", ["id", "name"]), - ]); - + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.sales`, ["id", "quantity", "price", "sale_date"])]); const result = getExtendedLineage(ast as Select, schema); - // ========== Field-level lineage ========== - - // store_name should trace back to stores.name - expect(result.fields.store_name).toBeDefined(); - expect(result.fields.store_name?.inputFields).toContainEqual({ - namespace: "trino", - name: "stores", - field: "name", - transformations: [DIRECT_IDENTITY], - }); - - // total_revenue should trace back through CTEs to quantity and price with AGGREGATION - expect(result.fields.total_revenue).toBeDefined(); - expect(result.fields.total_revenue?.inputFields.length).toBe(2); - expect(result.fields.total_revenue?.inputFields).toContainEqual({ - namespace: "trino", - name: "sales", - field: "quantity", - transformations: [DIRECT_AGGREGATION], - }); - expect(result.fields.total_revenue?.inputFields).toContainEqual({ - namespace: "trino", - name: "sales", - field: "price", - transformations: [DIRECT_AGGREGATION], - }); - - // ========== Dataset-level lineage ========== - - // JOIN lineage - should capture the join between store_summary and stores - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(1); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "stores", - field: "id", - transformations: [INDIRECT_JOIN], - }); - - // FILTER lineage - propagated from base_data CTE (WHERE sale_date >= '2024-01-01') - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(1); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "sales", - field: "sale_date", - transformations: [INDIRECT_FILTER], - }); + // total_revenue -> SUM(revenue) -> quantity * price + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + total_revenue: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "price", transformations: [DIRECT_AGGREGATION] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "quantity", transformations: [DIRECT_AGGREGATION] }, + ], + }, + }), + ); - // Note: GROUP BY in store_summary (GROUP BY store_id) references base_data.store_id - // which is a CTE column, not a direct table column. Dataset lineage extraction - // for GROUP BY/ORDER BY only resolves to direct table columns, not CTE columns. - // This is a known limitation - CTE-to-CTE indirect lineage is not resolved. - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage.length).toBe(0); + // Dataset lineage includes WHERE from base CTE + expect(result.dataset).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "sale_date", transformations: [INDIRECT_FILTER] }, + ]); }); }); // ============================================================================= -// SUBQUERY TESTS +// SECTION 5: SUBQUERIES // ============================================================================= -describe("getExtendedLineage - Subqueries", () => { - test("subquery in FROM clause", () => { +describe("Subqueries: FROM clause subquery", () => { + test("simple subquery in FROM", () => { const sql = ` - SELECT - sub.country, - sub.user_count + SELECT sub.country, sub.cnt FROM ( - SELECT country, COUNT(*) as user_count + SELECT country, COUNT(*) as cnt FROM users WHERE status = 'active' GROUP BY country ) sub - ORDER BY sub.user_count DESC `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "country", "status"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - expect(result.fields.country?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "country", - transformations: [DIRECT_IDENTITY], - }); + expect(result.fields.country?.inputFields).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [DIRECT_IDENTITY] }, + ]); + + // Dataset lineage from subquery + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_GROUP_BY] }, + ]), + ); }); }); // ============================================================================= -// EDGE CASES AND SPECIAL SCENARIOS +// SECTION 6: SET OPERATIONS (UNION, INTERSECT, EXCEPT) // ============================================================================= -describe("getExtendedLineage - Edge cases", () => { - test("same column used in multiple contexts", () => { +describe("Set Operations: UNION", () => { + test("UNION merges field lineage from both sides", () => { const sql = ` - SELECT - status, - COUNT(*) as count - FROM users - WHERE status != 'deleted' - GROUP BY status - ORDER BY status + SELECT id, name FROM users WHERE status = 'active' + UNION + SELECT id, name FROM customers WHERE verified = true `; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "status"])]); - + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.customers`, ["id", "name", "verified"])]); const result = getExtendedLineage(ast as Select, schema); - // status appears in: - // 1. SELECT (DIRECT/IDENTITY) - // 2. WHERE (INDIRECT/FILTER) - // 3. GROUP BY (INDIRECT/GROUP_BY) - // 4. ORDER BY (INDIRECT/SORT) - - expect(result.fields.status?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "status", - transformations: [DIRECT_IDENTITY], - }); + // Field lineage combines both sources + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + name: { + inputFields: [ + { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); - const filterLineage = findFieldBySubtype(result.dataset, "status", "FILTER"); - expect(filterLineage).toBeDefined(); + // Dataset lineage includes filters from both + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "verified", transformations: [INDIRECT_FILTER] }, + ]), + ); + }); - const groupByLineage = findFieldBySubtype(result.dataset, "status", "GROUP_BY"); - expect(groupByLineage).toBeDefined(); + test("UNION ALL with GROUP BY on both sides", () => { + const sql = ` + SELECT department FROM employees GROUP BY department + UNION ALL + SELECT department FROM contractors GROUP BY department + `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department"]), + createTable(`${DEFAULT_SCHEMA}.contractors`, ["id", "department"]), + ]); + const result = getExtendedLineage(ast as Select, schema); - const sortLineage = findFieldBySubtype(result.dataset, "status", "SORT"); - expect(sortLineage).toBeDefined(); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.contractors`, field: "department", transformations: [INDIRECT_GROUP_BY] }, + ]), + ); }); +}); - test("column from multiple tables with same name", () => { +describe("Set Operations: INTERSECT", () => { + test("INTERSECT with ORDER BY on both sides", () => { const sql = ` - SELECT u.name as user_name, p.name as product_name - FROM users u - JOIN products p ON u.favorite_product_id = p.id - WHERE u.name LIKE 'A%' AND p.name LIKE 'Widget%' + SELECT id FROM active_users ORDER BY created_at + INTERSECT + SELECT id FROM premium_users ORDER BY upgraded_at `; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name", "favorite_product_id"]), - createTable("products", ["id", "name"]), + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.active_users`, ["id", "created_at"]), + createTable(`${DEFAULT_SCHEMA}.premium_users`, ["id", "upgraded_at"]), ]); - const result = getExtendedLineage(ast as Select, schema); - // Field-level lineage should distinguish the two name columns - expect(result.fields.user_name?.inputFields).toContainEqual({ - namespace: "trino", - name: "users", - field: "name", - transformations: [DIRECT_IDENTITY], - }); - expect(result.fields.product_name?.inputFields).toContainEqual({ - namespace: "trino", - name: "products", - field: "name", - transformations: [DIRECT_IDENTITY], - }); - - // FILTER lineage should have both name columns - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "users", - field: "name", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "products", - field: "name", - transformations: [INDIRECT_FILTER], - }); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.active_users`, field: "created_at", transformations: [INDIRECT_SORT] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.premium_users`, field: "upgraded_at", transformations: [INDIRECT_SORT] }, + ]), + ); }); +}); - test("deduplication of dataset lineage", () => { +describe("Set Operations: EXCEPT", () => { + test("EXCEPT with WHERE on both sides", () => { const sql = ` - SELECT id, name - FROM users - WHERE status = 'active' AND status != 'banned' + SELECT id FROM users WHERE active = true + EXCEPT + SELECT id FROM banned_users WHERE ban_date > '2024-01-01' `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.banned_users`, ["id", "ban_date"])]); + const result = getExtendedLineage(ast as Select, schema); - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "status"])]); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "active", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.banned_users`, field: "ban_date", transformations: [INDIRECT_FILTER] }, + ]), + ); + }); +}); +describe("Set Operations: Chained", () => { + test("triple UNION with different clauses", () => { + const sql = ` + SELECT id FROM users WHERE region = 'US' + UNION + SELECT id FROM customers WHERE region = 'EU' + UNION + SELECT id FROM vendors WHERE region = 'APAC' + `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + USERS_TABLE, + createTable(`${DEFAULT_SCHEMA}.customers`, ["id", "region"]), + createTable(`${DEFAULT_SCHEMA}.vendors`, ["id", "region"]), + ]); const result = getExtendedLineage(ast as Select, schema); - // Even though status appears twice in WHERE, it should be deduplicated - const filterLineage = findBySubtype(result.dataset, "FILTER"); - const statusFilters = filterLineage.filter((f) => f.field === "status"); - expect(statusFilters.length).toBe(1); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "region", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "region", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.vendors`, field: "region", transformations: [INDIRECT_FILTER] }, + ]), + ); }); +}); - test("empty dataset lineage when no indirect clauses", () => { - const sql = `SELECT id, UPPER(name) as upper_name FROM users`; +// ============================================================================= +// SECTION 7: STAR (*) EXPANSION +// ============================================================================= +describe("Star Expansion", () => { + test("SELECT * expands to all columns", () => { + const sql = `SELECT * FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name"])]); + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); + expect(result.fields).toEqual( + USERS_TABLE.columns.reduce( + (acc, col) => { + acc[col] = { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: col, transformations: [DIRECT_IDENTITY] }], + }; + return acc; + }, + {} as Record, + ), + ); + }); + + test("table.* with multiple tables", () => { + const sql = `SELECT u.*, o.total FROM users u JOIN orders o ON u.id = o.user_id`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); const result = getExtendedLineage(ast as Select, schema); - expect(result.fields.id).toBeDefined(); - expect(result.fields.upper_name).toBeDefined(); - expect(result.dataset).toEqual([]); + expect(result.fields.id?.inputFields).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }, + ]); + expect(result.fields.name?.inputFields).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }, + ]); + expect(result.fields.total?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "total", transformations: [DIRECT_IDENTITY] }, + ]); }); +}); + +// ============================================================================= +// SECTION 8: EDGE CASES +// ============================================================================= - test("transformation functions with masking", () => { +describe("Edge Cases", () => { + test("same column in multiple contexts", () => { const sql = ` - SELECT - MD5(email) as email_hash, - SHA256(ssn) as ssn_hash, - MASK(phone) as masked_phone + SELECT status, COUNT(*) as cnt FROM users + WHERE status != 'deleted' + GROUP BY status + ORDER BY status `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "email", "ssn", "phone"])]); - + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - // All three should have masking: true in their transformations - expect(result.fields.email_hash?.inputFields[0]?.transformations?.[0]?.masking).toBe(true); - expect(result.fields.ssn_hash?.inputFields[0]?.transformations?.[0]?.masking).toBe(true); - expect(result.fields.masked_phone?.inputFields[0]?.transformations?.[0]?.masking).toBe(true); + // Field lineage + expect(result.fields.status?.inputFields).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [DIRECT_IDENTITY] }, + ]); + + // Dataset lineage should have all three subtypes + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_SORT] }, + ]), + ); }); - test("COUNT aggregation has masking flag", () => { + test("column name collision from different tables", () => { const sql = ` - SELECT country, COUNT(id) as user_count - FROM users - GROUP BY country + SELECT u.name as user_name, p.name as product_name + FROM users u + JOIN products p ON u.favorite_product = p.id + WHERE u.name LIKE 'A%' AND p.name LIKE 'Widget%' `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name"])]); + const result = getExtendedLineage(ast as Select, schema); + expect(result.fields.user_name?.inputFields).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }, + ]); + expect(result.fields.product_name?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [DIRECT_IDENTITY] }, + ]); + + const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); + expect(sortDataset(filterLineage)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [INDIRECT_FILTER] }, + ]), + ); + }); + + test("deduplication of repeated column in same clause", () => { + const sql = `SELECT id FROM users WHERE status = 'active' AND status != 'banned'`; const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "country"])]); + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); + + // status appears twice but should be deduplicated + expect(result.dataset).toEqual([ + { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + ]); + }); + test("empty dataset lineage when no indirect clauses", () => { + const sql = `SELECT id, UPPER(name) as upper_name FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE]); const result = getExtendedLineage(ast as Select, schema); - // COUNT should have masking: true - expect(result.fields.user_count?.inputFields[0]?.transformations).toContainEqual({ - type: "DIRECT", - subtype: "AGGREGATION", - masking: true, - }); + expect(result.dataset).toEqual([]); }); }); // ============================================================================= -// REAL-WORLD COMPLEX QUERIES +// SECTION 9: COMPREHENSIVE MEGA-QUERIES // ============================================================================= -describe("getExtendedLineage - Real-world complex queries", () => { - test("e-commerce analytics query", () => { +describe("Comprehensive: Everything Together", () => { + test("mega query with all features", () => { + const sql = ` + WITH + filtered_sales AS ( + SELECT + product_id, + store_id, + quantity, + unit_price, + quantity * unit_price as line_total + FROM sales + WHERE sale_date >= '2024-01-01' + AND status = 'completed' + ), + store_totals AS ( + SELECT + store_id, + SUM(line_total) as total_revenue, + COUNT(product_id) as product_count, + AVG(unit_price) as avg_price + FROM filtered_sales + GROUP BY store_id + HAVING SUM(line_total) > 1000 + ) + SELECT + s.name as store_name, + s.region, + st.total_revenue, + st.product_count, + st.avg_price, + CASE + WHEN st.total_revenue > 100000 THEN 'Premium' + WHEN st.total_revenue > 50000 THEN 'Standard' + ELSE 'Basic' + END as tier, + RANK() OVER (PARTITION BY s.region ORDER BY st.total_revenue DESC) as region_rank, + MD5(s.name) as store_hash + FROM store_totals st + JOIN stores s ON st.store_id = s.id + WHERE s.active = true + ORDER BY s.region, st.total_revenue DESC + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.sales`, ["id", "product_id", "store_id", "quantity", "unit_price", "sale_date", "status"]), + createTable(`${DEFAULT_SCHEMA}.stores`, ["id", "name", "region", "active"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + // ========== FIELD-LEVEL LINEAGE ========== + + // store_name -> stores.name (IDENTITY) + expect(result.fields.store_name?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "name", transformations: [DIRECT_IDENTITY] }, + ]); + + // region -> stores.region (IDENTITY) + expect(result.fields.region?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "region", transformations: [DIRECT_IDENTITY] }, + ]); + + // total_revenue -> SUM(quantity * unit_price) via CTEs (AGGREGATION) + expect(sortInputFields({ total_revenue: result.fields.total_revenue! })).toEqual( + sortInputFields({ + total_revenue: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "quantity", transformations: [DIRECT_AGGREGATION] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "unit_price", transformations: [DIRECT_AGGREGATION] }, + ], + }, + }), + ); + + // product_count -> COUNT(product_id) (AGGREGATION with masking) + expect(result.fields.product_count?.inputFields).toEqual([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "product_id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ]); + + // avg_price -> AVG(unit_price) (AGGREGATION) + expect(result.fields.avg_price?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "unit_price", transformations: [DIRECT_AGGREGATION] }, + ]); + + // tier -> CASE on total_revenue which traces to quantity and unit_price + const tierFields = result.fields.tier?.inputFields.map((f) => f.field); + expect(tierFields).toContain("quantity"); + expect(tierFields).toContain("unit_price"); + + // region_rank -> RANK() OVER (...) tracks columns from PARTITION BY and ORDER BY + const rankFields = result.fields.region_rank?.inputFields; + expect(rankFields?.map((f) => f.field)).toContain("region"); + + // store_hash -> MD5(name) (TRANSFORMATION with masking) + expect(result.fields.store_hash?.inputFields).toEqual([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.stores`, + field: "name", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ]); + + // ========== DATASET-LEVEL LINEAGE ========== + + // FILTER from filtered_sales CTE (WHERE sale_date >= ... AND status = ...) + const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); + expect(filterLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "sale_date", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "status", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.stores`, + field: "active", + transformations: [INDIRECT_FILTER], + }); + + // JOIN lineage from st.store_id = s.id + const joinLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "JOIN"); + expect(joinLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.stores`, + field: "id", + transformations: [INDIRECT_JOIN], + }); + + // SORT lineage from ORDER BY s.region, st.total_revenue + const sortLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "SORT"); + expect(sortLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.stores`, + field: "region", + transformations: [INDIRECT_SORT], + }); + + // WINDOW lineage from PARTITION BY s.region ORDER BY st.total_revenue + const windowLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "WINDOW"); + expect(windowLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.stores`, + field: "region", + transformations: [INDIRECT_WINDOW], + }); + + // Verify we have all 8 output fields + expect(Object.keys(result.fields).length).toBe(8); + }); + + test("e-commerce analytics mega query", () => { const sql = ` SELECT c.name as category_name, p.name as product_name, - SUM(oi.quantity) as total_quantity, - SUM(oi.quantity * oi.unit_price) as total_revenue, - AVG(oi.unit_price) as avg_price + SUM(oi.quantity) as total_qty, + SUM(oi.quantity * oi.price) as revenue, + AVG(oi.price) as avg_price, + COUNT(DISTINCT o.customer_id) as unique_customers, + ROW_NUMBER() OVER (PARTITION BY c.id ORDER BY SUM(oi.quantity * oi.price) DESC) as category_rank FROM categories c JOIN products p ON c.id = p.category_id JOIN order_items oi ON p.id = oi.product_id JOIN orders o ON oi.order_id = o.id - WHERE o.status = 'completed' AND o.order_date >= '2024-01-01' + WHERE o.status = 'completed' + AND o.created_at >= '2024-01-01' + AND p.active = true GROUP BY c.id, c.name, p.id, p.name HAVING SUM(oi.quantity) > 10 - ORDER BY c.name, total_revenue DESC + ORDER BY c.name, revenue DESC `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("categories", ["id", "name"]), - createTable("products", ["id", "name", "category_id"]), - createTable("order_items", ["id", "order_id", "product_id", "quantity", "unit_price"]), - createTable("orders", ["id", "status", "order_date"]), + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.categories`, ["id", "name"]), + createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name", "category_id", "active"]), + createTable(`${DEFAULT_SCHEMA}.order_items`, ["id", "order_id", "product_id", "quantity", "price"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["id", "customer_id", "status", "created_at"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + // ========== FIELD-LEVEL LINEAGE ========== + + expect(result.fields.category_name?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.categories`, field: "name", transformations: [DIRECT_IDENTITY] }, ]); - const result = getExtendedLineage(ast as Select, schema); + expect(result.fields.product_name?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [DIRECT_IDENTITY] }, + ]); - // ========== Field-level lineage ========== - - // category_name traces to categories.name - expect(result.fields.category_name).toBeDefined(); - expect(result.fields.category_name?.inputFields).toContainEqual({ - namespace: "trino", - name: "categories", - field: "name", - transformations: [DIRECT_IDENTITY], - }); + expect(result.fields.total_qty?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "quantity", transformations: [DIRECT_AGGREGATION] }, + ]); - // product_name traces to products.name - expect(result.fields.product_name).toBeDefined(); - expect(result.fields.product_name?.inputFields).toContainEqual({ - namespace: "trino", - name: "products", - field: "name", - transformations: [DIRECT_IDENTITY], - }); + expect(sortInputFields({ revenue: result.fields.revenue! })).toEqual( + sortInputFields({ + revenue: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "price", transformations: [DIRECT_AGGREGATION] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "quantity", transformations: [DIRECT_AGGREGATION] }, + ], + }, + }), + ); - // total_quantity traces to order_items.quantity with AGGREGATION - expect(result.fields.total_quantity).toBeDefined(); - expect(result.fields.total_quantity?.inputFields).toContainEqual({ - namespace: "trino", - name: "order_items", - field: "quantity", - transformations: [DIRECT_AGGREGATION], - }); + expect(result.fields.avg_price?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "price", transformations: [DIRECT_AGGREGATION] }, + ]); - // total_revenue traces to both quantity and unit_price with AGGREGATION - expect(result.fields.total_revenue).toBeDefined(); - expect(result.fields.total_revenue?.inputFields.length).toBe(2); - expect(result.fields.total_revenue?.inputFields).toContainEqual({ - namespace: "trino", - name: "order_items", - field: "quantity", - transformations: [DIRECT_AGGREGATION], - }); - expect(result.fields.total_revenue?.inputFields).toContainEqual({ - namespace: "trino", - name: "order_items", - field: "unit_price", - transformations: [DIRECT_AGGREGATION], - }); + expect(result.fields.unique_customers?.inputFields).toEqual([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, + field: "customer_id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ]); - // avg_price traces to order_items.unit_price with AGGREGATION - expect(result.fields.avg_price).toBeDefined(); - expect(result.fields.avg_price?.inputFields).toContainEqual({ - namespace: "trino", - name: "order_items", - field: "unit_price", - transformations: [DIRECT_AGGREGATION], - }); + // ========== DATASET-LEVEL LINEAGE ========== - // ========== Dataset-level lineage ========== - - // JOIN lineage - 3 joins with 2 columns each = 6 join columns - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBe(6); - // First join: c.id = p.category_id + // JOIN lineage - 3 joins with 2 columns each = 6 total + const joinLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "JOIN"); + expect(joinLineage?.length).toBe(6); expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "categories", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, field: "id", transformations: [INDIRECT_JOIN], }); expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "products", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, field: "category_id", transformations: [INDIRECT_JOIN], }); - // Second join: p.id = oi.product_id expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "products", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, field: "id", transformations: [INDIRECT_JOIN], }); expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "order_items", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, field: "product_id", transformations: [INDIRECT_JOIN], }); - // Third join: oi.order_id = o.id expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "order_items", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, field: "order_id", transformations: [INDIRECT_JOIN], }); expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "orders", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, field: "id", transformations: [INDIRECT_JOIN], }); - // FILTER lineage - status and order_date from WHERE clause - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBeGreaterThanOrEqual(2); + // FILTER lineage - status, created_at, active + HAVING quantity + const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "orders", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, field: "status", transformations: [INDIRECT_FILTER], }); expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "orders", - field: "order_date", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, + field: "created_at", + transformations: [INDIRECT_FILTER], + }); + expect(filterLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "active", transformations: [INDIRECT_FILTER], }); - // HAVING also contributes to filter lineage expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "order_items", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, field: "quantity", transformations: [INDIRECT_FILTER], }); // GROUP BY lineage - c.id, c.name, p.id, p.name - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage.length).toBe(4); + const groupByLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "GROUP_BY"); + expect(groupByLineage?.length).toBe(4); expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "categories", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, field: "id", transformations: [INDIRECT_GROUP_BY], }); expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "categories", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, field: "name", transformations: [INDIRECT_GROUP_BY], }); expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "products", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, field: "id", transformations: [INDIRECT_GROUP_BY], }); expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "products", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [INDIRECT_GROUP_BY], }); - // SORT lineage - c.name (total_revenue is alias, may not resolve) - const sortLineage = findBySubtype(result.dataset, "SORT"); + // SORT lineage - c.name + const sortLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "SORT"); expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "categories", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, field: "name", transformations: [INDIRECT_SORT], }); + + // WINDOW lineage - PARTITION BY c.id + const windowLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "WINDOW"); + expect(windowLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, + field: "id", + transformations: [INDIRECT_WINDOW], + }); + + // Verify 7 output fields + expect(Object.keys(result.fields).length).toBe(7); }); - test("HR analytics query with employee hierarchy", () => { + test("HR analytics mega query with complex CTEs and CASE", () => { const sql = ` - WITH department_stats AS ( + WITH + active_employees AS ( SELECT + id, department_id, - COUNT(*) as employee_count, - AVG(salary) as avg_salary, - SUM(salary) as total_salary + salary, + hire_date, + performance_score FROM employees - WHERE status = 'active' + WHERE status = 'active' AND terminated_at IS NULL + ), + dept_stats AS ( + SELECT + department_id, + COUNT(id) as headcount, + SUM(salary) as total_compensation, + AVG(salary) as avg_salary, + MIN(hire_date) as oldest_hire, + AVG(performance_score) as avg_performance + FROM active_employees GROUP BY department_id + HAVING COUNT(id) >= 3 ) SELECT d.name as department_name, - ds.employee_count, + d.location, + ds.headcount, + ds.total_compensation, ds.avg_salary, - ds.total_salary, + ds.avg_performance, CASE - WHEN ds.avg_salary > 100000 THEN 'High' - WHEN ds.avg_salary > 50000 THEN 'Medium' - ELSE 'Low' - END as salary_tier, - RANK() OVER (ORDER BY ds.total_salary DESC) as budget_rank - FROM department_stats ds + WHEN ds.avg_performance >= 4.5 THEN 'Exceptional' + WHEN ds.avg_performance >= 3.5 THEN 'Good' + WHEN ds.avg_performance >= 2.5 THEN 'Average' + ELSE 'Needs Improvement' + END as performance_tier, + DENSE_RANK() OVER (ORDER BY ds.total_compensation DESC) as compensation_rank, + ROW_NUMBER() OVER (PARTITION BY d.location ORDER BY ds.headcount DESC) as location_rank, + SHA256(d.name) as dept_hash + FROM dept_stats ds JOIN departments d ON ds.department_id = d.id - WHERE ds.employee_count >= 5 - ORDER BY budget_rank - `; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("employees", ["id", "department_id", "salary", "status"]), - createTable("departments", ["id", "name"]), + WHERE d.active = true + ORDER BY d.location, ds.total_compensation DESC + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.employees`, [ + "id", + "department_id", + "salary", + "hire_date", + "performance_score", + "status", + "terminated_at", + ]), + createTable(`${DEFAULT_SCHEMA}.departments`, ["id", "name", "location", "active"]), ]); - const result = getExtendedLineage(ast as Select, schema); - // ========== Field-level lineage ========== - - // department_name traces to departments.name - expect(result.fields.department_name).toBeDefined(); - expect(result.fields.department_name?.inputFields).toContainEqual({ - namespace: "trino", - name: "departments", - field: "name", - transformations: [DIRECT_IDENTITY], - }); - - // employee_count comes from COUNT(*) in CTE - no specific field traced - expect(result.fields.employee_count).toBeDefined(); - // COUNT(*) doesn't reference a specific column, so inputFields may be empty + // ========== FIELD-LEVEL LINEAGE ========== - // avg_salary traces to employees.salary with AGGREGATION - expect(result.fields.avg_salary).toBeDefined(); - expect(result.fields.avg_salary?.inputFields).toContainEqual({ - namespace: "trino", - name: "employees", - field: "salary", - transformations: [DIRECT_AGGREGATION], - }); + expect(result.fields.department_name?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "name", transformations: [DIRECT_IDENTITY] }, + ]); - // total_salary traces to employees.salary with AGGREGATION - expect(result.fields.total_salary).toBeDefined(); - expect(result.fields.total_salary?.inputFields).toContainEqual({ - namespace: "trino", - name: "employees", - field: "salary", - transformations: [DIRECT_AGGREGATION], - }); + expect(result.fields.location?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "location", transformations: [DIRECT_IDENTITY] }, + ]); - // salary_tier uses CASE with avg_salary in conditions - traces back to salary - expect(result.fields.salary_tier).toBeDefined(); - expect(result.fields.salary_tier?.inputFields.length).toBeGreaterThan(0); - // The salary_tier CASE statement references avg_salary which traces to employees.salary - const salaryTierFields = result.fields.salary_tier?.inputFields.map((f) => f.field); - expect(salaryTierFields).toContain("salary"); - // Should have CONDITION subtype for the WHEN clauses - const hasConditionTransformation = result.fields.salary_tier?.inputFields.some( - (f) => f.transformations?.some((t) => t.subtype === "CONDITION" || t.subtype === "AGGREGATION"), - ); - expect(hasConditionTransformation).toBe(true); - - // budget_rank from RANK() OVER (ORDER BY ds.total_salary DESC) - now tracks columns from OVER clause - expect(result.fields.budget_rank).toBeDefined(); - // RANK() has no arguments, but it should capture total_salary from ORDER BY in OVER clause - // total_salary traces back through CTE to employees.salary - expect(result.fields.budget_rank?.inputFields.length).toBeGreaterThan(0); - const budgetRankFields = result.fields.budget_rank?.inputFields.map((f) => f.field); - expect(budgetRankFields).toContain("salary"); - - // ========== Dataset-level lineage ========== - - // JOIN lineage - ds.department_id = d.id - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBeGreaterThanOrEqual(1); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "departments", - field: "id", - transformations: [INDIRECT_JOIN], - }); + expect(result.fields.headcount?.inputFields).toEqual([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ]); - // Verify total output field count - expect(Object.keys(result.fields).length).toBe(6); - }); + expect(result.fields.total_compensation?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [DIRECT_AGGREGATION] }, + ]); - test("time-series analysis query", () => { - const sql = ` - SELECT - DATE_TRUNC('month', event_date) as month, - event_type, - COUNT(*) as event_count, - COUNT(DISTINCT user_id) as unique_users - FROM events - WHERE event_date >= '2024-01-01' - AND event_type IN ('login', 'purchase', 'view') - GROUP BY DATE_TRUNC('month', event_date), event_type - ORDER BY month, event_type - `; + expect(result.fields.avg_salary?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [DIRECT_AGGREGATION] }, + ]); - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("events", ["id", "event_date", "event_type", "user_id"])]); + expect(result.fields.avg_performance?.inputFields).toEqual([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "performance_score", transformations: [DIRECT_AGGREGATION] }, + ]); - const result = getExtendedLineage(ast as Select, schema); + // performance_tier CASE uses avg_performance -> performance_score + const tierFields = result.fields.performance_tier?.inputFields; + expect(tierFields?.map((f) => f.field)).toContain("performance_score"); + + // dept_hash uses MD5 -> masking + expect(result.fields.dept_hash?.inputFields).toEqual([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "name", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ]); - // ========== Field-level lineage ========== - - // month traces to event_date with TRANSFORMATION (DATE_TRUNC) - expect(result.fields.month).toBeDefined(); - expect(result.fields.month?.inputFields.length).toBe(1); - expect(result.fields.month?.inputFields[0]?.field).toBe("event_date"); - expect(result.fields.month?.inputFields[0]?.transformations?.[0]?.subtype).toBe("TRANSFORMATION"); + // ========== DATASET-LEVEL LINEAGE ========== - // event_type is direct IDENTITY - expect(result.fields.event_type).toBeDefined(); - expect(result.fields.event_type?.inputFields).toContainEqual({ - namespace: "trino", - name: "events", - field: "event_type", - transformations: [DIRECT_IDENTITY], + // FILTER from active_employees CTE + const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); + expect(filterLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "status", + transformations: [INDIRECT_FILTER], }); - - // event_count from COUNT(*) - no specific field - expect(result.fields.event_count).toBeDefined(); - - // unique_users from COUNT(DISTINCT user_id) - traces to user_id with AGGREGATION + masking - expect(result.fields.unique_users).toBeDefined(); - expect(result.fields.unique_users?.inputFields.length).toBe(1); - expect(result.fields.unique_users?.inputFields[0]?.field).toBe("user_id"); - expect(result.fields.unique_users?.inputFields[0]?.transformations?.[0]?.subtype).toBe("AGGREGATION"); - expect(result.fields.unique_users?.inputFields[0]?.transformations?.[0]?.masking).toBe(true); - - // Verify total output field count - expect(Object.keys(result.fields).length).toBe(4); - - // ========== Dataset-level lineage ========== - - // FILTER lineage - event_date and event_type from WHERE - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage.length).toBe(2); expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "events", - field: "event_date", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "terminated_at", transformations: [INDIRECT_FILTER], }); expect(filterLineage).toContainEqual({ - namespace: "trino", - name: "events", - field: "event_type", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "active", transformations: [INDIRECT_FILTER], }); - // GROUP BY lineage - DATE_TRUNC('month', event_date) and event_type - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage.length).toBe(2); - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "events", - field: "event_date", - transformations: [INDIRECT_GROUP_BY], - }); - expect(groupByLineage).toContainEqual({ - namespace: "trino", - name: "events", - field: "event_type", - transformations: [INDIRECT_GROUP_BY], + // JOIN + const joinLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "JOIN"); + expect(joinLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "id", + transformations: [INDIRECT_JOIN], }); - // SORT lineage - month and event_type - // Note: month is an alias that may not resolve to base column - const sortLineage = findBySubtype(result.dataset, "SORT"); + // SORT + const sortLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "SORT"); expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "events", - field: "event_type", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "location", transformations: [INDIRECT_SORT], }); + + // WINDOW + const windowLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "WINDOW"); + expect(windowLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "location", + transformations: [INDIRECT_WINDOW], + }); + + // Verify 10 output fields + expect(Object.keys(result.fields).length).toBe(10); }); - test("multi-level aggregation query", () => { + test("UNION with CTEs and window functions mega query", () => { const sql = ` - WITH daily_sales AS ( + WITH + us_sales AS ( SELECT - store_id, - DATE(sale_timestamp) as sale_date, - SUM(amount) as daily_total + product_id, + SUM(amount) as total_amount, + COUNT(*) as sale_count FROM sales - WHERE sale_timestamp >= '2024-01-01' - GROUP BY store_id, DATE(sale_timestamp) + WHERE region = 'US' AND sale_date >= '2024-01-01' + GROUP BY product_id ), - weekly_sales AS ( + eu_sales AS ( SELECT - store_id, - DATE_TRUNC('week', sale_date) as week_start, - SUM(daily_total) as weekly_total, - AVG(daily_total) as daily_avg - FROM daily_sales - GROUP BY store_id, DATE_TRUNC('week', sale_date) + product_id, + SUM(amount) as total_amount, + COUNT(*) as sale_count + FROM sales + WHERE region = 'EU' AND sale_date >= '2024-01-01' + GROUP BY product_id ) SELECT - s.name as store_name, - s.region, - ws.week_start, - ws.weekly_total, - ws.daily_avg - FROM weekly_sales ws - JOIN stores s ON ws.store_id = s.id - ORDER BY s.region, ws.week_start - `; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("sales", ["id", "store_id", "sale_timestamp", "amount"]), - createTable("stores", ["id", "name", "region"]), + 'US' as region, + p.name as product_name, + us.total_amount, + us.sale_count, + RANK() OVER (ORDER BY us.total_amount DESC) as revenue_rank + FROM us_sales us + JOIN products p ON us.product_id = p.id + WHERE p.active = true + + UNION ALL + + SELECT + 'EU' as region, + p.name as product_name, + eu.total_amount, + eu.sale_count, + RANK() OVER (ORDER BY eu.total_amount DESC) as revenue_rank + FROM eu_sales eu + JOIN products p ON eu.product_id = p.id + WHERE p.active = true + `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.sales`, ["id", "product_id", "amount", "region", "sale_date"]), + createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name", "active"]), ]); - const result = getExtendedLineage(ast as Select, schema); - // ========== Field-level lineage ========== - - // store_name traces to stores.name - expect(result.fields.store_name).toBeDefined(); - expect(result.fields.store_name?.inputFields).toContainEqual({ - namespace: "trino", - name: "stores", + // Field lineage - product_name comes from products.name + // Both UNION parts join the same products table, so we get one unique entry per field + // (the mergeInputFields deduplicates by full field identity including transformations) + expect(result.fields.product_name?.inputFields).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [DIRECT_IDENTITY], }); - // region traces to stores.region - expect(result.fields.region).toBeDefined(); - expect(result.fields.region?.inputFields).toContainEqual({ - namespace: "trino", - name: "stores", - field: "region", - transformations: [DIRECT_IDENTITY], - }); - - // week_start traces through CTEs to sale_timestamp (via DATE and DATE_TRUNC transformations) - expect(result.fields.week_start).toBeDefined(); - expect(result.fields.week_start?.inputFields.length).toBe(1); - expect(result.fields.week_start?.inputFields[0]?.field).toBe("sale_timestamp"); - expect(result.fields.week_start?.inputFields[0]?.name).toBe("sales"); - // Should have TRANSFORMATION due to DATE_TRUNC/DATE functions - expect(result.fields.week_start?.inputFields[0]?.transformations?.[0]?.type).toBe("DIRECT"); - - // weekly_total traces through CTEs: SUM(SUM(amount)) -> amount with AGGREGATION - expect(result.fields.weekly_total).toBeDefined(); - expect(result.fields.weekly_total?.inputFields.length).toBe(1); - expect(result.fields.weekly_total?.inputFields).toContainEqual({ - namespace: "trino", - name: "sales", + // total_amount traces through CTEs to sales.amount (AGGREGATION) + expect(result.fields.total_amount?.inputFields).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, field: "amount", transformations: [DIRECT_AGGREGATION], }); - // daily_avg traces through CTEs: AVG(SUM(amount)) -> amount with AGGREGATION - expect(result.fields.daily_avg).toBeDefined(); - expect(result.fields.daily_avg?.inputFields.length).toBe(1); - expect(result.fields.daily_avg?.inputFields).toContainEqual({ - namespace: "trino", - name: "sales", - field: "amount", - transformations: [DIRECT_AGGREGATION], - }); - - // Verify total output field count - expect(Object.keys(result.fields).length).toBe(5); - - // ========== Dataset-level lineage ========== - - // JOIN lineage - ws.store_id = s.id - // Note: store_id from CTE doesn't resolve, but stores.id does - const joinLineage = findBySubtype(result.dataset, "JOIN"); - expect(joinLineage.length).toBeGreaterThanOrEqual(1); - expect(joinLineage).toContainEqual({ - namespace: "trino", - name: "stores", - field: "id", - transformations: [INDIRECT_JOIN], - }); + // Dataset lineage from both CTEs and both UNION parts + const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); - // SORT lineage - s.region and ws.week_start - // s.region should resolve to stores.region - const sortLineage = findBySubtype(result.dataset, "SORT"); - expect(sortLineage.length).toBeGreaterThanOrEqual(1); - expect(sortLineage).toContainEqual({ - namespace: "trino", - name: "stores", + // Both CTEs have WHERE region = '...' AND sale_date >= '...' + // Plus both outer queries have WHERE p.active = true + // Check for key filters + expect(filterLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, field: "region", - transformations: [INDIRECT_SORT], - }); - }); -}); - -// ============================================================================= -// TRANSFORMATION TYPE VERIFICATION -// ============================================================================= - -describe("getExtendedLineage - Transformation type verification", () => { - test("verifies all transformation types are correct", () => { - const sql = ` - SELECT - u.country, - COUNT(u.id) as user_count, - ROW_NUMBER() OVER (ORDER BY COUNT(u.id) DESC) as rank - FROM users u - JOIN orders o ON u.id = o.user_id - WHERE u.status = 'active' - GROUP BY u.country - ORDER BY u.country - `; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "country", "status"]), - createTable("orders", ["id", "user_id"]), - ]); - - const result = getExtendedLineage(ast as Select, schema); - - // Verify JOIN transformation structure - const joinLineage = findBySubtype(result.dataset, "JOIN"); - joinLineage.forEach((field) => { - expect(field.transformations?.[0]).toEqual(INDIRECT_JOIN); - }); - - // Verify FILTER transformation structure - const filterLineage = findBySubtype(result.dataset, "FILTER"); - filterLineage.forEach((field) => { - expect(field.transformations?.[0]).toEqual(INDIRECT_FILTER); - }); - - // Verify GROUP_BY transformation structure - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - groupByLineage.forEach((field) => { - expect(field.transformations?.[0]).toEqual(INDIRECT_GROUP_BY); - }); - - // Verify SORT transformation structure - const sortLineage = findBySubtype(result.dataset, "SORT"); - sortLineage.forEach((field) => { - expect(field.transformations?.[0]).toEqual(INDIRECT_SORT); - }); - }); - - test("field-level transformations for direct lineage", () => { - const sql = ` - SELECT - id, - name, - UPPER(email) as upper_email, - age + 1 as next_age, - COUNT(status) as status_count, - SUM(salary) as total_salary - FROM employees - GROUP BY id, name, email, age - `; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("employees", ["id", "name", "email", "age", "status", "salary"]), - ]); - - const result = getExtendedLineage(ast as Select, schema); - - // IDENTITY transformations - expect(result.fields.id?.inputFields[0]?.transformations).toContainEqual(DIRECT_IDENTITY); - expect(result.fields.name?.inputFields[0]?.transformations).toContainEqual(DIRECT_IDENTITY); - - // TRANSFORMATION (function) - expect(result.fields.upper_email?.inputFields[0]?.transformations).toContainEqual(DIRECT_TRANSFORMATION); - - // TRANSFORMATION (arithmetic) - expect(result.fields.next_age?.inputFields[0]?.transformations).toContainEqual(DIRECT_TRANSFORMATION); - - // AGGREGATION (with masking for COUNT) - expect(result.fields.status_count?.inputFields[0]?.transformations).toContainEqual({ - ...DIRECT_AGGREGATION, - masking: true, + transformations: [INDIRECT_FILTER], }); - - // AGGREGATION (without masking for SUM) - expect(result.fields.total_salary?.inputFields[0]?.transformations).toContainEqual(DIRECT_AGGREGATION); - }); -}); - -// Helper to parse SQL for PostgreSQL (which supports INTERSECT and EXCEPT) -function parseSQLPostgres(sql: string): AST { - const result = parser.astify(sql, { database: "postgresql" }); - const ast = Array.isArray(result) ? result[0] : result; - - if (!ast) { - throw new Error("Failed to parse SQL"); - } - - return ast; -} - -describe("getExtendedLineage - Set Operations (UNION, INTERSECT, EXCEPT)", () => { - test("UNION with WHERE clauses captures all dataset lineage", () => { - const sql = ` - SELECT id, name FROM users WHERE status = 'active' - UNION - SELECT id, name FROM customers WHERE verified = true - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "name", "status"]), - createTable("customers", ["id", "name", "verified"]), - ]); - - const result = getExtendedLineage(ast as Select, schema); - - // Field lineage should combine both sources - expect(result.fields.id?.inputFields).toHaveLength(2); - expect(result.fields.name?.inputFields).toHaveLength(2); - - // Dataset lineage should include filters from both queries - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage).toHaveLength(2); expect(filterLineage).toContainEqual({ - namespace: "postgres", - name: "users", - field: "status", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "sale_date", transformations: [INDIRECT_FILTER], }); expect(filterLineage).toContainEqual({ - namespace: "postgres", - name: "customers", - field: "verified", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "active", transformations: [INDIRECT_FILTER], }); - }); - - test("INTERSECT with GROUP BY captures all dataset lineage", () => { - const sql = ` - SELECT department_id FROM employees GROUP BY department_id - INTERSECT - SELECT department_id FROM managers GROUP BY department_id - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("employees", ["id", "department_id"]), - createTable("managers", ["id", "department_id"]), - ]); - - const result = getExtendedLineage(ast as Select, schema); - - // Field lineage combines both sources - expect(result.fields.department_id?.inputFields).toHaveLength(2); - // Dataset lineage should include GROUP BY from both queries - const groupByLineage = findBySubtype(result.dataset, "GROUP_BY"); - expect(groupByLineage).toHaveLength(2); + // GROUP BY from both CTEs (deduplicated since same table.field) + const groupByLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "GROUP_BY"); expect(groupByLineage).toContainEqual({ - namespace: "postgres", - name: "employees", - field: "department_id", - transformations: [INDIRECT_GROUP_BY], - }); - expect(groupByLineage).toContainEqual({ - namespace: "postgres", - name: "managers", - field: "department_id", + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "product_id", transformations: [INDIRECT_GROUP_BY], }); - }); - - test("EXCEPT with ORDER BY captures all dataset lineage", () => { - const sql = ` - SELECT id FROM users ORDER BY created_at - EXCEPT - SELECT id FROM banned_users ORDER BY banned_at - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "created_at"]), - createTable("banned_users", ["id", "banned_at"]), - ]); - - const result = getExtendedLineage(ast as Select, schema); - // Dataset lineage should include ORDER BY from both queries - const sortLineage = findBySubtype(result.dataset, "SORT"); - expect(sortLineage).toHaveLength(2); - expect(sortLineage).toContainEqual({ - namespace: "postgres", - name: "users", - field: "created_at", - transformations: [INDIRECT_SORT], - }); - expect(sortLineage).toContainEqual({ - namespace: "postgres", - name: "banned_users", - field: "banned_at", - transformations: [INDIRECT_SORT], + // JOIN from both UNION parts + const joinLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "JOIN"); + expect(joinLineage).toContainEqual({ + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "id", + transformations: [INDIRECT_JOIN], }); + + // Verify 5 output fields + expect(Object.keys(result.fields).length).toBe(5); }); +}); - test("chained UNION captures dataset lineage from all parts", () => { - const sql = ` - SELECT id FROM users WHERE region = 'US' - UNION - SELECT id FROM customers WHERE region = 'EU' - UNION - SELECT id FROM vendors WHERE region = 'APAC' - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "region"]), - createTable("customers", ["id", "region"]), - createTable("vendors", ["id", "region"]), - ]); +// ============================================================================= +// SECTION 10: DEFAULT SCHEMA HANDLING +// ============================================================================= +describe("Default Schema Handling", () => { + test("matches table with default schema", () => { + const sql = `SELECT id, name FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE], "public"); const result = getExtendedLineage(ast as Select, schema); - // Dataset lineage should include filters from all three queries - const filterLineage = findBySubtype(result.dataset, "FILTER"); - expect(filterLineage).toHaveLength(3); - expect(filterLineage).toContainEqual({ - namespace: "postgres", - name: "users", - field: "region", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "postgres", - name: "customers", - field: "region", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "postgres", - name: "vendors", - field: "region", - transformations: [INDIRECT_FILTER], + expect(result.fields).toEqual({ + id: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + }, + name: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + }, }); }); - test("UNION with JOINs captures dataset lineage from both parts", () => { - const sql = ` - SELECT u.id, u.name - FROM users u - JOIN orders o ON u.id = o.user_id - UNION - SELECT c.id, c.name - FROM customers c - JOIN purchases p ON c.id = p.customer_id - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id"]), - createTable("customers", ["id", "name"]), - createTable("purchases", ["id", "customer_id"]), - ]); - + test("schema-qualified table name", () => { + const sql = `SELECT u.id FROM analytics.users u WHERE u.status = 'active'`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [createTable("analytics.users", ["id", "status"])]); const result = getExtendedLineage(ast as Select, schema); - // Dataset lineage should include JOIN conditions from both queries - const joinLineage = findBySubtype(result.dataset, "JOIN"); - // Each JOIN contributes 2 fields (from ON condition) - expect(joinLineage.length).toBeGreaterThanOrEqual(4); + expect(result.fields.id?.inputFields).toEqual([ + { namespace: "ns", name: "analytics.users", field: "id", transformations: [DIRECT_IDENTITY] }, + ]); + expect(result.dataset).toEqual([ + { namespace: "ns", name: "analytics.users", field: "status", transformations: [INDIRECT_FILTER] }, + ]); }); }); From 683ed2a79785123e0515c37c167a2cfb828a94a5 Mon Sep 17 00:00:00 2001 From: its-elad <59926027+its-elad@users.noreply.github.com> Date: Mon, 2 Feb 2026 12:44:45 +0200 Subject: [PATCH 07/10] feat: removed some comments --- apps/demo/src/App.tsx | 4 +- packages/lineage/src/index.ts | 69 +++++------------------------------ 2 files changed, 13 insertions(+), 60 deletions(-) diff --git a/apps/demo/src/App.tsx b/apps/demo/src/App.tsx index 1410509..b16e1f4 100644 --- a/apps/demo/src/App.tsx +++ b/apps/demo/src/App.tsx @@ -47,6 +47,8 @@ export default function App() { setLineageData(lineageResult); }, []); + useEffect(() => console.log(lineageData), [lineageData]); + return (
{/* Header */} @@ -94,7 +96,7 @@ export default function App() { - + Data Lineage Graph diff --git a/packages/lineage/src/index.ts b/packages/lineage/src/index.ts index 8764519..1ac2b82 100644 --- a/packages/lineage/src/index.ts +++ b/packages/lineage/src/index.ts @@ -1,7 +1,3 @@ -// ============================================================================ -// Imports -// ============================================================================ - import { type ColumnLineageDatasetFacet, type InputField, @@ -23,10 +19,6 @@ import { } from "node-sql-parser"; import { HashSet } from "./hashset"; -// ============================================================================ -// Types -// ============================================================================ - type Transformation = Exclude<_Transformation, "masking"> & { masking: boolean; // output boolean only for easier testing }; @@ -48,10 +40,6 @@ const MASKING_FUNCTIONS = new Set([ "REDACT", ]); -// ============================================================================ -// Transformation Constants -// ============================================================================ - // Direct transformation constants export const DIRECT_TRANSFORMATION: Transformation = { type: "DIRECT", @@ -153,10 +141,6 @@ class TransformationSet extends HashSet { } } -// ============================================================================ -// Exported Types -// ============================================================================ - export type Column = { name: string; }; @@ -194,11 +178,7 @@ export type SetOperation = "union" | "union all" | "intersect" | "intersect all" /** * Extended lineage result that includes both field-level and dataset-level lineage */ -export type ExtendedLineageResult = Pick - -// ============================================================================ -// Column Name Utilities -// ============================================================================ +export type ExtendedLineageResult = Pick; function isColumn(selectColumn: Select["columns"][number]): selectColumn is AstColumn { return ( @@ -430,10 +410,6 @@ function extractWindowExpressionsFromOver(over: OverClause): ExpressionValue[] { return expressions; } -// ============================================================================ -// Direct Transformation Extraction -// ============================================================================ - /** * Get transformations from expression, supporting CASE/IF for CONDITION subtype */ @@ -525,6 +501,7 @@ function getDirectTransformationsFromExprValue( arg, mergeTransformations(parentTransformation, { ...DIRECT_TRANSFORMATION, + // TODO - copilot edits masking: funcExpr.name.name.length > 0 && MASKING_FUNCTIONS.has(funcExpr.name.name.at(-1)!.value.toUpperCase()), }), @@ -632,10 +609,6 @@ function getIndirectTransformationsFromExpr( return result; } -// ============================================================================ -// Indirect Lineage Extraction Helpers -// ============================================================================ - /** * Resolves a column reference to an InputField by finding the matching table in namespace. * This is the core helper that eliminates repetitive table lookup logic. @@ -712,10 +685,6 @@ function extractInputFieldsFromExpressions( ); } -// ============================================================================ -// Clause-Specific Lineage Extractors -// ============================================================================ - /** * Extract JOIN lineage from FROM clause (ON and USING conditions) */ @@ -917,10 +886,6 @@ function getHavingLineage(select: Select, namespace: Namespace): InputField[] { ); } -// ============================================================================ -// Table Expression Helpers -// ============================================================================ - function getTableExpressionsFromSelect(select: Select): { regularTables: BaseFrom[]; selectTables: SelectWithAlias[]; @@ -1071,10 +1036,6 @@ function expandStarColumn(column: AstColumn, select: Select, namespace: Namespac return expandedColumns; } -// ============================================================================ -// Set Operation Helpers -// ============================================================================ - /** * Check if a SELECT has set operations (UNION, INTERSECT, EXCEPT) */ @@ -1257,12 +1218,7 @@ function getLineageForSingleSelect(select: Select, namespace: Namespace): Column if (!outputFieldName) { outputFieldName = `unknown_${unknownCount++}`; } - acc = { - ...acc, - [outputFieldName]: { - inputFields: getColumnLineage(select, namespace, expandedCol), - }, - }; + acc[outputFieldName] = { inputFields: getColumnLineage(select, namespace, expandedCol) }; }); return acc; @@ -1274,12 +1230,8 @@ function getLineageForSingleSelect(select: Select, namespace: Namespace): Column outputFieldName = `unknown_${unknownCount++}`; } - return { - ...acc, - [outputFieldName]: { - inputFields: getColumnLineage(select, namespace, column), - }, - }; + acc[outputFieldName] = { inputFields: getColumnLineage(select, namespace, column) }; + return acc; }, {} as ColumnLineageDatasetFacet["fields"], ); @@ -1289,16 +1241,15 @@ function getLineageForSingleSelect(select: Select, namespace: Namespace): Column * Merge input fields from multiple sources, deduplicating by field identity */ function mergeInputFields(existing: InputField[], incoming: InputField[]): InputField[] { - const hasher = (value: InputField) => { + const hashset = new HashSet((value: InputField) => { const transformationsString = value.transformations?.map((t) => transformationHasher(t as Transformation)).join("-") ?? ""; return `${value.namespace}-${value.name}-${value.field}-${transformationsString}`; - }; - const mergedMap = new Map(existing.map((field) => [hasher(field), field] as const, {})); - - incoming.forEach((field) => mergedMap.set(hasher(field), field)); + }); + existing.forEach((field) => hashset.add(field)); + incoming.forEach((field) => hashset.add(field)); - return [...mergedMap.values()]; + return [...hashset.values()]; } /** From 9ce3df13f370cdef6002c67d58a834c1734ab82c Mon Sep 17 00:00:00 2001 From: its-elad <59926027+its-elad@users.noreply.github.com> Date: Tue, 3 Feb 2026 08:39:51 +0200 Subject: [PATCH 08/10] feat: added documentation --- packages/lineage/src/index.ts | 393 +++++++++++++++++++++++++--------- 1 file changed, 290 insertions(+), 103 deletions(-) diff --git a/packages/lineage/src/index.ts b/packages/lineage/src/index.ts index 1ac2b82..f4cf21f 100644 --- a/packages/lineage/src/index.ts +++ b/packages/lineage/src/index.ts @@ -96,6 +96,19 @@ export const INDIRECT_CONDITION: Transformation = { masking: false, }; +/** + * Merges two transformations, combining their properties based on precedence rules. + * + * Precedence rules: + * - If types differ (DIRECT vs INDIRECT), keeps the child transformation + * - For DIRECT types: AGGREGATION > TRANSFORMATION > IDENTITY + * - For INDIRECT types: prefers the child (more recent context) + * - Masking is OR'd together (if either is masked, result is masked) + * + * @param parent - The parent/outer transformation (may be undefined) + * @param child - The child/inner transformation to merge + * @returns The merged transformation with combined properties + */ function mergeTransformations(parent: Transformation | undefined, child: Transformation): Transformation { if (!parent) { return child; @@ -104,8 +117,8 @@ function mergeTransformations(parent: Transformation | undefined, child: Transfo // If types differ, prefer the more specific one // INDIRECT is generally more specific than DIRECT for the same column if (parent.type !== child.type) { - // Keep the child transformation but merge masking - return { ...child, masking: parent.masking || child.masking }; + let leading: Transformation = child.type === "INDIRECT" ? child : parent; + return { ...leading, masking: parent.masking || child.masking }; } if (child.type === "DIRECT" && parent.type === "DIRECT") { @@ -131,6 +144,7 @@ function mergeTransformations(parent: Transformation | undefined, child: Transfo const transformationHasher = (value: Transformation): string => `${value.type}-${value.subtype}-${value.masking ? "MASKED" : "UNMASKED"}`; + class TransformationSet extends HashSet { constructor(values?: readonly Transformation[]) { super((value: Transformation) => transformationHasher(value)); @@ -141,6 +155,12 @@ class TransformationSet extends HashSet { } } +/** + * Unified column lineage result that contains both direct and indirect transformations + * per column reference. Used internally to collect all transformations for columns. + */ +type ColumnTransformations = Record; + export type Column = { name: string; }; @@ -192,7 +212,7 @@ function isColumn(selectColumn: Select["columns"][number]): selectColumn is AstC } /** - * Check if a column expression is a star (wildcard) expression like * or table.* + * Checks if a column expression is a star (wildcard) expression like `*` or `table.*`. */ function isStar(column: AstColumn): boolean { if (column.expr.type !== "column_ref") return false; @@ -201,8 +221,9 @@ function isStar(column: AstColumn): boolean { } /** - * Get the table qualifier from a star expression (e.g., "u" from "u.*") - * Returns null if there's no table qualifier (plain "*") + * Extracts the table qualifier from a star expression. + * @returns The table alias/name (e.g., "u" from "u.*"), or null for plain "*" + * @calls isStar - To verify the column is a star expression */ function getStarTableQualifier(column: AstColumn): string | null { if (!isStar(column)) return null; @@ -211,10 +232,19 @@ function getStarTableQualifier(column: AstColumn): string | null { return typeof colRef.table === "string" ? colRef.table : (colRef.table as { type: string; value: string }).value; } +/** + * Formats a column reference into a string identifier. + * @returns Formatted string like "table.column" or just "column" if no table qualifier + * @calls getInputColumnName - To extract the column name from the reference + */ export function formatInputColumnName(column: ColumnRefItem): string { return `${column.table ? `${column.table}.` : ""}${getInputColumnName(column)}`; } +/** + * Parses a formatted column name string back into its components. + * @returns InputColumn object with name and optional table properties + */ export function parseInputColumnName(column: string): InputColumn { const parts = column.split("."); const name = parts.pop() || ""; @@ -224,7 +254,8 @@ export function parseInputColumnName(column: string): InputColumn { } /** - * Parse a fully qualified table name (schemaName.tableName) into its parts + * Parses a fully qualified table name into schema and table components. + * @returns Object with schema (empty string if not specified) and table name */ export function parseTableName(tableName: string): { schema: string; table: string } { const parts = tableName.split("."); @@ -235,8 +266,11 @@ export function parseTableName(tableName: string): { schema: string; table: stri } /** - * Check if an AST table reference matches a schema table - * Takes into account the db property from AST and the defaultSchema from namespace + * Checks if an AST table reference matches a schema table definition. + * Handles schema resolution including default schema fallback. + * @returns True if the AST table matches the schema table + * + * @calls parseTableName - To parse the schema table name into components */ function astTableMatchesSchemaTable(astTable: BaseFrom, schemaTableName: string, defaultSchema?: string): boolean { const parsed = parseTableName(schemaTableName); @@ -255,6 +289,11 @@ function astTableMatchesSchemaTable(astTable: BaseFrom, schemaTableName: string, return parsed.table === astTable.table; } +/** + * Extracts the column name from a ColumnRefItem AST node. + * Handles both simple string columns and complex expression columns. + * @returns The column name string, or null if it cannot be extracted + */ export function getInputColumnName(column: ColumnRefItem): string | null { return typeof column.column === "string" ? column.column @@ -263,6 +302,16 @@ export function getInputColumnName(column: ColumnRefItem): string | null { : null; } +/** + * Determines the output column name for a SELECT column. + * Uses the alias if present, otherwise extracts from the column reference. + * @returns The output column name (alias or original name), or null if undetermined + * @calls getInputColumnName - When no alias is present and expr is a column_ref + * @example + * // For "SELECT id AS user_id" returns "user_id" + * // For "SELECT id" returns "id" + * // For "SELECT 1 + 1" returns null (no determinable name) + */ export function getOutputColumnName(column: AstColumn): string | null { if (column.as) { return typeof column.as === "string" ? column.as : column.as.value; @@ -274,14 +323,16 @@ export function getOutputColumnName(column: AstColumn): string | null { } /** - * Extract column references from any expression value + * Recursively extracts all column references from any SQL expression. + * This is the unified function for finding all columns referenced in expressions. + * @returns Array of ColumnRefItem objects found in the expression + * @calls extractColumnRefs - Recursively for nested expressions */ -function extractColumnRefs(expr: ExpressionValue | null | undefined): ColumnRefItem[] { +export function extractColumnRefs(expr: ExpressionValue | null | undefined): ColumnRefItem[] { if (!expr) return []; const refs: ColumnRefItem[] = []; - // TODO - why not "exp" ? switch (expr.type) { case "column_ref": refs.push(expr as ColumnRefItem); @@ -359,10 +410,6 @@ function extractColumnRefs(expr: ExpressionValue | null | undefined): ColumnRefI return refs; } -// ============================================================================ -// Window Function Helpers (needed by both field-level and dataset-level lineage) -// ============================================================================ - /** * Type for OVER clause structure (shared between aggr_func and function types) */ @@ -380,8 +427,9 @@ type OverClause = { }; /** - * Extract expressions from an OVER clause object (PARTITION BY and ORDER BY) - * This is a helper used by both field-level and dataset-level lineage extraction. + * Extracts PARTITION BY and ORDER BY expressions from an OVER clause. + * Handles both direct structure and nested Trino parser output structure. + * @returns Array of expressions from PARTITION BY and ORDER BY clauses */ function extractWindowExpressionsFromOver(over: OverClause): ExpressionValue[] { const expressions: ExpressionValue[] = []; @@ -411,12 +459,31 @@ function extractWindowExpressionsFromOver(over: OverClause): ExpressionValue[] { } /** - * Get transformations from expression, supporting CASE/IF for CONDITION subtype + * Core unified function for extracting column transformations from any SQL expression. + * Returns a map of column names to their transformation sets. + * + * This function handles: + * - column_ref: Returns DIRECT/IDENTITY transformation + * - binary_expr: Returns DIRECT/TRANSFORMATION for both operands + * - aggr_func: Returns DIRECT/AGGREGATION (with masking for COUNT) + * - function: Returns DIRECT/TRANSFORMATION (with masking for hash functions) + * - case: Returns INDIRECT/CONDITION for conditions, DIRECT/IDENTITY for results + * - cast/interval: Returns DIRECT/TRANSFORMATION + * + * @param expr - The expression to extract transformations from + * @param parentTransformation - Optional parent transformation to merge with child transformations + * @returns Map of column names (e.g., "table.column") to their TransformationSet + * + * @calls formatInputColumnName - To format column references as keys + * @calls mergeTransformations - To combine parent and child transformations + * @calls extractWindowExpressionsFromOver - For window function OVER clauses + * @calls extractTransformationsWithType - For CASE condition columns + * @calls extractTransformationsFromExpr - Recursively for nested expressions */ -function getDirectTransformationsFromExprValue( +function extractTransformationsFromExpr( expr: ExpressionValue, parentTransformation?: Transformation, -): Record { +): ColumnTransformations { switch (expr.type) { case "column_ref": { const inputColumnName = formatInputColumnName(expr as ColumnRefItem); @@ -431,16 +498,16 @@ function getDirectTransformationsFromExprValue( case "binary_expr": { const { left, right } = expr as Binary; - const merged: Record = {}; + const merged: ColumnTransformations = {}; Object.entries( - getDirectTransformationsFromExprValue(left, mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION)), + extractTransformationsFromExpr(left, mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION)), ).forEach(([key, value]) => { merged[key] = value; }); Object.entries( - getDirectTransformationsFromExprValue(right, mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION)), + extractTransformationsFromExpr(right, mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION)), ).forEach(([key, value]) => { const prev = merged[key]; @@ -457,11 +524,11 @@ function getDirectTransformationsFromExprValue( case "aggr_func": { const aggExpr = expr as AggrFunc; - const merged: Record = {}; + const merged: ColumnTransformations = {}; // Extract lineage from aggregate function arguments if (aggExpr.args?.expr) { - const argTransformations = getDirectTransformationsFromExprValue( + const argTransformations = extractTransformationsFromExpr( aggExpr.args.expr, mergeTransformations(parentTransformation, { ...DIRECT_AGGREGATION, @@ -477,9 +544,9 @@ function getDirectTransformationsFromExprValue( if ("over" in aggExpr && aggExpr.over) { const windowExprs = extractWindowExpressionsFromOver(aggExpr.over); for (const windowExpr of windowExprs) { - const windowTransformations = getDirectTransformationsFromExprValue( + const windowTransformations = extractTransformationsFromExpr( windowExpr, - mergeTransformations(parentTransformation, DIRECT_AGGREGATION), + mergeTransformations(parentTransformation, INDIRECT_WINDOW), ); Object.entries(windowTransformations).forEach(([key, value]) => { merged[key] = merged[key] ? merged[key].union(value) : value; @@ -492,16 +559,15 @@ function getDirectTransformationsFromExprValue( case "function": { const funcExpr = expr as AstFunction; - const merged: Record = {}; + const merged: ColumnTransformations = {}; // Extract lineage from function arguments if (funcExpr.args?.value) { for (const arg of funcExpr.args.value) { - const argTransformations = getDirectTransformationsFromExprValue( + const argTransformations = extractTransformationsFromExpr( arg, mergeTransformations(parentTransformation, { ...DIRECT_TRANSFORMATION, - // TODO - copilot edits masking: funcExpr.name.name.length > 0 && MASKING_FUNCTIONS.has(funcExpr.name.name.at(-1)!.value.toUpperCase()), }), @@ -517,9 +583,9 @@ function getDirectTransformationsFromExprValue( if ("over" in funcExpr && funcExpr.over) { const windowExprs = extractWindowExpressionsFromOver((funcExpr as AstFunction & { over: OverClause }).over); for (const windowExpr of windowExprs) { - const windowTransformations = getDirectTransformationsFromExprValue( + const windowTransformations = extractTransformationsFromExpr( windowExpr, - mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION), + mergeTransformations(parentTransformation, INDIRECT_WINDOW), ); Object.entries(windowTransformations).forEach(([key, value]) => { merged[key] = merged[key] ? merged[key].union(value) : value; @@ -532,21 +598,21 @@ function getDirectTransformationsFromExprValue( case "case": { const caseExpr = expr as Case; - const merged: Record = {}; + const merged: ColumnTransformations = {}; if (caseExpr.args) { for (const arg of caseExpr.args) { - // Condition columns get INDIRECT/CONDITION + // Condition columns get INDIRECT/CONDITION (per-column indirect transformation) if (arg.type === "when" && arg.cond) { - const condTransformations = getIndirectTransformationsFromExpr(arg.cond, INDIRECT_CONDITION); + const condTransformations = extractTransformationsWithType(arg.cond, INDIRECT_CONDITION); Object.entries(condTransformations).forEach(([key, value]) => { merged[key] = merged[key] ? merged[key].union(value) : value; }); } - // Result columns get DIRECT/TRANSFORMATION (value is transformed through CASE) + // Result columns get DIRECT/IDENTITY (value is taken from CASE result) if (arg.result) { - const resultTransformations = getDirectTransformationsFromExprValue( + const resultTransformations = extractTransformationsFromExpr( arg.result, mergeTransformations(parentTransformation, DIRECT_IDENTITY), ); @@ -563,7 +629,7 @@ function getDirectTransformationsFromExprValue( case "cast": { const castExpr = expr as Cast; if (castExpr.expr) { - return getDirectTransformationsFromExprValue( + return extractTransformationsFromExpr( castExpr.expr, mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION), ); @@ -574,7 +640,7 @@ function getDirectTransformationsFromExprValue( case "interval": { const intervalExpr = expr as Interval; if (intervalExpr.expr) { - return getDirectTransformationsFromExprValue( + return extractTransformationsFromExpr( intervalExpr.expr, mergeTransformations(parentTransformation, DIRECT_TRANSFORMATION), ); @@ -588,16 +654,24 @@ function getDirectTransformationsFromExprValue( } /** - * Get indirect transformations from an expression with a specific transformation type + * Extracts column references and applies a uniform transformation type to all. + * Simpler than extractTransformationsFromExpr - doesn't analyze expression structure. + * + * Used for dataset-level indirect transformations where all columns in an expression + * receive the same transformation type (e.g., all columns in WHERE get FILTER). + * @returns Map of column names to TransformationSet containing the single transformation + * + * @calls extractColumnRefs - To find all column references in the expression + * @calls formatInputColumnName - To format column references as keys */ -function getIndirectTransformationsFromExpr( +function extractTransformationsWithType( expr: ExpressionValue | null | undefined, transformation: Transformation, -): Record { +): ColumnTransformations { if (!expr) return {}; const columnRefs = extractColumnRefs(expr); - const result: Record = {}; + const result: ColumnTransformations = {}; for (const ref of columnRefs) { const columnName = formatInputColumnName(ref); @@ -611,7 +685,11 @@ function getIndirectTransformationsFromExpr( /** * Resolves a column reference to an InputField by finding the matching table in namespace. - * This is the core helper that eliminates repetitive table lookup logic. + * This is the core helper for converting AST column refs to OpenLineage InputField format. + * @returns InputField object if table found, null otherwise + * + * @calls getInputColumnName - To extract the column name + * @calls astTableMatchesSchemaTable - To match AST table to namespace table */ function resolveColumnRefToInputField( ref: ColumnRefItem, @@ -647,8 +725,12 @@ function resolveColumnRefToInputField( } /** - * Extracts InputFields from column references in an expression. - * Common pattern used by WHERE, HAVING, GROUP BY, ORDER BY, etc. + * Extracts InputFields from all column references in an expression. + * Used by dataset-level lineage extraction (WHERE, HAVING, GROUP BY, ORDER BY, etc.). + * @returns Array of InputField objects for columns that could be resolved + * + * @calls extractColumnRefs - To find all column references + * @calls resolveColumnRefToInputField - To convert each ref to InputField */ function extractInputFieldsFromExpression( expr: ExpressionValue | null | undefined, @@ -672,21 +754,13 @@ function extractInputFieldsFromExpression( } /** - * Extracts InputFields from multiple expressions. - */ -function extractInputFieldsFromExpressions( - expressions: (ExpressionValue | null | undefined)[], - regularTables: BaseFrom[], - namespace: Namespace, - transformation: Transformation, -): InputField[] { - return expressions.flatMap((expr) => - extractInputFieldsFromExpression(expr, regularTables, namespace, transformation), - ); -} - -/** - * Extract JOIN lineage from FROM clause (ON and USING conditions) + * Extracts JOIN lineage from the FROM clause (ON and USING conditions). + * All columns in JOIN conditions receive INDIRECT/JOIN transformation. + * @returns Array of InputFields for columns used in JOIN conditions + * + * @calls getTableExpressionsFromSelect - To get regular tables from FROM + * @calls extractInputFieldsFromExpression - For ON clause columns + * @calls astTableMatchesSchemaTable - For USING clause table matching */ function getJoinLineage(select: Select, namespace: Namespace): InputField[] { if (!select.from) return []; @@ -699,9 +773,7 @@ function getJoinLineage(select: Select, namespace: Namespace): InputField[] { for (const item of fromItems) { // Handle ON clause if ("on" in item && item.on) { - inputFields.push( - ...extractInputFieldsFromExpression(item.on as ExpressionValue, regularTables, namespace, INDIRECT_JOIN), - ); + inputFields.push(...extractInputFieldsFromExpression(item.on, regularTables, namespace, INDIRECT_JOIN)); } // Handle USING clause - columns exist in multiple tables @@ -729,17 +801,28 @@ function getJoinLineage(select: Select, namespace: Namespace): InputField[] { } /** - * Extract WHERE clause lineage (FILTER) + * Extracts WHERE clause lineage. + * All columns in WHERE conditions receive INDIRECT/FILTER transformation. + * @returns Array of InputFields for columns used in WHERE clause + * + * @calls getTableExpressionsFromSelect - To get regular tables from FROM + * @calls extractInputFieldsFromExpression - To extract columns with FILTER transformation */ function getFilterLineage(select: Select, namespace: Namespace): InputField[] { if (!select.where) return []; const { regularTables } = getTableExpressionsFromSelect(select); - return extractInputFieldsFromExpression(select.where as ExpressionValue, regularTables, namespace, INDIRECT_FILTER); + return extractInputFieldsFromExpression(select.where, regularTables, namespace, INDIRECT_FILTER); } /** - * Extract GROUP BY lineage + * Extracts GROUP BY clause lineage. + * All columns in GROUP BY receive INDIRECT/GROUP_BY transformation. + * @returns Array of InputFields for columns used in GROUP BY clause + * + * @calls normalizeGroupByItems - To handle different GROUP BY AST formats + * @calls getTableExpressionsFromSelect - To get regular tables from FROM + * @calls extractInputFieldsFromExpression - To extract columns with GROUP_BY transformation */ function getGroupByLineage(select: Select, namespace: Namespace): InputField[] { if (!select.groupby) return []; @@ -748,11 +831,15 @@ function getGroupByLineage(select: Select, namespace: Namespace): InputField[] { const groupByItems = normalizeGroupByItems(select.groupby); const { regularTables } = getTableExpressionsFromSelect(select); - return extractInputFieldsFromExpressions(groupByItems, regularTables, namespace, INDIRECT_GROUP_BY); + return groupByItems.flatMap((expr) => + extractInputFieldsFromExpression(expr, regularTables, namespace, INDIRECT_GROUP_BY), + ); } /** - * Normalize GROUP BY clause to array of ExpressionValue + * Normalizes GROUP BY clause to a consistent array format. + * Handles different AST representations from various SQL parsers. + * @returns Array of ExpressionValue for each GROUP BY item */ function normalizeGroupByItems(groupby: Select["groupby"]): ExpressionValue[] { if (Array.isArray(groupby)) { @@ -765,8 +852,12 @@ function normalizeGroupByItems(groupby: Select["groupby"]): ExpressionValue[] { } /** - * Build a map of output aliases to their source expressions from SELECT columns. - * This allows ORDER BY alias resolution. + * Builds a map of output column aliases to their source expressions. + * Used to resolve ORDER BY alias references to their underlying columns. + * @returns Map where keys are output aliases, values are the source expressions + * + * @calls isColumn - To filter valid columns + * @calls getOutputColumnName - To get the alias/output name */ function buildAliasToExpressionMap(select: Select): Map { const aliasMap = new Map(); @@ -788,8 +879,11 @@ function buildAliasToExpressionMap(select: Select): Map } /** - * Resolve an ORDER BY expression to its underlying column references. - * If the expression is a column reference that matches an alias, resolve it to the aliased expression. + * Resolves an ORDER BY expression to its underlying column reference. + * If the expression is an alias (unqualified column_ref matching an alias), returns the aliased expression. + * @returns The resolved expression (original if not an alias, or the aliased expression) + * + * @calls getInputColumnName - To extract column name from column_ref */ function resolveOrderByExpression(expr: ExpressionValue, aliasMap: Map): ExpressionValue { // If it's a column_ref, check if it's an alias @@ -807,8 +901,15 @@ function resolveOrderByExpression(expr: ExpressionValue, aliasMap: Map + extractInputFieldsFromExpression(expr, regularTables, namespace, INDIRECT_WINDOW), + ), + ); } return inputFields; } /** - * Extract expressions from OVER clause in an expression (PARTITION BY and ORDER BY) - * Handles the parser structure: over.as_window_specification.window_specification.{partitionby,orderby} - * Supports both aggr_func (e.g., SUM() OVER) and function types (e.g., ROW_NUMBER() OVER) + * Extracts PARTITION BY and ORDER BY expressions from a window function expression. + * Supports both aggr_func (SUM() OVER) and function types (ROW_NUMBER() OVER). + * @returns Array of expressions from the OVER clause, empty if not a window function + * + * @calls extractWindowExpressionsFromOver - To parse the OVER clause structure */ function extractWindowExpressions(expr: ExpressionValue): ExpressionValue[] { // Support both aggr_func and function types with OVER clause @@ -872,7 +986,12 @@ function extractWindowExpressions(expr: ExpressionValue): ExpressionValue[] { } /** - * Extract HAVING clause lineage (FILTER in aggregation context) + * Extracts HAVING clause lineage. + * All columns in HAVING conditions receive INDIRECT/FILTER transformation. + * @returns Array of InputFields for columns used in HAVING clause + * + * @calls getTableExpressionsFromSelect - To get regular tables from FROM + * @calls extractInputFieldsFromExpression - To extract columns with FILTER transformation */ function getHavingLineage(select: Select, namespace: Namespace): InputField[] { if (!select.having) return []; @@ -886,6 +1005,13 @@ function getHavingLineage(select: Select, namespace: Namespace): InputField[] { ); } +/** + * Extracts and categorizes table expressions from a SELECT statement. + * Separates regular tables (physical tables) from select tables (CTEs, subqueries). + * @returns Object with: + * - regularTables: Physical tables from namespace + * - selectTables: CTEs and subqueries (as SelectWithAlias) + */ function getTableExpressionsFromSelect(select: Select): { regularTables: BaseFrom[]; selectTables: SelectWithAlias[]; @@ -939,6 +1065,13 @@ function getTableExpressionsFromSelect(select: Select): { return { regularTables, selectTables }; } +/** + * Merges two TransformationSets by combining each parent transformation with each child. + * Creates a Cartesian product of transformations, merging each pair. + * @returns New TransformationSet with all merged combinations + * + * @calls mergeTransformations - To merge each parent-child pair + */ function mergeTransformationSet(parent: TransformationSet, child: TransformationSet): TransformationSet { const merged = new TransformationSet(); @@ -952,9 +1085,16 @@ function mergeTransformationSet(parent: TransformationSet, child: Transformation } /** - * Expand a star column into individual columns based on namespace and FROM clause. - * For "*", returns all columns from all tables in FROM clause. - * For "table.*", returns all columns from that specific table. + * Expands a star (wildcard) column into individual column entries. + * Handles both "*" (all tables) and "table.*" (specific table) patterns. + * @returns Array of AstColumn entries for each expanded column + * + * @calls isStar - To verify it's a star column + * @calls getStarTableQualifier - To get table qualifier if present + * @calls getTableExpressionsFromSelect - To get tables from FROM clause + * @calls astTableMatchesSchemaTable - To match tables to namespace + * @calls expandStarColumn - Recursively for nested star expressions in subqueries + * @calls getOutputColumnName - To get column names from subquery columns */ function expandStarColumn(column: AstColumn, select: Select, namespace: Namespace): AstColumn[] { if (!isStar(column)) return [column]; @@ -1037,16 +1177,19 @@ function expandStarColumn(column: AstColumn, select: Select, namespace: Namespac } /** - * Check if a SELECT has set operations (UNION, INTERSECT, EXCEPT) + * Type guard to check if a SELECT has set operations (UNION, INTERSECT, EXCEPT). + * @returns True if the SELECT has a set_op property with a value */ function hasSetOperation(select: Select): select is Select { return "set_op" in select && select.set_op != null; } /** - * Get all SELECT statements in a set operation chain. - * Returns an array of SELECT statements, where the first element is the base select - * and subsequent elements are the _next selects in the chain. + * Collects all SELECT statements in a set operation chain. + * Follows the _next chain for UNION/INTERSECT/EXCEPT operations. + * @returns Array of SELECT statements, first element is the base select + * + * @calls hasSetOperation - To check if there are more SELECTs in the chain */ function getSetOperationSelects(select: Select): Select[] { const selects: Select[] = [select]; @@ -1062,17 +1205,30 @@ function getSetOperationSelects(select: Select): Select[] { return selects; } -// ============================================================================ -// Main Lineage Functions -// ============================================================================ - +/** + * Computes field-level lineage for a single output column. + * Traces the column back to its source columns in the namespace tables. + * + * Process: + * 1. Extracts transformations from the column expression + * 2. Merges with any parent transformations (for nested queries) + * 3. Resolves each column reference to InputField via regular tables or recursion into CTEs/subqueries + * @returns Array of InputField objects representing source columns with transformations + * + * @calls extractTransformationsFromExpr - To get column transformations from expression + * @calls mergeTransformationSet - To combine with parent transformations + * @calls getTableExpressionsFromSelect - To separate regular tables from CTEs/subqueries + * @calls parseInputColumnName - To parse column identifiers + * @calls astTableMatchesSchemaTable - To match columns to namespace tables + * @calls getColumnLineage - Recursively for columns from CTEs/subqueries + */ export function getColumnLineage( select: Select, namespace: Namespace, column: AstColumn, transformations?: TransformationSet, ): InputField[] { - let transformationsByColumns = getDirectTransformationsFromExprValue(column.expr); + let transformationsByColumns = extractTransformationsFromExpr(column.expr); if (transformations) { transformationsByColumns = Object.entries(transformationsByColumns).reduce( @@ -1144,7 +1300,14 @@ export function getColumnLineage( } /** - * Get dataset-level indirect lineage for a single SELECT (without following set operations) + * Extracts dataset-level indirect lineage for a single SELECT statement. + * Collects all columns that affect the entire result set through indirect transformations. + * @returns Array of InputFields for all indirect lineage columns + * + * @calls getJoinLineage, getFilterLineage, getGroupByLineage, getOrderByLineage, + * getWindowLineage, getHavingLineage - To collect each type of indirect lineage + * @calls getTableExpressionsFromSelect - To find CTEs/subqueries + * @calls getDatasetLineage - Recursively for CTEs/subqueries */ function getDatasetLineageForSingleSelect(select: Select, namespace: Namespace): InputField[] { const allIndirectFields: InputField[] = []; @@ -1167,9 +1330,12 @@ function getDatasetLineageForSingleSelect(select: Select, namespace: Namespace): } /** - * Get all dataset-level indirect lineage (columns that affect the entire result set) - * This includes lineage from CTEs, subqueries, and set operations (UNION, INTERSECT, EXCEPT) - * that contribute to the final result. + * Computes all dataset-level indirect lineage for a SELECT, including set operations. + * Returns columns that affect the entire result set (not mapped to specific output columns). + * @returns Deduplicated array of InputFields for dataset-level lineage + * + * @calls getSetOperationSelects - To collect all SELECTs in set operation chain + * @calls getDatasetLineageForSingleSelect - To get lineage for each SELECT */ export function getDatasetLineage(select: Select, namespace: Namespace): InputField[] { const allIndirectFields: InputField[] = []; @@ -1194,7 +1360,15 @@ export function getDatasetLineage(select: Select, namespace: Namespace): InputFi } /** - * Get field-level lineage for a single SELECT (without following set operations) + * Computes field-level lineage for a single SELECT (without set operations). + * Maps each output column to its source columns with transformations. + * @returns Object mapping output column names to their FieldLineage (inputFields array) + * + * @calls isColumn - To filter valid columns + * @calls isStar - To detect wildcard columns + * @calls expandStarColumn - To expand * into individual columns + * @calls getOutputColumnName - To determine output column name + * @calls getColumnLineage - To compute lineage for each column */ function getLineageForSingleSelect(select: Select, namespace: Namespace): ColumnLineageDatasetFacet["fields"] { let unknownCount = 0; @@ -1238,7 +1412,11 @@ function getLineageForSingleSelect(select: Select, namespace: Namespace): Column } /** - * Merge input fields from multiple sources, deduplicating by field identity + * Merges and deduplicates InputField arrays. + * Used when combining lineage from multiple sources (e.g., UNION branches). + * @returns Combined array with duplicates removed + * + * @calls transformationHasher - To create unique keys for transformations */ function mergeInputFields(existing: InputField[], incoming: InputField[]): InputField[] { const hashset = new HashSet((value: InputField) => { @@ -1253,9 +1431,17 @@ function mergeInputFields(existing: InputField[], incoming: InputField[]): Input } /** - * Main lineage extraction function - returns field-level lineage only (backward compatible) - * Handles set operations (UNION, INTERSECT, EXCEPT) by merging lineages from all parts. - * Output column names are determined by the first SELECT in the set operation. + * Main field-level lineage extraction function. + * Returns a map of output columns to their source columns with transformations. + * + * Handles set operations (UNION, INTERSECT, EXCEPT) by: + * 1. Using the first SELECT's column names as output names + * 2. Merging lineage from subsequent SELECTs by column position + * @returns Object mapping output column names to FieldLineage objects + * + * @calls getSetOperationSelects - To collect all SELECTs in chain + * @calls getLineageForSingleSelect - To compute lineage for each SELECT + * @calls mergeInputFields - To combine lineage from set operation branches */ export function getLineage(select: Select, namespace: Namespace): ColumnLineageDatasetFacet["fields"] { // Get all SELECT statements in the set operation chain @@ -1296,7 +1482,8 @@ export function getLineage(select: Select, namespace: Namespace): ColumnLineageD } /** - * Extended lineage extraction function - returns both field-level and dataset-level lineage + * Extended lineage extraction returning both field-level and dataset-level lineage. + * Follows the OpenLineage ColumnLineageDatasetFacet specification. */ export function getExtendedLineage( select: Select, From 2acbba034a056ea9eec8276d9ea8a0ffc07a18a7 Mon Sep 17 00:00:00 2001 From: its-elad <59926027+its-elad@users.noreply.github.com> Date: Thu, 5 Feb 2026 14:14:41 +0200 Subject: [PATCH 09/10] fix: normalized tests --- apps/demo/src/App.tsx | 2 - packages/lineage/src/hashset.ts | 2 +- packages/lineage/src/index.ts | 4 +- packages/lineage/test/extendedLineage.test.ts | 2000 ++++++++++++----- packages/lineage/test/index.test.ts | 1660 -------------- 5 files changed, 1482 insertions(+), 2186 deletions(-) delete mode 100644 packages/lineage/test/index.test.ts diff --git a/apps/demo/src/App.tsx b/apps/demo/src/App.tsx index b16e1f4..e1e8b4c 100644 --- a/apps/demo/src/App.tsx +++ b/apps/demo/src/App.tsx @@ -47,8 +47,6 @@ export default function App() { setLineageData(lineageResult); }, []); - useEffect(() => console.log(lineageData), [lineageData]); - return (
{/* Header */} diff --git a/packages/lineage/src/hashset.ts b/packages/lineage/src/hashset.ts index 13b9605..6e03de3 100644 --- a/packages/lineage/src/hashset.ts +++ b/packages/lineage/src/hashset.ts @@ -61,7 +61,7 @@ export class HashSet implements Set { return intersection; } - union(other: Set): HashSet { + union(other: HashSet): HashSet { const union = new HashSet(this.hasher); for (const value of this) { union.add(value); diff --git a/packages/lineage/src/index.ts b/packages/lineage/src/index.ts index f4cf21f..037c3f8 100644 --- a/packages/lineage/src/index.ts +++ b/packages/lineage/src/index.ts @@ -564,12 +564,12 @@ function extractTransformationsFromExpr( // Extract lineage from function arguments if (funcExpr.args?.value) { for (const arg of funcExpr.args.value) { + const maskingExpr = funcExpr.name.name.at(-1)?.value.toUpperCase(); const argTransformations = extractTransformationsFromExpr( arg, mergeTransformations(parentTransformation, { ...DIRECT_TRANSFORMATION, - masking: - funcExpr.name.name.length > 0 && MASKING_FUNCTIONS.has(funcExpr.name.name.at(-1)!.value.toUpperCase()), + masking: !!maskingExpr && MASKING_FUNCTIONS.has(maskingExpr), }), ); Object.entries(argTransformations).forEach(([key, value]) => { diff --git a/packages/lineage/test/extendedLineage.test.ts b/packages/lineage/test/extendedLineage.test.ts index 4ba2ac6..0af092a 100644 --- a/packages/lineage/test/extendedLineage.test.ts +++ b/packages/lineage/test/extendedLineage.test.ts @@ -30,6 +30,7 @@ const USERS_TABLE = createTable(`${DEFAULT_SCHEMA}.users`, [ "region", "verified", "active", + "favorite_product", "created_at", ]); @@ -50,6 +51,13 @@ function parseSQL(sql: string, database: "trino" | "postgresql" = "trino"): AST return ast; } +function parseSQLPostgres(sql: string): AST { + const result = parser.astify(sql, { database: "postgresql" }); + const ast = Array.isArray(result) ? result[0] : result; + if (!ast) throw new Error("Failed to parse SQL"); + return ast; +} + // ============================================================================= // EXACT ASSERTION HELPERS // ============================================================================= @@ -195,8 +203,18 @@ describe("Field-Level Lineage: DIRECT/TRANSFORMATION", () => { sortInputFields({ total: { inputFields: [ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "price", transformations: [DIRECT_TRANSFORMATION] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "tax", transformations: [DIRECT_TRANSFORMATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "price", + transformations: [DIRECT_TRANSFORMATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "tax", + transformations: [DIRECT_TRANSFORMATION], + }, ], }, }), @@ -240,7 +258,12 @@ describe("Field-Level Lineage: DIRECT/TRANSFORMATION", () => { expect(result.fields).toEqual({ price_str: { inputFields: [ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "price", transformations: [DIRECT_TRANSFORMATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "price", + transformations: [DIRECT_TRANSFORMATION], + }, ], }, }); @@ -272,7 +295,12 @@ describe("Field-Level Lineage: DIRECT/AGGREGATION", () => { expect(result.fields).toEqual({ total: { inputFields: [ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "amount", transformations: [DIRECT_AGGREGATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.transactions`, + field: "amount", + transformations: [DIRECT_AGGREGATION], + }, ], }, }); @@ -287,7 +315,12 @@ describe("Field-Level Lineage: DIRECT/AGGREGATION", () => { expect(result.fields).toEqual({ avg_salary: { inputFields: [ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [DIRECT_AGGREGATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "salary", + transformations: [DIRECT_AGGREGATION], + }, ], }, }); @@ -302,7 +335,12 @@ describe("Field-Level Lineage: DIRECT/AGGREGATION", () => { expect(result.fields).toEqual({ min_price: { inputFields: [ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "price", transformations: [DIRECT_AGGREGATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "price", + transformations: [DIRECT_AGGREGATION], + }, ], }, }); @@ -317,7 +355,12 @@ describe("Field-Level Lineage: DIRECT/AGGREGATION", () => { expect(result.fields).toEqual({ latest: { inputFields: [ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.events`, field: "created_at", transformations: [DIRECT_AGGREGATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.events`, + field: "created_at", + transformations: [DIRECT_AGGREGATION], + }, ], }, }); @@ -373,8 +416,18 @@ describe("Field-Level Lineage: DIRECT/AGGREGATION", () => { sortInputFields({ revenue: { inputFields: [ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "price", transformations: [DIRECT_AGGREGATION] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "quantity", transformations: [DIRECT_AGGREGATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "price", + transformations: [DIRECT_AGGREGATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }, ], }, }), @@ -457,12 +510,14 @@ describe("Field-Level Lineage: CASE Expressions", () => { // CASE WHEN condition column gets INDIRECT/CONDITION expect(result.fields.status_label).toBeDefined(); - expect(result.fields.status_label?.inputFields).toContainEqual({ - namespace: "ns", - name: USERS_TABLE.name, - field: "status", - transformations: [INDIRECT_CONDITION], - }); + expect(result.fields.status_label?.inputFields).toEqual([ + { + namespace: "ns", + name: USERS_TABLE.name, + field: "status", + transformations: [INDIRECT_CONDITION], + }, + ]); }); test("CASE with column in result", () => { @@ -475,9 +530,26 @@ describe("Field-Level Lineage: CASE Expressions", () => { const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.customers`, ["is_premium", "discount_rate"])]); const result = getExtendedLineage(ast as Select, schema); - const inputFieldNames = result.fields.applied_discount?.inputFields.map((f) => f.field); - expect(inputFieldNames).toContain("is_premium"); - expect(inputFieldNames).toContain("discount_rate"); + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + applied_discount: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.customers`, + field: "is_premium", + transformations: [INDIRECT_CONDITION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.customers`, + field: "discount_rate", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }), + ); }); test("CASE with multiple conditions and results", () => { @@ -496,11 +568,38 @@ describe("Field-Level Lineage: CASE Expressions", () => { ]); const result = getExtendedLineage(ast as Select, schema); - const inputFieldNames = result.fields.ticket_price?.inputFields.map((f) => f.field); - expect(inputFieldNames).toContain("age"); - expect(inputFieldNames).toContain("minor_price"); - expect(inputFieldNames).toContain("adult_price"); - expect(inputFieldNames).toContain("senior_price"); + expect(sortInputFields(result.fields)).toEqual( + sortInputFields({ + ticket_price: { + inputFields: [ + { + field: "age", + name: `${DEFAULT_SCHEMA}.visitors`, + namespace: "ns", + transformations: [INDIRECT_CONDITION], + }, + { + field: "minor_price", + name: `${DEFAULT_SCHEMA}.visitors`, + namespace: "ns", + transformations: [DIRECT_IDENTITY], + }, + { + field: "adult_price", + name: `${DEFAULT_SCHEMA}.visitors`, + namespace: "ns", + transformations: [DIRECT_IDENTITY], + }, + { + field: "senior_price", + name: `${DEFAULT_SCHEMA}.visitors`, + namespace: "ns", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }), + ); }); }); @@ -649,8 +748,12 @@ describe("Dataset-Level Lineage: INDIRECT/JOIN", () => { const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "name", "manager_id"])]); const result = getExtendedLineage(ast as Select, schema); - // Self join should have both columns from same table - expect(result.dataset?.length).toBeGreaterThanOrEqual(2); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "manager_id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "id", transformations: [INDIRECT_JOIN] }, + ]), + ); }); }); @@ -868,7 +971,12 @@ describe("Dataset-Level Lineage: INDIRECT/FILTER (HAVING)", () => { expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [INDIRECT_GROUP_BY] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "department", + transformations: [INDIRECT_GROUP_BY], + }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_FILTER] }, ]), ); @@ -912,7 +1020,12 @@ describe("Dataset-Level Lineage: INDIRECT/WINDOW", () => { const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "category", transformations: [INDIRECT_WINDOW] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.transactions`, + field: "category", + transformations: [INDIRECT_WINDOW], + }, ]); }); @@ -943,8 +1056,18 @@ describe("Dataset-Level Lineage: INDIRECT/WINDOW", () => { expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "user_id", transformations: [INDIRECT_WINDOW] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "created_at", transformations: [INDIRECT_WINDOW] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.transactions`, + field: "user_id", + transformations: [INDIRECT_WINDOW], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.transactions`, + field: "created_at", + transformations: [INDIRECT_WINDOW], + }, ]), ); }); @@ -983,7 +1106,12 @@ describe("Dataset-Level Lineage: INDIRECT/WINDOW", () => { expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [INDIRECT_WINDOW] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "department", + transformations: [INDIRECT_WINDOW], + }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_WINDOW] }, ]), ); @@ -1027,13 +1155,20 @@ describe("Combined Clauses: WHERE + GROUP BY + HAVING", () => { HAVING COUNT(*) > 5 `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department", "salary", "status"])]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department", "salary", "status"]), + ]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "status", transformations: [INDIRECT_FILTER] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [INDIRECT_GROUP_BY] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "department", + transformations: [INDIRECT_GROUP_BY], + }, // HAVING COUNT(*) doesn't add field lineage since COUNT(*) doesn't reference a column ]), ); @@ -1077,10 +1212,30 @@ describe("Combined Clauses: WINDOW + WHERE + ORDER BY", () => { expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "status", transformations: [INDIRECT_FILTER] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "category", transformations: [INDIRECT_WINDOW] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "created_at", transformations: [INDIRECT_WINDOW] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, field: "created_at", transformations: [INDIRECT_SORT] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.transactions`, + field: "status", + transformations: [INDIRECT_FILTER], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.transactions`, + field: "category", + transformations: [INDIRECT_WINDOW], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.transactions`, + field: "created_at", + transformations: [INDIRECT_WINDOW], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.transactions`, + field: "created_at", + transformations: [INDIRECT_SORT], + }, ]), ); }); @@ -1133,19 +1288,34 @@ describe("CTEs: Basic WITH clause", () => { expect(result.fields).toEqual({ department: { inputFields: [ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [DIRECT_IDENTITY] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "department", + transformations: [DIRECT_IDENTITY], + }, ], }, total_salary: { inputFields: [ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [DIRECT_AGGREGATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "salary", + transformations: [DIRECT_AGGREGATION], + }, ], }, }); // GROUP BY from CTE should be in dataset lineage expect(result.dataset).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [INDIRECT_GROUP_BY] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "department", + transformations: [INDIRECT_GROUP_BY], + }, ]); }); }); @@ -1201,7 +1371,9 @@ describe("CTEs: Nested transformations through CTEs", () => { SELECT total_revenue FROM summary `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.sales`, ["id", "quantity", "price", "sale_date"])]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.sales`, ["id", "quantity", "price", "sale_date"]), + ]); const result = getExtendedLineage(ast as Select, schema); // total_revenue -> SUM(revenue) -> quantity * price @@ -1210,7 +1382,12 @@ describe("CTEs: Nested transformations through CTEs", () => { total_revenue: { inputFields: [ { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "price", transformations: [DIRECT_AGGREGATION] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "quantity", transformations: [DIRECT_AGGREGATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }, ], }, }), @@ -1261,14 +1438,49 @@ describe("Subqueries: FROM clause subquery", () => { // ============================================================================= describe("Set Operations: UNION", () => { - test("UNION merges field lineage from both sides", () => { + test("simple UNION combines lineage from both queries", () => { + const sql = ` + SELECT id, name FROM users + UNION + SELECT id, name FROM customers + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name", "email"]), + createTable("customers", ["id", "name", "address"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + name: { + inputFields: [ + { namespace: "postgres", name: "users", field: "name", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + }); + + test("UNION with WHERE clauses combines field and dataset lineage", () => { const sql = ` SELECT id, name FROM users WHERE status = 'active' UNION SELECT id, name FROM customers WHERE verified = true `; const ast = parseSQL(sql, "postgresql"); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.customers`, ["id", "name", "verified"])]); + const schema = createNamespace("ns", [ + USERS_TABLE, + createTable(`${DEFAULT_SCHEMA}.customers`, ["id", "name", "verified"]), + ]); const result = getExtendedLineage(ast as Select, schema); // Field lineage combines both sources @@ -1298,73 +1510,101 @@ describe("Set Operations: UNION", () => { ); }); - test("UNION ALL with GROUP BY on both sides", () => { + test("UNION ALL combines lineage from both queries", () => { const sql = ` - SELECT department FROM employees GROUP BY department + SELECT id FROM users UNION ALL - SELECT department FROM contractors GROUP BY department + SELECT id FROM orders `; - const ast = parseSQL(sql, "postgresql"); - const schema = createNamespace("ns", [ - createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department"]), - createTable(`${DEFAULT_SCHEMA}.contractors`, ["id", "department"]), + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("orders", ["id", "product"]), ]); - const result = getExtendedLineage(ast as Select, schema); - expect(sortDataset(result.dataset)).toEqual( - sortDataset([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "department", transformations: [INDIRECT_GROUP_BY] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.contractors`, field: "department", transformations: [INDIRECT_GROUP_BY] }, - ]), + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "orders", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), ); }); -}); -describe("Set Operations: INTERSECT", () => { - test("INTERSECT with ORDER BY on both sides", () => { + test("UNION ALL with GROUP BY on both sides", () => { const sql = ` - SELECT id FROM active_users ORDER BY created_at - INTERSECT - SELECT id FROM premium_users ORDER BY upgraded_at + SELECT department FROM employees GROUP BY department + UNION ALL + SELECT department FROM contractors GROUP BY department `; const ast = parseSQL(sql, "postgresql"); const schema = createNamespace("ns", [ - createTable(`${DEFAULT_SCHEMA}.active_users`, ["id", "created_at"]), - createTable(`${DEFAULT_SCHEMA}.premium_users`, ["id", "upgraded_at"]), + createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department"]), + createTable(`${DEFAULT_SCHEMA}.contractors`, ["id", "department"]), ]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.active_users`, field: "created_at", transformations: [INDIRECT_SORT] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.premium_users`, field: "upgraded_at", transformations: [INDIRECT_SORT] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "department", + transformations: [INDIRECT_GROUP_BY], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.contractors`, + field: "department", + transformations: [INDIRECT_GROUP_BY], + }, ]), ); }); -}); -describe("Set Operations: EXCEPT", () => { - test("EXCEPT with WHERE on both sides", () => { + test("chained UNION combines lineage from all queries", () => { const sql = ` - SELECT id FROM users WHERE active = true - EXCEPT - SELECT id FROM banned_users WHERE ban_date > '2024-01-01' + SELECT id, name FROM users + UNION + SELECT id, name FROM customers + UNION + SELECT id, name FROM vendors `; - const ast = parseSQL(sql, "postgresql"); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.banned_users`, ["id", "ban_date"])]); - const result = getExtendedLineage(ast as Select, schema); + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("customers", ["id", "name"]), + createTable("vendors", ["id", "name"]), + ]); - expect(sortDataset(result.dataset)).toEqual( - sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "active", transformations: [INDIRECT_FILTER] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.banned_users`, field: "ban_date", transformations: [INDIRECT_FILTER] }, - ]), + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "vendors", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + name: { + inputFields: [ + { namespace: "postgres", name: "users", field: "name", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "name", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "vendors", field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), ); }); -}); -describe("Set Operations: Chained", () => { - test("triple UNION with different clauses", () => { + test("triple UNION with WHERE clauses", () => { const sql = ` SELECT id FROM users WHERE region = 'US' UNION @@ -1388,73 +1628,370 @@ describe("Set Operations: Chained", () => { ]), ); }); -}); -// ============================================================================= -// SECTION 7: STAR (*) EXPANSION -// ============================================================================= + test("UNION with aliases preserves first SELECT column names", () => { + const sql = ` + SELECT id AS user_id, name AS full_name FROM users + UNION + SELECT customer_id, customer_name FROM customers + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("customers", ["customer_id", "customer_name"]), + ]); -describe("Star Expansion", () => { - test("SELECT * expands to all columns", () => { - const sql = `SELECT * FROM users`; - const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); - const result = getExtendedLineage(ast as Select, schema); + const lineage = getExtendedLineage(ast as Select, schema); - expect(result.fields).toEqual( - USERS_TABLE.columns.reduce( - (acc, col) => { - acc[col] = { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: col, transformations: [DIRECT_IDENTITY] }], - }; - return acc; + // Output columns should be named according to the first SELECT + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + user_id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "customer_id", transformations: [DIRECT_IDENTITY] }, + ], }, - {} as Record, - ), + full_name: { + inputFields: [ + { namespace: "postgres", name: "users", field: "name", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "customer_name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), ); }); - test("table.* with multiple tables", () => { - const sql = `SELECT u.*, o.total FROM users u JOIN orders o ON u.id = o.user_id`; - const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); - const result = getExtendedLineage(ast as Select, schema); - - expect(result.fields.id?.inputFields).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }, - ]); - expect(result.fields.name?.inputFields).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }, - ]); - expect(result.fields.total?.inputFields).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "total", transformations: [DIRECT_IDENTITY] }, - ]); - }); -}); - -// ============================================================================= -// SECTION 8: EDGE CASES -// ============================================================================= - -describe("Edge Cases", () => { - test("same column in multiple contexts", () => { + test("UNION with transformations", () => { const sql = ` - SELECT status, COUNT(*) as cnt - FROM users - WHERE status != 'deleted' - GROUP BY status - ORDER BY status + SELECT UPPER(name) AS name FROM users + UNION + SELECT LOWER(name) AS name FROM customers `; - const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); - const result = getExtendedLineage(ast as Select, schema); - - // Field lineage - expect(result.fields.status?.inputFields).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [DIRECT_IDENTITY] }, + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("customers", ["id", "name"]), ]); - // Dataset lineage should have all three subtypes + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + name: { + inputFields: [ + { namespace: "postgres", name: "users", field: "name", transformations: [DIRECT_TRANSFORMATION] }, + { namespace: "postgres", name: "customers", field: "name", transformations: [DIRECT_TRANSFORMATION] }, + ], + }, + }), + ); + }); + + test("UNION with aggregation", () => { + const sql = ` + SELECT SUM(amount) AS total FROM sales + UNION + SELECT SUM(amount) AS total FROM refunds + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("sales", ["id", "amount"]), + createTable("refunds", ["id", "amount"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + total: { + inputFields: [ + { namespace: "postgres", name: "sales", field: "amount", transformations: [DIRECT_AGGREGATION] }, + { namespace: "postgres", name: "refunds", field: "amount", transformations: [DIRECT_AGGREGATION] }, + ], + }, + }), + ); + }); + + test("UNION with different column expressions", () => { + const sql = ` + SELECT id, first_name || ' ' || last_name AS full_name FROM users + UNION + SELECT id, company_name FROM customers + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "first_name", "last_name"]), + createTable("customers", ["id", "company_name"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + full_name: { + inputFields: [ + { namespace: "postgres", name: "users", field: "first_name", transformations: [DIRECT_TRANSFORMATION] }, + { namespace: "postgres", name: "users", field: "last_name", transformations: [DIRECT_TRANSFORMATION] }, + { namespace: "postgres", name: "customers", field: "company_name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + }); + + test("UNION with subqueries", () => { + const sql = ` + SELECT id FROM (SELECT id FROM users WHERE active = true) AS active_users + UNION + SELECT id FROM (SELECT id FROM customers WHERE verified = true) AS verified_customers + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "active"]), + createTable("customers", ["id", "verified"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "customers", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "users", field: "active", transformations: [INDIRECT_FILTER] }, + { namespace: "postgres", name: "customers", field: "verified", transformations: [INDIRECT_FILTER] }, + ], + }, + }), + ); + }); + + test("UNION deduplicates identical input fields", () => { + const sql = ` + SELECT id FROM users + UNION + SELECT id FROM users + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [createTable("users", ["id", "name"])]); + + const lineage = getExtendedLineage(ast as Select, schema); + + // Same table appears in both SELECTs, but should be deduplicated + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "users", + namespace: "postgres", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); +}); + +describe("Set Operations: INTERSECT", () => { + test("simple INTERSECT combines lineage from both queries", () => { + const sql = ` + SELECT id FROM users + INTERSECT + SELECT id FROM premium_users + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("premium_users", ["id", "tier"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "premium_users", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + }); + + test("INTERSECT with ORDER BY on both sides", () => { + const sql = ` + SELECT id FROM active_users ORDER BY created_at + INTERSECT + SELECT id FROM premium_users ORDER BY upgraded_at + `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.active_users`, ["id", "created_at"]), + createTable(`${DEFAULT_SCHEMA}.premium_users`, ["id", "upgraded_at"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.active_users`, + field: "created_at", + transformations: [INDIRECT_SORT], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.premium_users`, + field: "upgraded_at", + transformations: [INDIRECT_SORT], + }, + ]), + ); + }); +}); + +describe("Set Operations: EXCEPT", () => { + test("simple EXCEPT combines lineage from both queries", () => { + const sql = ` + SELECT id FROM users + EXCEPT + SELECT id FROM banned_users + `; + const ast = parseSQLPostgres(sql); + const schema = createNamespace("postgres", [ + createTable("users", ["id", "name"]), + createTable("banned_users", ["id", "reason"]), + ]); + + const lineage = getExtendedLineage(ast as Select, schema); + + expect(sortInputFields(lineage.fields)).toEqual( + sortInputFields({ + id: { + inputFields: [ + { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "postgres", name: "banned_users", field: "id", transformations: [DIRECT_IDENTITY] }, + ], + }, + }), + ); + }); + + test("EXCEPT with WHERE on both sides", () => { + const sql = ` + SELECT id FROM users WHERE active = true + EXCEPT + SELECT id FROM banned_users WHERE ban_date > '2024-01-01' + `; + const ast = parseSQL(sql, "postgresql"); + const schema = createNamespace("ns", [ + USERS_TABLE, + createTable(`${DEFAULT_SCHEMA}.banned_users`, ["id", "ban_date"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: USERS_TABLE.name, field: "active", transformations: [INDIRECT_FILTER] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.banned_users`, + field: "ban_date", + transformations: [INDIRECT_FILTER], + }, + ]), + ); + }); +}); + +// ============================================================================= +// SECTION 7: STAR (*) EXPANSION +// ============================================================================= + +describe("Star Expansion", () => { + test("SELECT * expands to all columns", () => { + const sql = `SELECT * FROM users`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual( + USERS_TABLE.columns.reduce( + (acc, col) => { + acc[col] = { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: col, transformations: [DIRECT_IDENTITY] }], + }; + return acc; + }, + {} as Record, + ), + ); + }); + + test("table.* with multiple tables", () => { + const sql = `SELECT u.*, o.total FROM users u JOIN orders o ON u.id = o.user_id`; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); + const result = getExtendedLineage(ast as Select, schema); + + expect(result.fields).toEqual({ + ...USERS_TABLE.columns.reduce( + (acc, col) => { + acc[col] = { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: col, transformations: [DIRECT_IDENTITY] }], + }; + return acc; + }, + {} as Record, + ), + total: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "total", transformations: [DIRECT_IDENTITY] }, + ], + }, + }); + }); +}); + +// ============================================================================= +// SECTION 8: EDGE CASES +// ============================================================================= + +describe("Edge Cases", () => { + test("same column in multiple contexts", () => { + const sql = ` + SELECT status, COUNT(*) as cnt + FROM users + WHERE status != 'deleted' + GROUP BY status + ORDER BY status + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [USERS_TABLE]); + const result = getExtendedLineage(ast as Select, schema); + + // Field lineage + expect(result.fields).toEqual({ + status: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [DIRECT_IDENTITY] }], + }, + cnt: { + inputFields: [], + }, + }); + + // Dataset lineage should have all three subtypes expect(sortDataset(result.dataset)).toEqual( sortDataset([ { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, @@ -1475,18 +2012,23 @@ describe("Edge Cases", () => { const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name"])]); const result = getExtendedLineage(ast as Select, schema); - expect(result.fields.user_name?.inputFields).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }, - ]); - expect(result.fields.product_name?.inputFields).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [DIRECT_IDENTITY] }, - ]); + expect(result.fields).toEqual({ + user_name: { + inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + }, + product_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + }); - const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); - expect(sortDataset(filterLineage)).toEqual( + expect(sortDataset(result.dataset)).toEqual( sortDataset([ { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [INDIRECT_FILTER] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: USERS_TABLE.name, field: "favorite_product", transformations: [INDIRECT_JOIN] }, ]), ); }); @@ -1562,118 +2104,142 @@ describe("Comprehensive: Everything Together", () => { `; const ast = parseSQL(sql); const schema = createNamespace("ns", [ - createTable(`${DEFAULT_SCHEMA}.sales`, ["id", "product_id", "store_id", "quantity", "unit_price", "sale_date", "status"]), + createTable(`${DEFAULT_SCHEMA}.sales`, [ + "id", + "product_id", + "store_id", + "quantity", + "unit_price", + "sale_date", + "status", + ]), createTable(`${DEFAULT_SCHEMA}.stores`, ["id", "name", "region", "active"]), ]); const result = getExtendedLineage(ast as Select, schema); // ========== FIELD-LEVEL LINEAGE ========== - // store_name -> stores.name (IDENTITY) - expect(result.fields.store_name?.inputFields).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "name", transformations: [DIRECT_IDENTITY] }, - ]); - - // region -> stores.region (IDENTITY) - expect(result.fields.region?.inputFields).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "region", transformations: [DIRECT_IDENTITY] }, - ]); - - // total_revenue -> SUM(quantity * unit_price) via CTEs (AGGREGATION) - expect(sortInputFields({ total_revenue: result.fields.total_revenue! })).toEqual( - sortInputFields({ - total_revenue: { - inputFields: [ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "quantity", transformations: [DIRECT_AGGREGATION] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "unit_price", transformations: [DIRECT_AGGREGATION] }, - ], - }, - }), - ); + // ========== FIELD-LEVEL LINEAGE ========== - // product_count -> COUNT(product_id) (AGGREGATION with masking) - expect(result.fields.product_count?.inputFields).toEqual([ - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.sales`, - field: "product_id", - transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], - }, - ]); - - // avg_price -> AVG(unit_price) (AGGREGATION) - expect(result.fields.avg_price?.inputFields).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "unit_price", transformations: [DIRECT_AGGREGATION] }, - ]); - - // tier -> CASE on total_revenue which traces to quantity and unit_price - const tierFields = result.fields.tier?.inputFields.map((f) => f.field); - expect(tierFields).toContain("quantity"); - expect(tierFields).toContain("unit_price"); - - // region_rank -> RANK() OVER (...) tracks columns from PARTITION BY and ORDER BY - const rankFields = result.fields.region_rank?.inputFields; - expect(rankFields?.map((f) => f.field)).toContain("region"); - - // store_hash -> MD5(name) (TRANSFORMATION with masking) - expect(result.fields.store_hash?.inputFields).toEqual([ - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.stores`, - field: "name", - transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + expect(result.fields).toEqual({ + store_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + region: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "region", transformations: [DIRECT_IDENTITY] }, + ], + }, + total_revenue: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "unit_price", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + product_count: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "product_id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ], + }, + avg_price: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "unit_price", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + tier: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "quantity", + transformations: [INDIRECT_CONDITION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "unit_price", + transformations: [INDIRECT_CONDITION], + }, + ], + }, + region_rank: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "region", transformations: [INDIRECT_WINDOW] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "quantity", + transformations: [INDIRECT_WINDOW], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.sales`, + field: "unit_price", + transformations: [INDIRECT_WINDOW], + }, + ], + }, + store_hash: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.stores`, + field: "name", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ], }, - ]); - - // ========== DATASET-LEVEL LINEAGE ========== - - // FILTER from filtered_sales CTE (WHERE sale_date >= ... AND status = ...) - const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.sales`, - field: "sale_date", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.sales`, - field: "status", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.stores`, - field: "active", - transformations: [INDIRECT_FILTER], - }); - - // JOIN lineage from st.store_id = s.id - const joinLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "JOIN"); - expect(joinLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.stores`, - field: "id", - transformations: [INDIRECT_JOIN], }); - // SORT lineage from ORDER BY s.region, st.total_revenue - const sortLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "SORT"); - expect(sortLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.stores`, - field: "region", - transformations: [INDIRECT_SORT], - }); + // ========== DATASET-LEVEL LINEAGE ========== - // WINDOW lineage from PARTITION BY s.region ORDER BY st.total_revenue - const windowLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "WINDOW"); - expect(windowLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.stores`, - field: "region", - transformations: [INDIRECT_WINDOW], - }); + // TODO - FIX + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + // FILTER from filtered_sales CTE (WHERE sale_date >= ... AND status = ...) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "sale_date", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "status", transformations: [INDIRECT_FILTER] }, + // FILTER from main query (WHERE s.active = true) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "active", transformations: [INDIRECT_FILTER] }, + // FILTER from store_totals CTE (HAVING SUM(line_total) > 1000) + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "quantity", transformations: [INDIRECT_FILTER] }, + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "unit_price", transformations: [INDIRECT_FILTER] }, + // JOIN from main query (st.store_id = s.id) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "id", transformations: [INDIRECT_JOIN] }, + // GROUP BY from store_totals CTE (GROUP BY store_id) + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "store_id", transformations: [INDIRECT_GROUP_BY] }, + // SORT from main query (ORDER BY s.region, st.total_revenue DESC) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "region", transformations: [INDIRECT_SORT] }, + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "quantity", transformations: [INDIRECT_SORT] }, + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "unit_price", transformations: [INDIRECT_SORT] }, + // WINDOW from RANK() OVER (PARTITION BY s.region ORDER BY st.total_revenue DESC) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "region", transformations: [INDIRECT_WINDOW] }, + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "quantity", transformations: [INDIRECT_WINDOW] }, + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "unit_price", transformations: [INDIRECT_WINDOW] }, + ]), + ); // Verify we have all 8 output fields expect(Object.keys(result.fields).length).toBe(8); @@ -1711,6 +2277,9 @@ describe("Comprehensive: Everything Together", () => { // ========== FIELD-LEVEL LINEAGE ========== + // Verify we have all 7 output fields + expect(Object.keys(result.fields).length).toBe(7); + expect(result.fields.category_name?.inputFields).toEqual([ { namespace: "ns", name: `${DEFAULT_SCHEMA}.categories`, field: "name", transformations: [DIRECT_IDENTITY] }, ]); @@ -1720,15 +2289,30 @@ describe("Comprehensive: Everything Together", () => { ]); expect(result.fields.total_qty?.inputFields).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "quantity", transformations: [DIRECT_AGGREGATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }, ]); expect(sortInputFields({ revenue: result.fields.revenue! })).toEqual( sortInputFields({ revenue: { inputFields: [ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "price", transformations: [DIRECT_AGGREGATION] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.order_items`, field: "quantity", transformations: [DIRECT_AGGREGATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "price", + transformations: [DIRECT_AGGREGATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [DIRECT_AGGREGATION], + }, ], }, }), @@ -1747,123 +2331,159 @@ describe("Comprehensive: Everything Together", () => { }, ]); - // ========== DATASET-LEVEL LINEAGE ========== - - // JOIN lineage - 3 joins with 2 columns each = 6 total - const joinLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "JOIN"); - expect(joinLineage?.length).toBe(6); - expect(joinLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.categories`, - field: "id", - transformations: [INDIRECT_JOIN], - }); - expect(joinLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.products`, - field: "category_id", - transformations: [INDIRECT_JOIN], - }); - expect(joinLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.products`, - field: "id", - transformations: [INDIRECT_JOIN], - }); - expect(joinLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.order_items`, - field: "product_id", - transformations: [INDIRECT_JOIN], - }); - expect(joinLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.order_items`, - field: "order_id", - transformations: [INDIRECT_JOIN], - }); - expect(joinLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.orders`, - field: "id", - transformations: [INDIRECT_JOIN], - }); - - // FILTER lineage - status, created_at, active + HAVING quantity - const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.orders`, - field: "status", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.orders`, - field: "created_at", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.products`, - field: "active", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.order_items`, - field: "quantity", - transformations: [INDIRECT_FILTER], - }); - - // GROUP BY lineage - c.id, c.name, p.id, p.name - const groupByLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "GROUP_BY"); - expect(groupByLineage?.length).toBe(4); - expect(groupByLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.categories`, - field: "id", - transformations: [INDIRECT_GROUP_BY], - }); - expect(groupByLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.categories`, - field: "name", - transformations: [INDIRECT_GROUP_BY], - }); - expect(groupByLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.products`, - field: "id", - transformations: [INDIRECT_GROUP_BY], - }); - expect(groupByLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.products`, - field: "name", - transformations: [INDIRECT_GROUP_BY], - }); - - // SORT lineage - c.name - const sortLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "SORT"); - expect(sortLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.categories`, - field: "name", - transformations: [INDIRECT_SORT], - }); + expect(sortInputFields({ category_rank: result.fields.category_rank! })).toEqual( + sortInputFields({ + category_rank: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.categories`, field: "id", transformations: [INDIRECT_WINDOW] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "price", + transformations: [INDIRECT_WINDOW], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [INDIRECT_WINDOW], + }, + ], + }, + }), + ); - // WINDOW lineage - PARTITION BY c.id - const windowLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "WINDOW"); - expect(windowLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.categories`, - field: "id", - transformations: [INDIRECT_WINDOW], - }); + // ========== DATASET-LEVEL LINEAGE ========== - // Verify 7 output fields - expect(Object.keys(result.fields).length).toBe(7); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + // JOIN lineage - 3 joins with 2 columns each = 6 total + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, + field: "id", + transformations: [INDIRECT_JOIN], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "category_id", + transformations: [INDIRECT_JOIN], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "id", + transformations: [INDIRECT_JOIN], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "product_id", + transformations: [INDIRECT_JOIN], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "order_id", + transformations: [INDIRECT_JOIN], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, + field: "id", + transformations: [INDIRECT_JOIN], + }, + // FILTER lineage - status, created_at, active + HAVING quantity + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, + field: "status", + transformations: [INDIRECT_FILTER], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.orders`, + field: "created_at", + transformations: [INDIRECT_FILTER], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "active", + transformations: [INDIRECT_FILTER], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [INDIRECT_FILTER], + }, + // GROUP BY lineage - c.id, c.name, p.id, p.name + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, + field: "id", + transformations: [INDIRECT_GROUP_BY], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, + field: "name", + transformations: [INDIRECT_GROUP_BY], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "id", + transformations: [INDIRECT_GROUP_BY], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.products`, + field: "name", + transformations: [INDIRECT_GROUP_BY], + }, + // SORT lineage - c.name, revenue (quantity * price) + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, + field: "name", + transformations: [INDIRECT_SORT], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "price", + transformations: [INDIRECT_SORT], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [INDIRECT_SORT], + }, + // WINDOW lineage - PARTITION BY c.id ORDER BY revenue (quantity * price) + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.categories`, + field: "id", + transformations: [INDIRECT_WINDOW], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "price", + transformations: [INDIRECT_WINDOW], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.order_items`, + field: "quantity", + transformations: [INDIRECT_WINDOW], + }, + ]), + ); }); test("HR analytics mega query with complex CTEs and CASE", () => { @@ -1929,102 +2549,152 @@ describe("Comprehensive: Everything Together", () => { // ========== FIELD-LEVEL LINEAGE ========== - expect(result.fields.department_name?.inputFields).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "name", transformations: [DIRECT_IDENTITY] }, - ]); - - expect(result.fields.location?.inputFields).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "location", transformations: [DIRECT_IDENTITY] }, - ]); - - expect(result.fields.headcount?.inputFields).toEqual([ - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.employees`, - field: "id", - transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + expect(result.fields).toEqual({ + department_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], }, - ]); - - expect(result.fields.total_compensation?.inputFields).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [DIRECT_AGGREGATION] }, - ]); - - expect(result.fields.avg_salary?.inputFields).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [DIRECT_AGGREGATION] }, - ]); - - expect(result.fields.avg_performance?.inputFields).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "performance_score", transformations: [DIRECT_AGGREGATION] }, - ]); - - // performance_tier CASE uses avg_performance -> performance_score - const tierFields = result.fields.performance_tier?.inputFields; - expect(tierFields?.map((f) => f.field)).toContain("performance_score"); - - // dept_hash uses MD5 -> masking - expect(result.fields.dept_hash?.inputFields).toEqual([ - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.departments`, - field: "name", - transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + location: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "location", + transformations: [DIRECT_IDENTITY], + }, + ], }, - ]); - - // ========== DATASET-LEVEL LINEAGE ========== - - // FILTER from active_employees CTE - const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.employees`, - field: "status", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.employees`, - field: "terminated_at", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.departments`, - field: "active", - transformations: [INDIRECT_FILTER], - }); - - // JOIN - const joinLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "JOIN"); - expect(joinLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.departments`, - field: "id", - transformations: [INDIRECT_JOIN], - }); - - // SORT - const sortLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "SORT"); - expect(sortLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.departments`, - field: "location", - transformations: [INDIRECT_SORT], - }); - - // WINDOW - const windowLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "WINDOW"); - expect(windowLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.departments`, - field: "location", - transformations: [INDIRECT_WINDOW], - }); - - // Verify 10 output fields - expect(Object.keys(result.fields).length).toBe(10); - }); + headcount: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "id", + transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], + }, + ], + }, + total_compensation: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "salary", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + avg_salary: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "salary", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + avg_performance: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "performance_score", + transformations: [DIRECT_AGGREGATION], + }, + ], + }, + performance_tier: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "performance_score", + transformations: [INDIRECT_CONDITION], + }, + ], + }, + compensation_rank: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "salary", + transformations: [INDIRECT_WINDOW], + }, + ], + }, + location_rank: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "location", + transformations: [INDIRECT_WINDOW], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "id", + transformations: [{ type: "INDIRECT", subtype: "WINDOW", masking: true }], + }, + ], + }, + dept_hash: { + inputFields: [ + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "name", + transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], + }, + ], + }, + }); + + // ========== DATASET-LEVEL LINEAGE ========== + + // FILTER from active_employees CTE + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + // FILTER from active_employees CTE (WHERE status = 'active' AND terminated_at IS NULL) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "status", transformations: [INDIRECT_FILTER] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.employees`, + field: "terminated_at", + transformations: [INDIRECT_FILTER], + }, + // FILTER from main query (WHERE d.active = true) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "active", transformations: [INDIRECT_FILTER] }, + // FILTER from dept_stats CTE (HAVING COUNT(id) >= 3) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "id", transformations: [INDIRECT_FILTER] }, + // JOIN from main query (ds.department_id = d.id) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "id", transformations: [INDIRECT_JOIN] }, + // GROUP BY from dept_stats CTE (GROUP BY department_id) + // { + // namespace: "ns", + // name: `${DEFAULT_SCHEMA}.employees`, + // field: "department_id", + // transformations: [INDIRECT_GROUP_BY], + // }, + // SORT from main query (ORDER BY d.location, ds.total_compensation DESC) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "location", transformations: [INDIRECT_SORT] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_SORT] }, + // WINDOW from DENSE_RANK() OVER (ORDER BY ds.total_compensation DESC) + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_WINDOW] }, + // WINDOW from ROW_NUMBER() OVER (PARTITION BY d.location ORDER BY ds.headcount DESC) + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.departments`, + field: "location", + transformations: [INDIRECT_WINDOW], + }, + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "id", transformations: [INDIRECT_WINDOW] }, + ]), + ); + }); test("UNION with CTEs and window functions mega query", () => { const sql = ` @@ -2079,71 +2749,47 @@ describe("Comprehensive: Everything Together", () => { // Field lineage - product_name comes from products.name // Both UNION parts join the same products table, so we get one unique entry per field // (the mergeInputFields deduplicates by full field identity including transformations) - expect(result.fields.product_name?.inputFields).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.products`, - field: "name", - transformations: [DIRECT_IDENTITY], - }); - - // total_amount traces through CTEs to sales.amount (AGGREGATION) - expect(result.fields.total_amount?.inputFields).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.sales`, - field: "amount", - transformations: [DIRECT_AGGREGATION], + expect(result.fields).toEqual({ + region: { + inputFields: [], + }, + product_name: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], + }, + total_amount: { + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "amount", transformations: [DIRECT_AGGREGATION] }, + ], + }, + sale_count: { + inputFields: [], + }, + revenue_rank: { + inputFields: [], + }, }); // Dataset lineage from both CTEs and both UNION parts - const filterLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "FILTER"); - - // Both CTEs have WHERE region = '...' AND sale_date >= '...' - // Plus both outer queries have WHERE p.active = true - // Check for key filters - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.sales`, - field: "region", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.sales`, - field: "sale_date", - transformations: [INDIRECT_FILTER], - }); - expect(filterLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.products`, - field: "active", - transformations: [INDIRECT_FILTER], - }); - - // GROUP BY from both CTEs (deduplicated since same table.field) - const groupByLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "GROUP_BY"); - expect(groupByLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.sales`, - field: "product_id", - transformations: [INDIRECT_GROUP_BY], - }); - - // JOIN from both UNION parts - const joinLineage = result.dataset?.filter((d) => d.transformations?.[0]?.subtype === "JOIN"); - expect(joinLineage).toContainEqual({ - namespace: "ns", - name: `${DEFAULT_SCHEMA}.products`, - field: "id", - transformations: [INDIRECT_JOIN], - }); - - // Verify 5 output fields - expect(Object.keys(result.fields).length).toBe(5); + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + // FILTER from both CTEs: WHERE region = '...' AND sale_date >= '...' + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "region", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "sale_date", transformations: [INDIRECT_FILTER] }, + // FILTER from both outer queries: WHERE p.active = true + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "active", transformations: [INDIRECT_FILTER] }, + // GROUP BY from both CTEs (deduplicated since same table.field) + { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "product_id", transformations: [INDIRECT_GROUP_BY] }, + // JOIN from both UNION parts + { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "id", transformations: [INDIRECT_JOIN] }, + ]), + ); }); }); // ============================================================================= -// SECTION 10: DEFAULT SCHEMA HANDLING +// SECTION 10: SCHEMA HANDLING (DEFAULT & MULTI-SCHEMA SUPPORT) // ============================================================================= describe("Default Schema Handling", () => { @@ -2169,11 +2815,323 @@ describe("Default Schema Handling", () => { const schema = createNamespace("ns", [createTable("analytics.users", ["id", "status"])]); const result = getExtendedLineage(ast as Select, schema); - expect(result.fields.id?.inputFields).toEqual([ - { namespace: "ns", name: "analytics.users", field: "id", transformations: [DIRECT_IDENTITY] }, - ]); + expect(result.fields).toEqual({ + id: { + inputFields: [{ namespace: "ns", name: "analytics.users", field: "id", transformations: [DIRECT_IDENTITY] }], + }, + }); + expect(result.dataset).toEqual([ { namespace: "ns", name: "analytics.users", field: "status", transformations: [INDIRECT_FILTER] }, ]); }); }); + +describe("Multi-Schema Support", () => { + test("select from table with explicit schema", () => { + const sql = `SELECT id, name FROM myschema.users`; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("myschema.users", ["id", "name", "email"]), + createTable("otherschema.users", ["id", "username"]), + ]); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("select from table with default schema", () => { + const sql = `SELECT id, name FROM users`; + const ast = parseSQL(sql); + const namespace = createNamespace( + "trino", + [createTable("myschema.users", ["id", "name", "email"]), createTable("otherschema.users", ["id", "username"])], + "myschema", // default schema + ); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("join across different schemas", () => { + const sql = ` + SELECT + u.id, + u.name, + o.total + FROM myschema.users u + JOIN orders_schema.orders o ON u.id = o.user_id + `; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("myschema.users", ["id", "name"]), + createTable("orders_schema.orders", ["id", "user_id", "total"]), + ]); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total: { + inputFields: [ + { + name: "orders_schema.orders", + namespace: "trino", + field: "total", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("mix explicit and default schema tables", () => { + const sql = ` + SELECT + u.id, + u.name, + o.total + FROM users u + JOIN orders_schema.orders o ON u.id = o.user_id + `; + const ast = parseSQL(sql); + const namespace = createNamespace( + "trino", + [createTable("myschema.users", ["id", "name"]), createTable("orders_schema.orders", ["id", "user_id", "total"])], + "myschema", // default schema + ); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total: { + inputFields: [ + { + name: "orders_schema.orders", + namespace: "trino", + field: "total", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("same table name in different schemas", () => { + const sql = ` + SELECT + u1.id as user1_id, + u2.id as user2_id + FROM schema1.users u1 + JOIN schema2.users u2 ON u1.id = u2.id + `; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("schema1.users", ["id", "name"]), + createTable("schema2.users", ["id", "username"]), + ]); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + user1_id: { + inputFields: [ + { + name: "schema1.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + user2_id: { + inputFields: [ + { + name: "schema2.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("CTE with schema-qualified tables", () => { + const sql = ` + WITH active_users AS ( + SELECT id, name FROM myschema.users WHERE status = 'active' + ) + SELECT + au.id, + au.name, + o.total + FROM active_users au + JOIN orders_schema.orders o ON au.id = o.user_id + `; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [ + createTable("myschema.users", ["id", "name", "status"]), + createTable("orders_schema.orders", ["id", "user_id", "total"]), + ]); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + total: { + inputFields: [ + { + name: "orders_schema.orders", + namespace: "trino", + field: "total", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); + + test("select * from schema-qualified table", () => { + const sql = `SELECT * FROM myschema.users`; + const ast = parseSQL(sql); + const namespace = createNamespace("trino", [createTable("myschema.users", ["id", "name", "email"])]); + + const lineage = getExtendedLineage(ast as Select, namespace); + + expect(lineage.fields).toEqual({ + id: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "id", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + name: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "name", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + email: { + inputFields: [ + { + name: "myschema.users", + namespace: "trino", + field: "email", + transformations: [DIRECT_IDENTITY], + }, + ], + }, + }); + }); +}); diff --git a/packages/lineage/test/index.test.ts b/packages/lineage/test/index.test.ts deleted file mode 100644 index 4674e32..0000000 --- a/packages/lineage/test/index.test.ts +++ /dev/null @@ -1,1660 +0,0 @@ -import { describe, test, expect } from "bun:test"; -import { Parser } from "node-sql-parser"; -import type { AST, Select } from "node-sql-parser"; -import { - getLineage, - type Namespace, - type Table, - DIRECT_AGGREGATION, - DIRECT_IDENTITY, - DIRECT_TRANSFORMATION, -} from "../src/index.js"; - -const parser = new Parser(); - -// Helper function to create namespaces -function createNamespace(namespace: string, tables: Table[], defaultSchema?: string): Namespace { - return { namespace, tables, defaultSchema }; -} - -function createTable(name: string, columns: string[]): Table { - return { name, columns }; -} - -// Helper to ensure we get a single AST -function parseSQL(sql: string): AST { - const result = parser.astify(sql, { database: "trino" }); - const ast = Array.isArray(result) ? result[0] : result; - - if (!ast) { - throw new Error("Failed to parse SQL"); - } - - return ast; -} - -describe("Select Lineage", () => { - test("select from cte", () => { - const sql = ` - WITH u AS ( - SELECT - id, - name - FROM users - ) - SELECT - id, - name as wow - FROM u - `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - wow: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select from cte with *", () => { - const sql = ` - WITH u AS ( - SELECT * FROM users - ) - SELECT - id, - name as wow - FROM (SELECT * FROM u) AS t`; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - wow: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select from multiple ctes", () => { - const sql = ` - WITH active_users AS ( - SELECT - id, - name, - email - FROM users - WHERE status = 'active' - ), - user_orders AS ( - SELECT - user_id, - COUNT(user_id) as order_count, - SUM(total) as total_spent - FROM orders - GROUP BY user_id - ), - enriched_users AS ( - SELECT - au.id, - au.name, - au.email, - COALESCE(uo.order_count, 0) as order_count, - COALESCE(uo.total_spent, 0) as total_spent - FROM active_users au - LEFT JOIN user_orders uo ON au.id = uo.user_id - ) - SELECT - id, - name as full_name, - order_count, - total_spent * 1.1 as total_with_tax - FROM enriched_users - WHERE order_count > 0 - `; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name", "email", "status"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - full_name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - order_count: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "user_id", - transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], - }, - ], - }, - total_with_tax: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "total", - transformations: [DIRECT_AGGREGATION], - }, - ], - }, - }); - }); - - test("product sales analysis with multiple ctes", () => { - const sql = `-- Product sales analysis with store information using CTEs -WITH filtered_sales AS ( - SELECT - product_id, - store_id, - quantity_sold, - unit_price, - discount_percentage - FROM product_sales - WHERE sale_date >= '2023-01-01' -), -store_sales_summary AS ( - SELECT - fs.product_id, - fs.store_id, - SUM(fs.quantity_sold) as total_quantity, - AVG(fs.unit_price) as avg_price, - SUM(fs.quantity_sold * fs.unit_price * (1 - fs.discount_percentage/100)) as net_revenue - FROM filtered_sales fs - GROUP BY fs.product_id, fs.store_id -), -final_report AS ( - SELECT - sss.product_id, - s.store_name, - s.region, - sss.total_quantity, - sss.avg_price, - sss.net_revenue - FROM store_sales_summary sss - JOIN stores s ON sss.store_id = s.id -) -SELECT - product_id, - store_name, - region, - total_quantity, - avg_price, - net_revenue -FROM final_report -ORDER BY net_revenue DESC`; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("product_sales", [ - "product_id", - "store_id", - "quantity_sold", - "unit_price", - "discount_percentage", - "sale_date", - ]), - createTable("stores", ["id", "store_name", "region"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - product_id: { - inputFields: [ - { - name: "product_sales", - namespace: "trino", - field: "product_id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - store_name: { - inputFields: [ - { - name: "stores", - namespace: "trino", - field: "store_name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - region: { - inputFields: [ - { - name: "stores", - namespace: "trino", - field: "region", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - total_quantity: { - inputFields: [ - { - name: "product_sales", - namespace: "trino", - field: "quantity_sold", - transformations: [DIRECT_AGGREGATION], - }, - ], - }, - avg_price: { - inputFields: [ - { - name: "product_sales", - namespace: "trino", - field: "unit_price", - transformations: [DIRECT_AGGREGATION], - }, - ], - }, - net_revenue: { - inputFields: [ - { - name: "product_sales", - namespace: "trino", - field: "quantity_sold", - transformations: [DIRECT_AGGREGATION], - }, - { - name: "product_sales", - namespace: "trino", - field: "unit_price", - transformations: [DIRECT_AGGREGATION], - }, - { - name: "product_sales", - namespace: "trino", - field: "discount_percentage", - transformations: [DIRECT_AGGREGATION], - }, - ], - }, - }); - }); - - test("select with lots of aliases", () => { - const sql = ` - WITH u AS ( - SELECT - id as i, - name as n - FROM users - ) - SELECT - i as id, - n as wow - FROM u - `; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - wow: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select with group by", () => { - const sql = `SELECT country, count(city) as city_count - FROM cities - GROUP BY country`; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("cities", ["country", "city"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - country: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "country", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - city_count: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "city", - transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], - }, - ], - }, - }); - }); - - test("select with binary expression", () => { - const sql = `SELECT id, name, id + 1 as next_id - FROM users`; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - next_id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - }); - }); - - test("select same column different tables", () => { - const sql = `SELECT u.id, o.id as order_id - FROM users u - JOIN orders o ON u.id = o.user_id`; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name", "email"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - order_id: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select with function transformation", () => { - const sql = `SELECT - id, - UPPER(name) as upper_name, - LENGTH(email) as email_length, - CONCAT(name, email) as name_email - FROM users`; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - upper_name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - email_length: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "email", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - name_email: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "users", - namespace: "trino", - field: "email", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - }); - }); - - test("select with arithmetic operations", () => { - const sql = `SELECT - id, - price + tax as total_price, - quantity * price as line_total, - (price + tax) * quantity as grand_total, - price - discount as discounted_price - FROM orders`; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("orders", ["id", "price", "tax", "quantity", "discount"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - total_price: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "tax", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - line_total: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "quantity", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - grand_total: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "tax", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "quantity", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - discounted_price: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "discount", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - }); - }); - - test("select with complex nested arithmetic", () => { - const sql = `SELECT - id, - ((price + tax) * quantity) / discount as complex_calc, - price % 10 as price_remainder - FROM orders`; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("orders", ["id", "price", "tax", "quantity", "discount"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - complex_calc: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "tax", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "quantity", - transformations: [DIRECT_TRANSFORMATION], - }, - { - name: "orders", - namespace: "trino", - field: "discount", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - price_remainder: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "price", - transformations: [DIRECT_TRANSFORMATION], - }, - ], - }, - }); - }); - - test("select with mixed aggregation and arithmetic", () => { - const sql = `SELECT - country, - SUM(population) as total_population, - AVG(area) * 2 as double_avg_area, - COUNT(city) + 1 as city_count_plus_one - FROM cities - GROUP BY country`; - - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("cities", ["country", "city", "population", "area"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - country: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "country", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - total_population: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "population", - transformations: [{ ...DIRECT_AGGREGATION }], - }, - ], - }, - double_avg_area: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "area", - transformations: [DIRECT_AGGREGATION], - }, - ], - }, - city_count_plus_one: { - inputFields: [ - { - name: "cities", - namespace: "trino", - field: "city", - transformations: [{ ...DIRECT_AGGREGATION, masking: true }], - }, - ], - }, - }); - }); - - test("select * from single table", () => { - const sql = `SELECT * FROM users`; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - email: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "email", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select * from multiple tables (JOIN)", () => { - const sql = `SELECT * FROM users u JOIN orders o ON u.id = o.user_id`; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - // Note: When both tables have "id", the second one (orders.id) overwrites the first (users.id) - // This is expected behavior since the output column names would conflict - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - user_id: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "user_id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - total: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "total", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select table.* from specific table", () => { - const sql = `SELECT u.* FROM users u JOIN orders o ON u.id = o.user_id`; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select * mixed with specific columns", () => { - const sql = `SELECT o.*, u.name as user_name FROM users u JOIN orders o ON u.id = o.user_id`; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "user_id", "total"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - user_id: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "user_id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - total: { - inputFields: [ - { - name: "orders", - namespace: "trino", - field: "total", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - user_name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select * from CTE", () => { - const sql = ` - WITH filtered_users AS ( - SELECT id, name FROM users WHERE active = true - ) - SELECT * FROM filtered_users - `; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "active"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select * from nested subquery", () => { - const sql = `SELECT * FROM (SELECT id, name FROM users) AS subq`; - const ast = parseSQL(sql); - const schema = createNamespace("trino", [createTable("users", ["id", "name", "email"])]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); -}); - -// Helper to parse SQL for PostgreSQL (which supports INTERSECT and EXCEPT) -function parseSQLPostgres(sql: string): AST { - const result = parser.astify(sql, { database: "postgresql" }); - const ast = Array.isArray(result) ? result[0] : result; - - if (!ast) { - throw new Error("Failed to parse SQL"); - } - - return ast; -} - -describe("Set Operations (UNION, INTERSECT, EXCEPT)", () => { - test("simple UNION combines lineage from both queries", () => { - const sql = ` - SELECT id, name FROM users - UNION - SELECT id, name FROM customers - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "name", "email"]), - createTable("customers", ["id", "name", "address"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - // Output columns are from the first SELECT, but input fields include both tables - expect(lineage.id?.inputFields).toHaveLength(2); - expect(lineage.id?.inputFields).toContainEqual({ - name: "users", - namespace: "postgres", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(lineage.id?.inputFields).toContainEqual({ - name: "customers", - namespace: "postgres", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - - expect(lineage.name?.inputFields).toHaveLength(2); - expect(lineage.name?.inputFields).toContainEqual({ - name: "users", - namespace: "postgres", - field: "name", - transformations: [DIRECT_IDENTITY], - }); - expect(lineage.name?.inputFields).toContainEqual({ - name: "customers", - namespace: "postgres", - field: "name", - transformations: [DIRECT_IDENTITY], - }); - }); - - test("UNION ALL combines lineage from both queries", () => { - const sql = ` - SELECT id FROM users - UNION ALL - SELECT id FROM orders - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "name"]), - createTable("orders", ["id", "product"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage.id?.inputFields).toHaveLength(2); - expect(lineage.id?.inputFields).toContainEqual({ - name: "users", - namespace: "postgres", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(lineage.id?.inputFields).toContainEqual({ - name: "orders", - namespace: "postgres", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - }); - - test("INTERSECT combines lineage from both queries", () => { - const sql = ` - SELECT id FROM users - INTERSECT - SELECT id FROM premium_users - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "name"]), - createTable("premium_users", ["id", "tier"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage.id?.inputFields).toHaveLength(2); - expect(lineage.id?.inputFields).toContainEqual({ - name: "users", - namespace: "postgres", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(lineage.id?.inputFields).toContainEqual({ - name: "premium_users", - namespace: "postgres", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - }); - - test("EXCEPT combines lineage from both queries", () => { - const sql = ` - SELECT id FROM users - EXCEPT - SELECT id FROM banned_users - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "name"]), - createTable("banned_users", ["id", "reason"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage.id?.inputFields).toHaveLength(2); - expect(lineage.id?.inputFields).toContainEqual({ - name: "users", - namespace: "postgres", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(lineage.id?.inputFields).toContainEqual({ - name: "banned_users", - namespace: "postgres", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - }); - - test("chained UNION combines lineage from all queries", () => { - const sql = ` - SELECT id, name FROM users - UNION - SELECT id, name FROM customers - UNION - SELECT id, name FROM vendors - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "name"]), - createTable("customers", ["id", "name"]), - createTable("vendors", ["id", "name"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - // All three tables contribute to the lineage - expect(lineage.id?.inputFields).toHaveLength(3); - expect(lineage.name?.inputFields).toHaveLength(3); - }); - - test("UNION with aliases preserves first SELECT column names", () => { - const sql = ` - SELECT id AS user_id, name AS full_name FROM users - UNION - SELECT customer_id, customer_name FROM customers - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "name"]), - createTable("customers", ["customer_id", "customer_name"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - // Output columns should be named according to the first SELECT - expect(Object.keys(lineage)).toEqual(["user_id", "full_name"]); - expect(lineage.user_id?.inputFields).toContainEqual({ - name: "users", - namespace: "postgres", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - expect(lineage.user_id?.inputFields).toContainEqual({ - name: "customers", - namespace: "postgres", - field: "customer_id", - transformations: [DIRECT_IDENTITY], - }); - }); - - test("UNION with transformations", () => { - const sql = ` - SELECT UPPER(name) AS name FROM users - UNION - SELECT LOWER(name) AS name FROM customers - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "name"]), - createTable("customers", ["id", "name"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - // Both inputs should have TRANSFORMATION type - expect(lineage.name?.inputFields).toHaveLength(2); - expect(lineage.name?.inputFields).toContainEqual({ - name: "users", - namespace: "postgres", - field: "name", - transformations: [DIRECT_TRANSFORMATION], - }); - expect(lineage.name?.inputFields).toContainEqual({ - name: "customers", - namespace: "postgres", - field: "name", - transformations: [DIRECT_TRANSFORMATION], - }); - }); - - test("UNION with aggregation", () => { - const sql = ` - SELECT SUM(amount) AS total FROM sales - UNION - SELECT SUM(amount) AS total FROM refunds - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("sales", ["id", "amount"]), - createTable("refunds", ["id", "amount"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage.total?.inputFields).toHaveLength(2); - expect(lineage.total?.inputFields).toContainEqual({ - name: "sales", - namespace: "postgres", - field: "amount", - transformations: [DIRECT_AGGREGATION], - }); - expect(lineage.total?.inputFields).toContainEqual({ - name: "refunds", - namespace: "postgres", - field: "amount", - transformations: [DIRECT_AGGREGATION], - }); - }); - - test("UNION with different column expressions", () => { - const sql = ` - SELECT id, first_name || ' ' || last_name AS full_name FROM users - UNION - SELECT id, company_name FROM customers - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "first_name", "last_name"]), - createTable("customers", ["id", "company_name"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - // First SELECT contributes first_name and last_name, second contributes company_name - expect(lineage.full_name?.inputFields.length).toBeGreaterThanOrEqual(3); - }); - - test("UNION with subqueries", () => { - const sql = ` - SELECT id FROM (SELECT id FROM users WHERE active = true) AS active_users - UNION - SELECT id FROM (SELECT id FROM customers WHERE verified = true) AS verified_customers - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [ - createTable("users", ["id", "active"]), - createTable("customers", ["id", "verified"]), - ]); - - const lineage = getLineage(ast as Select, schema); - - expect(lineage.id?.inputFields).toHaveLength(2); - }); - - test("UNION deduplicates identical input fields", () => { - const sql = ` - SELECT id FROM users - UNION - SELECT id FROM users - `; - const ast = parseSQLPostgres(sql); - const schema = createNamespace("postgres", [createTable("users", ["id", "name"])]); - - const lineage = getLineage(ast as Select, schema); - - // Same table appears in both SELECTs, but should be deduplicated - expect(lineage.id?.inputFields).toHaveLength(1); - expect(lineage.id?.inputFields).toContainEqual({ - name: "users", - namespace: "postgres", - field: "id", - transformations: [DIRECT_IDENTITY], - }); - }); -}); - -describe("Multi-Schema Support", () => { - test("select from table with explicit schema", () => { - const sql = `SELECT id, name FROM myschema.users`; - const ast = parseSQL(sql); - const namespace = createNamespace("trino", [ - createTable("myschema.users", ["id", "name", "email"]), - createTable("otherschema.users", ["id", "username"]), - ]); - - const lineage = getLineage(ast as Select, namespace); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select from table with default schema", () => { - const sql = `SELECT id, name FROM users`; - const ast = parseSQL(sql); - const namespace = createNamespace( - "trino", - [ - createTable("myschema.users", ["id", "name", "email"]), - createTable("otherschema.users", ["id", "username"]), - ], - "myschema", // default schema - ); - - const lineage = getLineage(ast as Select, namespace); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("join across different schemas", () => { - const sql = ` - SELECT - u.id, - u.name, - o.total - FROM myschema.users u - JOIN orders_schema.orders o ON u.id = o.user_id - `; - const ast = parseSQL(sql); - const namespace = createNamespace("trino", [ - createTable("myschema.users", ["id", "name"]), - createTable("orders_schema.orders", ["id", "user_id", "total"]), - ]); - - const lineage = getLineage(ast as Select, namespace); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - total: { - inputFields: [ - { - name: "orders_schema.orders", - namespace: "trino", - field: "total", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("mix explicit and default schema tables", () => { - const sql = ` - SELECT - u.id, - u.name, - o.total - FROM users u - JOIN orders_schema.orders o ON u.id = o.user_id - `; - const ast = parseSQL(sql); - const namespace = createNamespace( - "trino", - [ - createTable("myschema.users", ["id", "name"]), - createTable("orders_schema.orders", ["id", "user_id", "total"]), - ], - "myschema", // default schema - ); - - const lineage = getLineage(ast as Select, namespace); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - total: { - inputFields: [ - { - name: "orders_schema.orders", - namespace: "trino", - field: "total", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("same table name in different schemas", () => { - const sql = ` - SELECT - u1.id as user1_id, - u2.id as user2_id - FROM schema1.users u1 - JOIN schema2.users u2 ON u1.id = u2.id - `; - const ast = parseSQL(sql); - const namespace = createNamespace("trino", [ - createTable("schema1.users", ["id", "name"]), - createTable("schema2.users", ["id", "username"]), - ]); - - const lineage = getLineage(ast as Select, namespace); - - expect(lineage).toEqual({ - user1_id: { - inputFields: [ - { - name: "schema1.users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - user2_id: { - inputFields: [ - { - name: "schema2.users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("CTE with schema-qualified tables", () => { - const sql = ` - WITH active_users AS ( - SELECT id, name FROM myschema.users WHERE status = 'active' - ) - SELECT - au.id, - au.name, - o.total - FROM active_users au - JOIN orders_schema.orders o ON au.id = o.user_id - `; - const ast = parseSQL(sql); - const namespace = createNamespace("trino", [ - createTable("myschema.users", ["id", "name", "status"]), - createTable("orders_schema.orders", ["id", "user_id", "total"]), - ]); - - const lineage = getLineage(ast as Select, namespace); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - total: { - inputFields: [ - { - name: "orders_schema.orders", - namespace: "trino", - field: "total", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); - - test("select * from schema-qualified table", () => { - const sql = `SELECT * FROM myschema.users`; - const ast = parseSQL(sql); - const namespace = createNamespace("trino", [ - createTable("myschema.users", ["id", "name", "email"]), - ]); - - const lineage = getLineage(ast as Select, namespace); - - expect(lineage).toEqual({ - id: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "id", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - name: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "name", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - email: { - inputFields: [ - { - name: "myschema.users", - namespace: "trino", - field: "email", - transformations: [DIRECT_IDENTITY], - }, - ], - }, - }); - }); -}); From cf53c594d7b7c9d2003d39fbb0ca47d224cd5778 Mon Sep 17 00:00:00 2001 From: its-elad <59926027+its-elad@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:56:24 +0200 Subject: [PATCH 10/10] feat: added support for expr_list in extractColumnRefs --- apps/demo/src/App.tsx | 2 +- packages/lineage/src/index.ts | 186 ++---- packages/lineage/test/extendedLineage.test.ts | 533 ++++++++---------- 3 files changed, 292 insertions(+), 429 deletions(-) diff --git a/apps/demo/src/App.tsx b/apps/demo/src/App.tsx index e1e8b4c..2f8d046 100644 --- a/apps/demo/src/App.tsx +++ b/apps/demo/src/App.tsx @@ -1,4 +1,4 @@ -import { useState, useCallback, useEffect } from "react"; +import { useState, useCallback } from "react"; import { GitBranch, Github } from "lucide-react"; import { Card, CardContent, CardHeader, CardTitle } from "@meta-sql/ui"; import { SQLEditor } from "./components/editor"; diff --git a/packages/lineage/src/index.ts b/packages/lineage/src/index.ts index 037c3f8..7e121aa 100644 --- a/packages/lineage/src/index.ts +++ b/packages/lineage/src/index.ts @@ -16,6 +16,9 @@ import { Case, Interval, Cast, + type AsWindowSpec, + NamedWindowExpr, + ExprList, } from "node-sql-parser"; import { HashSet } from "./hashset"; @@ -190,16 +193,6 @@ export type SelectWithAlias = Select & { as?: string | null; }; -/** - * Set operation type for UNION, INTERSECT, EXCEPT - */ -export type SetOperation = "union" | "union all" | "intersect" | "intersect all" | "except" | "except all"; - -/** - * Extended lineage result that includes both field-level and dataset-level lineage - */ -export type ExtendedLineageResult = Pick; - function isColumn(selectColumn: Select["columns"][number]): selectColumn is AstColumn { return ( typeof selectColumn === "object" && @@ -356,9 +349,7 @@ export function extractColumnRefs(expr: ExpressionValue | null | undefined): Col case "function": { const func = expr as AstFunction; if (func.args?.value) { - for (const arg of func.args.value) { - refs.push(...extractColumnRefs(arg)); - } + refs.push(...extractColumnRefs(func.args)); } break; } @@ -394,67 +385,58 @@ export function extractColumnRefs(expr: ExpressionValue | null | undefined): Col break; } - default: - // Handle nested expressions in unknown types - if (typeof expr === "object" && expr !== null) { - for (const key of Object.keys(expr)) { - const value = (expr as Record)[key]; - if (value && typeof value === "object" && "type" in value) { - refs.push(...extractColumnRefs(value as ExpressionValue)); - } + case "expr_list": { + const cast = expr as ExprList; + if (cast.value) { + for (const subExpr of cast.value) { + refs.push(...extractColumnRefs(subExpr)); } } break; + } + + default: + console.warn("UNHANDLED EXPR TYPE IN EXTRACT COLUMN REFS:", expr.type, expr); + break; } return refs; } /** - * Type for OVER clause structure (shared between aggr_func and function types) + * Type for OVER clause in window functions. + * Uses the library's AsWindowSpec type which matches the actual parser output. */ -type OverClause = { - // Direct structure (legacy/simple case) - partitionby?: ExpressionValue[]; - orderby?: Array<{ expr: ExpressionValue }>; - // Nested structure (Trino parser output) - as_window_specification?: { - window_specification?: { - partitionby?: Array<{ expr: ExpressionValue }>; - orderby?: Array<{ expr: ExpressionValue }>; - }; - }; +type OverClause = NamedWindowExpr & { + type: "window"; + as_window_specification: AsWindowSpec; }; /** * Extracts PARTITION BY and ORDER BY expressions from an OVER clause. - * Handles both direct structure and nested Trino parser output structure. * @returns Array of expressions from PARTITION BY and ORDER BY clauses */ function extractWindowExpressionsFromOver(over: OverClause): ExpressionValue[] { const expressions: ExpressionValue[] = []; - // Handle nested structure (Trino parser output) - const windowSpec = over.as_window_specification?.window_specification; + // Handle string reference (named window) + if (typeof over.as_window_specification === "string") { + // Named window reference - no direct expressions to extract + return expressions; + } + + const windowSpec = over.as_window_specification.window_specification; if (windowSpec) { if (windowSpec.partitionby) { - expressions.push(...windowSpec.partitionby.map((item) => item.expr)); + expressions.push( + ...windowSpec.partitionby.flatMap((item) => (Array.isArray(item.expr) ? item.expr : [item.expr])), + ); } if (windowSpec.orderby) { expressions.push(...windowSpec.orderby.map((item) => item.expr)); } } - // Handle direct structure (legacy/simple case) as fallback - if (expressions.length === 0) { - if (over.partitionby) { - expressions.push(...over.partitionby); - } - if (over.orderby) { - expressions.push(...over.orderby.map((item) => item.expr)); - } - } - return expressions; } @@ -477,7 +459,7 @@ function extractWindowExpressionsFromOver(over: OverClause): ExpressionValue[] { * @calls formatInputColumnName - To format column references as keys * @calls mergeTransformations - To combine parent and child transformations * @calls extractWindowExpressionsFromOver - For window function OVER clauses - * @calls extractTransformationsWithType - For CASE condition columns + * @calls extractColumnsWithUniformTransformation - For CASE condition columns * @calls extractTransformationsFromExpr - Recursively for nested expressions */ function extractTransformationsFromExpr( @@ -542,7 +524,7 @@ function extractTransformationsFromExpr( // For window functions (aggr_func with OVER clause), also extract columns from PARTITION BY/ORDER BY if ("over" in aggExpr && aggExpr.over) { - const windowExprs = extractWindowExpressionsFromOver(aggExpr.over); + const windowExprs = extractWindowExpressionsFromOver(aggExpr.over as OverClause); for (const windowExpr of windowExprs) { const windowTransformations = extractTransformationsFromExpr( windowExpr, @@ -581,7 +563,7 @@ function extractTransformationsFromExpr( // For window functions (function with OVER clause like RANK(), ROW_NUMBER()), // extract columns from PARTITION BY/ORDER BY since these functions have no arguments if ("over" in funcExpr && funcExpr.over) { - const windowExprs = extractWindowExpressionsFromOver((funcExpr as AstFunction & { over: OverClause }).over); + const windowExprs = extractWindowExpressionsFromOver(funcExpr.over as OverClause); for (const windowExpr of windowExprs) { const windowTransformations = extractTransformationsFromExpr( windowExpr, @@ -603,8 +585,8 @@ function extractTransformationsFromExpr( if (caseExpr.args) { for (const arg of caseExpr.args) { // Condition columns get INDIRECT/CONDITION (per-column indirect transformation) - if (arg.type === "when" && arg.cond) { - const condTransformations = extractTransformationsWithType(arg.cond, INDIRECT_CONDITION); + if (arg.type === "when") { + const condTransformations = extractColumnsWithUniformTransformation(arg.cond, INDIRECT_CONDITION); Object.entries(condTransformations).forEach(([key, value]) => { merged[key] = merged[key] ? merged[key].union(value) : value; }); @@ -664,7 +646,7 @@ function extractTransformationsFromExpr( * @calls extractColumnRefs - To find all column references in the expression * @calls formatInputColumnName - To format column references as keys */ -function extractTransformationsWithType( +function extractColumnsWithUniformTransformation( expr: ExpressionValue | null | undefined, transformation: Transformation, ): ColumnTransformations { @@ -866,14 +848,14 @@ function buildAliasToExpressionMap(select: Select): Map return aliasMap; } - select.columns.forEach((col) => { - if (!isColumn(col)) return; + for (const col of select.columns) { + if (!isColumn(col)) continue; const outputName = getOutputColumnName(col); if (outputName && col.expr) { aliasMap.set(outputName, col.expr); } - }); + } return aliasMap; } @@ -922,7 +904,7 @@ function getOrderByLineage(select: Select, namespace: Namespace): InputField[] { const inputFields: InputField[] = []; - orderByItems.forEach((item) => { + for (const item of orderByItems) { const expr = ("expr" in item ? item.expr : item) as ExpressionValue; // Resolve the expression - if it's an alias, get the underlying expression @@ -930,61 +912,11 @@ function getOrderByLineage(select: Select, namespace: Namespace): InputField[] { // Extract input fields from the resolved expression inputFields.push(...extractInputFieldsFromExpression(resolvedExpr, regularTables, namespace, INDIRECT_SORT)); - }); - - return inputFields; -} - -/** - * Extracts WINDOW function lineage from SELECT columns. - * Columns in PARTITION BY and ORDER BY clauses of OVER receive INDIRECT/WINDOW transformation. - * @returns Array of InputFields for columns used in window function OVER clauses - * - * @calls getTableExpressionsFromSelect - To get regular tables from FROM - * @calls isColumn - To filter valid columns - * @calls extractWindowExpressions - To get OVER clause expressions - * @calls extractInputFieldsFromExpression - To extract columns with WINDOW transformation - */ -function getWindowLineage(select: Select, namespace: Namespace): InputField[] { - if (!select.columns || (typeof select.columns === "string" && select.columns === "*")) { - return []; - } - - const { regularTables } = getTableExpressionsFromSelect(select); - const inputFields: InputField[] = []; - - for (const col of select.columns) { - if (!isColumn(col)) continue; - - const windowExprs = extractWindowExpressions(col.expr); - inputFields.push( - ...windowExprs.flatMap((expr) => - extractInputFieldsFromExpression(expr, regularTables, namespace, INDIRECT_WINDOW), - ), - ); } return inputFields; } -/** - * Extracts PARTITION BY and ORDER BY expressions from a window function expression. - * Supports both aggr_func (SUM() OVER) and function types (ROW_NUMBER() OVER). - * @returns Array of expressions from the OVER clause, empty if not a window function - * - * @calls extractWindowExpressionsFromOver - To parse the OVER clause structure - */ -function extractWindowExpressions(expr: ExpressionValue): ExpressionValue[] { - // Support both aggr_func and function types with OVER clause - if ((expr.type !== "aggr_func" && expr.type !== "function") || !("over" in expr)) return []; - - const exprWithOver = expr as (AggrFunc | AstFunction) & { over?: OverClause }; - - if (!exprWithOver.over) return []; - - return extractWindowExpressionsFromOver(exprWithOver.over); -} - /** * Extracts HAVING clause lineage. * All columns in HAVING conditions receive INDIRECT/FILTER transformation. @@ -1039,7 +971,7 @@ function getTableExpressionsFromSelect(select: Select): { if (select.from) { const fromItems = Array.isArray(select.from) ? select.from : [select.from]; - fromItems.forEach((item) => { + for (const item of fromItems) { if ("table" in item) { // might mention with statement in our select const matchingWith = withByNames.get(item.table); @@ -1059,7 +991,7 @@ function getTableExpressionsFromSelect(select: Select): { with: previousWiths, // propagate previous withs }); } - }); + } } return { regularTables, selectTables }; @@ -1105,16 +1037,16 @@ function expandStarColumn(column: AstColumn, select: Select, namespace: Namespac const expandedColumns: AstColumn[] = []; // Process regular tables (from namespace) - regularTables.forEach((fromTable) => { + for (const fromTable of regularTables) { // If there's a table qualifier, skip tables that don't match if (tableQualifier && tableQualifier !== fromTable.table && tableQualifier !== fromTable.as) { - return; + continue; } const schemaTable = namespace.tables!.find((t) => astTableMatchesSchemaTable(fromTable, t.name, namespace.defaultSchema), ); - if (!schemaTable) return; + if (!schemaTable) continue; for (const colName of schemaTable.columns) { expandedColumns.push({ @@ -1126,24 +1058,24 @@ function expandStarColumn(column: AstColumn, select: Select, namespace: Namespac as: colName, }); } - }); + } // Process subquery/CTE tables - selectTables.forEach((selectTable) => { + for (const selectTable of selectTables) { // If there's a table qualifier, skip tables that don't match if (tableQualifier && tableQualifier !== selectTable.as) { - return; + continue; } // Get columns from the subquery/CTE if (selectTable.columns && typeof selectTable.columns !== "string") { - selectTable.columns.forEach((subCol) => { - if (!isColumn(subCol)) return; + for (const subCol of selectTable.columns) { + if (!isColumn(subCol)) continue; // Handle star in subquery recursively if (isStar(subCol)) { const expandedSubCols = expandStarColumn(subCol, selectTable, namespace); - expandedSubCols.forEach((expandedSubCol) => { + for (const expandedSubCol of expandedSubCols) { const outputName = getOutputColumnName(expandedSubCol); if (outputName) { expandedColumns.push({ @@ -1155,7 +1087,7 @@ function expandStarColumn(column: AstColumn, select: Select, namespace: Namespac as: outputName, }); } - }); + } } else { const outputName = getOutputColumnName(subCol); if (outputName) { @@ -1169,9 +1101,9 @@ function expandStarColumn(column: AstColumn, select: Select, namespace: Namespac }); } } - }); + } } - }); + } return expandedColumns; } @@ -1233,9 +1165,10 @@ export function getColumnLineage( if (transformations) { transformationsByColumns = Object.entries(transformationsByColumns).reduce( (acc, [columnName, childTransformations]) => { - acc[columnName] = mergeTransformationSet(transformations, childTransformations); - - return acc; + return { + ...acc, + [columnName]: mergeTransformationSet(transformations, childTransformations), + }; }, {} as Record, ); @@ -1317,7 +1250,6 @@ function getDatasetLineageForSingleSelect(select: Select, namespace: Namespace): allIndirectFields.push(...getFilterLineage(select, namespace)); allIndirectFields.push(...getGroupByLineage(select, namespace)); allIndirectFields.push(...getOrderByLineage(select, namespace)); - allIndirectFields.push(...getWindowLineage(select, namespace)); allIndirectFields.push(...getHavingLineage(select, namespace)); // Recursively collect dataset lineage from CTEs and subqueries @@ -1387,13 +1319,13 @@ function getLineageForSingleSelect(select: Select, namespace: Namespace): Column // Expand star columns into individual columns if (isStar(column)) { const expandedColumns = expandStarColumn(column, select, namespace); - expandedColumns.forEach((expandedCol) => { + for (const expandedCol of expandedColumns) { let outputFieldName = getOutputColumnName(expandedCol); if (!outputFieldName) { outputFieldName = `unknown_${unknownCount++}`; } acc[outputFieldName] = { inputFields: getColumnLineage(select, namespace, expandedCol) }; - }); + } return acc; } diff --git a/packages/lineage/test/extendedLineage.test.ts b/packages/lineage/test/extendedLineage.test.ts index 0af092a..4918b32 100644 --- a/packages/lineage/test/extendedLineage.test.ts +++ b/packages/lineage/test/extendedLineage.test.ts @@ -17,22 +17,6 @@ import { } from "../src/index.js"; const DEFAULT_SCHEMA = "public"; -const USERS_TABLE = createTable(`${DEFAULT_SCHEMA}.users`, [ - "id", - "name", - "email", - "first_name", - "last_name", - "status", - "age", - "country", - "city", - "region", - "verified", - "active", - "favorite_product", - "created_at", -]); const parser = new Parser(); @@ -95,12 +79,14 @@ describe("Field-Level Lineage: DIRECT/IDENTITY", () => { test("single column select", () => { const sql = `SELECT id FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ id: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], }, }); expect(result.dataset).toEqual([]); @@ -109,18 +95,24 @@ describe("Field-Level Lineage: DIRECT/IDENTITY", () => { test("multiple columns select", () => { const sql = `SELECT id, name, email FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "email"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ id: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], }, name: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], }, email: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "email", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "email", transformations: [DIRECT_IDENTITY] }, + ], }, }); expect(result.dataset).toEqual([]); @@ -129,15 +121,19 @@ describe("Field-Level Lineage: DIRECT/IDENTITY", () => { test("column with alias", () => { const sql = `SELECT id as user_id, name as user_name FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ user_id: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], }, user_name: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], }, }); }); @@ -145,15 +141,19 @@ describe("Field-Level Lineage: DIRECT/IDENTITY", () => { test("table-qualified column", () => { const sql = `SELECT u.id, u.name FROM users u`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ id: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], }, name: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], }, }); }); @@ -163,13 +163,13 @@ describe("Field-Level Lineage: DIRECT/TRANSFORMATION", () => { test("function transformation - UPPER", () => { const sql = `SELECT UPPER(name) as upper_name FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["name"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ upper_name: { inputFields: [ - { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_TRANSFORMATION] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_TRANSFORMATION] }, ], }, }); @@ -178,15 +178,25 @@ describe("Field-Level Lineage: DIRECT/TRANSFORMATION", () => { test("function transformation - CONCAT", () => { const sql = `SELECT CONCAT(first_name, last_name) as full_name FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["first_name", "last_name"])]); const result = getExtendedLineage(ast as Select, schema); expect(sortInputFields(result.fields)).toEqual( sortInputFields({ full_name: { inputFields: [ - { namespace: "ns", name: USERS_TABLE.name, field: "first_name", transformations: [DIRECT_TRANSFORMATION] }, - { namespace: "ns", name: USERS_TABLE.name, field: "last_name", transformations: [DIRECT_TRANSFORMATION] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.users`, + field: "first_name", + transformations: [DIRECT_TRANSFORMATION], + }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.users`, + field: "last_name", + transformations: [DIRECT_TRANSFORMATION], + }, ], }, }), @@ -272,13 +282,13 @@ describe("Field-Level Lineage: DIRECT/TRANSFORMATION", () => { test("nested function transformation", () => { const sql = `SELECT LOWER(TRIM(name)) as clean_name FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["name"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ clean_name: { inputFields: [ - { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_TRANSFORMATION] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_TRANSFORMATION] }, ], }, }); @@ -369,7 +379,7 @@ describe("Field-Level Lineage: DIRECT/AGGREGATION", () => { test("COUNT with column - has masking", () => { const sql = `SELECT COUNT(id) as count FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ @@ -377,7 +387,7 @@ describe("Field-Level Lineage: DIRECT/AGGREGATION", () => { inputFields: [ { namespace: "ns", - name: USERS_TABLE.name, + name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [{ type: "DIRECT", subtype: "AGGREGATION", masking: true }], }, @@ -439,7 +449,7 @@ describe("Field-Level Lineage: Masking Functions", () => { test("MD5 masking", () => { const sql = `SELECT MD5(email) as hashed_email FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["email"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ @@ -447,7 +457,7 @@ describe("Field-Level Lineage: Masking Functions", () => { inputFields: [ { namespace: "ns", - name: USERS_TABLE.name, + name: `${DEFAULT_SCHEMA}.users`, field: "email", transformations: [{ type: "DIRECT", subtype: "TRANSFORMATION", masking: true }], }, @@ -505,7 +515,7 @@ describe("Field-Level Lineage: CASE Expressions", () => { FROM users `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["status"])]); const result = getExtendedLineage(ast as Select, schema); // CASE WHEN condition column gets INDIRECT/CONDITION @@ -513,7 +523,7 @@ describe("Field-Level Lineage: CASE Expressions", () => { expect(result.fields.status_label?.inputFields).toEqual([ { namespace: "ns", - name: USERS_TABLE.name, + name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_CONDITION], }, @@ -615,12 +625,15 @@ describe("Dataset-Level Lineage: INDIRECT/JOIN", () => { JOIN orders o ON u.id = o.user_id `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, ]), ); @@ -633,12 +646,15 @@ describe("Dataset-Level Lineage: INDIRECT/JOIN", () => { LEFT JOIN orders o ON u.id = o.user_id `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, ]), ); @@ -651,12 +667,15 @@ describe("Dataset-Level Lineage: INDIRECT/JOIN", () => { RIGHT JOIN orders o ON u.id = o.user_id `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, ]), ); @@ -669,12 +688,15 @@ describe("Dataset-Level Lineage: INDIRECT/JOIN", () => { FULL OUTER JOIN orders o ON u.id = o.user_id `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, ]), ); @@ -687,14 +709,17 @@ describe("Dataset-Level Lineage: INDIRECT/JOIN", () => { JOIN orders o ON u.id = o.user_id AND u.region = o.region `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "region"])]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "region"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "region"]), + ]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, - { namespace: "ns", name: USERS_TABLE.name, field: "region", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "region", transformations: [INDIRECT_JOIN] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "region", transformations: [INDIRECT_JOIN] }, ]), ); @@ -709,7 +734,7 @@ describe("Dataset-Level Lineage: INDIRECT/JOIN", () => { `; const ast = parseSQL(sql); const schema = createNamespace("ns", [ - USERS_TABLE, + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "product_id", "total"]), createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name"]), ]); @@ -717,7 +742,7 @@ describe("Dataset-Level Lineage: INDIRECT/JOIN", () => { expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "product_id", transformations: [INDIRECT_JOIN] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "id", transformations: [INDIRECT_JOIN] }, @@ -732,7 +757,10 @@ describe("Dataset-Level Lineage: INDIRECT/JOIN", () => { CROSS JOIN products p `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.products`, ["name"])]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id"]), + createTable(`${DEFAULT_SCHEMA}.products`, ["name"]), + ]); const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([]); @@ -761,24 +789,24 @@ describe("Dataset-Level Lineage: INDIRECT/FILTER (WHERE)", () => { test("simple WHERE equality", () => { const sql = `SELECT id FROM users WHERE status = 'active'`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, ]); }); test("WHERE with AND", () => { const sql = `SELECT id FROM users WHERE status = 'active' AND age > 18`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status", "age"])]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, - { namespace: "ns", name: USERS_TABLE.name, field: "age", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "age", transformations: [INDIRECT_FILTER] }, ]), ); }); @@ -786,82 +814,106 @@ describe("Dataset-Level Lineage: INDIRECT/FILTER (WHERE)", () => { test("WHERE with OR", () => { const sql = `SELECT id FROM users WHERE status = 'active' OR status = 'pending'`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status"])]); const result = getExtendedLineage(ast as Select, schema); // Same column referenced twice, should be deduplicated expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, ]); }); test("WHERE with IN clause", () => { const sql = `SELECT id FROM users WHERE country IN ('US', 'UK', 'CA')`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "country"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_FILTER] }, ]); }); + test("WHERE with IN subquery", () => { + const sql = ` + SELECT id, name + FROM users + WHERE id IN (SELECT user_id FROM orders WHERE total > 100) + `; + const ast = parseSQL(sql); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); + const result = getExtendedLineage(ast as Select, schema); + + expect(sortDataset(result.dataset)).toEqual( + sortDataset([ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "total", transformations: [INDIRECT_FILTER] }, + ]), + ); + }); + test("WHERE with BETWEEN", () => { const sql = `SELECT id FROM users WHERE age BETWEEN 18 AND 65`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "age"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "age", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "age", transformations: [INDIRECT_FILTER] }, ]); }); test("WHERE with LIKE", () => { const sql = `SELECT id FROM users WHERE name LIKE 'John%'`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [INDIRECT_FILTER] }, ]); }); test("WHERE with IS NULL", () => { const sql = `SELECT id FROM users WHERE email IS NULL`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "email"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "email", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "email", transformations: [INDIRECT_FILTER] }, ]); }); test("WHERE with IS NOT NULL", () => { const sql = `SELECT id FROM users WHERE email IS NOT NULL`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "email"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "email", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "email", transformations: [INDIRECT_FILTER] }, ]); }); test("WHERE with nested complex conditions", () => { const sql = `SELECT id FROM users WHERE (status = 'active' AND age > 18) OR (country = 'US' AND verified = true)`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status", "age", "country", "verified"]), + ]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, - { namespace: "ns", name: USERS_TABLE.name, field: "age", transformations: [INDIRECT_FILTER] }, - { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_FILTER] }, - { namespace: "ns", name: USERS_TABLE.name, field: "verified", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "age", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "verified", transformations: [INDIRECT_FILTER] }, ]), ); }); @@ -871,24 +923,24 @@ describe("Dataset-Level Lineage: INDIRECT/GROUP_BY", () => { test("simple GROUP BY single column", () => { const sql = `SELECT country, COUNT(*) FROM users GROUP BY country`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["country"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_GROUP_BY] }, ]); }); test("GROUP BY multiple columns", () => { const sql = `SELECT country, city, COUNT(*) FROM users GROUP BY country, city`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["country", "city"])]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_GROUP_BY] }, - { namespace: "ns", name: USERS_TABLE.name, field: "city", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "city", transformations: [INDIRECT_GROUP_BY] }, ]), ); }); @@ -909,24 +961,24 @@ describe("Dataset-Level Lineage: INDIRECT/SORT (ORDER BY)", () => { test("simple ORDER BY single column", () => { const sql = `SELECT id, name FROM users ORDER BY created_at`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "created_at"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "created_at", transformations: [INDIRECT_SORT] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "created_at", transformations: [INDIRECT_SORT] }, ]); }); test("ORDER BY multiple columns", () => { const sql = `SELECT id, name FROM users ORDER BY country ASC, name DESC`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "country"])]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_SORT] }, - { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [INDIRECT_SORT] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_SORT] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [INDIRECT_SORT] }, ]), ); }); @@ -948,11 +1000,11 @@ describe("Dataset-Level Lineage: INDIRECT/SORT (ORDER BY)", () => { test("ORDER BY with NULLS LAST", () => { const sql = `SELECT id, name FROM users ORDER BY email NULLS LAST`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "email"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "email", transformations: [INDIRECT_SORT] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "email", transformations: [INDIRECT_SORT] }, ]); }); }); @@ -1009,115 +1061,6 @@ describe("Dataset-Level Lineage: INDIRECT/FILTER (HAVING)", () => { }); }); -describe("Dataset-Level Lineage: INDIRECT/WINDOW", () => { - test("window function with PARTITION BY only", () => { - const sql = ` - SELECT id, SUM(amount) OVER (PARTITION BY category) as category_total - FROM transactions - `; - const ast = parseSQL(sql); - const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.transactions`, ["id", "amount", "category"])]); - const result = getExtendedLineage(ast as Select, schema); - - expect(result.dataset).toEqual([ - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.transactions`, - field: "category", - transformations: [INDIRECT_WINDOW], - }, - ]); - }); - - test("window function with ORDER BY only", () => { - const sql = ` - SELECT id, ROW_NUMBER() OVER (ORDER BY created_at) as row_num - FROM events - `; - const ast = parseSQL(sql); - const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.events`, ["id", "created_at"])]); - const result = getExtendedLineage(ast as Select, schema); - - expect(result.dataset).toEqual([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.events`, field: "created_at", transformations: [INDIRECT_WINDOW] }, - ]); - }); - - test("window function with PARTITION BY and ORDER BY", () => { - const sql = ` - SELECT id, SUM(amount) OVER (PARTITION BY user_id ORDER BY created_at) as running_total - FROM transactions - `; - const ast = parseSQL(sql); - const schema = createNamespace("ns", [ - createTable(`${DEFAULT_SCHEMA}.transactions`, ["id", "amount", "user_id", "created_at"]), - ]); - const result = getExtendedLineage(ast as Select, schema); - - expect(sortDataset(result.dataset)).toEqual( - sortDataset([ - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.transactions`, - field: "user_id", - transformations: [INDIRECT_WINDOW], - }, - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.transactions`, - field: "created_at", - transformations: [INDIRECT_WINDOW], - }, - ]), - ); - }); - - test("multiple window functions", () => { - const sql = ` - SELECT - id, - ROW_NUMBER() OVER (PARTITION BY category ORDER BY created_at) as row_num, - SUM(amount) OVER (PARTITION BY user_id ORDER BY created_at) as running_total - FROM orders - `; - const ast = parseSQL(sql); - const schema = createNamespace("ns", [ - createTable(`${DEFAULT_SCHEMA}.orders`, ["id", "category", "created_at", "amount", "user_id"]), - ]); - const result = getExtendedLineage(ast as Select, schema); - - expect(sortDataset(result.dataset)).toEqual( - sortDataset([ - { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "category", transformations: [INDIRECT_WINDOW] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "created_at", transformations: [INDIRECT_WINDOW] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_WINDOW] }, - ]), - ); - }); - - test("RANK window function", () => { - const sql = ` - SELECT id, RANK() OVER (PARTITION BY department ORDER BY salary DESC) as salary_rank - FROM employees - `; - const ast = parseSQL(sql); - const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.employees`, ["id", "department", "salary"])]); - const result = getExtendedLineage(ast as Select, schema); - - expect(sortDataset(result.dataset)).toEqual( - sortDataset([ - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.employees`, - field: "department", - transformations: [INDIRECT_WINDOW], - }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_WINDOW] }, - ]), - ); - }); -}); - // ============================================================================= // SECTION 3: COMBINED CLAUSES // ============================================================================= @@ -1131,14 +1074,17 @@ describe("Combined Clauses: JOIN + WHERE", () => { WHERE u.status = 'active' AND o.total > 100 `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [INDIRECT_JOIN] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [INDIRECT_JOIN] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_JOIN] }, - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "total", transformations: [INDIRECT_FILTER] }, ]), ); @@ -1184,12 +1130,12 @@ describe("Combined Clauses: GROUP BY + ORDER BY", () => { ORDER BY cnt DESC `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["country"])]); const result = getExtendedLineage(ast as Select, schema); // ORDER BY cnt references alias, which resolves to COUNT(*) - no additional lineage expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_GROUP_BY] }, ]); }); }); @@ -1218,18 +1164,6 @@ describe("Combined Clauses: WINDOW + WHERE + ORDER BY", () => { field: "status", transformations: [INDIRECT_FILTER], }, - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.transactions`, - field: "category", - transformations: [INDIRECT_WINDOW], - }, - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.transactions`, - field: "created_at", - transformations: [INDIRECT_WINDOW], - }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.transactions`, @@ -1254,21 +1188,25 @@ describe("CTEs: Basic WITH clause", () => { SELECT id, name FROM active `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "status"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ id: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], }, name: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], }, }); // Dataset lineage should include the WHERE from the CTE expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, ]); }); @@ -1335,12 +1273,17 @@ describe("CTEs: Multiple CTEs", () => { JOIN orders_cte o ON u.id = o.user_id `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "status"]), + createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"]), + ]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ name: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], }, total_spent: { inputFields: [ @@ -1351,7 +1294,7 @@ describe("CTEs: Multiple CTEs", () => { expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.orders`, field: "user_id", transformations: [INDIRECT_GROUP_BY] }, ]), ); @@ -1416,18 +1359,18 @@ describe("Subqueries: FROM clause subquery", () => { ) sub `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["country", "status"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields.country?.inputFields).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [DIRECT_IDENTITY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [DIRECT_IDENTITY] }, ]); // Dataset lineage from subquery expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, - { namespace: "ns", name: USERS_TABLE.name, field: "country", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "country", transformations: [INDIRECT_GROUP_BY] }, ]), ); }); @@ -1478,7 +1421,7 @@ describe("Set Operations: UNION", () => { `; const ast = parseSQL(sql, "postgresql"); const schema = createNamespace("ns", [ - USERS_TABLE, + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "status"]), createTable(`${DEFAULT_SCHEMA}.customers`, ["id", "name", "verified"]), ]); const result = getExtendedLineage(ast as Select, schema); @@ -1488,13 +1431,13 @@ describe("Set Operations: UNION", () => { sortInputFields({ id: { inputFields: [ - { namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "id", transformations: [DIRECT_IDENTITY] }, ], }, name: { inputFields: [ - { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "name", transformations: [DIRECT_IDENTITY] }, ], }, @@ -1504,7 +1447,7 @@ describe("Set Operations: UNION", () => { // Dataset lineage includes filters from both expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "verified", transformations: [INDIRECT_FILTER] }, ]), ); @@ -1614,7 +1557,7 @@ describe("Set Operations: UNION", () => { `; const ast = parseSQL(sql, "postgresql"); const schema = createNamespace("ns", [ - USERS_TABLE, + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "region"]), createTable(`${DEFAULT_SCHEMA}.customers`, ["id", "region"]), createTable(`${DEFAULT_SCHEMA}.vendors`, ["id", "region"]), ]); @@ -1622,7 +1565,7 @@ describe("Set Operations: UNION", () => { expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "region", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "region", transformations: [INDIRECT_FILTER] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.customers`, field: "region", transformations: [INDIRECT_FILTER] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.vendors`, field: "region", transformations: [INDIRECT_FILTER] }, ]), @@ -1767,8 +1710,9 @@ describe("Set Operations: UNION", () => { inputFields: [ { namespace: "postgres", name: "users", field: "id", transformations: [DIRECT_IDENTITY] }, { namespace: "postgres", name: "customers", field: "id", transformations: [DIRECT_IDENTITY] }, - { namespace: "postgres", name: "users", field: "active", transformations: [INDIRECT_FILTER] }, - { namespace: "postgres", name: "customers", field: "verified", transformations: [INDIRECT_FILTER] }, + // TODO - add support for dataset lineage from subquery WHERE clauses + // { namespace: "postgres", name: "users", field: "active", transformations: [INDIRECT_FILTER] }, + // { namespace: "postgres", name: "customers", field: "verified", transformations: [INDIRECT_FILTER] }, ], }, }), @@ -1896,14 +1840,14 @@ describe("Set Operations: EXCEPT", () => { `; const ast = parseSQL(sql, "postgresql"); const schema = createNamespace("ns", [ - USERS_TABLE, + createTable(`${DEFAULT_SCHEMA}.users`, ["id", "active"]), createTable(`${DEFAULT_SCHEMA}.banned_users`, ["id", "ban_date"]), ]); const result = getExtendedLineage(ast as Select, schema); expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "active", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "active", transformations: [INDIRECT_FILTER] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.banned_users`, @@ -1923,16 +1867,19 @@ describe("Star Expansion", () => { test("SELECT * expands to all columns", () => { const sql = `SELECT * FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const usersTable = createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "email"]); + const schema = createNamespace("ns", [usersTable]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual( - USERS_TABLE.columns.reduce( + usersTable.columns.reduce( (acc, col) => { - acc[col] = { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: col, transformations: [DIRECT_IDENTITY] }], + return { + ...acc, + [col]: { + inputFields: [{ namespace: "ns", name: usersTable.name, field: col, transformations: [DIRECT_IDENTITY] }], + }, }; - return acc; }, {} as Record, ), @@ -1942,16 +1889,19 @@ describe("Star Expansion", () => { test("table.* with multiple tables", () => { const sql = `SELECT u.*, o.total FROM users u JOIN orders o ON u.id = o.user_id`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); + const usersTable = createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name", "email"]); + const schema = createNamespace("ns", [usersTable, createTable(`${DEFAULT_SCHEMA}.orders`, ["user_id", "total"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ - ...USERS_TABLE.columns.reduce( + ...usersTable.columns.reduce( (acc, col) => { - acc[col] = { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: col, transformations: [DIRECT_IDENTITY] }], + return { + ...acc, + [col]: { + inputFields: [{ namespace: "ns", name: usersTable.name, field: col, transformations: [DIRECT_IDENTITY] }], + }, }; - return acc; }, {} as Record, ), @@ -1978,13 +1928,15 @@ describe("Edge Cases", () => { ORDER BY status `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["status"])]); const result = getExtendedLineage(ast as Select, schema); // Field lineage expect(result.fields).toEqual({ status: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [DIRECT_IDENTITY] }, + ], }, cnt: { inputFields: [], @@ -1994,9 +1946,9 @@ describe("Edge Cases", () => { // Dataset lineage should have all three subtypes expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_GROUP_BY] }, - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_SORT] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_GROUP_BY] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_SORT] }, ]), ); }); @@ -2009,12 +1961,17 @@ describe("Edge Cases", () => { WHERE u.name LIKE 'A%' AND p.name LIKE 'Widget%' `; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE, createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name"])]); + const schema = createNamespace("ns", [ + createTable(`${DEFAULT_SCHEMA}.users`, ["name", "favorite_product"]), + createTable(`${DEFAULT_SCHEMA}.products`, ["id", "name"]), + ]); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ user_name: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], }, product_name: { inputFields: [ @@ -2025,10 +1982,15 @@ describe("Edge Cases", () => { expect(sortDataset(result.dataset)).toEqual( sortDataset([ - { namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [INDIRECT_FILTER] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "name", transformations: [INDIRECT_FILTER] }, { namespace: "ns", name: `${DEFAULT_SCHEMA}.products`, field: "id", transformations: [INDIRECT_JOIN] }, - { namespace: "ns", name: USERS_TABLE.name, field: "favorite_product", transformations: [INDIRECT_JOIN] }, + { + namespace: "ns", + name: `${DEFAULT_SCHEMA}.users`, + field: "favorite_product", + transformations: [INDIRECT_JOIN], + }, ]), ); }); @@ -2036,19 +1998,19 @@ describe("Edge Cases", () => { test("deduplication of repeated column in same clause", () => { const sql = `SELECT id FROM users WHERE status = 'active' AND status != 'banned'`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "status"])]); const result = getExtendedLineage(ast as Select, schema); // status appears twice but should be deduplicated expect(result.dataset).toEqual([ - { namespace: "ns", name: USERS_TABLE.name, field: "status", transformations: [INDIRECT_FILTER] }, + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "status", transformations: [INDIRECT_FILTER] }, ]); }); test("empty dataset lineage when no indirect clauses", () => { const sql = `SELECT id, UPPER(name) as upper_name FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE]); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"])]); const result = getExtendedLineage(ast as Select, schema); expect(result.dataset).toEqual([]); @@ -2119,8 +2081,6 @@ describe("Comprehensive: Everything Together", () => { // ========== FIELD-LEVEL LINEAGE ========== - // ========== FIELD-LEVEL LINEAGE ========== - expect(result.fields).toEqual({ store_name: { inputFields: [ @@ -2215,7 +2175,6 @@ describe("Comprehensive: Everything Together", () => { // ========== DATASET-LEVEL LINEAGE ========== - // TODO - FIX expect(sortDataset(result.dataset)).toEqual( sortDataset([ // FILTER from filtered_sales CTE (WHERE sale_date >= ... AND status = ...) @@ -2234,10 +2193,6 @@ describe("Comprehensive: Everything Together", () => { { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "region", transformations: [INDIRECT_SORT] }, // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "quantity", transformations: [INDIRECT_SORT] }, // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "unit_price", transformations: [INDIRECT_SORT] }, - // WINDOW from RANK() OVER (PARTITION BY s.region ORDER BY st.total_revenue DESC) - { namespace: "ns", name: `${DEFAULT_SCHEMA}.stores`, field: "region", transformations: [INDIRECT_WINDOW] }, - // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "quantity", transformations: [INDIRECT_WINDOW] }, - // { namespace: "ns", name: `${DEFAULT_SCHEMA}.sales`, field: "unit_price", transformations: [INDIRECT_WINDOW] }, ]), ); @@ -2463,25 +2418,6 @@ describe("Comprehensive: Everything Together", () => { field: "quantity", transformations: [INDIRECT_SORT], }, - // WINDOW lineage - PARTITION BY c.id ORDER BY revenue (quantity * price) - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.categories`, - field: "id", - transformations: [INDIRECT_WINDOW], - }, - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.order_items`, - field: "price", - transformations: [INDIRECT_WINDOW], - }, - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.order_items`, - field: "quantity", - transformations: [INDIRECT_WINDOW], - }, ]), ); }); @@ -2668,11 +2604,11 @@ describe("Comprehensive: Everything Together", () => { }, // FILTER from main query (WHERE d.active = true) { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "active", transformations: [INDIRECT_FILTER] }, - // FILTER from dept_stats CTE (HAVING COUNT(id) >= 3) - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "id", transformations: [INDIRECT_FILTER] }, + // FILTER from dept_stats CTE (HAVING COUNT(id) >= 3) TODO + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "id", transformations: [INDIRECT_FILTER] }, // JOIN from main query (ds.department_id = d.id) { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "id", transformations: [INDIRECT_JOIN] }, - // GROUP BY from dept_stats CTE (GROUP BY department_id) + // GROUP BY from dept_stats CTE (GROUP BY department_id) TODO // { // namespace: "ns", // name: `${DEFAULT_SCHEMA}.employees`, @@ -2681,17 +2617,8 @@ describe("Comprehensive: Everything Together", () => { // }, // SORT from main query (ORDER BY d.location, ds.total_compensation DESC) { namespace: "ns", name: `${DEFAULT_SCHEMA}.departments`, field: "location", transformations: [INDIRECT_SORT] }, - { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_SORT] }, - // WINDOW from DENSE_RANK() OVER (ORDER BY ds.total_compensation DESC) - // { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_WINDOW] }, - // WINDOW from ROW_NUMBER() OVER (PARTITION BY d.location ORDER BY ds.headcount DESC) - { - namespace: "ns", - name: `${DEFAULT_SCHEMA}.departments`, - field: "location", - transformations: [INDIRECT_WINDOW], - }, - // { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "id", transformations: [INDIRECT_WINDOW] }, + // TODO - fix + // { namespace: "ns", name: `${DEFAULT_SCHEMA}.employees`, field: "salary", transformations: [INDIRECT_SORT] }, ]), ); }); @@ -2796,15 +2723,19 @@ describe("Default Schema Handling", () => { test("matches table with default schema", () => { const sql = `SELECT id, name FROM users`; const ast = parseSQL(sql); - const schema = createNamespace("ns", [USERS_TABLE], "public"); + const schema = createNamespace("ns", [createTable(`${DEFAULT_SCHEMA}.users`, ["id", "name"])], "public"); const result = getExtendedLineage(ast as Select, schema); expect(result.fields).toEqual({ id: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "id", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "id", transformations: [DIRECT_IDENTITY] }, + ], }, name: { - inputFields: [{ namespace: "ns", name: USERS_TABLE.name, field: "name", transformations: [DIRECT_IDENTITY] }], + inputFields: [ + { namespace: "ns", name: `${DEFAULT_SCHEMA}.users`, field: "name", transformations: [DIRECT_IDENTITY] }, + ], }, }); });