diff --git a/.changeset/postgres-kit-parity-tszod.md b/.changeset/postgres-kit-parity-tszod.md new file mode 100644 index 0000000..677cc6b --- /dev/null +++ b/.changeset/postgres-kit-parity-tszod.md @@ -0,0 +1,5 @@ +--- +"smooai-clickhouse-kit": minor +--- + +Add TS + Zod code emit (codegen feature) for schema/consumer parity with postgres-kit. diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f16e00..24261ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # @smooai/clickhouse-kit +## Unreleased + +### Patch Changes + +- Docs/metadata: neutralized external-toolkit references in the ROADMAP, package description/keywords, and source comments (now described as a schema-as-code toolkit with Zod schema emitters). No API change. + +### Minor Changes + +- TS + Zod code emit behind the `codegen` cargo feature (`src/codegen.rs`) — from a `TableSpec`, emit a TS row `interface`, a Zod **select** schema, and a Zod **insert** schema (columns with a ClickHouse `DEFAULT` become `.optional()`), for schema/consumer parity with `postgres-kit`. Mirrors the retired TS package's `createSelectSchema`/`createInsertSchema` output style: `camelCase` keys, 4-space formatting, and the same ClickHouse→TS/Zod type mapping (`String`/`UUID`/dates→`string`/`z.string()`, ints/floats→`number`/`z.number()`, `Bool`→`boolean`, `Array(String)`→`string[]`/`z.array(z.string())`, `Map(String,String)`→`Record`/`z.record(z.string(),z.string())`, `JSON`→`unknown`/`z.unknown()`, `Nullable(T)`→optional `T | null`/`.nullable()`, `LowCardinality(T)` transparent → `T`). + ## 0.2.0 ### Minor Changes diff --git a/README.md b/README.md index 99f1dc6..b9d2e90 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,8 @@ let ddl = to_create_table_sql(&table, &SchemaLimits::default())?; Every identifier is validated (`^[A-Za-z_][A-Za-z0-9_]*$` + a length bound, backtick-quoted on render), column counts are bounded, and `ORDER BY` entries must be real columns — so a malicious table/column name can't inject SQL. +Need an explicit precision/timezone? Use the parametrised `DateTime64` type — `{"datetime64": {"precision": 3, "timezone": "UTC"}}` renders `DateTime64(3, 'UTC')` (bare `"DateTime64"` still renders `DateTime64(3)`). Precision (`0..=9`) and the timezone charset (`^[A-Za-z0-9_+/-]{1,64}$`, the IANA shape) are validated before they reach SQL, so an untrusted timezone string can't inject. + ## The flexible (hybrid) table The most-reused multi-tenant shape in one call — your mandatory + promoted typed columns, plus an `attrs Map(String, String)` catch-all and a `raw String`: @@ -71,6 +73,40 @@ let table = flexible_table( )?; ``` +## Production-table DDL: partitioning, TTL, indexes, settings + +Real production tables need `PARTITION BY`, a TTL policy, data-skipping indexes, and `SETTINGS`. `TableSpec` (and `FlexibleConfig`) carry these as additive fields, rendered in canonical ClickHouse clause order — `ENGINE` → `PARTITION BY` → `ORDER BY` → `TTL` → `SETTINGS`, with `INDEX` lines inside the column parens: + +```rust +use clickhouse_kit::{IndexSpec, TableSpec, TtlMove, TtlSpec}; + +let table = TableSpec { + // ...columns, engine, order_by... + partition_by: Some("(organization_id, toDate(started_at))".into()), + indexes: vec![IndexSpec { + name: "idx_trace_id".into(), + expression: "trace_id".into(), + type_def: "bloom_filter(0.01)".into(), + granularity: 1, + }], + ttl: Some(TtlSpec { + column: "started_at".into(), + move_to_volume_after: Some(TtlMove { interval: "14 DAY".into(), volume: "cold".into() }), + delete_after: Some("180 DAY".into()), + }), + settings: vec![ + ("storage_policy".into(), "'hot_cold'".into()), + ("index_granularity".into(), "8192".into()), + ], + // ... +}; +// TTL toDateTime(started_at) + INTERVAL 14 DAY TO VOLUME 'cold', toDateTime(started_at) + INTERVAL 180 DAY DELETE +``` + +A `DateTime64` TTL column is automatically wrapped in `toDateTime(...)`. All four fields are optional/empty by default, so existing specs render exactly as before. + +**Safety posture:** these knobs are **app-controlled raw fragments** emitted verbatim — `partition_by`, the index `expression`/`type_def`, the TTL `interval`/`volume`/`delete_after`, and the settings RHS values are _not_ validated, exactly like `engine`. Only identifiers are validated: the index `name`, and the TTL `column` (which must also be a real column in the table). Never build the raw fragments from untrusted input. + ## Ingest: flatten + coerce Shape an arbitrary record to a (possibly dynamic) table — known keys land in their columns, the long tail flattens into `attrs`, and `raw` captures the original: diff --git a/ROADMAP.md b/ROADMAP.md index b430d57..a7796a7 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -2,7 +2,7 @@ ## v0.1 (shipped) — static, developer-authored schemas -"Drizzle for ClickHouse": a developer authors a table once, at compile time, as a literal — `clickhouseTable(name, columns, options)` → `toCreateTableSql` (DDL) + inferred row type (`InferSelect`) + `createSelectSchema`/`createInsertSchema` (drizzle-zod) + forward-only migrations (`generate`/`migrate`/`check`, no auto-diff). Column system: `ch.*` (`ChColumn`). Minimal, TS-only, forward-only, MIT. +A schema-as-code toolkit for ClickHouse: a developer authors a table once, at compile time, as a literal — `clickhouseTable(name, columns, options)` → `toCreateTableSql` (DDL) + inferred row type (`InferSelect`) + `createSelectSchema`/`createInsertSchema` (Zod schema emitters) + forward-only migrations (`generate`/`migrate`/`check`, no auto-diff). Column system: `ch.*` (`ChColumn`). Minimal, TS-only, forward-only, MIT. ## v0.2 — the safe foundation for flexible, user-driven, multi-tenant schemas @@ -40,3 +40,5 @@ The flexible / multi-tenant surface (runtime construction, the safety layer, fla Started: the Rust **safety core** (`crates/clickhouse-kit/src/safety.rs`) — `validate_identifier`/`quote_identifier`, the `ColumnTypeSpec` allowlist (+ `to_ch_type`/`is_datetime64`), bounds + reserved — plus runtime **table DDL generation** (`table.rs`: `to_create_table_sql` from an untrusted spec, with identifier/allowlist/bounds/dup guards). Verified **end-to-end against a real ClickHouse** via testcontainers (generate DDL → apply → introspect `system.columns` → insert/select round-trip); the ported adversarial unit suite (injection, disallowed types, bounds, dup columns) is green too. CI runs unit + the testcontainers integration. **Full surface landed (built via a 4-way Rust fan-out, lead-integrated):** `flexible_table` (the hybrid), `flatten_record` + `coerce_to_table`, `diff_columns` + `alter_add_columns_sql` (additive-only), and the I/O layer — a driver-agnostic `ChExecutor` trait + `run_migrations` (forward-only) + `check_drift` — with a second testcontainers integration exercising migrate + drift against a real ClickHouse. **38 unit + 2 real-ClickHouse integration tests, clippy `-D warnings` clean.** Published to crates.io as **`smooai-clickhouse-kit`** (manual `publish-crate.yml`, `SMOOAI_CARGO_REGISTRY_TOKEN`). No WASM binding — there are no TS consumers; the Rust services consume the crate directly. Rows stay Serde-native. + +**TS + Zod code emit (the `codegen` feature):** to keep parity with `postgres-kit`'s TS/Zod emitter, `src/codegen.rs` (behind the off-by-default `codegen` cargo feature, no extra deps) emits from a `TableSpec` — a TS row `interface`, a Zod **select** schema, and a Zod **insert** schema (columns with a ClickHouse `DEFAULT` become `.optional()`). This is the Rust-canonical port of the retired TS package's `createSelectSchema`/`createInsertSchema`: same `camelCase` keys, 4-space formatting, and ClickHouse→TS/Zod type mapping (`Nullable(T)` → optional `T | null`/`.nullable()`, `LowCardinality(T)` transparent, `Array(String)`/`Map(String,String)`/`JSON` handled). Golden snapshot unit tests cover `Nullable`, `Array`, `Map`, `LowCardinality`, `DateTime64`, and enum-ish `LowCardinality(String)`. diff --git a/crates/clickhouse-kit/Cargo.lock b/crates/clickhouse-kit/Cargo.lock index 0a7f22a..985408e 100644 --- a/crates/clickhouse-kit/Cargo.lock +++ b/crates/clickhouse-kit/Cargo.lock @@ -2068,7 +2068,7 @@ checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" [[package]] name = "smooai-clickhouse-kit" -version = "0.1.0" +version = "0.2.0" dependencies = [ "clickhouse", "serde", diff --git a/crates/clickhouse-kit/Cargo.toml b/crates/clickhouse-kit/Cargo.toml index 530687f..3f47ebb 100644 --- a/crates/clickhouse-kit/Cargo.toml +++ b/crates/clickhouse-kit/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "smooai-clickhouse-kit" -version = "0.1.0" +version = "0.2.0" edition = "2021" rust-version = "1.75" license = "MIT" @@ -18,6 +18,11 @@ categories = ["database", "data-structures"] name = "clickhouse_kit" path = "src/lib.rs" +[features] +# TS + Zod code emit from a `TableSpec` (see `src/codegen.rs`) — for schema/consumer +# parity with postgres-kit. Off by default; pulls in no extra dependencies. +codegen = [] + [dependencies] serde = { version = "1", features = ["derive"] } serde_json = "1" diff --git a/crates/clickhouse-kit/README.md b/crates/clickhouse-kit/README.md index 99f1dc6..b9d2e90 100644 --- a/crates/clickhouse-kit/README.md +++ b/crates/clickhouse-kit/README.md @@ -51,6 +51,8 @@ let ddl = to_create_table_sql(&table, &SchemaLimits::default())?; Every identifier is validated (`^[A-Za-z_][A-Za-z0-9_]*$` + a length bound, backtick-quoted on render), column counts are bounded, and `ORDER BY` entries must be real columns — so a malicious table/column name can't inject SQL. +Need an explicit precision/timezone? Use the parametrised `DateTime64` type — `{"datetime64": {"precision": 3, "timezone": "UTC"}}` renders `DateTime64(3, 'UTC')` (bare `"DateTime64"` still renders `DateTime64(3)`). Precision (`0..=9`) and the timezone charset (`^[A-Za-z0-9_+/-]{1,64}$`, the IANA shape) are validated before they reach SQL, so an untrusted timezone string can't inject. + ## The flexible (hybrid) table The most-reused multi-tenant shape in one call — your mandatory + promoted typed columns, plus an `attrs Map(String, String)` catch-all and a `raw String`: @@ -71,6 +73,40 @@ let table = flexible_table( )?; ``` +## Production-table DDL: partitioning, TTL, indexes, settings + +Real production tables need `PARTITION BY`, a TTL policy, data-skipping indexes, and `SETTINGS`. `TableSpec` (and `FlexibleConfig`) carry these as additive fields, rendered in canonical ClickHouse clause order — `ENGINE` → `PARTITION BY` → `ORDER BY` → `TTL` → `SETTINGS`, with `INDEX` lines inside the column parens: + +```rust +use clickhouse_kit::{IndexSpec, TableSpec, TtlMove, TtlSpec}; + +let table = TableSpec { + // ...columns, engine, order_by... + partition_by: Some("(organization_id, toDate(started_at))".into()), + indexes: vec![IndexSpec { + name: "idx_trace_id".into(), + expression: "trace_id".into(), + type_def: "bloom_filter(0.01)".into(), + granularity: 1, + }], + ttl: Some(TtlSpec { + column: "started_at".into(), + move_to_volume_after: Some(TtlMove { interval: "14 DAY".into(), volume: "cold".into() }), + delete_after: Some("180 DAY".into()), + }), + settings: vec![ + ("storage_policy".into(), "'hot_cold'".into()), + ("index_granularity".into(), "8192".into()), + ], + // ... +}; +// TTL toDateTime(started_at) + INTERVAL 14 DAY TO VOLUME 'cold', toDateTime(started_at) + INTERVAL 180 DAY DELETE +``` + +A `DateTime64` TTL column is automatically wrapped in `toDateTime(...)`. All four fields are optional/empty by default, so existing specs render exactly as before. + +**Safety posture:** these knobs are **app-controlled raw fragments** emitted verbatim — `partition_by`, the index `expression`/`type_def`, the TTL `interval`/`volume`/`delete_after`, and the settings RHS values are _not_ validated, exactly like `engine`. Only identifiers are validated: the index `name`, and the TTL `column` (which must also be a real column in the table). Never build the raw fragments from untrusted input. + ## Ingest: flatten + coerce Shape an arbitrary record to a (possibly dynamic) table — known keys land in their columns, the long tail flattens into `attrs`, and `raw` captures the original: diff --git a/crates/clickhouse-kit/src/codegen.rs b/crates/clickhouse-kit/src/codegen.rs new file mode 100644 index 0000000..e056a4a --- /dev/null +++ b/crates/clickhouse-kit/src/codegen.rs @@ -0,0 +1,449 @@ +//! TS + Zod code emit for a [`TableSpec`] — the Rust-canonical port of the retired +//! `@smooai/clickhouse-kit` `createSelectSchema`/`createInsertSchema` emitter. +//! +//! From a [`TableSpec`] this emits, mirroring the retired TS package's output style: +//! - a TS row `interface` (one field per column), +//! - a Zod **select** schema (`z.object(...)`), and +//! - a Zod **insert** schema (columns with a ClickHouse `DEFAULT` become `.optional()`). +//! +//! The ClickHouse-type → TS/Zod mapping: +//! +//! | ClickHouse | TS | Zod | +//! | ------------------------- | ------------------------ | ------------------------------------ | +//! | `String` | `string` | `z.string()` | +//! | `UUID` | `string` | `z.string()` | +//! | `Bool` | `boolean` | `z.boolean()` | +//! | `Int*` / `UInt*` | `number` | `z.number()` | +//! | `Float32` / `Float64` | `number` | `z.number()` | +//! | `Date`/`DateTime`/`DateTime64` | `string` | `z.string()` | +//! | `JSON` | `unknown` | `z.unknown()` | +//! | `Nullable(T)` | `T \| null` (optional `?`) | `.nullable()` | +//! | `LowCardinality(T)` | `T` | `` | +//! | `Array(String)` | `string[]` | `z.array(z.string())` | +//! | `Map(String, String)` | `Record` | `z.record(z.string(), z.string())` | +//! +//! Keys are emitted in `camelCase` (ClickHouse columns are conventionally +//! `snake_case`), the type/schema names are derived from the table name, and the +//! output uses 4-space indentation for parity with how `postgres-kit` emits. + +use crate::safety::{ColumnTypeSpec, ScalarType}; +use crate::table::{ColumnSpec, TableSpec}; + +// ── Naming helpers ─────────────────────────────────────────────────────────── + +/// `snake_case` / `kebab-case` → `camelCase` (e.g. `organization_id` → `organizationId`). +fn to_camel_case(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + let mut upper_next = false; + let mut first = true; + for c in s.chars() { + if c == '_' || c == '-' { + // Don't uppercase across a leading separator; keep it as a boundary. + upper_next = !first; + continue; + } + if upper_next { + out.extend(c.to_uppercase()); + upper_next = false; + } else { + out.push(c); + } + first = false; + } + out +} + +/// `snake_case` → `PascalCase` (e.g. `observability_traces` → `ObservabilityTraces`). +fn to_pascal_case(s: &str) -> String { + let camel = to_camel_case(s); + let mut chars = camel.chars(); + match chars.next() { + Some(first) => first.to_uppercase().collect::() + chars.as_str(), + None => camel, + } +} + +// ── Type mapping ───────────────────────────────────────────────────────────── + +fn scalar_ts(s: ScalarType) -> &'static str { + match s { + ScalarType::String + | ScalarType::Uuid + | ScalarType::Date + | ScalarType::DateTime + | ScalarType::DateTime64 => "string", + ScalarType::Bool => "boolean", + ScalarType::Int8 + | ScalarType::Int16 + | ScalarType::Int32 + | ScalarType::Int64 + | ScalarType::UInt8 + | ScalarType::UInt16 + | ScalarType::UInt32 + | ScalarType::UInt64 + | ScalarType::Float32 + | ScalarType::Float64 => "number", + ScalarType::Json => "unknown", + } +} + +fn scalar_zod(s: ScalarType) -> &'static str { + match s { + ScalarType::String + | ScalarType::Uuid + | ScalarType::Date + | ScalarType::DateTime + | ScalarType::DateTime64 => "z.string()", + ScalarType::Bool => "z.boolean()", + ScalarType::Int8 + | ScalarType::Int16 + | ScalarType::Int32 + | ScalarType::Int64 + | ScalarType::UInt8 + | ScalarType::UInt16 + | ScalarType::UInt32 + | ScalarType::UInt64 + | ScalarType::Float32 + | ScalarType::Float64 => "z.number()", + ScalarType::Json => "z.unknown()", + } +} + +/// The TS type for a column spec. `Nullable(T)` widens to `T | null`; +/// `LowCardinality(T)` is transparent (renders as `T`). +fn ts_type(spec: &ColumnTypeSpec) -> String { + match spec { + ColumnTypeSpec::Scalar(s) => scalar_ts(*s).to_string(), + ColumnTypeSpec::DateTime64 { .. } => "string".to_string(), + ColumnTypeSpec::Nullable { nullable } => format!("{} | null", ts_type(nullable)), + ColumnTypeSpec::LowCardinality { low_cardinality } => ts_type(low_cardinality), + ColumnTypeSpec::Array { .. } => "string[]".to_string(), + ColumnTypeSpec::Map { .. } => "Record".to_string(), + } +} + +/// The Zod expression for a column spec. `Nullable(T)` appends `.nullable()`; +/// `LowCardinality(T)` is transparent (renders as the inner Zod). +fn zod_type(spec: &ColumnTypeSpec) -> String { + match spec { + ColumnTypeSpec::Scalar(s) => scalar_zod(*s).to_string(), + ColumnTypeSpec::DateTime64 { .. } => "z.string()".to_string(), + ColumnTypeSpec::Nullable { nullable } => format!("{}.nullable()", zod_type(nullable)), + ColumnTypeSpec::LowCardinality { low_cardinality } => zod_type(low_cardinality), + ColumnTypeSpec::Array { .. } => "z.array(z.string())".to_string(), + ColumnTypeSpec::Map { .. } => "z.record(z.string(), z.string())".to_string(), + } +} + +/// Whether a column is nullable (a `Nullable(...)` at the core, seen through any +/// transparent `LowCardinality(...)` wrappers). Nullable columns become optional +/// (`field?`) in the emitted interface. +fn is_nullable(spec: &ColumnTypeSpec) -> bool { + match spec { + ColumnTypeSpec::Nullable { .. } => true, + ColumnTypeSpec::LowCardinality { low_cardinality } => is_nullable(low_cardinality), + _ => false, + } +} + +// ── Emit ───────────────────────────────────────────────────────────────────── + +/// The TS interface name for a table, e.g. `observability_traces` → `ObservabilityTracesRow`. +pub fn row_type_name(table: &TableSpec) -> String { + format!("{}Row", to_pascal_case(&table.name)) +} + +/// The Zod select-schema const name, e.g. `observabilityTracesSelectSchema`. +pub fn select_schema_name(table: &TableSpec) -> String { + format!("{}SelectSchema", to_camel_case(&table.name)) +} + +/// The Zod insert-schema const name, e.g. `observabilityTracesInsertSchema`. +pub fn insert_schema_name(table: &TableSpec) -> String { + format!("{}InsertSchema", to_camel_case(&table.name)) +} + +/// Emit the TS row `interface` for a table (one field per column). +pub fn emit_row_interface(table: &TableSpec) -> String { + let mut out = format!("export interface {} {{\n", row_type_name(table)); + for c in &table.columns { + let optional = if is_nullable(&c.type_spec) { "?" } else { "" }; + out.push_str(&format!( + " {}{}: {};\n", + to_camel_case(&c.name), + optional, + ts_type(&c.type_spec) + )); + } + out.push('}'); + out +} + +fn emit_zod_object(name: &str, columns: &[ColumnSpec], insert: bool) -> String { + let mut out = format!("export const {name} = z.object({{\n"); + for c in columns { + let mut zod = zod_type(&c.type_spec); + // Columns with a ClickHouse DEFAULT are optional on insert (the server fills them). + if insert && c.default.is_some() { + zod.push_str(".optional()"); + } + out.push_str(&format!(" {}: {},\n", to_camel_case(&c.name), zod)); + } + out.push_str("});"); + out +} + +/// Emit the Zod **select** schema (`z.object(...)`) for a table. +pub fn emit_select_schema(table: &TableSpec) -> String { + emit_zod_object(&select_schema_name(table), &table.columns, false) +} + +/// Emit the Zod **insert** schema — columns with a `DEFAULT` become `.optional()`. +pub fn emit_insert_schema(table: &TableSpec) -> String { + emit_zod_object(&insert_schema_name(table), &table.columns, true) +} + +/// Emit a full TS module for a table: the `zod` import, the row interface, and the +/// select + insert schemas, separated by blank lines. +pub fn emit_ts_module(table: &TableSpec) -> String { + format!( + "import {{ z }} from \"zod\";\n\n{}\n\n{}\n\n{}\n", + emit_row_interface(table), + emit_select_schema(table), + emit_insert_schema(table), + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::safety::{ScalarType, StringOnly}; + + fn col(name: &str, t: ColumnTypeSpec) -> ColumnSpec { + ColumnSpec { + name: name.into(), + type_spec: t, + default: None, + } + } + + fn lc(inner: ColumnTypeSpec) -> ColumnTypeSpec { + ColumnTypeSpec::LowCardinality { + low_cardinality: Box::new(inner), + } + } + + fn nullable(inner: ColumnTypeSpec) -> ColumnTypeSpec { + ColumnTypeSpec::Nullable { + nullable: Box::new(inner), + } + } + + /// A representative table covering every interesting case: scalar, UUID, + /// DateTime64, Bool, numeric, `Array(String)`, `Map(String, String)`, JSON, + /// enum-ish `LowCardinality(String)`, `LowCardinality(Nullable(String))`, and a + /// `DEFAULT`-bearing column (optional on insert). + fn sample() -> TableSpec { + TableSpec { + name: "events".into(), + columns: vec![ + col("id", ColumnTypeSpec::Scalar(ScalarType::Uuid)), + col( + "occurred_at", + ColumnTypeSpec::Scalar(ScalarType::DateTime64), + ), + // enum-ish LowCardinality(String) + col("status", lc(ColumnTypeSpec::Scalar(ScalarType::String))), + // nullable, through a LowCardinality wrapper + col( + "region", + lc(nullable(ColumnTypeSpec::Scalar(ScalarType::String))), + ), + col("score", ColumnTypeSpec::Scalar(ScalarType::Float64)), + col("retry_count", ColumnTypeSpec::Scalar(ScalarType::UInt32)), + col("is_error", ColumnTypeSpec::Scalar(ScalarType::Bool)), + col( + "tags", + ColumnTypeSpec::Array { + array: StringOnly::String, + }, + ), + col( + "attributes", + ColumnTypeSpec::Map { + map: (StringOnly::String, StringOnly::String), + }, + ), + col("payload", ColumnTypeSpec::Scalar(ScalarType::Json)), + ColumnSpec { + name: "ingested_at".into(), + type_spec: ColumnTypeSpec::Scalar(ScalarType::DateTime), + default: Some("now()".into()), + }, + ], + engine: "MergeTree()".into(), + order_by: vec!["id".into()], + partition_by: None, + ttl: None, + indexes: vec![], + settings: vec![], + } + } + + #[test] + fn names_are_derived_from_table_name() { + let t = TableSpec { + name: "observability_traces".into(), + ..sample() + }; + assert_eq!(row_type_name(&t), "ObservabilityTracesRow"); + assert_eq!(select_schema_name(&t), "observabilityTracesSelectSchema"); + assert_eq!(insert_schema_name(&t), "observabilityTracesInsertSchema"); + } + + #[test] + fn golden_row_interface() { + let expected = "\ +export interface EventsRow { + id: string; + occurredAt: string; + status: string; + region?: string | null; + score: number; + retryCount: number; + isError: boolean; + tags: string[]; + attributes: Record; + payload: unknown; + ingestedAt: string; +}"; + assert_eq!(emit_row_interface(&sample()), expected); + } + + #[test] + fn golden_select_schema() { + let expected = "\ +export const eventsSelectSchema = z.object({ + id: z.string(), + occurredAt: z.string(), + status: z.string(), + region: z.string().nullable(), + score: z.number(), + retryCount: z.number(), + isError: z.boolean(), + tags: z.array(z.string()), + attributes: z.record(z.string(), z.string()), + payload: z.unknown(), + ingestedAt: z.string(), +});"; + assert_eq!(emit_select_schema(&sample()), expected); + } + + #[test] + fn golden_insert_schema_makes_default_columns_optional() { + let expected = "\ +export const eventsInsertSchema = z.object({ + id: z.string(), + occurredAt: z.string(), + status: z.string(), + region: z.string().nullable(), + score: z.number(), + retryCount: z.number(), + isError: z.boolean(), + tags: z.array(z.string()), + attributes: z.record(z.string(), z.string()), + payload: z.unknown(), + ingestedAt: z.string().optional(), +});"; + assert_eq!(emit_insert_schema(&sample()), expected); + } + + #[test] + fn golden_full_module() { + let expected = "\ +import { z } from \"zod\"; + +export interface EventsRow { + id: string; + occurredAt: string; + status: string; + region?: string | null; + score: number; + retryCount: number; + isError: boolean; + tags: string[]; + attributes: Record; + payload: unknown; + ingestedAt: string; +} + +export const eventsSelectSchema = z.object({ + id: z.string(), + occurredAt: z.string(), + status: z.string(), + region: z.string().nullable(), + score: z.number(), + retryCount: z.number(), + isError: z.boolean(), + tags: z.array(z.string()), + attributes: z.record(z.string(), z.string()), + payload: z.unknown(), + ingestedAt: z.string(), +}); + +export const eventsInsertSchema = z.object({ + id: z.string(), + occurredAt: z.string(), + status: z.string(), + region: z.string().nullable(), + score: z.number(), + retryCount: z.number(), + isError: z.boolean(), + tags: z.array(z.string()), + attributes: z.record(z.string(), z.string()), + payload: z.unknown(), + ingestedAt: z.string().optional(), +}); +"; + assert_eq!(emit_ts_module(&sample()), expected); + } + + #[test] + fn parametrised_datetime64_maps_to_string() { + let dt: ColumnTypeSpec = + serde_json::from_str(r#"{"datetime64":{"precision":6,"timezone":"UTC"}}"#).unwrap(); + let t = TableSpec { + name: "t".into(), + columns: vec![col("occurred_at", dt)], + ..sample() + }; + assert!(emit_row_interface(&t).contains("occurredAt: string;")); + assert!(emit_select_schema(&t).contains("occurredAt: z.string()")); + } + + #[test] + fn nullable_scalar_without_low_cardinality_is_optional_and_nullable() { + let t = TableSpec { + name: "t".into(), + columns: vec![col( + "note", + nullable(ColumnTypeSpec::Scalar(ScalarType::String)), + )], + ..sample() + }; + assert!(emit_row_interface(&t).contains("note?: string | null;")); + assert!(emit_select_schema(&t).contains("note: z.string().nullable(),")); + } + + #[test] + fn camel_case_helper() { + assert_eq!(to_camel_case("organization_id"), "organizationId"); + assert_eq!(to_camel_case("started_at"), "startedAt"); + assert_eq!(to_camel_case("id"), "id"); + assert_eq!(to_camel_case("_leading"), "leading"); + assert_eq!( + to_pascal_case("observability_traces"), + "ObservabilityTraces" + ); + } +} diff --git a/crates/clickhouse-kit/src/drift.rs b/crates/clickhouse-kit/src/drift.rs index fc76e78..8ffe5e7 100644 --- a/crates/clickhouse-kit/src/drift.rs +++ b/crates/clickhouse-kit/src/drift.rs @@ -143,6 +143,10 @@ mod tests { columns: vec![col("id", ScalarType::Uuid), col("name", ScalarType::String)], engine: "MergeTree()".into(), order_by: vec!["id".into()], + partition_by: None, + ttl: None, + indexes: vec![], + settings: vec![], } } diff --git a/crates/clickhouse-kit/src/evolve.rs b/crates/clickhouse-kit/src/evolve.rs index ac7fd7f..83e826e 100644 --- a/crates/clickhouse-kit/src/evolve.rs +++ b/crates/clickhouse-kit/src/evolve.rs @@ -87,6 +87,10 @@ mod tests { ], engine: "MergeTree()".into(), order_by: vec!["id".into()], + partition_by: None, + ttl: None, + indexes: vec![], + settings: vec![], } } diff --git a/crates/clickhouse-kit/src/flatten.rs b/crates/clickhouse-kit/src/flatten.rs index 25a0aca..7ed198d 100644 --- a/crates/clickhouse-kit/src/flatten.rs +++ b/crates/clickhouse-kit/src/flatten.rs @@ -185,6 +185,10 @@ mod tests { ], engine: "MergeTree()".into(), order_by: vec!["id".into()], + partition_by: None, + ttl: None, + indexes: vec![], + settings: vec![], } } @@ -300,6 +304,10 @@ mod tests { columns: vec![col("id", ColumnTypeSpec::Scalar(ScalarType::String))], engine: "MergeTree()".into(), order_by: vec!["id".into()], + partition_by: None, + ttl: None, + indexes: vec![], + settings: vec![], }; let input = json!({ "id": "abc", "extra": "x" }); let res = coerce_to_table(input, &table, &FlattenOptions::default()); diff --git a/crates/clickhouse-kit/src/flexible.rs b/crates/clickhouse-kit/src/flexible.rs index 23d551b..d2c356c 100644 --- a/crates/clickhouse-kit/src/flexible.rs +++ b/crates/clickhouse-kit/src/flexible.rs @@ -12,7 +12,7 @@ use crate::safety::{ assert_not_reserved, validate_identifier, ColumnTypeSpec, ScalarType, SchemaError, SchemaLimits, StringOnly, DEFAULT_RESERVED_COLUMNS, }; -use crate::table::{ColumnSpec, TableSpec}; +use crate::table::{ColumnSpec, IndexSpec, TableSpec, TtlSpec}; /// Configuration for a flexible/hybrid table. /// @@ -26,6 +26,15 @@ pub struct FlexibleConfig { pub engine: String, pub order_by: Vec, pub reserved: Option>, + /// App-controlled raw `PARTITION BY` expression — copied verbatim into the + /// produced [`TableSpec`]. See [`TableSpec`] for the safety posture. + pub partition_by: Option, + /// Optional table TTL policy — copied verbatim into the produced [`TableSpec`]. + pub ttl: Option, + /// Secondary data-skipping indexes — copied verbatim into the produced [`TableSpec`]. + pub indexes: Vec, + /// App-controlled `SETTINGS` pairs — copied verbatim into the produced [`TableSpec`]. + pub settings: Vec<(String, String)>, } /// Build a [`TableSpec`] for the flexible/hybrid table shape: the mandatory + promoted @@ -74,6 +83,10 @@ pub fn flexible_table( columns, engine: config.engine, order_by: config.order_by, + partition_by: config.partition_by, + ttl: config.ttl, + indexes: config.indexes, + settings: config.settings, }) } @@ -100,6 +113,10 @@ mod tests { engine: "MergeTree()".into(), order_by: vec!["org_id".into(), "ts".into()], reserved: None, + partition_by: None, + ttl: None, + indexes: vec![], + settings: vec![], } } @@ -117,6 +134,43 @@ mod tests { assert!(ddl.contains("ORDER BY (org_id, ts)")); } + #[test] + fn carries_partition_ttl_indexes_settings_into_ddl() { + use crate::table::{IndexSpec, TtlMove, TtlSpec}; + + let mut cfg = config(); + cfg.partition_by = Some("(org_id, toDate(ts))".into()); + cfg.ttl = Some(TtlSpec { + column: "ts".into(), + move_to_volume_after: Some(TtlMove { + interval: "7 DAY".into(), + volume: "cold".into(), + }), + delete_after: Some("90 DAY".into()), + }); + cfg.indexes = vec![IndexSpec { + name: "idx_status".into(), + expression: "status".into(), + type_def: "bloom_filter(0.01)".into(), + granularity: 1, + }]; + cfg.settings = vec![("index_granularity".into(), "8192".into())]; + + let spec = flexible_table("events", cfg, &SchemaLimits::default()).unwrap(); + let ddl = to_create_table_sql(&spec, &SchemaLimits::default()).unwrap(); + + assert!(ddl.contains("PARTITION BY (org_id, toDate(ts))"), "{ddl}"); + assert!( + ddl.contains(" INDEX idx_status status TYPE bloom_filter(0.01) GRANULARITY 1"), + "{ddl}" + ); + assert!( + ddl.contains("TTL toDateTime(ts) + INTERVAL 7 DAY TO VOLUME 'cold', toDateTime(ts) + INTERVAL 90 DAY DELETE"), + "{ddl}" + ); + assert!(ddl.contains("SETTINGS index_granularity = 8192"), "{ddl}"); + } + #[test] fn rejects_promoted_column_colliding_with_reserved() { let mut cfg = config(); diff --git a/crates/clickhouse-kit/src/lib.rs b/crates/clickhouse-kit/src/lib.rs index 45ed75b..9473afa 100644 --- a/crates/clickhouse-kit/src/lib.rs +++ b/crates/clickhouse-kit/src/lib.rs @@ -11,6 +11,8 @@ //! input. See the repo `ROADMAP.md`. pub mod client; +#[cfg(feature = "codegen")] +pub mod codegen; pub mod drift; pub mod evolve; pub mod flatten; @@ -27,6 +29,7 @@ pub use flexible::{flexible_table, FlexibleConfig}; pub use migrate::{run_migrations, split_sql_statements, MigrationRunResult}; pub use safety::{ assert_column_count, assert_not_reserved, quote_identifier, validate_identifier, - ColumnTypeSpec, ScalarType, SchemaError, SchemaLimits, StringOnly, DEFAULT_RESERVED_COLUMNS, + ColumnTypeSpec, DateTime64Spec, ScalarType, SchemaError, SchemaLimits, StringOnly, + DEFAULT_RESERVED_COLUMNS, }; -pub use table::{to_create_table_sql, ColumnSpec, TableSpec}; +pub use table::{to_create_table_sql, ColumnSpec, IndexSpec, TableSpec, TtlMove, TtlSpec}; diff --git a/crates/clickhouse-kit/src/safety.rs b/crates/clickhouse-kit/src/safety.rs index f77824f..5fce196 100644 --- a/crates/clickhouse-kit/src/safety.rs +++ b/crates/clickhouse-kit/src/safety.rs @@ -30,6 +30,8 @@ pub enum SchemaError { ReservedColumn(String), #[error("duplicate column name {0:?}")] DuplicateColumn(String), + #[error("invalid DateTime64 precision: {precision} (must be 0..=9)")] + InvalidDateTime64Precision { precision: u8 }, } /// Size bounds for a schema. @@ -60,6 +62,18 @@ fn is_valid_identifier(name: &str) -> bool { chars.all(|c| c.is_ascii_alphanumeric() || c == '_') } +/// Whether `tz` is a plausible IANA timezone name: 1..=64 chars from the +/// `[A-Za-z0-9_+/-]` charset (covers names like `UTC`, `America/New_York`, +/// `Etc/GMT+5`). Anything outside this charset (quotes, semicolons, spaces) is +/// rejected, so an untrusted timezone string cannot inject SQL. +fn is_valid_timezone(tz: &str) -> bool { + !tz.is_empty() + && tz.len() <= 64 + && tz + .chars() + .all(|c| c.is_ascii_alphanumeric() || matches!(c, '_' | '+' | '/' | '-')) +} + /// Validate a table/column identifier against the strict ASCII allowlist + length /// bound. `kind` is `"table"` / `"column"` / `"identifier"` for error messages. pub fn validate_identifier<'a>( @@ -171,6 +185,46 @@ pub enum StringOnly { String, } +fn default_dt64_precision() -> u8 { + 3 +} + +/// A parametrised `DateTime64(precision[, 'timezone'])` column type. +/// +/// **Safety posture:** `precision` and `timezone` may come from untrusted JSON, so +/// they are **validated before rendering** (via [`DateTime64Spec::validate`], called +/// from the table builder's per-column loop): `precision` must be `0..=9` and +/// `timezone` must match the IANA charset `^[A-Za-z0-9_+/-]{1,64}$`. The default +/// (bare `{"datetime64":{}}`) is `DateTime64(3)`, matching the legacy +/// [`ScalarType::DateTime64`] rendering. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +pub struct DateTime64Spec { + #[serde(default = "default_dt64_precision")] + pub precision: u8, + #[serde(default)] + pub timezone: Option, +} + +impl DateTime64Spec { + /// Validate the (possibly untrusted) precision + timezone before they reach SQL. + pub fn validate(&self) -> Result<(), SchemaError> { + if self.precision > 9 { + return Err(SchemaError::InvalidDateTime64Precision { + precision: self.precision, + }); + } + if let Some(tz) = &self.timezone { + if !is_valid_timezone(tz) { + return Err(SchemaError::InvalidIdentifier { + kind: "timezone", + name: tz.clone(), + }); + } + } + Ok(()) + } +} + /// A column type as supplied by untrusted input — the allowlisted recursive shape. /// Mirrors the TS `ColumnTypeSpec`: a bare scalar string, or a single-key wrapper /// object (`nullable` / `lowCardinality` / `array` / `map`). @@ -178,6 +232,12 @@ pub enum StringOnly { #[serde(untagged)] pub enum ColumnTypeSpec { Scalar(ScalarType), + /// Parametrised `DateTime64(precision[, 'timezone'])`. JSON: + /// `{"datetime64": {"precision": 3, "timezone": "UTC"}}`. The single `datetime64` + /// key keeps the untagged match unambiguous against the other wrappers. + DateTime64 { + datetime64: DateTime64Spec, + }, Nullable { nullable: Box, }, @@ -195,9 +255,17 @@ pub enum ColumnTypeSpec { impl ColumnTypeSpec { /// The ClickHouse type string for this spec. + /// + /// For [`ColumnTypeSpec::DateTime64`] this trusts the spec to be valid; untrusted + /// precision/timezone must be checked first via [`ColumnTypeSpec::validate`] (the + /// table builder does this in its per-column loop). pub fn to_ch_type(&self) -> String { match self { ColumnTypeSpec::Scalar(s) => s.ch_type().to_string(), + ColumnTypeSpec::DateTime64 { datetime64 } => match &datetime64.timezone { + Some(tz) => format!("DateTime64({}, '{}')", datetime64.precision, tz), + None => format!("DateTime64({})", datetime64.precision), + }, ColumnTypeSpec::Nullable { nullable } => format!("Nullable({})", nullable.to_ch_type()), ColumnTypeSpec::LowCardinality { low_cardinality } => { format!("LowCardinality({})", low_cardinality.to_ch_type()) @@ -208,15 +276,31 @@ impl ColumnTypeSpec { } /// Whether a `DateTime64` is at the core (so a TTL move expression must wrap it - /// in `toDateTime(...)`). Propagates through `Nullable`/`LowCardinality`. + /// in `toDateTime(...)`). Propagates through `Nullable`/`LowCardinality` and covers + /// both the bare [`ScalarType::DateTime64`] and the parametrised + /// [`ColumnTypeSpec::DateTime64`] variant. pub fn is_datetime64(&self) -> bool { match self { ColumnTypeSpec::Scalar(ScalarType::DateTime64) => true, + ColumnTypeSpec::DateTime64 { .. } => true, ColumnTypeSpec::Nullable { nullable } => nullable.is_datetime64(), ColumnTypeSpec::LowCardinality { low_cardinality } => low_cardinality.is_datetime64(), _ => false, } } + + /// Validate any embedded untrusted parameters (currently the parametrised + /// `DateTime64` precision + timezone) before this type is rendered to SQL. + /// Recurses through `Nullable`/`LowCardinality`. Identifier-shaped scalars/arrays/ + /// maps have nothing to validate here. + pub fn validate(&self) -> Result<(), SchemaError> { + match self { + ColumnTypeSpec::DateTime64 { datetime64 } => datetime64.validate(), + ColumnTypeSpec::Nullable { nullable } => nullable.validate(), + ColumnTypeSpec::LowCardinality { low_cardinality } => low_cardinality.validate(), + _ => Ok(()), + } + } } #[cfg(test)] @@ -327,4 +411,77 @@ mod tests { ); } } + + #[test] + fn parametrised_datetime64_renders_and_validates() { + // Full precision + timezone. + let utc: ColumnTypeSpec = + serde_json::from_str(r#"{"datetime64":{"precision":3,"timezone":"UTC"}}"#).unwrap(); + assert_eq!(utc.to_ch_type(), "DateTime64(3, 'UTC')"); + assert!(utc.is_datetime64()); + assert!(utc.validate().is_ok()); + + // Precision only, no timezone. + let p6: ColumnTypeSpec = serde_json::from_str(r#"{"datetime64":{"precision":6}}"#).unwrap(); + assert_eq!(p6.to_ch_type(), "DateTime64(6)"); + assert!(p6.validate().is_ok()); + + // Empty object → defaults to DateTime64(3), matching the legacy scalar. + let def: ColumnTypeSpec = serde_json::from_str(r#"{"datetime64":{}}"#).unwrap(); + assert_eq!(def.to_ch_type(), "DateTime64(3)"); + assert!(def.is_datetime64()); + assert!(def.validate().is_ok()); + + // The bare string still deserializes to the legacy scalar variant. + let bare: ColumnTypeSpec = serde_json::from_str("\"DateTime64\"").unwrap(); + assert!(matches!( + bare, + ColumnTypeSpec::Scalar(ScalarType::DateTime64) + )); + + // A real IANA name with a slash + plus is accepted. + let tz: ColumnTypeSpec = + serde_json::from_str(r#"{"datetime64":{"precision":9,"timezone":"America/New_York"}}"#) + .unwrap(); + assert_eq!(tz.to_ch_type(), "DateTime64(9, 'America/New_York')"); + assert!(tz.validate().is_ok()); + } + + #[test] + fn parametrised_datetime64_rejects_bad_params() { + // Injection attempt in the timezone string. + let bad_tz: ColumnTypeSpec = + serde_json::from_str(r#"{"datetime64":{"precision":3,"timezone":"UTC'; DROP"}}"#) + .unwrap(); + assert!(matches!( + bad_tz.validate(), + Err(SchemaError::InvalidIdentifier { + kind: "timezone", + .. + }) + )); + + // Out-of-range precision. + let bad_p: ColumnTypeSpec = + serde_json::from_str(r#"{"datetime64":{"precision":12}}"#).unwrap(); + assert!(matches!( + bad_p.validate(), + Err(SchemaError::InvalidDateTime64Precision { precision: 12 }) + )); + } + + #[test] + fn parametrised_datetime64_is_datetime64_through_nullable() { + let n: ColumnTypeSpec = + serde_json::from_str(r#"{"nullable":{"datetime64":{"precision":3,"timezone":"UTC"}}}"#) + .unwrap(); + assert!(n.is_datetime64()); + assert_eq!(n.to_ch_type(), "Nullable(DateTime64(3, 'UTC'))"); + assert!(n.validate().is_ok()); + + // Validation propagates through the wrapper too. + let bad: ColumnTypeSpec = + serde_json::from_str(r#"{"nullable":{"datetime64":{"precision":12}}}"#).unwrap(); + assert!(bad.validate().is_err()); + } } diff --git a/crates/clickhouse-kit/src/table.rs b/crates/clickhouse-kit/src/table.rs index 360de8c..b0db60a 100644 --- a/crates/clickhouse-kit/src/table.rs +++ b/crates/clickhouse-kit/src/table.rs @@ -16,14 +16,70 @@ pub struct ColumnSpec { pub default: Option, } +/// A secondary data-skipping index. +/// +/// **Safety posture:** `name` is identifier-validated. `expression` and `type_def` +/// are **app-controlled raw SQL** (like [`TableSpec::engine`]) — they are emitted +/// verbatim, so never build them from untrusted input. +#[derive(Debug, Clone)] +pub struct IndexSpec { + pub name: String, + /// Raw, app-controlled index expression, e.g. `"trace_id"` or a real expression. + pub expression: String, + /// Raw, app-controlled index type, e.g. `"bloom_filter(0.01)"` or + /// `"tokenbf_v1(8192, 3, 0)"`. + pub type_def: String, + pub granularity: u32, +} + +/// A move-to-volume TTL tier. +/// +/// **Safety posture:** both fields are app-controlled raw fragments emitted verbatim. +#[derive(Debug, Clone)] +pub struct TtlMove { + /// Raw INTERVAL fragment, e.g. `"14 DAY"`. + pub interval: String, + /// Volume name, e.g. `"cold"`. + pub volume: String, +} + +/// Table TTL policy. +/// +/// **Safety posture:** `column` is identifier-validated **and** must be a real column +/// in the table. `interval`/`volume`/`delete_after` are app-controlled raw fragments +/// emitted verbatim — never build them from untrusted input. +#[derive(Debug, Clone)] +pub struct TtlSpec { + pub column: String, + pub move_to_volume_after: Option, + /// Raw INTERVAL fragment for the DELETE tier, e.g. `"180 DAY"`. + pub delete_after: Option, +} + /// A table built from a runtime spec. `engine` is app-controlled (not user input); /// `order_by` entries are validated as column identifiers. +/// +/// **Safety posture for the production-DDL knobs** (`partition_by`, `ttl`, `indexes`, +/// `settings`): these are **app-controlled raw fragments** emitted verbatim, with the +/// sole exception that identifiers (`ttl.column`, `indexes[].name`) are validated and +/// `ttl.column` must be a real column. Never build the raw fragments from untrusted +/// input. #[derive(Debug, Clone)] pub struct TableSpec { pub name: String, pub columns: Vec, pub engine: String, pub order_by: Vec, + /// App-controlled raw `PARTITION BY` expression, e.g. + /// `"(organization_id, toDate(started_at))"`. + pub partition_by: Option, + /// Optional table TTL policy. + pub ttl: Option, + /// Secondary data-skipping indexes rendered inside the column parens. + pub indexes: Vec, + /// App-controlled `SETTINGS` pairs (key, raw-value RHS), e.g. + /// `("storage_policy", "'hot_cold'")`, `("index_granularity", "8192")`. + pub settings: Vec<(String, String)>, } /// Render the `CREATE TABLE IF NOT EXISTS` DDL for a runtime spec, enforcing @@ -42,6 +98,9 @@ pub fn to_create_table_sql( if !seen.insert(c.name.as_str()) { return Err(SchemaError::DuplicateColumn(c.name.clone())); } + // Validate any untrusted type parameters (e.g. parametrised DateTime64 + // precision/timezone) before they reach the rendered SQL. + c.type_spec.validate()?; let default = c .default .as_deref() @@ -68,13 +127,76 @@ pub fn to_create_table_sql( } } - Ok(format!( - "CREATE TABLE IF NOT EXISTS {} (\n{}\n)\nENGINE = {}\nORDER BY ({})", + // Secondary indexes render inside the column parens. `name` is identifier-validated; + // `expression`/`type_def` are app-controlled raw SQL emitted verbatim. + let mut paren_lines = col_lines; + for idx in &table.indexes { + validate_identifier(&idx.name, "index", limits)?; + paren_lines.push(format!( + " INDEX {} {} TYPE {} GRANULARITY {}", + idx.name, idx.expression, idx.type_def, idx.granularity + )); + } + + let mut sql = format!( + "CREATE TABLE IF NOT EXISTS {} (\n{}\n)\nENGINE = {}", table.name, - col_lines.join(",\n"), + paren_lines.join(",\n"), table.engine, - table.order_by.join(", "), - )) + ); + + // PARTITION BY sits between ENGINE and ORDER BY. + if let Some(partition_by) = &table.partition_by { + sql.push_str(&format!("\nPARTITION BY {partition_by}")); + } + + sql.push_str(&format!("\nORDER BY ({})", table.order_by.join(", "))); + + // TTL: the column must be a real, validated column. DateTime64 columns are wrapped + // in `toDateTime(...)` for the TTL expression; everything else uses the bare column. + if let Some(ttl) = &table.ttl { + validate_identifier(&ttl.column, "column", limits)?; + if !known.contains(ttl.column.as_str()) { + return Err(SchemaError::InvalidIdentifier { + kind: "ttl column", + name: ttl.column.clone(), + }); + } + let type_spec = table + .columns + .iter() + .find(|c| c.name == ttl.column) + .map(|c| &c.type_spec); + let base = match type_spec { + Some(ts) if ts.is_datetime64() => format!("toDateTime({})", ttl.column), + _ => ttl.column.clone(), + }; + let mut parts = Vec::new(); + if let Some(mv) = &ttl.move_to_volume_after { + parts.push(format!( + "{base} + INTERVAL {} TO VOLUME '{}'", + mv.interval, mv.volume + )); + } + if let Some(after) = &ttl.delete_after { + parts.push(format!("{base} + INTERVAL {after} DELETE")); + } + if !parts.is_empty() { + sql.push_str(&format!("\nTTL {}", parts.join(", "))); + } + } + + // SETTINGS render last. Values are app-controlled raw RHS fragments. + if !table.settings.is_empty() { + let rendered: Vec = table + .settings + .iter() + .map(|(k, v)| format!("{k} = {v}")) + .collect(); + sql.push_str(&format!("\nSETTINGS {}", rendered.join(", "))); + } + + Ok(sql) } #[cfg(test)] @@ -107,6 +229,10 @@ mod tests { ], engine: "MergeTree()".into(), order_by: vec!["id".into()], + partition_by: None, + ttl: None, + indexes: vec![], + settings: vec![], } } @@ -142,4 +268,247 @@ mod tests { t.order_by = vec!["nope".into()]; assert!(to_create_table_sql(&t, &SchemaLimits::default()).is_err()); } + + /// The live `observability_traces` table — a real production DDL with partitioning, + /// two data-skipping indexes, a two-tier TTL, and settings. + fn observability_traces() -> TableSpec { + TableSpec { + name: "observability_traces".into(), + columns: vec![ + col("started_at", ColumnTypeSpec::Scalar(ScalarType::DateTime64)), + col( + "organization_id", + ColumnTypeSpec::LowCardinality { + low_cardinality: Box::new(ColumnTypeSpec::Scalar(ScalarType::String)), + }, + ), + col("trace_id", ColumnTypeSpec::Scalar(ScalarType::String)), + col("name", ColumnTypeSpec::Scalar(ScalarType::String)), + col( + "service_name", + ColumnTypeSpec::LowCardinality { + low_cardinality: Box::new(ColumnTypeSpec::Scalar(ScalarType::String)), + }, + ), + col("has_error", ColumnTypeSpec::Scalar(ScalarType::UInt8)), + col( + "attributes", + ColumnTypeSpec::Map { + map: ( + crate::safety::StringOnly::String, + crate::safety::StringOnly::String, + ), + }, + ), + ColumnSpec { + name: "ingested_at".into(), + type_spec: ColumnTypeSpec::Scalar(ScalarType::DateTime), + default: Some("now()".into()), + }, + ], + engine: "MergeTree()".into(), + order_by: vec![ + "organization_id".into(), + "service_name".into(), + "started_at".into(), + "trace_id".into(), + ], + partition_by: Some("(organization_id, toDate(started_at))".into()), + ttl: Some(TtlSpec { + column: "started_at".into(), + move_to_volume_after: Some(TtlMove { + interval: "14 DAY".into(), + volume: "cold".into(), + }), + delete_after: Some("180 DAY".into()), + }), + indexes: vec![ + IndexSpec { + name: "idx_trace_id".into(), + expression: "trace_id".into(), + type_def: "bloom_filter(0.01)".into(), + granularity: 1, + }, + IndexSpec { + name: "idx_name".into(), + expression: "name".into(), + type_def: "tokenbf_v1(8192, 3, 0)".into(), + granularity: 1, + }, + ], + settings: vec![ + ("storage_policy".into(), "'hot_cold'".into()), + ("index_granularity".into(), "8192".into()), + ], + } + } + + #[test] + fn reproduces_observability_traces_production_ddl() { + let ddl = to_create_table_sql(&observability_traces(), &SchemaLimits::default()).unwrap(); + + // Partitioning between ENGINE and ORDER BY. + assert!( + ddl.contains("PARTITION BY (organization_id, toDate(started_at))"), + "{ddl}" + ); + // Both INDEX lines, verbatim, inside the column parens. + assert!( + ddl.contains(" INDEX idx_trace_id trace_id TYPE bloom_filter(0.01) GRANULARITY 1"), + "{ddl}" + ); + assert!( + ddl.contains(" INDEX idx_name name TYPE tokenbf_v1(8192, 3, 0) GRANULARITY 1"), + "{ddl}" + ); + // TTL line, verbatim — started_at is DateTime64 → wrapped in toDateTime(...). + assert!( + ddl.contains("TTL toDateTime(started_at) + INTERVAL 14 DAY TO VOLUME 'cold', toDateTime(started_at) + INTERVAL 180 DAY DELETE"), + "{ddl}" + ); + // SETTINGS line, verbatim, last. + assert!( + ddl.contains("SETTINGS storage_policy = 'hot_cold', index_granularity = 8192"), + "{ddl}" + ); + + // Clause ordering sanity: ENGINE < PARTITION BY < ORDER BY < TTL < SETTINGS. + let pos = |needle: &str| ddl.find(needle).unwrap(); + assert!(pos("ENGINE = MergeTree()") < pos("PARTITION BY")); + assert!(pos("PARTITION BY") < pos("ORDER BY (")); + assert!(pos("ORDER BY (") < pos("TTL ")); + assert!(pos("TTL ") < pos("SETTINGS ")); + } + + #[test] + fn ttl_on_plain_datetime_is_not_wrapped() { + let mut t = sample(); + t.columns + .push(col("created", ColumnTypeSpec::Scalar(ScalarType::DateTime))); + t.ttl = Some(TtlSpec { + column: "created".into(), + move_to_volume_after: None, + delete_after: Some("30 DAY".into()), + }); + let ddl = to_create_table_sql(&t, &SchemaLimits::default()).unwrap(); + assert!( + ddl.contains("TTL created + INTERVAL 30 DAY DELETE"), + "{ddl}" + ); + assert!(!ddl.contains("toDateTime(created)"), "{ddl}"); + } + + #[test] + fn ttl_delete_only_renders_just_delete() { + let mut t = sample(); + // `ts` is DateTime64 → wrapped. + t.ttl = Some(TtlSpec { + column: "ts".into(), + move_to_volume_after: None, + delete_after: Some("90 DAY".into()), + }); + let ddl = to_create_table_sql(&t, &SchemaLimits::default()).unwrap(); + assert!( + ddl.contains("TTL toDateTime(ts) + INTERVAL 90 DAY DELETE"), + "{ddl}" + ); + assert!(!ddl.contains("TO VOLUME"), "{ddl}"); + } + + #[test] + fn ttl_unknown_column_is_rejected() { + let mut t = sample(); + t.ttl = Some(TtlSpec { + column: "nope".into(), + move_to_volume_after: None, + delete_after: Some("1 DAY".into()), + }); + assert!(matches!( + to_create_table_sql(&t, &SchemaLimits::default()), + Err(SchemaError::InvalidIdentifier { + kind: "ttl column", + .. + }) + )); + } + + #[test] + fn index_with_invalid_name_is_rejected() { + let mut t = sample(); + t.indexes = vec![IndexSpec { + name: "bad name".into(), + expression: "name".into(), + type_def: "bloom_filter(0.01)".into(), + granularity: 1, + }]; + assert!(matches!( + to_create_table_sql(&t, &SchemaLimits::default()), + Err(SchemaError::InvalidIdentifier { kind: "index", .. }) + )); + } + + #[test] + fn backward_compat_no_extra_clauses() { + // With all the new knobs absent, the output is exactly the legacy shape: + // no PARTITION BY / TTL / SETTINGS lines, no trailing INDEX lines. + let ddl = to_create_table_sql(&sample(), &SchemaLimits::default()).unwrap(); + let expected = "CREATE TABLE IF NOT EXISTS events (\n id UUID,\n ts DateTime64(3),\n name String,\n value Float64,\n tags Array(String)\n)\nENGINE = MergeTree()\nORDER BY (id)"; + assert_eq!(ddl, expected); + } + + #[test] + fn parametrised_datetime64_column_renders_with_timezone() { + let mut t = sample(); + let dt: ColumnTypeSpec = + serde_json::from_str(r#"{"datetime64":{"precision":3,"timezone":"UTC"}}"#).unwrap(); + t.columns.push(col("occurred_at", dt)); + let ddl = to_create_table_sql(&t, &SchemaLimits::default()).unwrap(); + assert!(ddl.contains("occurred_at DateTime64(3, 'UTC')"), "{ddl}"); + } + + #[test] + fn parametrised_datetime64_bad_params_rejected_at_ddl_boundary() { + // Bad timezone is caught in the per-column loop, before reaching SQL. + let mut t = sample(); + let bad_tz: ColumnTypeSpec = + serde_json::from_str(r#"{"datetime64":{"precision":3,"timezone":"UTC'; DROP"}}"#) + .unwrap(); + t.columns.push(col("occurred_at", bad_tz)); + assert!(matches!( + to_create_table_sql(&t, &SchemaLimits::default()), + Err(SchemaError::InvalidIdentifier { + kind: "timezone", + .. + }) + )); + + // Out-of-range precision is also caught. + let mut t2 = sample(); + let bad_p: ColumnTypeSpec = + serde_json::from_str(r#"{"datetime64":{"precision":12}}"#).unwrap(); + t2.columns.push(col("occurred_at", bad_p)); + assert!(matches!( + to_create_table_sql(&t2, &SchemaLimits::default()), + Err(SchemaError::InvalidDateTime64Precision { precision: 12 }) + )); + } + + #[test] + fn ttl_wraps_parametrised_datetime64_column() { + let mut t = sample(); + let dt: ColumnTypeSpec = + serde_json::from_str(r#"{"datetime64":{"precision":3,"timezone":"UTC"}}"#).unwrap(); + t.columns.push(col("occurred_at", dt)); + t.ttl = Some(TtlSpec { + column: "occurred_at".into(), + move_to_volume_after: None, + delete_after: Some("30 DAY".into()), + }); + let ddl = to_create_table_sql(&t, &SchemaLimits::default()).unwrap(); + // The parametrised DateTime64 column is still wrapped in toDateTime(...). + assert!( + ddl.contains("TTL toDateTime(occurred_at) + INTERVAL 30 DAY DELETE"), + "{ddl}" + ); + } } diff --git a/crates/clickhouse-kit/tests/integration.rs b/crates/clickhouse-kit/tests/integration.rs index d190bc2..d605303 100644 --- a/crates/clickhouse-kit/tests/integration.rs +++ b/crates/clickhouse-kit/tests/integration.rs @@ -41,6 +41,10 @@ fn events_table() -> TableSpec { ], engine: "MergeTree()".into(), order_by: vec!["id".into()], + partition_by: None, + ttl: None, + indexes: vec![], + settings: vec![], } } diff --git a/crates/clickhouse-kit/tests/integration_io.rs b/crates/clickhouse-kit/tests/integration_io.rs index e75055b..5fed1d6 100644 --- a/crates/clickhouse-kit/tests/integration_io.rs +++ b/crates/clickhouse-kit/tests/integration_io.rs @@ -102,6 +102,10 @@ fn events_spec() -> TableSpec { ], engine: "MergeTree()".into(), order_by: vec!["id".into()], + partition_by: None, + ttl: None, + indexes: vec![], + settings: vec![], } } diff --git a/package.json b/package.json index f65256a..8d430c0 100644 --- a/package.json +++ b/package.json @@ -1,11 +1,10 @@ { "name": "@smooai/clickhouse-kit", "version": "0.2.0", - "description": "Drizzle-shaped, TS-only schema toolkit for ClickHouse — schema-as-code, drizzle-zod-style schemas, forward-only migrations, and drift detection.", + "description": "TS-only schema-as-code toolkit for ClickHouse — declarative schemas, Zod schema emitters, forward-only migrations, and drift detection.", "keywords": [ "clickhouse", "ddl", - "drizzle", "migrations", "olap", "schema", diff --git a/src/__tests__/kit.test.ts b/src/__tests__/kit.test.ts index 5cb3e4e..03d5da7 100644 --- a/src/__tests__/kit.test.ts +++ b/src/__tests__/kit.test.ts @@ -76,7 +76,7 @@ describe("materialized view → DDL", () => { }); }); -describe("drizzle-zod ergonomics", () => { +describe("Zod schema emitters", () => { it("createSelectSchema parses a row and rejects a wrong type", () => { expect(selectEventSchema.safeParse(validEventRow).success).toBe(true); expect(selectEventSchema.safeParse({ ...validEventRow, value: "nope" }).success).toBe(false); diff --git a/src/generate.ts b/src/generate.ts index 0c784f0..979cd6b 100644 --- a/src/generate.ts +++ b/src/generate.ts @@ -5,7 +5,7 @@ // `CREATE` DDL for any object not yet captured. There is NO auto-diff engine: // this only ever appends a brand-new CREATE for a not-yet-migrated table/MV. // Schema *changes* to an already-migrated object are hand-authored as a fresh -// migration, exactly like Drizzle custom migrations. +// migration, as custom SQL. import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; import path from "node:path"; diff --git a/src/index.ts b/src/index.ts index de58b9a..4a9ff86 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,7 +1,7 @@ // @smooai/clickhouse-kit — public API. // -// "Drizzle for ClickHouse": define a table once, get the DDL, the inferred row -// type, and drizzle-zod-style select/insert schemas. Forward-only migrations +// Schema-as-code for ClickHouse: define a table once, get the DDL, the inferred +// row type, and select/insert Zod schemas. Forward-only migrations // (no auto-diff engine) that ride your own ClickHouse client + a drift gate. export { diff --git a/src/kit.ts b/src/kit.ts index 125594b..d529ce4 100644 --- a/src/kit.ts +++ b/src/kit.ts @@ -1,10 +1,10 @@ -// @smooai/clickhouse-kit — TS-only, Drizzle-shaped schema kit for ClickHouse. +// @smooai/clickhouse-kit — TS-only, schema-as-code kit for ClickHouse. // // Define a table ONCE with `clickhouseTable(...)` and get: the `CREATE TABLE` DDL // (`toCreateTableSql`), the inferred TS row type (`typeof table.$inferSelect`), -// and select/insert Zod (`createSelectSchema`/`createInsertSchema`) — mirroring -// the Drizzle + drizzle-zod ergonomics. Forward-only by design: there is NO schema -// differ (schema changes are hand-authored migrations, like Drizzle custom SQL). +// and select/insert Zod (`createSelectSchema`/`createInsertSchema`). Forward-only +// by design: there is NO schema differ (schema changes are hand-authored +// migrations as custom SQL). import { z } from "zod"; @@ -175,12 +175,12 @@ export function toCreateTableSql(table: ChTable): string return parts.join("\n"); } -// ── Zod (drizzle-zod ergonomics) ───────────────────────────────────────────── +// ── Zod schema emitters ────────────────────────────────────────────────────── type ShapeOf = { [K in keyof C]: C[K] extends ChColumn ? z.ZodType : never; }; -/** Zod schema for a row as read from ClickHouse. Pass `overrides` to refine columns (like drizzle-zod). */ +/** Zod schema for a row as read from ClickHouse. Pass `overrides` to refine columns. */ export function createSelectSchema( table: ChTable, overrides?: Partial>,