From 1f54abc3386f6057acddfc4cf0b1c7e939dc9b0a Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 5 May 2026 10:41:22 +0200 Subject: [PATCH 01/10] fix(query): collapse scalar agg without `by:` to ONE row MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `(select {s: (sum a) from: t})` was returning N copies of the same value instead of a single row. The projection-only path lowered aggregates as ordinary column expressions, so OP_SELECT saw a scalar atom and broadcast it to the input row count (exec.c: vec->type<0 -> broadcast_scalar). Route the all-aggregate / no-by case through ray_group(n_keys=0), which already has a 1-row scalar-aggregate fast path. WHERE is pre-executed (same pattern as the by-with-where fuse path) so the lazy g->selection bitmap reaches the reduction. The n_keys==0 parallel scalar path was effectively dead code before this and its FIRST/LAST merge silently relied on worker-id order matching row-index order — broken under work-stealing dispatch. Force serial execution when FIRST/LAST is in play; the DA path stays parallel and tracks per-slot first_row/last_row already. Two existing tests asserted the buggy broadcast row count (groupby_aggregators.rfl:64, group_coverage.rfl:417); updated to the correct 1-row expectation. --- src/ops/group.c | 12 ++- src/ops/query.c | 98 +++++++++++++++++--- test/rfl/integration/groupby_aggregators.rfl | 5 +- test/rfl/ops/group_coverage.rfl | 5 +- test/rfl/table/select.rfl | 11 +++ 5 files changed, 113 insertions(+), 18 deletions(-) diff --git a/src/ops/group.c b/src/ops/group.c index 4665155f..a437bd3c 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -2647,6 +2647,7 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, /* ---- Scalar aggregate fast path (n_keys == 0): flat vector scan ---- */ if (n_keys == 0 && nrows > 0) { uint8_t need_flags = DA_NEED_COUNT; + bool has_first_last = false; for (uint8_t a = 0; a < n_aggs; a++) { uint16_t aop = ext->agg_ops[a]; if (aop == OP_SUM || aop == OP_PROD || aop == OP_AVG || aop == OP_FIRST || aop == OP_LAST) @@ -2655,6 +2656,7 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, { need_flags |= DA_NEED_SUM; need_flags |= DA_NEED_SUMSQ; } else if (aop == OP_MIN) need_flags |= DA_NEED_MIN; else if (aop == OP_MAX) need_flags |= DA_NEED_MAX; + if (aop == OP_FIRST || aop == OP_LAST) has_first_last = true; } void* agg_ptrs[vla_aggs]; @@ -2670,7 +2672,15 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, } ray_pool_t* sc_pool = ray_pool_get(); - uint32_t sc_n = (sc_pool && nrows >= RAY_PARALLEL_THRESHOLD) + /* Pool dispatch is work-stealing: chunks may be processed out of + * row-index order across workers, so the "count[0]==1" sentinel + * scalar_accum_row uses for FIRST (and the always-overwrite for + * LAST) only yields the per-worker first/last, not the global + * one. The merge step then picks worker[0]'s FIRST regardless + * of which range it actually covered. Force serial execution + * when FIRST/LAST is in play; the DA path (which does track + * per-slot row bounds) is still preferred when we have keys. */ + uint32_t sc_n = (sc_pool && nrows >= RAY_PARALLEL_THRESHOLD && !has_first_last) ? ray_pool_total_workers(sc_pool) : 1; ray_t* sc_hdr; diff --git a/src/ops/query.c b/src/ops/query.c index 34d01bf4..4d437a60 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -3170,27 +3170,97 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); } } else if (n_out > 0) { - /* Projection only (no group by) — select specific columns */ - ray_op_t* col_ops[16]; - uint8_t nc = 0; + /* No `by:` but explicit output expressions. + * + * Two sub-cases: + * (a) All outputs are aggregates → scalar reduction. Route + * through ray_group(n_keys=0) so the result is ONE row, + * not the input row count broadcast. The naive ray_select + * path lowers `(sum c)` to OP_SUM as a column expression; + * OP_SELECT then broadcasts the scalar atom to nrows + * (exec.c: vec->type < 0 → broadcast_scalar), producing + * N copies of the same value. + * (b) At least one non-agg output → keep the existing + * projection (broadcast-as-column), matching q's + * per-row evaluation semantics. + * + * Mixed agg+non-agg without `by:` continues to flow through (b); + * q's semantics there imply LIST/scalar mixing that is out of + * scope for this fix. */ + int has_agg = 0; + int has_nonagg_out = 0; for (int64_t i = 0; i + 1 < dict_n; i += 2) { int64_t kid = dict_elems[i]->i64; - if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id || kid == nearest_id) continue; - if (nc < 16) { - col_ops[nc] = compile_expr_dag(g, dict_elems[i + 1]); - if (!col_ops[nc]) { - /* Nearest-path resources must be freed here too — the - * rerank handle/query buffers are held across the whole - * ray_select_fn body, not just inside the nearest block. */ - if (nearest_handle_owned) ray_release(nearest_handle_owned); - if (nearest_query_owned) ray_sys_free(nearest_query_owned); + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == asc_id || kid == desc_id || kid == nearest_id) continue; + if (is_agg_expr(dict_elems[i + 1])) has_agg = 1; + else has_nonagg_out = 1; + } + + if (has_agg && !has_nonagg_out && !nearest_expr) { + /* Scalar reduction. Pre-execute the WHERE filter (already + * wired as ray_filter at the top) so OP_FILTER on the table + * input populates g->selection, which exec_group then + * honours in its n_keys==0 fast path. */ + if (where_expr) { + root = ray_optimize(g, root); + ray_t* fres = exec_node(g, root); + if (!fres || RAY_IS_ERR(fres)) { + if (g->selection) { + ray_release(g->selection); + g->selection = NULL; + } + ray_graph_free(g); ray_release(tbl); + return fres ? fres : ray_error("domain", NULL); + } + ray_release(fres); + } + + uint16_t s_agg_ops[16]; + ray_op_t* s_agg_ins[16]; + uint8_t s_n_aggs = 0; + for (int64_t i = 0; i + 1 < dict_n && s_n_aggs < 16; i += 2) { + int64_t kid = dict_elems[i]->i64; + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == asc_id || kid == desc_id || kid == nearest_id) continue; + ray_t* val_expr = dict_elems[i + 1]; + ray_t** agg_elems = (ray_t**)ray_data(val_expr); + s_agg_ops[s_n_aggs] = resolve_agg_opcode(agg_elems[0]->i64); + s_agg_ins[s_n_aggs] = compile_expr_dag(g, agg_elems[1]); + if (!s_agg_ins[s_n_aggs]) { + if (g->selection) { + ray_release(g->selection); + g->selection = NULL; + } ray_graph_free(g); ray_release(tbl); return ray_error("domain", NULL); } - nc++; + s_n_aggs++; + } + root = ray_group(g, NULL, 0, s_agg_ops, s_agg_ins, s_n_aggs); + } else { + /* Projection only (no group by) — select specific columns */ + ray_op_t* col_ops[16]; + uint8_t nc = 0; + for (int64_t i = 0; i + 1 < dict_n; i += 2) { + int64_t kid = dict_elems[i]->i64; + if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id || kid == nearest_id) continue; + if (nc < 16) { + col_ops[nc] = compile_expr_dag(g, dict_elems[i + 1]); + if (!col_ops[nc]) { + /* Nearest-path resources must be freed here too — the + * rerank handle/query buffers are held across the whole + * ray_select_fn body, not just inside the nearest block. */ + if (nearest_handle_owned) ray_release(nearest_handle_owned); + if (nearest_query_owned) ray_sys_free(nearest_query_owned); + ray_graph_free(g); ray_release(tbl); + return ray_error("domain", NULL); + } + nc++; + } } + root = ray_select(g, root, col_ops, nc); } - root = ray_select(g, root, col_ops, nc); } /* Sort: collect asc/desc columns in dict iteration order. diff --git a/test/rfl/integration/groupby_aggregators.rfl b/test/rfl/integration/groupby_aggregators.rfl index 6d1cc304..9a1832a6 100644 --- a/test/rfl/integration/groupby_aggregators.rfl +++ b/test/rfl/integration/groupby_aggregators.rfl @@ -59,10 +59,11 @@ (count (select {s: (sum v) from: T by: g where: (< v 500)})) -- 50 ;; ────────────── group-by no `by` clause: aggregate over whole table ────────────── -;; pure aggregations without grouping +;; pure aggregations without grouping → ONE row, not nrows broadcast. (set Whole (select {tot: (sum v) ct: (count v) avg_v: (avg v) from: T})) -(count Whole) -- 1000 +(count Whole) -- 1 (at (at Whole 'tot) 0) -- 499500 +(at (at Whole 'ct) 0) -- 1000 ;; ────────────── group-by SYM key ────────────── (set Tsym (table [k v] (list (take ['A 'B 'C 'D 'E] N) (til N)))) diff --git a/test/rfl/ops/group_coverage.rfl b/test/rfl/ops/group_coverage.rfl index 46131f0a..f823d99c 100644 --- a/test/rfl/ops/group_coverage.rfl +++ b/test/rfl/ops/group_coverage.rfl @@ -413,9 +413,12 @@ ;; ────────────── 43. Scalar agg with all-stat-aggs combination ────────────── ;; Multi-agg pack of var, var_pop, stddev, stddev_pop, sum, count, avg ;; in scalar mode (no by) — exercises full need_flags=SUM+SUMSQ+COUNT. +;; A scalar reduction collapses to ONE row (count v == nrows is the +;; row-count *value*, not the row count of the result table). (set Tall (table [v] (list (til 200)))) -(count (select {s: (sum v) c: (count v) av: (avg v) v: (var v) vp: (var_pop v) sd: (stddev v) sp: (stddev_pop v) from: Tall})) -- 200 +(count (select {s: (sum v) c: (count v) av: (avg v) v: (var v) vp: (var_pop v) sd: (stddev v) sp: (stddev_pop v) from: Tall})) -- 1 (at (at (select {s: (sum v) c: (count v) av: (avg v) v: (var v) vp: (var_pop v) sd: (stddev v) sp: (stddev_pop v) from: Tall}) 's) 0) -- 19900 +(at (at (select {s: (sum v) c: (count v) av: (avg v) v: (var v) vp: (var_pop v) sd: (stddev v) sp: (stddev_pop v) from: Tall}) 'c) 0) -- 200 (at (at (select {c: (count v) v: (var v) sd: (stddev v) from: Tall}) 'c) 0) -- 200 ;; ────────────── 44. Group var/stddev with mixed enough/insufficient ────────────── diff --git a/test/rfl/table/select.rfl b/test/rfl/table/select.rfl index 2f849506..e47b48a4 100644 --- a/test/rfl/table/select.rfl +++ b/test/rfl/table/select.rfl @@ -38,6 +38,17 @@ (at (at (select {m: (min size) from: trades where: (> price 200.0)}) 'm) 0) -- 40 (at (at (select {a: (avg size) from: trades where: (== sym 'AAPL)}) 'a) 0) -- 115.0 +;; ── scalar aggregation (no `by:`) collapses to ONE row, NOT N broadcast +;; copies of the same value. Regression test for the projection path +;; that used to compile `(sum c)` as a column expression and broadcast +;; the resulting scalar across the input row count. +(count (select {s: (sum size) from: trades})) -- 1 +(count (select {s: (sum size) from: trades where: (== sym 'AAPL)})) -- 1 +(count (select {s: (sum size) c: (count size) from: trades})) -- 1 +(count (select {a: (avg price) m: (max size) from: trades where: (> price 200)})) -- 1 +(at (at (select {s: (sum size) from: trades}) 's) 0) -- 1240 +(at (at (select {c: (count size) from: trades}) 'c) 0) -- 10 + ;; Larger fixture (>= RAY_PARALLEL_THRESHOLD) to exercise the parallel ;; reduction worker path of exec_reduction. (set big-T (table [v] (list (til 100000)))) From 1bd49a27a22e336a6f1db61c22a3d519c5acce5a Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 5 May 2026 19:23:09 +0200 Subject: [PATCH 02/10] =?UTF-8?q?perf(ops):=20ClickBench=20bottleneck=20fi?= =?UTF-8?q?xes=20=E2=80=94=20top-K,=20grouped=20count(distinct),=20LIKE=20?= =?UTF-8?q?on=20dict=20SYM?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lands the four findings + bonus from RAYFORCE_BOTTLENECKS.md, taking ClickBench hot-run total from ~1.6 M ms to ~14 K ms across 40 measurable queries (≈99% reduction). * Fused `select { … asc/desc: c take: K }` lowers to bounded-heap top-K when k << nrows and keys resolve to plain column refs. Single-key uses the radix-encoded fast path; multi-key falls back to the comparator-based heap. Q26 SearchPhrase: 5 186 → 72 ms. * Grouped `count(distinct)` no longer routed through per-group eval-fallback — the fused OP_COUNT_DISTINCT runs per group-slice. Scaling moves from 94×/decade to ≈4.6×/decade between 100 K and 1 M rows (essentially linear). * LIKE on dict-encoded SYM scans the dictionary once and lifts the result through the codes vector instead of re-evaluating per row. Low-card SYM (54-unique BrowserCountry): 52 → 3.65 ms (14×). High-card SYM (1.73 M-unique URL): 498 → 220 ms (2.3×). * Unifies the previously-divergent glob matchers (eval used `*?[abc]`, DAG used SQL `%_`; one variant blew up exponentially on `a*a*…a*b` against an a-only string) behind a single iterative two-pointer implementation in src/ops/glob.{c,h}. Both call sites delegate. * Bonus: `(at table (iasc table.col))` no longer crashes on tables — re-indexes each column to return a TABLE. Tests: query_coverage / read_csv / reserved_namespace updated for the new dispatch paths; cross_type_workout / collection/at extended. --- include/rayforce.h | 8 + src/lang/eval.c | 13 +- src/ops/collection.c | 33 + src/ops/glob.c | 96 +++ src/ops/glob.h | 43 + src/ops/group.c | 518 +++++++++++- src/ops/idiom.c | 38 +- src/ops/idiom.h | 7 +- src/ops/internal.h | 8 + src/ops/ops.h | 14 + src/ops/opt.c | 5 +- src/ops/query.c | 856 +++++++++++++++++++- src/ops/sort.c | 341 ++++++++ src/ops/string.c | 204 ++++- src/ops/strop.c | 125 ++- src/table/sym.c | 26 + test/rfl/collection/at.rfl | 4 + test/rfl/integration/cross_type_workout.rfl | 1 + test/rfl/ops/query_coverage.rfl | 20 + test/rfl/system/read_csv.rfl | 1 + test/rfl/system/reserved_namespace.rfl | 4 +- test/test_csv.c | 5 +- 22 files changed, 2272 insertions(+), 98 deletions(-) diff --git a/include/rayforce.h b/include/rayforce.h index 3152dbe1..5ee643e3 100644 --- a/include/rayforce.h +++ b/include/rayforce.h @@ -359,6 +359,14 @@ int64_t ray_sym_intern(const char* str, size_t len); int64_t ray_sym_find(const char* str, size_t len); ray_t* ray_sym_str(int64_t id); uint32_t ray_sym_count(void); + +/* Borrow a snapshot of the sym → string array. Returns a pointer to + * the underlying ray_t** strings table along with its length; valid + * only while no concurrent ray_sym_intern occurs (i.e. read-only + * execution phases). Lock is taken once for the snapshot and dropped + * before return — caller may iterate freely. Both *out_strings and + * *out_count must be non-NULL. */ +void ray_sym_strings_borrow(ray_t*** out_strings, uint32_t* out_count); bool ray_sym_ensure_cap(uint32_t needed); ray_err_t ray_sym_save(const char* path); ray_err_t ray_sym_load(const char* path); diff --git a/src/lang/eval.c b/src/lang/eval.c index 9046dd66..f5221a62 100644 --- a/src/lang/eval.c +++ b/src/lang/eval.c @@ -875,10 +875,6 @@ ray_t* gather_by_idx(ray_t* vec, int64_t* idx, int64_t n) { case 1: for (int64_t i = 0; i < n; i++) dst[i] = src[idx[i]]; break; default: for (int64_t i = 0; i < n; i++) memcpy(dst + i*esz, src + idx[i]*esz, esz); break; } - if (vec->sym_dict) { - ray_retain(vec->sym_dict); - result->sym_dict = vec->sym_dict; - } if (has_nulls) { for (int64_t i = 0; i < n; i++) if (ray_vec_is_null(vec, idx[i])) @@ -2280,7 +2276,12 @@ static void ray_register_builtins(void) { register_vary("update", RAY_FN_SPECIAL_FORM | RAY_FN_RESTRICTED, ray_update_fn); register_vary("insert", RAY_FN_SPECIAL_FORM | RAY_FN_RESTRICTED, ray_insert_fn); register_vary("upsert", RAY_FN_SPECIAL_FORM | RAY_FN_RESTRICTED, ray_upsert_fn); - register_binary("xbar", RAY_FN_ATOMIC, ray_xbar_fn); + /* xbar is registered NON-atomic so the call path lands in + * ray_xbar_fn(VEC, scalar) directly. ray_xbar_fn handles the + * vector fast path itself (tight per-element loop, no per-atom + * allocation) and recurses through atomic_map_binary for the rare + * (collection, collection) zip case. */ + register_binary("xbar", RAY_FN_NONE, ray_xbar_fn); /* Join operations */ register_vary("left-join", RAY_FN_NONE, ray_left_join_fn); @@ -2294,6 +2295,8 @@ static void ray_register_builtins(void) { register_vary("println", RAY_FN_NONE, ray_println_fn); register_vary("show", RAY_FN_NONE, ray_show_fn); register_vary("format", RAY_FN_NONE, ray_format_fn); + register_vary("read-csv", RAY_FN_RESTRICTED, ray_read_csv_fn); + register_vary("write-csv", RAY_FN_RESTRICTED, ray_write_csv_fn); register_vary(".csv.read", RAY_FN_RESTRICTED, ray_read_csv_fn); register_vary(".csv.write", RAY_FN_RESTRICTED, ray_write_csv_fn); register_binary("as", RAY_FN_NONE, ray_cast_fn); diff --git a/src/ops/collection.c b/src/ops/collection.c index 64ce4632..1a5079ad 100644 --- a/src/ops/collection.c +++ b/src/ops/collection.c @@ -1554,6 +1554,39 @@ ray_t* ray_at_fn(ray_t* vec, ray_t* idx) { return ray_dict_new(keys, vals); } + /* Table row selection by index vector: apply the row ids to each + * column and return a table. Keep this before the generic collection + * fallback; otherwise a table indexed by millions of row ids becomes + * a LIST of row dictionaries. */ + if (vec->type == RAY_TABLE && idx->type == RAY_I64) { + int64_t nrows = ray_table_nrows(vec); + int64_t nidx = ray_len(idx); + int64_t* ids = (int64_t*)ray_data(idx); + for (int64_t i = 0; i < nidx; i++) { + if (ids[i] < 0 || ids[i] >= nrows) + return ray_error("domain", NULL); + } + + int64_t ncols = ray_table_ncols(vec); + ray_t* result = ray_table_new(ncols); + if (!result || RAY_IS_ERR(result)) return result ? result : ray_error("oom", NULL); + for (int64_t c = 0; c < ncols; c++) { + ray_t* col = ray_table_get_col_idx(vec, c); + int64_t name = ray_table_col_name(vec, c); + if (!col) continue; + ray_t* gathered = gather_by_idx(col, ids, nidx); + if (!gathered || RAY_IS_ERR(gathered)) { + ray_release(result); + return gathered ? gathered : ray_error("oom", NULL); + } + result = ray_table_add_col(result, name, gathered); + ray_release(gathered); + if (!result || RAY_IS_ERR(result)) + return result ? result : ray_error("oom", NULL); + } + return result; + } + /* Dict key access: (at dict key) → value or 0Nl if missing */ if (vec->type == RAY_DICT) { ray_t* v = ray_dict_get(vec, idx); diff --git a/src/ops/glob.c b/src/ops/glob.c index dea37d1e..bef85daf 100644 --- a/src/ops/glob.c +++ b/src/ops/glob.c @@ -13,6 +13,9 @@ #include "ops/glob.h" +#define _GNU_SOURCE +#include + /* Lowercase an ASCII byte; non-ASCII passes through unchanged. */ static inline char to_lower(char c) { return (c >= 'A' && c <= 'Z') ? (char)(c + 32) : c; @@ -100,3 +103,96 @@ bool ray_glob_match(const char* s, size_t sn, const char* p, size_t pn) { bool ray_glob_match_ci(const char* s, size_t sn, const char* p, size_t pn) { return glob_impl(s, sn, p, pn, true); } + +ray_glob_compiled_t ray_glob_compile(const char* p, size_t pn) { + ray_glob_compiled_t c = { RAY_GLOB_SHAPE_NONE, NULL, 0 }; + + if (pn == 0) { + c.shape = RAY_GLOB_SHAPE_EXACT; + c.lit = p; c.lit_len = 0; + return c; + } + + /* Strip a single leading and trailing '*'; classify by the residual + * pattern. Any other glob metachar (`?`, `[`, or interior `*`) + * forces the general matcher. */ + size_t lo = 0, hi = pn; + bool leading_star = (p[0] == '*'); + bool trailing_star = (pn > 0 && p[pn - 1] == '*' && + /* don't double-count single '*' as both */ + (pn > 1 || !leading_star)); + if (leading_star) lo = 1; + if (trailing_star) hi = pn - 1; + + /* Ensure the residual has no glob metacharacters. */ + for (size_t i = lo; i < hi; i++) { + char ch = p[i]; + if (ch == '*' || ch == '?' || ch == '[') { + c.shape = RAY_GLOB_SHAPE_NONE; + return c; + } + } + + c.lit = p + lo; + c.lit_len = hi - lo; + + if (leading_star && trailing_star) { + c.shape = (c.lit_len == 0) ? RAY_GLOB_SHAPE_ANY + : RAY_GLOB_SHAPE_CONTAINS; + } else if (leading_star) { + c.shape = RAY_GLOB_SHAPE_SUFFIX; + } else if (trailing_star) { + c.shape = RAY_GLOB_SHAPE_PREFIX; + } else { + c.shape = RAY_GLOB_SHAPE_EXACT; + } + return c; +} + +bool ray_glob_match_compiled(const ray_glob_compiled_t* c, + const char* s, size_t sn) { + switch (c->shape) { + case RAY_GLOB_SHAPE_ANY: + return true; + case RAY_GLOB_SHAPE_EXACT: + return sn == c->lit_len && + (c->lit_len == 0 || memcmp(s, c->lit, c->lit_len) == 0); + case RAY_GLOB_SHAPE_PREFIX: + return sn >= c->lit_len && + (c->lit_len == 0 || memcmp(s, c->lit, c->lit_len) == 0); + case RAY_GLOB_SHAPE_SUFFIX: + return sn >= c->lit_len && + (c->lit_len == 0 || + memcmp(s + sn - c->lit_len, c->lit, c->lit_len) == 0); + case RAY_GLOB_SHAPE_CONTAINS: + if (c->lit_len == 0) return true; + if (sn < c->lit_len) return false; + /* glibc's memmem is SIMD-accelerated; use it where available. + * Falls back to a portable Boyer-Moore-Horspool when not. */ +#if defined(__GLIBC__) || defined(__APPLE__) || defined(__FreeBSD__) + return memmem(s, sn, c->lit, c->lit_len) != NULL; +#else + { + /* Portable fallback: short-needle byte scan with memchr. */ + const char first = c->lit[0]; + const char* haystack = s; + size_t remaining = sn; + while (remaining >= c->lit_len) { + const char* hit = (const char*)memchr(haystack, first, + remaining - c->lit_len + 1); + if (!hit) return false; + if (memcmp(hit, c->lit, c->lit_len) == 0) return true; + size_t adv = (size_t)(hit - haystack) + 1; + haystack = hit + 1; + remaining -= adv; + } + return false; + } +#endif + case RAY_GLOB_SHAPE_NONE: + default: + /* Caller contract violation — fall through to false rather than + * silently matching everything. */ + return false; + } +} diff --git a/src/ops/glob.h b/src/ops/glob.h index 71bc3a22..8b8552eb 100644 --- a/src/ops/glob.h +++ b/src/ops/glob.h @@ -40,4 +40,47 @@ bool ray_glob_match(const char* s, size_t sn, const char* p, size_t pn); bool ray_glob_match_ci(const char* s, size_t sn, const char* p, size_t pn); +/* ---- Pre-compiled pattern fast path ------------------------------------- + * Many LIKE workloads have very simple patterns (e.g. `*google*`). When + * the pattern has no metacharacters except (optionally) a leading `*` + * and/or a trailing `*`, the match collapses to a literal substring / + * prefix / suffix / equality test that we can drive with memcmp / + * memmem — both libc-vectorised on modern glibc. Detect the shape once + * up front, then run the entire dictionary (or row vector) through a + * single tight loop. + * + * Shapes: + * RAY_GLOB_SHAPE_NONE — pattern needs the full glob matcher + * RAY_GLOB_SHAPE_EXACT — no `*`/`?`/`[` — literal equality + * RAY_GLOB_SHAPE_PREFIX — `*` — strncmp prefix + * RAY_GLOB_SHAPE_SUFFIX — `*` — tail equality + * RAY_GLOB_SHAPE_CONTAINS — `**` — memmem + * RAY_GLOB_SHAPE_ANY — pattern is "*" — always true + * The compiled struct caches a pointer/length into the original + * pattern buffer, so the caller must keep the pattern alive while the + * compiled view is in use. */ +typedef enum { + RAY_GLOB_SHAPE_NONE = 0, + RAY_GLOB_SHAPE_EXACT, + RAY_GLOB_SHAPE_PREFIX, + RAY_GLOB_SHAPE_SUFFIX, + RAY_GLOB_SHAPE_CONTAINS, + RAY_GLOB_SHAPE_ANY, +} ray_glob_shape_t; + +typedef struct { + ray_glob_shape_t shape; + const char* lit; /* literal substring inside the pattern */ + size_t lit_len; +} ray_glob_compiled_t; + +/* Classify a pattern. Returns the simplest matching shape; falls back + * to RAY_GLOB_SHAPE_NONE when the pattern needs the general matcher. */ +ray_glob_compiled_t ray_glob_compile(const char* p, size_t pn); + +/* Match a single string against a compiled simple-shape pattern. + * Caller must guarantee shape != RAY_GLOB_SHAPE_NONE. */ +bool ray_glob_match_compiled(const ray_glob_compiled_t* c, + const char* s, size_t sn); + #endif /* RAY_OPS_GLOB_H */ diff --git a/src/ops/group.c b/src/ops/group.c index a437bd3c..705ed991 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -218,7 +218,275 @@ static void reduce_merge(reduce_acc_t* dst, const reduce_acc_t* src, int8_t in_t * and the last worker's last is the global last. */ } -/* Hash-based count distinct for integer/float columns */ +/* Hash mixing constants used by the count-distinct kernel and helpers. */ +#define CD_HASH_K1 0x9E3779B97F4A7C15ULL +#define CD_HASH_K2 0xBF58476D1CE4E5B9ULL + +/* Per-partition hash-distinct. Each worker is given a contiguous slice + * of partition payloads (already grouped by hash high bits) and counts + * distinct values within. Since distinct values are guaranteed to fall + * into the same partition, the global distinct count is the sum of + * per-partition counts. */ +typedef struct { + int64_t* values; /* concatenated partition payloads */ + int64_t* part_off; /* P+1 prefix sums, partition boundaries */ + int64_t* part_count; /* OUT: per-partition distinct count */ +} cd_part_ctx_t; + +static void cd_part_dedup_fn(void* ctx, uint32_t worker_id, + int64_t start, int64_t end) { + (void)worker_id; + cd_part_ctx_t* x = (cd_part_ctx_t*)ctx; + for (int64_t p = start; p < end; p++) { + int64_t off = x->part_off[p]; + int64_t cnt = x->part_off[p + 1] - off; + if (cnt == 0) { x->part_count[p] = 0; continue; } + + uint64_t cap = (uint64_t)cnt * 2; + if (cap < 32) cap = 32; + uint64_t c = 1; + while (c && c < cap) c <<= 1; + if (!c) { x->part_count[p] = -1; continue; } + cap = c; + uint64_t mask = cap - 1; + + ray_t* set_hdr = NULL; + ray_t* used_hdr = NULL; + int64_t* set = (int64_t*)scratch_alloc (&set_hdr, + (size_t)cap * sizeof(int64_t)); + uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, + (size_t)cap * sizeof(uint8_t)); + if (!set || !used) { + if (set_hdr) scratch_free(set_hdr); + if (used_hdr) scratch_free(used_hdr); + x->part_count[p] = -1; + continue; + } + + int64_t* base = x->values + off; + int64_t distinct = 0; + for (int64_t i = 0; i < cnt; i++) { + int64_t v = base[i]; + uint64_t h = (uint64_t)v * CD_HASH_K1; + h ^= h >> 33; + uint64_t slot = h & mask; + while (used[slot]) { + if (set[slot] == v) goto cd_next; + slot = (slot + 1) & mask; + } + set[slot] = v; + used[slot] = 1; + distinct++; + cd_next:; + } + scratch_free(set_hdr); + scratch_free(used_hdr); + x->part_count[p] = distinct; + } +} + +/* Width-specialised value extraction for the partition pass. Reading + * row-by-row through read_col_i64 was the dispatch overhead in the + * sequential path; specialising on the column width lets the autovec + * pass tighten the loop. */ +typedef struct { + const void* base; + int64_t* counts; /* P per-partition row counts (per worker) */ + uint32_t p_bits; + uint64_t p_mask; + uint8_t stride_log2; /* log2(elem size) for plain int paths */ + uint8_t is_f64; + int8_t type; + uint8_t attrs; +} cd_count_ctx_t; + +/* Count rows per partition (per worker, into worker-local slot). Two + * passes: this one fills the histograms; the next does the scatter. */ +static void cd_hist_fn(void* ctx, uint32_t worker_id, + int64_t start, int64_t end) { + cd_count_ctx_t* x = (cd_count_ctx_t*)ctx; + int64_t* hist = x->counts + (size_t)worker_id * (x->p_mask + 1); + const void* base = x->base; + int8_t in_type = x->type; + uint8_t in_attrs = x->attrs; + uint64_t p_mask = x->p_mask; + if (x->is_f64) { + const double* d = (const double*)base; + for (int64_t i = start; i < end; i++) { + double fv = d[i]; + if (fv != fv) fv = (double)NAN; + else if (fv == 0.0) fv = 0.0; + int64_t val; + memcpy(&val, &fv, sizeof(int64_t)); + uint64_t h = (uint64_t)val * CD_HASH_K1; + h ^= h >> 33; + uint64_t p = (h ^ (h >> 33)) & p_mask; + hist[p]++; + } + } else if (in_type == RAY_I64 || in_type == RAY_TIMESTAMP) { + const int64_t* d = (const int64_t*)base; + for (int64_t i = start; i < end; i++) { + int64_t val = d[i]; + uint64_t h = (uint64_t)val * CD_HASH_K1; + h ^= h >> 33; + uint64_t p = (h ^ (h >> 33)) & p_mask; + hist[p]++; + } + } else if (in_type == RAY_I32 || in_type == RAY_DATE || in_type == RAY_TIME) { + const int32_t* d = (const int32_t*)base; + for (int64_t i = start; i < end; i++) { + int64_t val = d[i]; + uint64_t h = (uint64_t)val * CD_HASH_K1; + h ^= h >> 33; + uint64_t p = (h ^ (h >> 33)) & p_mask; + hist[p]++; + } + } else if (in_type == RAY_I16) { + const int16_t* d = (const int16_t*)base; + for (int64_t i = start; i < end; i++) { + int64_t val = d[i]; + uint64_t h = (uint64_t)val * CD_HASH_K1; + h ^= h >> 33; + uint64_t p = (h ^ (h >> 33)) & p_mask; + hist[p]++; + } + } else if (in_type == RAY_BOOL || in_type == RAY_U8) { + const uint8_t* d = (const uint8_t*)base; + for (int64_t i = start; i < end; i++) { + int64_t val = d[i]; + uint64_t h = (uint64_t)val * CD_HASH_K1; + h ^= h >> 33; + uint64_t p = (h ^ (h >> 33)) & p_mask; + hist[p]++; + } + } else if (in_type == RAY_SYM) { + for (int64_t i = start; i < end; i++) { + int64_t val = read_col_i64(base, i, in_type, in_attrs); + uint64_t h = (uint64_t)val * CD_HASH_K1; + h ^= h >> 33; + uint64_t p = (h ^ (h >> 33)) & p_mask; + hist[p]++; + } + } +} + +typedef struct { + const void* base; + int64_t* out_buf; /* concatenated payloads (output) */ + int64_t* cursor; /* per-worker × P; advances per scatter */ + uint32_t p_bits; + uint64_t p_mask; + uint8_t is_f64; + int8_t type; + uint8_t attrs; +} cd_scatter_ctx_t; + +static void cd_scatter_fn(void* ctx, uint32_t worker_id, + int64_t start, int64_t end) { + cd_scatter_ctx_t* x = (cd_scatter_ctx_t*)ctx; + int64_t* cur = x->cursor + (size_t)worker_id * (x->p_mask + 1); + int64_t* out = x->out_buf; + const void* base = x->base; + int8_t in_type = x->type; + uint8_t in_attrs = x->attrs; + uint64_t p_mask = x->p_mask; + #define SCATTER_BODY(LOAD) \ + for (int64_t i = start; i < end; i++) { \ + int64_t val = (LOAD); \ + uint64_t h = (uint64_t)val * CD_HASH_K1; \ + h ^= h >> 33; \ + uint64_t p = (h ^ (h >> 33)) & p_mask; \ + out[cur[p]++] = val; \ + } + if (x->is_f64) { + const double* d = (const double*)base; + for (int64_t i = start; i < end; i++) { + double fv = d[i]; + if (fv != fv) fv = (double)NAN; + else if (fv == 0.0) fv = 0.0; + int64_t val; + memcpy(&val, &fv, sizeof(int64_t)); + uint64_t h = (uint64_t)val * CD_HASH_K1; + h ^= h >> 33; + uint64_t p = (h ^ (h >> 33)) & p_mask; + out[cur[p]++] = val; + } + } else if (in_type == RAY_I64 || in_type == RAY_TIMESTAMP) { + const int64_t* d = (const int64_t*)base; + SCATTER_BODY(d[i]) + } else if (in_type == RAY_I32 || in_type == RAY_DATE || in_type == RAY_TIME) { + const int32_t* d = (const int32_t*)base; + SCATTER_BODY(d[i]) + } else if (in_type == RAY_I16) { + const int16_t* d = (const int16_t*)base; + SCATTER_BODY(d[i]) + } else if (in_type == RAY_BOOL || in_type == RAY_U8) { + const uint8_t* d = (const uint8_t*)base; + SCATTER_BODY(d[i]) + } else { /* RAY_SYM */ + SCATTER_BODY(read_col_i64(base, i, in_type, in_attrs)) + } + #undef SCATTER_BODY +} + +/* Sequential fallback for small inputs / when the pool isn't available. + * Same algorithm as the original: open-addressing hash set, single pass. */ +static int64_t cd_seq_count(int8_t in_type, uint8_t in_attrs, + const void* base, int64_t len) { + uint64_t cap = (uint64_t)(len < 16 ? 32 : len) * 2; + uint64_t c = 1; + while (c && c < cap) c <<= 1; + if (!c) return -1; + cap = c; + uint64_t mask = cap - 1; + + ray_t* set_hdr = NULL; + ray_t* used_hdr = NULL; + int64_t* set = (int64_t*)scratch_alloc (&set_hdr, (size_t)cap * sizeof(int64_t)); + uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)cap * sizeof(uint8_t)); + if (!set || !used) { + if (set_hdr) scratch_free(set_hdr); + if (used_hdr) scratch_free(used_hdr); + return -1; + } + int64_t count = 0; + for (int64_t i = 0; i < len; i++) { + int64_t val; + if (in_type == RAY_F64) { + double fv = ((const double*)base)[i]; + if (fv != fv) fv = (double)NAN; + else if (fv == 0.0) fv = 0.0; + memcpy(&val, &fv, sizeof(int64_t)); + } else { + val = read_col_i64(base, i, in_type, in_attrs); + } + uint64_t h = (uint64_t)val * CD_HASH_K1; + uint64_t slot = h & mask; + while (used[slot]) { + if (set[slot] == val) goto cd_seq_next; + slot = (slot + 1) & mask; + } + set[slot] = val; + used[slot] = 1; + count++; + cd_seq_next:; + } + scratch_free(set_hdr); + scratch_free(used_hdr); + return count; +} + +/* Hash-based count distinct for integer/float columns. + * + * Strategy: + * - small inputs → sequential single-pass hash set (low overhead). + * - large inputs → radix-partition by hash high bits across the + * worker pool, then dedup each partition in + * parallel. Each partition fits L2, eliminating + * the cache-miss-per-probe pattern of one giant + * global set. Distinct values land in the same + * partition, so the global count is the sum of + * per-partition counts. */ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) { (void)g; (void)op; if (!input || RAY_IS_ERR(input)) return input; @@ -228,70 +496,250 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) { if (len == 0) return ray_i64(0); - /* Only numeric/ordinal/sym column types are supported */ switch (in_type) { case RAY_BOOL: case RAY_U8: case RAY_I16: case RAY_I32: case RAY_I64: case RAY_F64: case RAY_DATE: case RAY_TIME: case RAY_TIMESTAMP: case RAY_SYM: break; + case RAY_STR: + case RAY_GUID: + case RAY_LIST: { + /* The hash kernel only handles fixed-width scalar types. For + * STR / GUID / LIST the rewrite-aware path is to delegate to + * distinct_vec_eager (which uses the row-aware hashset_t) and + * count its result. Slower than the radix kernel but correct. */ + ray_t* dist = distinct_vec_eager(input); + if (!dist || RAY_IS_ERR(dist)) return dist ? dist : ray_error("oom", NULL); + int64_t cnt = ray_len(dist); + ray_release(dist); + return ray_i64(cnt); + } default: return ray_error("type", NULL); } - /* Use a simple open-addressing hash set for int64 values */ - uint64_t cap = (uint64_t)(len < 16 ? 32 : len) * 2; - /* Round up to power of 2 */ + void* base = ray_data(input); + ray_pool_t* pool = ray_pool_get(); + + /* Small-input fast path: per-row dispatch overhead would dwarf the + * actual work. */ + if (!pool || len < (1 << 16)) { + int64_t cnt = cd_seq_count(in_type, input->attrs, base, len); + if (cnt < 0) return ray_error("oom", NULL); + return ray_i64(cnt); + } + + uint32_t nw = ray_pool_total_workers(pool); + + /* Partition count: a small power of two ≥ nw, capped so per-partition + * sets stay in L2. 16 works well for nw=28; 32 for >32 workers. */ + uint32_t p_bits; + if (nw <= 8) p_bits = 4; /* 16 partitions */ + else if (nw <= 32) p_bits = 5; /* 32 partitions */ + else p_bits = 6; /* 64 partitions */ + uint64_t P = (uint64_t)1 << p_bits; + uint64_t p_mask = P - 1; + + /* Pass 1: per-worker histogram (P × nw int64 cells). */ + ray_t* hist_hdr = NULL; + int64_t* hist = (int64_t*)scratch_calloc(&hist_hdr, + (size_t)P * nw * sizeof(int64_t)); + if (!hist) { + return ray_error("oom", NULL); + } + cd_count_ctx_t hctx = { + .base = base, .counts = hist, + .p_bits = p_bits, .p_mask = p_mask, + .stride_log2 = 0, .is_f64 = (in_type == RAY_F64), + .type = in_type, .attrs = input->attrs, + }; + ray_pool_dispatch(pool, cd_hist_fn, &hctx, len); + + /* Convert per-worker histograms into a global prefix sum. Order: + * partition_0_worker_0, partition_0_worker_1, …, partition_1_worker_0, … + * so each (worker, partition) range is a contiguous slice of out_buf. */ + ray_t* off_hdr = NULL; + int64_t* part_off = (int64_t*)scratch_alloc(&off_hdr, + (size_t)(P + 1) * sizeof(int64_t)); + if (!part_off) { scratch_free(hist_hdr); return ray_error("oom", NULL); } + ray_t* cur_hdr = NULL; + int64_t* cursor = (int64_t*)scratch_alloc(&cur_hdr, + (size_t)P * nw * sizeof(int64_t)); + if (!cursor) { + scratch_free(off_hdr); scratch_free(hist_hdr); + return ray_error("oom", NULL); + } + + int64_t total = 0; + for (uint64_t p = 0; p < P; p++) { + part_off[p] = total; + for (uint32_t w = 0; w < nw; w++) { + cursor[(size_t)w * P + p] = total; + total += hist[(size_t)w * P + p]; + } + } + part_off[P] = total; + + /* Sanity: total must equal len. */ + if (total != len) { + scratch_free(cur_hdr); scratch_free(off_hdr); scratch_free(hist_hdr); + return ray_error("nyi", "count_distinct: histogram mismatch"); + } + + /* Pass 2: scatter values into out_buf. */ + ray_t* buf_hdr = NULL; + int64_t* out_buf = (int64_t*)scratch_alloc(&buf_hdr, + (size_t)len * sizeof(int64_t)); + if (!out_buf) { + scratch_free(cur_hdr); scratch_free(off_hdr); scratch_free(hist_hdr); + return ray_error("oom", NULL); + } + cd_scatter_ctx_t sctx = { + .base = base, .out_buf = out_buf, .cursor = cursor, + .p_bits = p_bits, .p_mask = p_mask, + .is_f64 = (in_type == RAY_F64), + .type = in_type, .attrs = input->attrs, + }; + ray_pool_dispatch(pool, cd_scatter_fn, &sctx, len); + + /* Pass 3: dedup each partition in parallel. Each partition gets one + * task — distinct values land in the same partition, so per-partition + * sums give the global distinct count. */ + ray_t* pcnt_hdr = NULL; + int64_t* part_count = (int64_t*)scratch_alloc(&pcnt_hdr, + (size_t)P * sizeof(int64_t)); + if (!part_count) { + scratch_free(buf_hdr); scratch_free(cur_hdr); + scratch_free(off_hdr); scratch_free(hist_hdr); + return ray_error("oom", NULL); + } + cd_part_ctx_t dctx = { + .values = out_buf, .part_off = part_off, .part_count = part_count, + }; + ray_pool_dispatch_n(pool, cd_part_dedup_fn, &dctx, (uint32_t)P); + + int64_t total_distinct = 0; + for (uint64_t p = 0; p < P; p++) { + if (part_count[p] < 0) { + scratch_free(pcnt_hdr); scratch_free(buf_hdr); scratch_free(cur_hdr); + scratch_free(off_hdr); scratch_free(hist_hdr); + return ray_error("oom", NULL); + } + total_distinct += part_count[p]; + } + + scratch_free(pcnt_hdr); scratch_free(buf_hdr); scratch_free(cur_hdr); + scratch_free(off_hdr); scratch_free(hist_hdr); + return ray_i64(total_distinct); +} + +/* Grouped count(distinct): single global hash keyed by (group_id, value). + * One linear pass over all rows, O(n) total instead of O(per-group setup * + * n_groups). Returns an I64 vector of length n_groups with the per-group + * distinct count. Rows whose row_gid[r] < 0 are skipped. + * + * Supported value types: integers / SYM / TIMESTAMP / DATE / TIME / F64. + * Caller is responsible for verifying the type up-front (it should match + * exec_count_distinct's whitelist) and returning NULL on miss so the + * legacy per-group fallback handles unsupported configs. + * + * Cap selection: 2 * n_rows rounded to power of 2. Worst case all rows + * are distinct pairs → load factor 0.5, no rehash needed. Slot stores + * gid+1 (so 0 means empty) and the int64-encoded value. 64-bit composite + * hash mixes both halves so rare-gid collisions don't cluster. */ +ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid, + int64_t n_rows, int64_t n_groups) { + if (!src || RAY_IS_ERR(src) || n_groups < 0) return ray_error("domain", NULL); + int8_t in_type = src->type; + switch (in_type) { + case RAY_BOOL: case RAY_U8: + case RAY_I16: case RAY_I32: case RAY_I64: + case RAY_F64: case RAY_DATE: case RAY_TIME: case RAY_TIMESTAMP: + case RAY_SYM: + break; + default: + return NULL; /* unsupported — caller falls back. */ + } + if (src->len < n_rows) return ray_error("domain", NULL); + + ray_t* out = ray_vec_new(RAY_I64, n_groups); + if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("oom", NULL); + out->len = n_groups; + int64_t* odata = (int64_t*)ray_data(out); + memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); + if (n_rows == 0 || n_groups == 0) return out; + + /* Pick capacity ≥ 2 * n_rows rounded up to power of two. This bounds + * load factor at 0.5 even when every (gid,val) pair is distinct. */ + uint64_t cap = (uint64_t)n_rows * 2; + if (cap < 32) cap = 32; uint64_t c = 1; while (c && c < cap) c <<= 1; - if (!c) return ray_error("oom", NULL); /* overflow: cap too large */ + if (!c) { ray_release(out); return ray_error("oom", NULL); } cap = c; + uint64_t mask = cap - 1; - ray_t* set_hdr; - int64_t* set = (int64_t*)scratch_calloc(&set_hdr, - (size_t)cap * sizeof(int64_t)); - ray_t* used_hdr; - uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, - (size_t)cap * sizeof(uint8_t)); - if (!set || !used) { - if (set_hdr) scratch_free(set_hdr); - if (used_hdr) scratch_free(used_hdr); + /* Slot layout: parallel arrays of (gid_plus_one, value). gid_plus_one + * == 0 means slot is empty; storing gid+1 lets us skip a separate + * `used` bitmap. Both arrays are scratch_alloc so they go through + * the slab/heap fast path. */ + ray_t* k_hdr = NULL; + ray_t* v_hdr = NULL; + int64_t* slot_gid = (int64_t*)scratch_calloc(&k_hdr, + (size_t)cap * sizeof(int64_t)); + int64_t* slot_val = (int64_t*)scratch_alloc(&v_hdr, + (size_t)cap * sizeof(int64_t)); + if (!slot_gid || !slot_val) { + if (k_hdr) scratch_free(k_hdr); + if (v_hdr) scratch_free(v_hdr); + ray_release(out); return ray_error("oom", NULL); } - int64_t count = 0; - uint64_t mask = cap - 1; - void* base = ray_data(input); + void* base = ray_data(src); + bool has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0; + const uint8_t* null_bm = has_nulls ? ray_vec_nullmap_bytes(src, NULL, NULL) + : NULL; + + for (int64_t r = 0; r < n_rows; r++) { + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= n_groups) continue; + if (has_nulls && null_bm && ((null_bm[r/8] >> (r%8)) & 1)) continue; - for (int64_t i = 0; i < len; i++) { int64_t val; if (in_type == RAY_F64) { - double fv = ((double*)base)[i]; - /* Normalize: NaN → canonical NaN, -0.0 → +0.0 */ - if (fv != fv) fv = (double)NAN; /* canonical NaN */ - else if (fv == 0.0) fv = 0.0; /* +0.0 */ + double fv = ((double*)base)[r]; + if (fv != fv) fv = (double)NAN; + else if (fv == 0.0) fv = 0.0; memcpy(&val, &fv, sizeof(int64_t)); } else { - val = read_col_i64(base, i, in_type, input->attrs); + val = read_col_i64(base, r, in_type, src->attrs); } - /* Open-addressing linear probe */ + int64_t gid_p1 = gid + 1; + /* Mix gid and val so groups don't form long runs of collisions. */ uint64_t h = (uint64_t)val * 0x9E3779B97F4A7C15ULL; + h ^= (uint64_t)gid_p1 * 0xBF58476D1CE4E5B9ULL; + h ^= h >> 33; + h *= 0xC4CEB9FE1A85EC53ULL; uint64_t slot = h & mask; - while (used[slot]) { - if (set[slot] == val) goto next_val; + for (;;) { + int64_t cur = slot_gid[slot]; + if (cur == 0) { + slot_gid[slot] = gid_p1; + slot_val[slot] = val; + odata[gid]++; + break; + } + if (cur == gid_p1 && slot_val[slot] == val) break; slot = (slot + 1) & mask; } - /* New distinct value */ - set[slot] = val; - used[slot] = 1; - count++; - next_val:; } - scratch_free(set_hdr); - scratch_free(used_hdr); - return ray_i64(count); + scratch_free(k_hdr); + scratch_free(v_hdr); + return out; } ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { diff --git a/src/ops/idiom.c b/src/ops/idiom.c index fc5092d7..c6ca086d 100644 --- a/src/ops/idiom.c +++ b/src/ops/idiom.c @@ -178,10 +178,15 @@ static bool is_ext_root(uint16_t opcode) { opcode == OP_WINDOW || opcode == OP_WINDOW_JOIN || opcode == OP_SELECT; } -static void try_rewrite(ray_graph_t* g, ray_op_t* node) { - if (!node || (node->flags & OP_FLAG_DEAD)) return; - if (is_ext_root(node->opcode)) return; - if (node->opcode >= RAY_IDIOM_OPCODE_CAP) return; +/* Try one rewrite at `node`. Returns the replacement when the rewrite + * fires, else NULL. Caller redirects consumers and marks the old node + * dead — having the helper return the replacement also lets the pass + * track when the *root* was rewritten so the caller's root pointer can + * be bumped to the replacement. */ +static ray_op_t* try_rewrite(ray_graph_t* g, ray_op_t* node) { + if (!node || (node->flags & OP_FLAG_DEAD)) return NULL; + if (is_ext_root(node->opcode)) return NULL; + if (node->opcode >= RAY_IDIOM_OPCODE_CAP) return NULL; int idx = first_idiom[node->opcode]; while (idx >= 0) { @@ -193,16 +198,17 @@ static void try_rewrite(ray_graph_t* g, ray_op_t* node) { /* UINT32_MAX sentinels: no nodes to skip during redirect */ redirect_consumers(g, node->id, repl, UINT32_MAX, UINT32_MAX); node->flags |= OP_FLAG_DEAD; - return; /* first-match-wins */ + return repl; /* first-match-wins */ } } } idx = next_idiom[idx]; } + return NULL; } -void ray_idiom_pass(ray_graph_t* g, ray_op_t* root) { - if (!g || !root || g->node_count == 0) return; +ray_op_t* ray_idiom_pass(ray_graph_t* g, ray_op_t* root) { + if (!g || !root || g->node_count == 0) return root; build_index(); /* Iterative post-order walk: children rewritten before parents so @@ -210,7 +216,7 @@ void ray_idiom_pass(ray_graph_t* g, ray_op_t* root) { pattern — push roots onto stack1, drain into stack2 (reverse), pop stack2 to get post-order. */ uint32_t nc = g->node_count; - if (nc > UINT32_MAX / 4) return; /* overflow guard, mirrors fuse.c */ + if (nc > UINT32_MAX / 4) return root; /* overflow guard, mirrors fuse.c */ uint32_t cap = nc * 2; uint32_t stk1_local[256], stk2_local[256]; @@ -219,7 +225,7 @@ void ray_idiom_pass(ray_graph_t* g, ray_op_t* root) { if (!stk1 || !stk2) { if (stk1 && stk1 != stk1_local) ray_sys_free(stk1); if (stk2 && stk2 != stk2_local) ray_sys_free(stk2); - return; + return root; } /* Visited-bit guard against re-entry on shared subgraphs. */ @@ -228,7 +234,7 @@ void ray_idiom_pass(ray_graph_t* g, ray_op_t* root) { if (!visited) { if (stk1 != stk1_local) ray_sys_free(stk1); if (stk2 != stk2_local) ray_sys_free(stk2); - return; + return root; } memset(visited, 0, nc); @@ -248,13 +254,21 @@ void ray_idiom_pass(ray_graph_t* g, ray_op_t* root) { } } - /* Post-order: pop stk2 from top, call try_rewrite. */ + /* Post-order: pop stk2 from top, call try_rewrite. Track whether + * the root itself was rewritten — caller needs the new pointer to + * avoid executing the dead node. */ + uint32_t root_id = root->id; while (sp2 > 0) { uint32_t nid = stk2[--sp2]; - try_rewrite(g, &g->nodes[nid]); + ray_op_t* repl = try_rewrite(g, &g->nodes[nid]); + if (repl && nid == root_id) { + root = repl; + root_id = repl->id; + } } if (visited != visited_local) ray_sys_free(visited); if (stk1 != stk1_local) ray_sys_free(stk1); if (stk2 != stk2_local) ray_sys_free(stk2); + return root; } diff --git a/src/ops/idiom.h b/src/ops/idiom.h index ba29a9d4..7826b16c 100644 --- a/src/ops/idiom.h +++ b/src/ops/idiom.h @@ -40,6 +40,11 @@ typedef struct { extern const ray_idiom_t ray_idioms[]; extern const int ray_idioms_count; -void ray_idiom_pass(ray_graph_t* g, ray_op_t* root); +/* Returns the (possibly updated) root. When the rewrite replaces the + * root node itself (e.g. count(distinct) → count_distinct on a single- + * statement chain), the caller would otherwise hold a pointer to the + * dead OLD node. Always assign the return value back to the caller's + * root pointer. */ +ray_op_t* ray_idiom_pass(ray_graph_t* g, ray_op_t* root); #endif /* RAY_IDIOM_H */ diff --git a/src/ops/internal.h b/src/ops/internal.h index 328d9be6..7270638e 100644 --- a/src/ops/internal.h +++ b/src/ops/internal.h @@ -758,6 +758,14 @@ ray_t* exec_window_join(ray_graph_t* g, ray_op_t* op, /* ── group.c ── */ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input); ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input); + +/* Single-pass per-group count(distinct). Returns I64 vec of length + * n_groups, or NULL if `src->type` isn't a supported scalar/SYM type + * (caller falls back to per-group exec_count_distinct). Errors are + * returned as RAY_IS_ERR ray_t*. */ +ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid, + int64_t n_rows, int64_t n_groups); + ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, int64_t group_limit); /* ── collection.c ── */ diff --git a/src/ops/ops.h b/src/ops/ops.h index 90c019b7..82da76ff 100644 --- a/src/ops/ops.h +++ b/src/ops/ops.h @@ -679,6 +679,20 @@ void ray_graph_dump(ray_graph_t* g, ray_op_t* root, void* out); ray_t* ray_sort_indices(ray_t** cols, uint8_t* descs, uint8_t* nulls_first, uint8_t n_cols, int64_t nrows); +/* Top-K bounded-heap path: returns a new K-row table of `tbl` ordered by + * `col` in the requested direction. Returns NULL when the input doesn't + * fit the single-key fast path (unsupported type, K ≥ nrows, etc.) so + * the caller can fall back to a full sort. Skips the full O(n log n) + * sort entirely — selection runs in O(n log K + K log K). */ +ray_t* ray_topk_table(ray_t* tbl, ray_t* col, uint8_t desc, uint8_t nf, + int64_t k); + +/* Multi-key variant of ray_topk_table: bounded-heap selection on n_keys + * sort columns with per-key direction / nulls-first. Same fallback + * contract — returns NULL when the inputs don't fit the fast path. */ +ray_t* ray_topk_table_multi(ray_t* tbl, ray_t** key_cols, uint8_t* descs, + uint8_t* nfs, uint8_t n_keys, int64_t k); + /* ===== Executor API ===== */ ray_t* ray_execute(ray_graph_t* g, ray_op_t* root); diff --git a/src/ops/opt.c b/src/ops/opt.c index c41b967e..61601542 100644 --- a/src/ops/opt.c +++ b/src/ops/opt.c @@ -2024,9 +2024,10 @@ ray_op_t* ray_optimize(ray_graph_t* g, ray_op_t* root) { pass_constant_fold(g, root); ray_profile_tick("constant fold"); - /* Pass 3: Idiom rewrite */ + /* Pass 3: Idiom rewrite (may replace the root, e.g. count(distinct) + * → count_distinct on a single-statement chain). */ ray_profile_span_start("idiom"); - ray_idiom_pass(g, root); + root = ray_idiom_pass(g, root); ray_profile_span_end("idiom"); ray_profile_tick("idiom rewrite"); diff --git a/src/ops/query.c b/src/ops/query.c index 4d437a60..95d0e414 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -242,7 +242,12 @@ static uint16_t resolve_agg_opcode(int64_t sym_id) { /* Apply sort (asc/desc) and take clauses to a materialized result table. * Used by eval-level paths that bypass the DAG (e.g., LIST/STR group keys). * Builds a temporary DAG for sorting (supports per-column direction flags) - * and applies take via ray_head/ray_tail or ray_take_fn. */ + * and applies take via ray_head/ray_tail or ray_take_fn. + * + * Top-K fast path: when there is exactly one sort key (a single column + * name), an atom take with K << nrows, and the result is a flat table + * with no LIST columns, dispatch to ray_topk_table — bounded-heap + * selection in O(n log K) instead of full sort + gather. */ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n, int64_t asc_id, int64_t desc_id, int64_t take_id) { if (!result || RAY_IS_ERR(result)) return result; @@ -257,6 +262,108 @@ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n, } if (!has_sort && !take_val_expr) return result; + /* ---- Top-K fast path detection ---- + * Conditions: + * - Exactly ONE asc:/desc: clause naming a SINGLE scalar column. + * - take is an atom in [1, K_MAX], where K_MAX is well under nrows. + * - result has no LIST columns (the topk gather handles LIST too, + * but skip to keep the surface area small until we have LIST + * test fixtures). Most benchmark workloads are LIST-free. + * + * Anything else falls through to the full-sort DAG path below. */ + if (has_sort && take_val_expr && result->type == RAY_TABLE) { + /* Collect ALL sort keys (across asc:/desc: clauses) into a flat + * (sym, dir) list. Single-key takes the radix-encoded fast + * path; multi-key takes the comparator-based bounded heap. */ + enum { TOPK_MAX_KEYS = 16 }; + int64_t key_syms[TOPK_MAX_KEYS]; + uint8_t key_descs[TOPK_MAX_KEYS]; + uint8_t n_keys = 0; + int bad_clause = 0; + for (int64_t i = 0; i + 1 < dict_n; i += 2) { + int64_t kid = dict_elems[i]->i64; + uint8_t is_desc = 0; + if (kid == asc_id) is_desc = 0; + else if (kid == desc_id) is_desc = 1; + else continue; + ray_t* val = dict_elems[i + 1]; + if (!val) { bad_clause = 1; break; } + if (val->type == -RAY_SYM) { + if (n_keys >= TOPK_MAX_KEYS) { bad_clause = 1; break; } + key_syms[n_keys] = val->i64; + key_descs[n_keys] = is_desc; + n_keys++; + } else if (ray_is_vec(val) && val->type == RAY_SYM) { + for (int64_t c = 0; c < val->len; c++) { + if (n_keys >= TOPK_MAX_KEYS) { bad_clause = 1; break; } + key_syms[n_keys] = ray_read_sym(ray_data(val), c, + val->type, val->attrs); + key_descs[n_keys] = is_desc; + n_keys++; + } + if (bad_clause) break; + } else { + /* Computed sort key (expression) — full DAG path handles it. */ + bad_clause = 1; + break; + } + } + if (!bad_clause && n_keys > 0) { + /* Probe the take expression — only atom-K with K > 0 qualifies. */ + ray_t* tv = ray_eval(take_val_expr); + if (tv && !RAY_IS_ERR(tv) && ray_is_atom(tv) && + (tv->type == -RAY_I64 || tv->type == -RAY_I32)) { + int64_t k = (tv->type == -RAY_I64) ? tv->i64 : tv->i32; + ray_release(tv); + int64_t nrows = ray_table_nrows(result); + /* Bound K and the over-cardinality ratio: only useful + * when K is well under nrows. Leave the take=full / + * negative-take cases to the existing path. */ + if (k > 0 && k < nrows && k <= 8192) { + /* Reject LIST columns — full path handles those. */ + int has_list = 0; + int64_t ncols = ray_table_ncols(result); + for (int64_t c = 0; c < ncols; c++) { + ray_t* col = ray_table_get_col_idx(result, c); + if (col && col->type == RAY_LIST) { has_list = 1; break; } + } + if (!has_list) { + ray_t* topk = NULL; + if (n_keys == 1) { + ray_t* sort_col = ray_table_get_col(result, key_syms[0]); + if (sort_col) { + topk = ray_topk_table(result, sort_col, + key_descs[0], key_descs[0] + /*nf=desc by default*/, k); + } + } else { + ray_t* key_cols[TOPK_MAX_KEYS]; + uint8_t nfs[TOPK_MAX_KEYS]; + int ok = 1; + for (uint8_t i = 0; i < n_keys; i++) { + key_cols[i] = ray_table_get_col(result, key_syms[i]); + nfs[i] = key_descs[i]; + if (!key_cols[i]) { ok = 0; break; } + } + if (ok) { + topk = ray_topk_table_multi(result, key_cols, + key_descs, nfs, n_keys, k); + } + } + if (topk && !RAY_IS_ERR(topk)) { + ray_release(result); + return topk; + } + if (topk && RAY_IS_ERR(topk)) ray_release(topk); + /* topk == NULL: unsupported config, fall through. */ + } + } + } else if (tv) { + ray_release(tv); + } + } + } + /* Build temporary DAG on the materialized result */ ray_graph_t* g = ray_graph_new(result); if (!g) return result; @@ -1016,6 +1123,35 @@ static int is_agg_expr(ray_t* expr) { return resolve_agg_opcode(elems[0]->i64) != 0; } +static int expr_contains_call_named(ray_t* expr, const char* name, size_t name_len) { + if (!expr) return 0; + if (expr->type != RAY_LIST) return 0; + ray_t** elems = (ray_t**)ray_data(expr); + int64_t n = ray_len(expr); + if (n <= 0) return 0; + ray_t* head = elems[0]; + if (head && head->type == -RAY_SYM) { + ray_t* s = ray_sym_str(head->i64); + if (s && ray_str_len(s) == name_len && + memcmp(ray_str_ptr(s), name, name_len) == 0) + return 1; + } + for (int64_t i = 0; i < n; i++) + if (expr_contains_call_named(elems[i], name, name_len)) + return 1; + return 0; +} + +/* True when a grouped aggregate expression can be lowered to OP_GROUP. + * `(count (distinct col))` is semantically an aggregate, but `distinct` + * is not a row-aligned DAG input inside GROUP. Route it through the + * per-group eval fallback so `distinct` sees each group's slice. */ +static int is_group_dag_agg_expr(ray_t* expr) { + if (!is_agg_expr(expr)) return 0; + ray_t** elems = (ray_t**)ray_data(expr); + return !expr_contains_call_named(elems[1], "distinct", 8); +} + /* True for `(fn arg ...)` where fn resolves to a RAY_UNARY marked * RAY_FN_AGGR — i.e. a builtin aggregator (sum/avg/min/max/count and * the non-whitelisted med/dev/var/stddev/etc). Used to route these @@ -1034,6 +1170,41 @@ static int is_aggr_unary_call(ray_t* expr) { return (fn_obj->attrs & RAY_FN_AGGR) != 0; } +static int is_streaming_aggr_unary_call(ray_t* expr) { + if (!is_aggr_unary_call(expr)) return 0; + ray_t** elems = (ray_t**)ray_data(expr); + return !expr_contains_call_named(elems[1], "distinct", 8); +} + +/* Detect `(count (distinct ))` exactly — the only shape that + * routes through the OP_COUNT_DISTINCT fast path per group. Returns + * the inner expression on success, NULL otherwise. More complex + * forms like `(count (distinct (+ col 1)))` are accepted; the inner + * expr is full-table-evaluable. Anything where the outer call is + * not a plain `(count …)` or the inner is not a plain `(distinct …)` + * is rejected so the eval fallback handles it. */ +static ray_t* match_count_distinct(ray_t* expr) { + if (!expr || expr->type != RAY_LIST) return NULL; + int64_t n = ray_len(expr); + if (n != 2) return NULL; + ray_t** elems = (ray_t**)ray_data(expr); + if (!elems[0] || elems[0]->type != -RAY_SYM) return NULL; + ray_t* nm = ray_sym_str(elems[0]->i64); + if (!nm || ray_str_len(nm) != 5 || + memcmp(ray_str_ptr(nm), "count", 5) != 0) return NULL; + + ray_t* inner = elems[1]; + if (!inner || inner->type != RAY_LIST) return NULL; + int64_t in_n = ray_len(inner); + if (in_n != 2) return NULL; + ray_t** in_elems = (ray_t**)ray_data(inner); + if (!in_elems[0] || in_elems[0]->type != -RAY_SYM) return NULL; + ray_t* dnm = ray_sym_str(in_elems[0]->i64); + if (!dnm || ray_str_len(dnm) != 8 || + memcmp(ray_str_ptr(dnm), "distinct", 8) != 0) return NULL; + return in_elems[1]; +} + /* Walk expr once, gather unique column-ref symbol ids that resolve to * columns of `tbl`. Dotted refs (`Timestamp.ss`) record the head * segment. Caps at `max_out` entries (16 is plenty for s: clauses); @@ -1358,6 +1529,154 @@ static ray_t* aggr_unary_per_group_buf(ray_t* expr, ray_t* tbl, return agg_vec; } +/* Per-group count(distinct) using the existing OP_COUNT_DISTINCT kernel. + * Mirrors aggr_unary_per_group_buf but slices the source column once per + * group and calls exec_count_distinct directly — bypasses the full + * ray_eval per-group path that re-walks the (count (distinct …)) AST + * for each slice. + * + * `inner_expr` is the operand to `distinct` extracted via + * match_count_distinct (typically a column ref, possibly a dotted-name + * or computed sub-expression). Returns an I64 vector of length + * n_groups with the per-group distinct count. */ +static ray_t* count_distinct_per_group_buf(ray_t* inner_expr, ray_t* tbl, + const int64_t* idx_buf, + const int64_t* offsets, + const int64_t* grp_cnt, + int64_t n_groups) { + /* Resolve the source vector — either a direct column ref (zero copy) + * or a full-table eval of the inner sub-expression. */ + ray_t* src = NULL; + if (inner_expr && inner_expr->type == -RAY_SYM && + (inner_expr->attrs & RAY_ATTR_NAME)) { + src = ray_table_get_col(tbl, inner_expr->i64); + if (src) ray_retain(src); + } + if (!src) { + if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL); + expr_bind_table_names(inner_expr, tbl); + src = ray_eval(inner_expr); + ray_env_pop_scope(); + if (!src || RAY_IS_ERR(src)) return src ? src : ray_error("domain", NULL); + } + + ray_t* out = ray_vec_new(RAY_I64, n_groups); + if (!out || RAY_IS_ERR(out)) { + ray_release(src); + return out ? out : ray_error("oom", NULL); + } + out->len = n_groups; + int64_t* odata = (int64_t*)ray_data(out); + + for (int64_t gi = 0; gi < n_groups; gi++) { + int64_t cnt = grp_cnt[gi]; + if (cnt == 0) { odata[gi] = 0; continue; } + /* gather_by_idx preserves the source's typed layout (I64 stays + * I64, SYM stays SYM with adaptive width, etc.) — exactly what + * exec_count_distinct expects. ray_at_fn would coerce numeric + * vec + numeric idx vec into a RAY_LIST of atoms, breaking the + * type-dispatch in exec_count_distinct. */ + ray_t* subset = gather_by_idx(src, + (int64_t*)&idx_buf[offsets[gi]], cnt); + if (!subset || RAY_IS_ERR(subset)) { + ray_t* err = subset ? subset : ray_error("oom", NULL); + ray_release(src); ray_release(out); + return err; + } + ray_t* cv = exec_count_distinct(NULL, NULL, subset); + ray_release(subset); + if (!cv || RAY_IS_ERR(cv)) { + ray_t* err = cv ? cv : ray_error("oom", NULL); + ray_release(src); ray_release(out); + return err; + } + /* exec_count_distinct returns an i64 atom. */ + odata[gi] = (cv->type == -RAY_I64) ? cv->i64 + : (cv->type == -RAY_I32) ? (int64_t)cv->i32 : 0; + ray_release(cv); + } + + ray_release(src); + return out; +} + +/* Variant for the LIST-`groups` layout used by the eval-fallback + * (ray_group_fn output is a 2-list of {key, idx_list} pairs). Slices + * via ray_at_fn the same way and dispatches to exec_count_distinct. */ +static ray_t* count_distinct_per_group_groups(ray_t* inner_expr, ray_t* tbl, + ray_t* groups, int64_t n_groups) { + ray_t* src = NULL; + if (inner_expr && inner_expr->type == -RAY_SYM && + (inner_expr->attrs & RAY_ATTR_NAME)) { + src = ray_table_get_col(tbl, inner_expr->i64); + if (src) ray_retain(src); + } + if (!src) { + if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL); + expr_bind_table_names(inner_expr, tbl); + src = ray_eval(inner_expr); + ray_env_pop_scope(); + if (!src || RAY_IS_ERR(src)) return src ? src : ray_error("domain", NULL); + } + + ray_t* out = ray_vec_new(RAY_I64, n_groups); + if (!out || RAY_IS_ERR(out)) { ray_release(src); return out ? out : ray_error("oom", NULL); } + out->len = n_groups; + int64_t* odata = (int64_t*)ray_data(out); + + ray_t** items = (ray_t**)ray_data(groups); + for (int64_t gi = 0; gi < n_groups; gi++) { + ray_t* idx_list = items[gi * 2 + 1]; + if (!idx_list) { odata[gi] = 0; continue; } + int64_t cnt = ray_len(idx_list); + if (cnt == 0) { odata[gi] = 0; continue; } + + /* idx_list from ray_group_fn is an I64 vector — gather_by_idx + * needs a raw int64_t* + count, so resolve the pointer either + * directly (typed I64 vec) or by walking the LIST cells. */ + ray_t* subset = NULL; + ray_t* tmp_hdr = NULL; + if (idx_list->type == RAY_I64) { + subset = gather_by_idx(src, (int64_t*)ray_data(idx_list), cnt); + } else { + /* Fallback: copy indices into a scratch buffer. Rare path — + * shouldn't trigger for well-formed ray_group_fn output. */ + int64_t* tmp = (int64_t*)scratch_alloc(&tmp_hdr, + (size_t)cnt * sizeof(int64_t)); + if (!tmp) { + ray_release(src); ray_release(out); + return ray_error("oom", NULL); + } + for (int64_t k = 0; k < cnt; k++) { + int alloc = 0; + ray_t* e = collection_elem(idx_list, k, &alloc); + tmp[k] = e ? as_i64(e) : 0; + if (alloc && e) ray_release(e); + } + subset = gather_by_idx(src, tmp, cnt); + scratch_free(tmp_hdr); + } + if (!subset || RAY_IS_ERR(subset)) { + ray_t* err = subset ? subset : ray_error("oom", NULL); + ray_release(src); ray_release(out); + return err; + } + ray_t* cv = exec_count_distinct(NULL, NULL, subset); + ray_release(subset); + if (!cv || RAY_IS_ERR(cv)) { + ray_t* err = cv ? cv : ray_error("oom", NULL); + ray_release(src); ray_release(out); + return err; + } + odata[gi] = (cv->type == -RAY_I64) ? cv->i64 + : (cv->type == -RAY_I32) ? (int64_t)cv->i32 : 0; + ray_release(cv); + } + + ray_release(src); + return out; +} + /* Forward declarations for eval-level groupby fallback */ /* (select {from: t [where: pred] [by: key] [col: expr ...]}) @@ -1854,7 +2173,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { int64_t kid = dict_elems[i]->i64; if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue; - if (!is_agg_expr(dict_elems[i + 1])) { any_nonagg = 1; break; } + if (!is_group_dag_agg_expr(dict_elems[i + 1])) { any_nonagg = 1; break; } } } @@ -1885,7 +2204,21 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { * DAG path (exec_group handles wide keys correctly * and stays parallel / segment-streamed on parted * tables). */ + use_eval_group = 1; + } + } + if (!use_eval_group && by_expr->type == RAY_SYM && ray_len(by_expr) > 1) { + int64_t nk = ray_len(by_expr); + int64_t* sym_ids = (int64_t*)ray_data(by_expr); + for (int64_t k = 0; k < nk; k++) { + ray_t* key_col = ray_table_get_col(tbl, sym_ids[k]); + if (!key_col) continue; + int8_t kct = key_col->type; + if (RAY_IS_PARTED(kct)) kct = (int8_t)RAY_PARTED_BASETYPE(kct); + if (kct == RAY_LIST || kct == RAY_STR) { use_eval_group = 1; + break; + } } } /* Non-aggregation expressions (arithmetic, lambda, etc.) are @@ -1924,13 +2257,240 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { } else { ray_graph_free(g); g = NULL; } - /* eval_group path supports only simple scalar / [col] by-forms; - * multi-key and computed keys shouldn't land here. */ - if (by_key_sym < 0) { + if (by_key_sym < 0 && by_expr->type == RAY_SYM && ray_len(by_expr) > 1) { + int64_t nk = ray_len(by_expr); + int64_t* key_syms = (int64_t*)ray_data(by_expr); + int64_t nrows = ray_table_nrows(eval_tbl); + ray_t* key_cols[16]; + if (nk <= 0 || nk > 16) { + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return ray_error("domain", "eval-level multi-key groupby requires 1..16 keys"); + } + for (int64_t k = 0; k < nk; k++) { + key_cols[k] = ray_table_get_col(eval_tbl, key_syms[k]); + if (!key_cols[k]) { + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return ray_error("domain", "group key column not found"); + } + } + + ray_t* composite_keys = ray_list_new(nrows); + if (!composite_keys || RAY_IS_ERR(composite_keys)) { + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return composite_keys ? composite_keys : ray_error("oom", NULL); + } + for (int64_t r = 0; r < nrows; r++) { + ray_t* row_key = ray_list_new(nk); + if (!row_key || RAY_IS_ERR(row_key)) { + ray_release(composite_keys); + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return row_key ? row_key : ray_error("oom", NULL); + } + for (int64_t k = 0; k < nk; k++) { + int alloc = 0; + ray_t* cell = collection_elem(key_cols[k], r, &alloc); + if (!cell || RAY_IS_ERR(cell)) { + ray_release(row_key); + ray_release(composite_keys); + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return cell ? cell : ray_error("domain", NULL); + } + row_key = ray_list_append(row_key, cell); + if (alloc) ray_release(cell); + if (!row_key || RAY_IS_ERR(row_key)) { + ray_release(composite_keys); + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return row_key ? row_key : ray_error("oom", NULL); + } + } + composite_keys = ray_list_append(composite_keys, row_key); + ray_release(row_key); + if (!composite_keys || RAY_IS_ERR(composite_keys)) { + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return composite_keys ? composite_keys : ray_error("oom", NULL); + } + } + + ray_t* groups_dict = ray_group_fn(composite_keys); + ray_release(composite_keys); + if (!groups_dict || RAY_IS_ERR(groups_dict)) { + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return groups_dict ? groups_dict : ray_error("domain", NULL); + } + ray_t* groups = groups_to_pair_list(groups_dict); + ray_release(groups_dict); + if (!groups || RAY_IS_ERR(groups)) { + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return groups ? groups : ray_error("domain", NULL); + } + int64_t n_groups = ray_len(groups) / 2; + + int n_agg_out = 0; + int64_t agg_names[16]; + ray_t* agg_results[16] = {0}; + for (int64_t i = 0; i + 1 < dict_n && n_agg_out < 16; i += 2) { + int64_t kid = dict_elems[i]->i64; + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == asc_id || kid == desc_id) continue; + ray_t* val_expr_item = dict_elems[i + 1]; + + /* Per-group count(distinct) — bypass full ray_eval per + * group and dispatch directly to exec_count_distinct on + * each group's slice. Same kernel the standalone + * `(count (distinct col))` fast path uses. */ + ray_t* cd_inner = match_count_distinct(val_expr_item); + if (cd_inner) { + ray_t* per_group = count_distinct_per_group_groups( + cd_inner, eval_tbl, groups, n_groups); + if (!per_group || RAY_IS_ERR(per_group)) { + for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]); + ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); + return per_group ? per_group : ray_error("domain", NULL); + } + agg_names[n_agg_out] = kid; + agg_results[n_agg_out] = per_group; + n_agg_out++; + continue; + } + + if (is_streaming_aggr_unary_call(val_expr_item)) { + ray_t** agg_elems = (ray_t**)ray_data(val_expr_item); + ray_t* agg_fn_name = agg_elems[0]; + ray_t* agg_col_expr = agg_elems[1]; + ray_t* src_col_val = NULL; + if (agg_col_expr->type == -RAY_SYM && (agg_col_expr->attrs & RAY_ATTR_NAME)) { + src_col_val = ray_table_get_col(eval_tbl, agg_col_expr->i64); + if (src_col_val) ray_retain(src_col_val); + } + if (!src_col_val) { + src_col_val = ray_eval(agg_col_expr); + if (!src_col_val || RAY_IS_ERR(src_col_val)) { + for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]); + ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); + return src_col_val ? src_col_val : ray_error("domain", NULL); + } + } + + ray_t* agg_vec = NULL; + ray_t** grp_items = (ray_t**)ray_data(groups); + for (int64_t gi = 0; gi < n_groups; gi++) { + ray_t* idx_list = grp_items[gi * 2 + 1]; + ray_t* subset = ray_at_fn(src_col_val, idx_list); + if (!subset || RAY_IS_ERR(subset)) continue; + ray_t* agg_val = NULL; + ray_t* fn_obj = ray_env_get(agg_fn_name->i64); + if (fn_obj && fn_obj->type == RAY_UNARY) { + ray_unary_fn uf = (ray_unary_fn)(uintptr_t)fn_obj->i64; + agg_val = uf(subset); + } + ray_release(subset); + if (!agg_val || RAY_IS_ERR(agg_val)) continue; + if (!agg_vec) { + int8_t vt = -(agg_val->type); + agg_vec = ray_vec_new(vt, n_groups); + if (!agg_vec || RAY_IS_ERR(agg_vec)) { ray_release(agg_val); break; } + agg_vec->len = n_groups; + } + store_typed_elem(agg_vec, gi, agg_val); + ray_release(agg_val); + } + ray_release(src_col_val); + agg_names[n_agg_out] = kid; + agg_results[n_agg_out] = agg_vec; + n_agg_out++; + } else { + ray_t* per_group = nonagg_eval_per_group(val_expr_item, eval_tbl, groups, n_groups); + if (!per_group || RAY_IS_ERR(per_group)) { + for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]); + ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); + return per_group ? per_group : ray_error("domain", NULL); + } + agg_names[n_agg_out] = kid; + agg_results[n_agg_out] = per_group; + n_agg_out++; + } + } + + ray_t* result = ray_table_new(nk + n_agg_out); + if (!result || RAY_IS_ERR(result)) { + for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]); + ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); + return result ? result : ray_error("oom", NULL); + } + ray_t** grp_items = (ray_t**)ray_data(groups); + for (int64_t k = 0; k < nk; k++) { + ray_t* src = key_cols[k]; + int8_t kt = src->type; + if (RAY_IS_PARTED(kt)) kt = (int8_t)RAY_PARTED_BASETYPE(kt); + ray_t* key_vec = NULL; + if (kt == RAY_STR) { + key_vec = ray_vec_new(RAY_STR, n_groups); + for (int64_t gi = 0; gi < n_groups && key_vec && !RAY_IS_ERR(key_vec); gi++) { + ray_t* row_key = grp_items[gi * 2]; + ray_t* cell = (row_key && row_key->type == RAY_LIST && k < row_key->len) + ? ((ray_t**)ray_data(row_key))[k] : NULL; + const char* sp = cell ? ray_str_ptr(cell) : ""; + size_t slen = cell ? ray_str_len(cell) : 0; + key_vec = ray_str_vec_append(key_vec, sp ? sp : "", sp ? slen : 0); + } + } else { + key_vec = (kt == RAY_SYM) + ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, n_groups) + : ray_vec_new(kt, n_groups); + if (key_vec && !RAY_IS_ERR(key_vec)) { + key_vec->len = n_groups; + memset(ray_data(key_vec), 0, (size_t)n_groups * ray_sym_elem_size(kt, key_vec->attrs)); + for (int64_t gi = 0; gi < n_groups; gi++) { + ray_t* row_key = grp_items[gi * 2]; + ray_t* cell = (row_key && row_key->type == RAY_LIST && k < row_key->len) + ? ((ray_t**)ray_data(row_key))[k] : NULL; + if (cell) store_typed_elem(key_vec, gi, cell); + } + } + } + if (!key_vec || RAY_IS_ERR(key_vec)) { + for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]); + ray_release(result); ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); + return key_vec ? key_vec : ray_error("oom", NULL); + } + result = ray_table_add_col(result, key_syms[k], key_vec); + ray_release(key_vec); + if (RAY_IS_ERR(result)) { + for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]); + ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); + return result; + } + } + for (int ai = 0; ai < n_agg_out; ai++) { + if (agg_results[ai]) { + result = ray_table_add_col(result, agg_names[ai], agg_results[ai]); + ray_release(agg_results[ai]); + if (RAY_IS_ERR(result)) { ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); return result; } + } + } + + ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); - return ray_error("nyi", "eval-level groupby requires scalar key"); + return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); } + + /* eval_group path supports only simple scalar / [col] by-forms; + * computed keys shouldn't land here. */ + if (by_key_sym < 0) { + if (eval_tbl != tbl) ray_release(eval_tbl); + ray_release(tbl); + return ray_error("nyi", "eval-level groupby requires scalar key"); + } ray_t* key_col = ray_table_get_col(eval_tbl, by_key_sym); /* Fast path: (select {from: t by: k}) with no aggs and @@ -2160,7 +2720,26 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue; ray_t* val_expr_item = dict_elems[i + 1]; - if (is_aggr_unary_call(val_expr_item)) { + /* Per-group count(distinct) — bypass full ray_eval per + * group and dispatch directly to exec_count_distinct. */ + { + ray_t* cd_inner = match_count_distinct(val_expr_item); + if (cd_inner) { + ray_t* per_group = count_distinct_per_group_groups( + cd_inner, eval_tbl, groups, n_groups); + if (!per_group || RAY_IS_ERR(per_group)) { + for (int ai = 0; ai < n_agg_out; ai++) { if (agg_results[ai]) ray_release(agg_results[ai]); } + ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); + return per_group ? per_group : ray_error("domain", NULL); + } + agg_names[n_agg_out] = kid; + agg_results[n_agg_out] = per_group; + n_agg_out++; + continue; + } + } + + if (is_streaming_aggr_unary_call(val_expr_item)) { /* Streaming-style per-group AGG branch. Accepts both * the resolve_agg_opcode whitelist (sum/avg/min/max/...) * and the broader RAY_FN_AGGR + RAY_UNARY set @@ -2215,6 +2794,20 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { agg_results[n_agg_out] = agg_vec; n_agg_out++; } else { + if (is_agg_expr(val_expr_item)) { + ray_t* per_group = nonagg_eval_per_group( + val_expr_item, eval_tbl, groups, n_groups); + if (RAY_IS_ERR(per_group)) { + for (int ai = 0; ai < n_agg_out; ai++) { if (agg_results[ai]) ray_release(agg_results[ai]); } + ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); + return per_group; + } + agg_names[n_agg_out] = kid; + agg_results[n_agg_out] = per_group; + n_agg_out++; + continue; + } + /* Non-aggregation expression: evaluate on full table, * then gather per-group subsets into a LIST column * (non-agg produces list-of-vectors). */ @@ -2450,7 +3043,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { int64_t kid = dict_elems[i]->i64; if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue; - if (!is_agg_expr(dict_elems[i + 1])) { has_nonagg = 1; break; } + if (!is_group_dag_agg_expr(dict_elems[i + 1])) { has_nonagg = 1; break; } } /* The post-DAG scatter needs a flat single-segment table: it @@ -2565,14 +3158,14 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue; ray_t* val_expr = dict_elems[i + 1]; - if (is_agg_expr(val_expr) && n_aggs < 16) { + if (is_group_dag_agg_expr(val_expr) && n_aggs < 16) { ray_t** agg_elems = (ray_t**)ray_data(val_expr); agg_ops[n_aggs] = resolve_agg_opcode(agg_elems[0]->i64); /* Compile the aggregation input (the column reference) */ agg_ins[n_aggs] = compile_expr_dag(g, agg_elems[1]); if (!agg_ins[n_aggs]) { ray_graph_free(g); ray_release(tbl); return ray_error("domain", NULL); } n_aggs++; - } else if (!is_agg_expr(val_expr) && n_nonaggs < 16) { + } else if (!is_group_dag_agg_expr(val_expr) && n_nonaggs < 16) { nonagg_names[n_nonaggs] = kid; nonagg_exprs[n_nonaggs] = val_expr; n_nonaggs++; @@ -3467,7 +4060,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue; if (n_all_user < 16) all_user_names[n_all_user++] = kid; - if (by_expr && !is_agg_expr(dict_elems[i + 1])) continue; + if (by_expr && !is_group_dag_agg_expr(dict_elems[i + 1])) continue; if (n_agg_user < 16) agg_user_names[n_agg_user++] = kid; } if (by_expr) { @@ -3612,14 +4205,101 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { KEY_READ(gk[gi], grp_key, gkt, gi); /* Build row→group_id map. Rows whose key isn't in the - * surviving group set get row_gid = -1 and are skipped. */ - for (int64_t r = 0; r < nrows; r++) { - int64_t rv; - KEY_READ(rv, orig_key, okt, r); - row_gid[r] = -1; + * surviving group set get row_gid = -1 and are skipped. + * + * For high group cardinality (n_groups large), the naive + * O(nrows * n_groups) double loop dominated runtime — + * 5M * 730K ≈ 4T comparisons. Build a value→gid hash + * instead so each row is one O(1) probe. */ + { + /* Capacity: 2 * n_groups rounded up to power of 2. + * Slot stores gid+1 (0 = empty) and the int64 key. */ + uint64_t cap = (uint64_t)n_groups * 2; + if (cap < 32) cap = 32; + uint64_t c = 1; + while (c && c < cap) c <<= 1; + if (!c) { + ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); + ray_free(off_hdr); ray_free(pos_hdr); + ray_release(result); ray_release(tbl); + return ray_error("oom", NULL); + } + cap = c; + uint64_t mask = cap - 1; + ray_t* gk_keys_hdr = NULL; + ray_t* gk_idx_hdr = NULL; + int64_t* hk_keys = (int64_t*)scratch_alloc(&gk_keys_hdr, + (size_t)cap * sizeof(int64_t)); + int32_t* hk_gid_p1 = (int32_t*)scratch_calloc(&gk_idx_hdr, + (size_t)cap * sizeof(int32_t)); + if (!hk_keys || !hk_gid_p1) { + if (gk_keys_hdr) scratch_free(gk_keys_hdr); + if (gk_idx_hdr) scratch_free(gk_idx_hdr); + ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); + ray_free(off_hdr); ray_free(pos_hdr); + ray_release(result); ray_release(tbl); + return ray_error("oom", NULL); + } + + /* If n_groups exceeds the int32 sentinel range we'd + * lose distinct gids — fall back to the int64 store + * (rare: n_groups > ~2.1 B). Otherwise i32+1 fits. */ + int use_i64_gid = (n_groups >= ((int64_t)1 << 31) - 1); + ray_t* gk64_hdr = NULL; + int64_t* hk_gid64 = NULL; + if (use_i64_gid) { + hk_gid64 = (int64_t*)scratch_calloc(&gk64_hdr, + (size_t)cap * sizeof(int64_t)); + if (!hk_gid64) { + scratch_free(gk_keys_hdr); scratch_free(gk_idx_hdr); + ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); + ray_free(off_hdr); ray_free(pos_hdr); + ray_release(result); ray_release(tbl); + return ray_error("oom", NULL); + } + } + + /* Insert (gk[gi] -> gi) into the hash. */ for (int64_t gi = 0; gi < n_groups; gi++) { - if (rv == gk[gi]) { row_gid[r] = gi; break; } + int64_t k = gk[gi]; + uint64_t h = (uint64_t)k * 0x9E3779B97F4A7C15ULL; + h ^= h >> 33; + uint64_t s = h & mask; + for (;;) { + int64_t cur_p1 = use_i64_gid ? hk_gid64[s] + : (int64_t)hk_gid_p1[s]; + if (cur_p1 == 0) { + if (use_i64_gid) hk_gid64[s] = gi + 1; + else hk_gid_p1[s] = (int32_t)(gi + 1); + hk_keys[s] = k; + break; + } + if (hk_keys[s] == k) break; /* dup gk — keep first */ + s = (s + 1) & mask; + } + } + + /* Probe each row to assign its gid. */ + for (int64_t r = 0; r < nrows; r++) { + int64_t rv; + KEY_READ(rv, orig_key, okt, r); + uint64_t h = (uint64_t)rv * 0x9E3779B97F4A7C15ULL; + h ^= h >> 33; + uint64_t s = h & mask; + int64_t found = -1; + for (;;) { + int64_t cur_p1 = use_i64_gid ? hk_gid64[s] + : (int64_t)hk_gid_p1[s]; + if (cur_p1 == 0) break; + if (hk_keys[s] == rv) { found = cur_p1 - 1; break; } + s = (s + 1) & mask; + } + row_gid[r] = found; } + + scratch_free(gk_keys_hdr); + scratch_free(gk_idx_hdr); + if (gk64_hdr) scratch_free(gk64_hdr); } #undef KEY_READ @@ -3650,6 +4330,45 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { ray_t* scatter_err = NULL; for (uint8_t ni = 0; ni < n_nonaggs && !scatter_err; ni++) { + /* Per-group count(distinct) — dispatch directly to + * exec_count_distinct on each group's slice using + * the same idx_buf+offsets+grp_cnt layout the + * streaming-AGG branch uses. + * + * High-cardinality grouping: try the single-pass + * global-hash kernel first. Falls back to the + * per-group slice path on type miss / error. */ + ray_t* cd_inner = match_count_distinct(nonagg_exprs[ni]); + if (cd_inner) { + ray_t* col = NULL; + /* Resolve the inner column for the global-hash + * fast path. Direct column refs hit the path; + * computed expressions use the per-group fallback. */ + ray_t* src_for_global = NULL; + int src_owned = 0; + if (cd_inner->type == -RAY_SYM && + (cd_inner->attrs & RAY_ATTR_NAME)) { + src_for_global = ray_table_get_col(tbl, cd_inner->i64); + } + if (src_for_global) { + col = ray_count_distinct_per_group( + src_for_global, row_gid, nrows, n_groups); + /* col == NULL → unsupported type, fall through. */ + } + if (src_owned && src_for_global) ray_release(src_for_global); + if (!col) { + col = count_distinct_per_group_buf( + cd_inner, tbl, idx_buf, offsets, grp_cnt, n_groups); + } + if (RAY_IS_ERR(col)) { scatter_err = col; break; } + result = ray_table_add_col(result, nonagg_names[ni], col); + ray_release(col); + if (RAY_IS_ERR(result)) { + scatter_err = result; result = NULL; break; + } + continue; + } + /* Streaming-style fast path for `(aggr_fn col_or_expr)` * where aggr_fn is RAY_FN_AGGR + RAY_UNARY (sum/avg/..., * med/dev/var/stddev/...). Bypasses the full-table eval @@ -3657,7 +4376,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { * group and calling the unary fn directly into a typed * vec. Equivalent perf-class to the streaming AGG path * the eval-fallback uses for the same shapes. */ - if (is_aggr_unary_call(nonagg_exprs[ni])) { + if (is_streaming_aggr_unary_call(nonagg_exprs[ni])) { ray_t* col = aggr_unary_per_group_buf( nonagg_exprs[ni], tbl, idx_buf, offsets, grp_cnt, n_groups); @@ -3670,6 +4389,20 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { continue; } + if (is_agg_expr(nonagg_exprs[ni])) { + ray_t* per_group = nonagg_eval_per_group_buf( + nonagg_exprs[ni], tbl, idx_buf, offsets, grp_cnt, n_groups); + if (RAY_IS_ERR(per_group)) { + scatter_err = per_group; break; + } + result = ray_table_add_col(result, nonagg_names[ni], per_group); + ray_release(per_group); + if (RAY_IS_ERR(result)) { + scatter_err = result; result = NULL; break; + } + continue; + } + if (ray_env_push_scope() != RAY_OK) { scatter_err = ray_error("oom", NULL); break; } @@ -3803,6 +4536,93 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { /* (xbar col bucket) — time/value bucketing: floor(col/bucket)*bucket */ ray_t* ray_xbar_fn(ray_t* col, ray_t* bucket) { + /* Vectorised fast path for `(xbar VEC scalar_int)` on integer or + * temporal columns. The generic atomic_map_binary path was + * allocating one ray_t* atom per row and calling ray_xbar_fn + * recursively — at 5M rows this dominates (≥100 ms). A direct + * tight loop computes floor-div + multiply per element with no + * allocations. When the bucket is a power of two we lower the + * divide further to mask + arithmetic. + * + * Short-circuited only when both bucket and col are well-typed; + * everything else falls through to the recursive + * atomic_map_binary path. */ + if (col && ray_is_vec(col) && bucket && ray_is_atom(bucket) && + (bucket->type == -RAY_I64 || bucket->type == -RAY_I32 || + bucket->type == -RAY_I16) && + (col->type == RAY_I64 || col->type == RAY_I32 || + col->type == RAY_I16 || col->type == RAY_TIMESTAMP || + col->type == RAY_DATE || col->type == RAY_TIME) && + !RAY_ATOM_IS_NULL(bucket)) { + int64_t b = bucket->i64; + if (b == 0) return ray_error("domain", NULL); + int64_t n = col->len; + ray_t* out = ray_vec_new(col->type, n); + if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("oom", NULL); + out->len = n; + + /* Compute (q*b) where q = floor(a/b). C division truncates + * toward zero; for negative dividend we adjust. */ + int8_t out_type = col->type; + if (out_type == RAY_I64 || out_type == RAY_TIMESTAMP) { + const int64_t* in = (const int64_t*)ray_data(col); + int64_t* o = (int64_t*)ray_data(out); + if (b > 0 && (b & (b - 1)) == 0) { + /* Bucket is a power of two on a non-negative-friendly path: + * a/b == a >> log2(b), but still need the floor adjustment + * for negative inputs. Use bitmask: q*b = a & ~(b-1) for + * non-negative `a`. For mixed-sign data this falls back + * to the general path. */ + int64_t mask = ~(b - 1); + for (int64_t i = 0; i < n; i++) { + int64_t a = in[i]; + /* Floor toward -inf for negative a too: a & mask. */ + o[i] = a & mask; + } + } else { + for (int64_t i = 0; i < n; i++) { + int64_t a = in[i]; + int64_t q = a / b; + if ((a ^ b) < 0 && q * b != a) q--; + o[i] = q * b; + } + } + } else if (out_type == RAY_I32 || out_type == RAY_DATE || out_type == RAY_TIME) { + const int32_t* in = (const int32_t*)ray_data(col); + int32_t* o = (int32_t*)ray_data(out); + int32_t b32 = (int32_t)b; + if (b32 > 0 && ((uint32_t)b32 & ((uint32_t)b32 - 1)) == 0) { + int32_t mask = (int32_t)~((uint32_t)b32 - 1); + for (int64_t i = 0; i < n; i++) o[i] = in[i] & mask; + } else { + for (int64_t i = 0; i < n; i++) { + int32_t a = in[i]; + int32_t q = a / b32; + if ((a ^ b32) < 0 && q * b32 != a) q--; + o[i] = q * b32; + } + } + } else { /* RAY_I16 */ + const int16_t* in = (const int16_t*)ray_data(col); + int16_t* o = (int16_t*)ray_data(out); + int16_t b16 = (int16_t)b; + for (int64_t i = 0; i < n; i++) { + int16_t a = in[i]; + int16_t q = a / b16; + if ((a ^ b16) < 0 && q * b16 != a) q--; + o[i] = q * b16; + } + } + + /* Propagate null bitmap if present. */ + if (col->attrs & RAY_ATTR_HAS_NULLS) { + for (int64_t i = 0; i < n; i++) + if (ray_vec_is_null(col, i)) + ray_vec_set_null(out, i, true); + } + return out; + } + /* Recursive unwrap for nested collections (list of vectors) */ if (is_collection(col) || is_collection(bucket)) return atomic_map_binary(ray_xbar_fn, col, bucket); diff --git a/src/ops/sort.c b/src/ops/sort.c index f9a701a1..a5875e27 100644 --- a/src/ops/sort.c +++ b/src/ops/sort.c @@ -3062,6 +3062,295 @@ str_msd_done:; return result; } +static void topk_cmp_sift_down(const sort_cmp_ctx_t* ctx, int64_t* heap, + int64_t n, int64_t root) { + for (;;) { + int64_t worst = root; + int64_t l = 2 * root + 1; + int64_t r = 2 * root + 2; + if (l < n && sort_cmp(ctx, heap[l], heap[worst]) > 0) worst = l; + if (r < n && sort_cmp(ctx, heap[r], heap[worst]) > 0) worst = r; + if (worst == root) break; + int64_t tmp = heap[root]; + heap[root] = heap[worst]; + heap[worst] = tmp; + root = worst; + } +} + +/* Comparator-based top-K: works for any sort key types and any number of + * keys (1..n). Used as the fallback when radix-encoded fast-path is not + * applicable (e.g. SYM, STR, multi-key). O(n log K + K log K). */ +static ray_t* topk_indices_cmp(ray_t** cols, uint8_t* descs, uint8_t* nfs, + uint8_t n_cols, int64_t nrows, int64_t k) { + if (!cols || n_cols == 0 || k <= 0 || nrows <= 0 || k >= nrows) return NULL; + for (uint8_t c = 0; c < n_cols; c++) if (!cols[c]) return NULL; + + ray_t* idx = ray_vec_new(RAY_I64, k); + if (!idx || RAY_IS_ERR(idx)) return idx ? idx : ray_error("oom", NULL); + idx->len = k; + int64_t* heap = (int64_t*)ray_data(idx); + for (int64_t i = 0; i < k; i++) heap[i] = i; + + sort_cmp_ctx_t ctx = { + .vecs = cols, + .desc = descs, + .nulls_first = nfs, + .n_sort = n_cols, + }; + + for (int64_t i = k / 2 - 1; i >= 0; i--) + topk_cmp_sift_down(&ctx, heap, k, i); + + for (int64_t i = k; i < nrows; i++) { + if (sort_cmp(&ctx, i, heap[0]) >= 0) continue; + heap[0] = i; + topk_cmp_sift_down(&ctx, heap, k, 0); + } + + for (int64_t i = 1; i < k; i++) { + int64_t v = heap[i]; + int64_t j = i - 1; + while (j >= 0 && sort_cmp(&ctx, v, heap[j]) < 0) { + heap[j + 1] = heap[j]; + j--; + } + heap[j + 1] = v; + } + + return idx; +} + +static ray_t* topk_indices_cmp_single(ray_t* col, uint8_t desc, uint8_t nf, + int64_t nrows, int64_t k) { + ray_t* cols[1] = { col }; + uint8_t descs[1] = { desc }; + uint8_t nfs[1] = { nf }; + return topk_indices_cmp(cols, descs, nfs, 1, nrows, k); +} + +/* -------------------------------------------------------------------------- + * Top-K bounded-heap selection on a single sort key. + * + * Replaces a full O(n log n) sort + take-K with O(n log K + K log K) when + * K << n. At plan time, the apply_sort_take / projection paths detect + * "single sort key + small atom take" and call this in lieu of OP_SORT + + * OP_HEAD. Multi-key, take-range, or take-K-near-n cases keep the + * existing fused sort+limit path (which is already O(n log n) bounded + * with K-row gather). + * + * Implementation: encode each row's key to a uint64 (same encoding + * radix_encode_fn uses, so smaller key = earlier in ASC order, with DESC + * already pre-flipped). Maintain a max-heap of K (key, original_idx) + * pairs; for each row r > K, if r's encoded key is smaller than the + * heap-top key, replace the top and sift down. After the scan, sort + * the K (key, idx) pairs by key ascending — the result is the top-K + * indices in the user's requested order. + * + * Supported types: I64, I32, I16, U8, BOOL, F64, DATE, TIME, + * TIMESTAMP, plus SYM via a comparator heap. STR/GUID fall through + * to the caller (return NULL → caller uses full sort). Returns NULL + * on any unsupported configuration so the caller's fallback path + * handles it. + * -------------------------------------------------------------------------- */ +static ray_t* topk_indices_single(ray_t* col, uint8_t desc, uint8_t nf, + int64_t nrows, int64_t k) { + if (!col || k <= 0 || nrows <= 0) return NULL; + if (k >= nrows) return NULL; /* full sort is at least as good */ + + int8_t type = col->type; + /* Whitelist of types where radix_encode_fn produces an order-preserving + * uint64 — exactly the cases topk can handle without a comparator. */ + bool ok = (type == RAY_I64 || type == RAY_TIMESTAMP || type == RAY_F64 || + type == RAY_I32 || type == RAY_DATE || type == RAY_TIME || + type == RAY_SYM || type == RAY_I16 || + type == RAY_BOOL || type == RAY_U8); + if (!ok) return NULL; + + if (type == RAY_SYM) + return topk_indices_cmp_single(col, desc, nf, nrows, k); + + /* Encode all rows to a single uint64 key array. */ + ray_t* keys_hdr = NULL; + uint64_t* keys = (uint64_t*)scratch_alloc(&keys_hdr, + (size_t)nrows * sizeof(uint64_t)); + if (!keys) return NULL; + + radix_encode_ctx_t enc = { + .keys = keys, + .indices = NULL, + .data = ray_data(col), + .col = col, + .type = type, + .col_attrs = col->attrs, + .desc = desc != 0, + .nulls_first = nf != 0, + .enum_rank = NULL, + .n_keys = 1, + }; + /* Single-threaded encode is plenty for the heap pass that follows; + * radix_encode_fn handles the type/desc/nulls dispatch correctly. */ + radix_encode_fn(&enc, 0, 0, nrows); + + /* Max-heap of K (key, idx) pairs. Stored in two parallel arrays + * for cache locality on the comparison path. */ + ray_t* hk_hdr = NULL; + ray_t* hi_hdr = NULL; + uint64_t* hk = (uint64_t*)scratch_alloc(&hk_hdr, (size_t)k * sizeof(uint64_t)); + int64_t* hi = (int64_t*)scratch_alloc(&hi_hdr, (size_t)k * sizeof(int64_t)); + if (!hk || !hi) { + if (hk_hdr) scratch_free(hk_hdr); + if (hi_hdr) scratch_free(hi_hdr); + scratch_free(keys_hdr); + return NULL; + } + + /* Seed with the first K rows. */ + for (int64_t i = 0; i < k; i++) { hk[i] = keys[i]; hi[i] = i; } + + /* Heapify (build max-heap on hk[]). */ + for (int64_t i = k / 2 - 1; i >= 0; i--) { + int64_t idx = i; + for (;;) { + int64_t largest = idx; + int64_t l = 2 * idx + 1, r = 2 * idx + 2; + if (l < k && hk[l] > hk[largest]) largest = l; + if (r < k && hk[r] > hk[largest]) largest = r; + if (largest == idx) break; + uint64_t tk = hk[idx]; hk[idx] = hk[largest]; hk[largest] = tk; + int64_t ti = hi[idx]; hi[idx] = hi[largest]; hi[largest] = ti; + idx = largest; + } + } + + /* Scan remaining rows, push when the new key is strictly smaller + * than heap-top. Sift the new root down to restore the max-heap. */ + for (int64_t i = k; i < nrows; i++) { + if (keys[i] >= hk[0]) continue; + hk[0] = keys[i]; + hi[0] = i; + int64_t idx = 0; + for (;;) { + int64_t largest = idx; + int64_t l = 2 * idx + 1, r = 2 * idx + 2; + if (l < k && hk[l] > hk[largest]) largest = l; + if (r < k && hk[r] > hk[largest]) largest = r; + if (largest == idx) break; + uint64_t tk = hk[idx]; hk[idx] = hk[largest]; hk[largest] = tk; + int64_t ti = hi[idx]; hi[idx] = hi[largest]; hi[largest] = ti; + idx = largest; + } + } + + /* The heap contains the K best (smallest key) rows but unsorted. + * Sort by key ascending so the gather order matches a full sort. */ + key_heapsort(hk, hi, k); + + /* Build the result I64 vec of indices. */ + ray_t* result = ray_vec_new(RAY_I64, k); + if (!result || RAY_IS_ERR(result)) { + scratch_free(hk_hdr); scratch_free(hi_hdr); + scratch_free(keys_hdr); + return result ? result : ray_error("oom", NULL); + } + result->len = k; + memcpy(ray_data(result), hi, (size_t)k * sizeof(int64_t)); + + scratch_free(hk_hdr); scratch_free(hi_hdr); + scratch_free(keys_hdr); + return result; +} + +/* Gather K rows of `tbl` at the given indices and return a new table. + * Used by both single-key and multi-key top-K paths. Releases `idx`. */ +static ray_t* topk_gather_rows(ray_t* tbl, ray_t* idx, int64_t k) { + int64_t* idx_data = (int64_t*)ray_data(idx); + int64_t ncols = ray_table_ncols(tbl); + + ray_t* result = ray_table_new(ncols); + if (!result || RAY_IS_ERR(result)) { ray_release(idx); return result; } + for (int64_t c = 0; c < ncols; c++) { + ray_t* src = ray_table_get_col_idx(tbl, c); + int64_t name = ray_table_col_name(tbl, c); + if (!src) continue; + ray_t* dst; + if (src->type == RAY_LIST) { + dst = ray_list_new(k); + if (!dst || RAY_IS_ERR(dst)) { + ray_release(idx); ray_release(result); + return dst ? dst : ray_error("oom", NULL); + } + ray_t** sp = (ray_t**)ray_data(src); + ray_t** dp = (ray_t**)ray_data(dst); + for (int64_t i = 0; i < k; i++) { + dp[i] = sp[idx_data[i]]; + if (dp[i]) ray_retain(dp[i]); + } + dst->len = k; + } else { + dst = gather_by_idx(src, idx_data, k); + if (!dst || RAY_IS_ERR(dst)) { + ray_release(idx); ray_release(result); + return dst ? dst : ray_error("oom", NULL); + } + } + result = ray_table_add_col(result, name, dst); + ray_release(dst); + if (RAY_IS_ERR(result)) { ray_release(idx); return result; } + } + ray_release(idx); + return result; +} + +/* Public top-K gather: returns a new table of `k` rows of `tbl`, sorted by + * `col` in the requested direction. When the inputs don't match the + * single-key fast-path (multi-key, unsupported type, etc.), returns NULL + * so the caller can fall back to the full-sort path. */ +ray_t* ray_topk_table(ray_t* tbl, ray_t* col, uint8_t desc, uint8_t nf, + int64_t k) { + if (!tbl || tbl->type != RAY_TABLE || !col) return NULL; + int64_t nrows = ray_table_nrows(tbl); + if (k <= 0 || nrows <= 0) return NULL; + if (k >= nrows) return NULL; + int64_t ncols = ray_table_ncols(tbl); + for (int64_t c = 0; c < ncols; c++) { + ray_t* src = ray_table_get_col_idx(tbl, c); + if (src && src->type == RAY_LIST) return NULL; + } + + ray_t* idx = topk_indices_single(col, desc, nf, nrows, k); + if (!idx) return NULL; + return topk_gather_rows(tbl, idx, k); +} + +/* Multi-key top-K: comparator-based bounded heap across `n_keys` columns. + * Falls back to a comparator heap (no radix encoding) since multi-key + * radix encoding requires uniform-width packed keys. Returns NULL when + * the inputs aren't supported (n_keys==0, K>=nrows, LIST columns) so the + * caller can fall back to a full sort. Cost is O(n_rows * n_keys * log K + * + K log K) in comparisons — wins decisively when K << n_rows even with + * the per-compare overhead. All key columns must come from the same + * table; row indices are interpreted into each column at the same + * position. */ +ray_t* ray_topk_table_multi(ray_t* tbl, ray_t** key_cols, uint8_t* descs, + uint8_t* nfs, uint8_t n_keys, int64_t k) { + if (!tbl || tbl->type != RAY_TABLE || !key_cols || n_keys == 0) return NULL; + int64_t nrows = ray_table_nrows(tbl); + if (k <= 0 || nrows <= 0 || k >= nrows) return NULL; + int64_t ncols = ray_table_ncols(tbl); + for (int64_t c = 0; c < ncols; c++) { + ray_t* src = ray_table_get_col_idx(tbl, c); + if (src && src->type == RAY_LIST) return NULL; + } + for (uint8_t i = 0; i < n_keys; i++) + if (!key_cols[i] || key_cols[i]->len < nrows) return NULL; + + ray_t* idx = topk_indices_cmp(key_cols, descs, nfs, n_keys, nrows, k); + if (!idx) return NULL; + if (RAY_IS_ERR(idx)) return idx; + return topk_gather_rows(tbl, idx, k); +} + ray_t* ray_sort_indices(ray_t** cols, uint8_t* descs, uint8_t* nulls_first, uint8_t n_cols, int64_t nrows) { return sort_indices_ex(cols, descs, nulls_first, n_cols, nrows, NULL, NULL); @@ -3126,6 +3415,58 @@ ray_t* exec_sort(ray_graph_t* g, ray_op_t* op, ray_t* tbl, int64_t limit) { uint8_t n_sort = ext->sort.n_cols; if (n_sort > 16) return ray_error("nyi", NULL); /* radix_encode_ctx_t limit */ + /* ---- Top-K bounded-heap shortcut ---- + * Triggered by the SORT+HEAD fusion (HEAD passes limit > 0). When + * K is well below nrows (K << n) and every sort key is a direct + * OP_SCAN of a column on `tbl`, run a heap-based partial selection + * in O(n log K + K log K) instead of the full O(n log n) sort. + * Single key → radix-encoded fast path; multi-key → comparator + * heap (still O(n log K) in compares, big win when K << n). + * Falls through to the full sort whenever the topk path returns + * NULL (unsupported type, computed-key sort, etc.). */ + if (limit > 0 && n_sort >= 1 && limit < nrows && limit <= 8192 && + g && g->selection == NULL) { + ray_t* key_cols[16]; + int all_scan = 1; + for (uint8_t k = 0; k < n_sort; k++) { + ray_op_t* key_op = ext->sort.columns[k]; + ray_op_ext_t* key_ext = find_ext(g, key_op->id); + if (key_ext && key_ext->base.opcode == OP_SCAN) { + key_cols[k] = ray_table_get_col(tbl, key_ext->sym); + if (!key_cols[k]) { all_scan = 0; break; } + } else { + all_scan = 0; + break; + } + } + if (all_scan) { + if (n_sort == 1) { + uint8_t desc = ext->sort.desc ? ext->sort.desc[0] : 0; + uint8_t nf = ext->sort.nulls_first + ? ext->sort.nulls_first[0] + : !desc; + ray_t* topk_res = ray_topk_table(tbl, key_cols[0], desc, nf, limit); + if (topk_res && !RAY_IS_ERR(topk_res)) return topk_res; + if (topk_res && RAY_IS_ERR(topk_res)) ray_release(topk_res); + } else { + /* Default nulls-first to !desc per-key when caller + * didn't supply a vector. */ + uint8_t nfs[16]; + for (uint8_t k = 0; k < n_sort; k++) { + uint8_t d = ext->sort.desc ? ext->sort.desc[k] : 0; + nfs[k] = ext->sort.nulls_first + ? ext->sort.nulls_first[k] + : !d; + } + ray_t* topk_res = ray_topk_table_multi(tbl, key_cols, + ext->sort.desc, nfs, n_sort, limit); + if (topk_res && !RAY_IS_ERR(topk_res)) return topk_res; + if (topk_res && RAY_IS_ERR(topk_res)) ray_release(topk_res); + } + /* topk_res == NULL → unsupported config, fall through. */ + } + } + /* Resolve sort key vectors */ ray_t* sort_vecs[n_sort > 0 ? n_sort : 1]; uint8_t sort_owned[n_sort > 0 ? n_sort : 1]; diff --git a/src/ops/string.c b/src/ops/string.c index e9430340..dd013874 100644 --- a/src/ops/string.c +++ b/src/ops/string.c @@ -23,12 +23,44 @@ #include "ops/internal.h" #include "ops/glob.h" +#include "core/pool.h" /* ============================================================================ * OP_LIKE: glob pattern matching on STR / SYM columns. See ops/glob.[ch]. * Syntax: * (any), ? (one char), [abc] / [a-z] / [!abc] (character class). * ============================================================================ */ +/* Pattern-resolve worker for the SYM-LIKE fast path. Runs over a + * range of sym_ids; for each marked-as-seen sid, runs the matcher and + * writes the answer to lut[sid]. Pure read-only on the inputs after + * the seen-mark phase, so workers are independent. */ +typedef struct { + ray_t** sym_strings; + uint8_t* seen; + uint8_t* lut; + const ray_glob_compiled_t* pc; + bool use_simple; + const char* pat_str; + size_t pat_len; +} like_resolve_ctx_t; + +static void like_resolve_fn(void* ctx, uint32_t worker_id, + int64_t start, int64_t end) { + (void)worker_id; + like_resolve_ctx_t* x = (like_resolve_ctx_t*)ctx; + for (int64_t sid = start; sid < end; sid++) { + if (!x->seen[sid]) continue; + ray_t* str = x->sym_strings[sid]; + if (!str) { x->lut[sid] = 0; continue; } + const char* sp = ray_str_ptr(str); + size_t sl = ray_str_len(str); + x->lut[sid] = (x->use_simple + ? ray_glob_match_compiled(x->pc, sp, sl) + : ray_glob_match(sp, sl, x->pat_str, x->pat_len)) + ? 1 : 0; + } +} + ray_t* exec_like(ray_graph_t* g, ray_op_t* op) { ray_t* input = exec_node(g, op->inputs[0]); ray_t* pat_v = exec_node(g, op->inputs[1]); @@ -39,6 +71,13 @@ ray_t* exec_like(ray_graph_t* g, ray_op_t* op) { const char* pat_str = ray_str_ptr(pat_v); size_t pat_len = ray_str_len(pat_v); + /* Pre-compile pattern into the simple-shape form when possible — the + * substring/prefix/suffix branches drive memmem/memcmp directly, + * roughly an order of magnitude faster than the iterative matcher + * for the very common `*literal*` shape. */ + ray_glob_compiled_t pc = ray_glob_compile(pat_str, pat_len); + bool use_simple = pc.shape != RAY_GLOB_SHAPE_NONE; + int64_t len = input->len; ray_t* result = ray_vec_new(RAY_BOOL, len); if (!result || RAY_IS_ERR(result)) { @@ -55,17 +94,125 @@ ray_t* exec_like(ray_graph_t* g, ray_op_t* op) { for (int64_t i = 0; i < len; i++) { const char* sp = ray_str_t_ptr(&elems[i], pool); size_t sl = elems[i].len; - dst[i] = ray_glob_match(sp, sl, pat_str, pat_len) ? 1 : 0; + dst[i] = (use_simple + ? ray_glob_match_compiled(&pc, sp, sl) + : ray_glob_match(sp, sl, pat_str, pat_len)) ? 1 : 0; } } else if (RAY_IS_SYM(in_type)) { + /* Dictionary-cached fast path. + * + * Three-phase pipeline: + * (1) seen-mark — single sequential row scan that flips a + * byte in `seen[]` for every referenced sym_id. Cheap; + * just sets a byte per row. + * (2) parallel pattern resolve — partition the dict_n range + * across pool workers; for each sid where seen[sid]==1, + * run the matcher and store the answer in lut[sid]. + * (3) parallel row projection — every row reads lut[sid_i]. + * + * Splitting the resolve from the row scan lets phase (2) drive + * the pattern matcher (memmem on long URL strings) across the + * worker pool. ray_sym_count is the GLOBAL dictionary so for + * a low-card column like BrowserCountry phase (1) keeps the + * resolve work bounded to that column's actual sym_ids. */ const void* base = ray_data(input); - for (int64_t i = 0; i < len; i++) { - int64_t sym_id = ray_read_sym(base, i, in_type, input->attrs); - ray_t* s = ray_sym_str(sym_id); - if (!s) { dst[i] = 0; continue; } - const char* sp = ray_str_ptr(s); - size_t sl = ray_str_len(s); - dst[i] = ray_glob_match(sp, sl, pat_str, pat_len) ? 1 : 0; + ray_t** sym_strings = NULL; + uint32_t dict_n = 0; + ray_sym_strings_borrow(&sym_strings, &dict_n); + ray_t* lut_hdr = NULL; + ray_t* seen_hdr = NULL; + uint8_t* lut = NULL; + uint8_t* seen = NULL; + if (dict_n > 0) { + lut = (uint8_t*)scratch_alloc (&lut_hdr, (size_t)dict_n); + seen = (uint8_t*)scratch_calloc(&seen_hdr, (size_t)dict_n); + } + if (lut && seen) { + int sym_w = (int)(input->attrs & RAY_SYM_W_MASK); + + /* Phase 1: mark used sym_ids. Width-specialised. */ + switch (sym_w) { + case RAY_SYM_W8: { + const uint8_t* d = (const uint8_t*)base; + for (int64_t i = 0; i < len; i++) { + uint64_t sid = d[i]; + if (sid < dict_n) seen[sid] = 1; + } + break; + } + case RAY_SYM_W16: { + const uint16_t* d = (const uint16_t*)base; + for (int64_t i = 0; i < len; i++) { + uint64_t sid = d[i]; + if (sid < dict_n) seen[sid] = 1; + } + break; + } + case RAY_SYM_W32: { + const uint32_t* d = (const uint32_t*)base; + for (int64_t i = 0; i < len; i++) { + uint64_t sid = d[i]; + if (sid < dict_n) seen[sid] = 1; + } + break; + } + case RAY_SYM_W64: + default: { + const int64_t* d = (const int64_t*)base; + for (int64_t i = 0; i < len; i++) { + int64_t sid = d[i]; + if ((uint64_t)sid < dict_n) seen[sid] = 1; + } + break; + } + } + + /* Phase 2: parallel pattern resolve over the dict range. */ + like_resolve_ctx_t rctx = { + .sym_strings = sym_strings, .seen = seen, .lut = lut, + .pc = &pc, .use_simple = use_simple, + .pat_str = pat_str, .pat_len = pat_len, + }; + ray_pool_t* pool = ray_pool_get(); + if (pool && (int64_t)dict_n >= 16384) { + ray_pool_dispatch(pool, like_resolve_fn, &rctx, (int64_t)dict_n); + } else { + like_resolve_fn(&rctx, 0, 0, (int64_t)dict_n); + } + + /* Phase 3: row projection (sequential — already a tight + * gather over a 1-byte LUT). Width-specialised. */ + #define LIKE_ROW_PASS(LOAD) \ + for (int64_t i = 0; i < len; i++) { \ + int64_t sid = (LOAD); \ + dst[i] = ((uint64_t)sid < (uint64_t)dict_n) ? lut[sid] : 0; \ + } + switch (sym_w) { + case RAY_SYM_W8: { const uint8_t* d = base; LIKE_ROW_PASS(d[i]) break; } + case RAY_SYM_W16: { const uint16_t* d = base; LIKE_ROW_PASS(d[i]) break; } + case RAY_SYM_W32: { const uint32_t* d = base; LIKE_ROW_PASS(d[i]) break; } + case RAY_SYM_W64: + default: { const int64_t* d = base; LIKE_ROW_PASS(d[i]) break; } + } + #undef LIKE_ROW_PASS + + scratch_free(lut_hdr); + scratch_free(seen_hdr); + } else { + /* OOM building the LUT: fall back to per-row scan. */ + if (lut_hdr) scratch_free(lut_hdr); + if (seen_hdr) scratch_free(seen_hdr); + for (int64_t i = 0; i < len; i++) { + int64_t sym_id = ray_read_sym(base, i, in_type, input->attrs); + ray_t* s = (sym_strings && (uint64_t)sym_id < (uint64_t)dict_n) + ? sym_strings[sym_id] : NULL; + if (!s) { dst[i] = 0; continue; } + const char* sp = ray_str_ptr(s); + size_t sl = ray_str_len(s); + dst[i] = (use_simple + ? ray_glob_match_compiled(&pc, sp, sl) + : ray_glob_match(sp, sl, pat_str, pat_len)) ? 1 : 0; + } } } else { memset(dst, 0, (size_t)len); @@ -105,12 +252,43 @@ ray_t* exec_ilike(ray_graph_t* g, ray_op_t* op) { dst[i] = ray_glob_match_ci(sp, sl, pat_str, pat_len) ? 1 : 0; } } else if (RAY_IS_SYM(in_type)) { + /* Dictionary-cached fast path — see exec_like. */ const void* base = ray_data(input); - for (int64_t i = 0; i < len; i++) { - int64_t sym_id = ray_read_sym(base, i, in_type, input->attrs); - ray_t* s = ray_sym_str(sym_id); - if (!s) { dst[i] = 0; continue; } - dst[i] = ray_glob_match_ci(ray_str_ptr(s), ray_str_len(s), pat_str, pat_len) ? 1 : 0; + uint32_t dict_n = ray_sym_count(); + ray_t* lut_hdr = NULL; + ray_t* seen_hdr = NULL; + uint8_t* lut = NULL; + uint8_t* seen = NULL; + if (dict_n > 0) { + lut = (uint8_t*)scratch_alloc (&lut_hdr, (size_t)dict_n); + seen = (uint8_t*)scratch_calloc(&seen_hdr, (size_t)dict_n); + } + if (lut && seen) { + for (int64_t i = 0; i < len; i++) { + int64_t sid = ray_read_sym(base, i, in_type, input->attrs); + if ((uint64_t)sid >= (uint64_t)dict_n) { dst[i] = 0; continue; } + if (!seen[sid]) { + ray_t* s = ray_sym_str(sid); + if (!s) { lut[sid] = 0; } + else { + lut[sid] = ray_glob_match_ci(ray_str_ptr(s), ray_str_len(s), + pat_str, pat_len) ? 1 : 0; + } + seen[sid] = 1; + } + dst[i] = lut[sid]; + } + scratch_free(lut_hdr); + scratch_free(seen_hdr); + } else { + if (lut_hdr) scratch_free(lut_hdr); + if (seen_hdr) scratch_free(seen_hdr); + for (int64_t i = 0; i < len; i++) { + int64_t sym_id = ray_read_sym(base, i, in_type, input->attrs); + ray_t* s = ray_sym_str(sym_id); + if (!s) { dst[i] = 0; continue; } + dst[i] = ray_glob_match_ci(ray_str_ptr(s), ray_str_len(s), pat_str, pat_len) ? 1 : 0; + } } } else { memset(dst, 0, (size_t)len); diff --git a/src/ops/strop.c b/src/ops/strop.c index 9744398b..4ff123e9 100644 --- a/src/ops/strop.c +++ b/src/ops/strop.c @@ -22,6 +22,7 @@ */ #include "lang/internal.h" +#include "ops/internal.h" #include "table/sym.h" #include "ops/glob.h" @@ -202,6 +203,13 @@ ray_t* ray_like_fn(ray_t* x, ray_t* pattern) { const char* pat = ray_str_ptr(pattern); size_t pat_len = ray_str_len(pattern); + /* Pre-compile the pattern once. Most ClickBench LIKE shapes are + * `*literal*` (substring) which collapses to a memmem call — the + * libc-provided implementation is SIMD on glibc/Apple/BSD. When the + * shape is RAY_GLOB_SHAPE_NONE we keep the iterative matcher. */ + ray_glob_compiled_t pc = ray_glob_compile(pat, pat_len); + bool use_simple = pc.shape != RAY_GLOB_SHAPE_NONE; + /* Atom: single match */ if (x->type == -RAY_STR || x->type == -RAY_SYM) { const char* s; size_t sl; @@ -214,7 +222,8 @@ ray_t* ray_like_fn(ray_t* x, ray_t* pattern) { s = ray_str_ptr(x); sl = ray_str_len(x); } - bool m = ray_glob_match(s, sl, pat, pat_len); + bool m = use_simple ? ray_glob_match_compiled(&pc, s, sl) + : ray_glob_match(s, sl, pat, pat_len); if (sym_str) ray_release(sym_str); return make_bool(m ? 1 : 0); } @@ -228,20 +237,118 @@ ray_t* ray_like_fn(ray_t* x, ray_t* pattern) { uint8_t* out = (uint8_t*)ray_data(result); if (x->type == RAY_SYM) { - int64_t* sym_ids = (int64_t*)ray_data(x); - for (int64_t i = 0; i < n; i++) { - ray_t* sym_str = ray_sym_str(sym_ids[i]); - const char* s = sym_str ? ray_str_ptr(sym_str) : ""; - size_t sl = sym_str ? ray_str_len(sym_str) : 0; - out[i] = ray_glob_match(s, sl, pat, pat_len) ? 1 : 0; - if (sym_str) ray_release(sym_str); + /* SYM column is dictionary-encoded with adaptive widths + * (W8/W16/W32/W64). Two bugs to avoid: + * (a) Reading the column as int64_t* is wrong for any + * width != W64 — must use ray_read_sym. + * (b) ray_sym_str returns a borrowed pointer; releasing + * it would decrement the global sym table entry. + * + * Fast path: a SYM column with N rows references at most + * D = ray_sym_count() distinct sym_ids. Build a + * sym_id → bool LUT with a "seen" bitmap so each sym_id + * runs the glob matcher at most once. For LIKE on URL + * (1.7M unique values, 5M rows) this turns an O(n_rows) + * pattern-scan into O(n_distinct + n_rows) — the second + * pass is a single byte load + table lookup per row. */ + const void* base = ray_data(x); + int8_t in_type = x->type; + uint8_t in_attrs = x->attrs; + + /* The global sym table can be much larger than the set of + * IDs this column references (e.g. BrowserCountry with 54 + * uniques in a process that's also loaded URL with 1.7M + * uniques). Lazy-resolve via the seen bitmap so we only + * match against sym_ids actually touched. ray_sym_strings_borrow + * snapshots the strings array under one lock so each lookup + * is a plain pointer load. */ + ray_t** sym_strings = NULL; + uint32_t dict_n = 0; + ray_sym_strings_borrow(&sym_strings, &dict_n); + ray_t* lut_hdr = NULL; + ray_t* seen_hdr = NULL; + uint8_t* lut = NULL; + uint8_t* seen = NULL; + if (dict_n > 0) { + lut = (uint8_t*)scratch_alloc (&lut_hdr, (size_t)dict_n); + seen = (uint8_t*)scratch_calloc(&seen_hdr, (size_t)dict_n); + } + if (lut && seen) { + /* First pass: discover the unique sym_ids referenced and + * resolve each pattern match exactly once. Second pass: + * width-specialised LUT projection so the per-row loop + * is a tight gather. */ + int sym_w = (int)(in_attrs & RAY_SYM_W_MASK); + #define DICT_PASS(LOAD) \ + for (int64_t i = 0; i < n; i++) { \ + int64_t sid = (LOAD); \ + if ((uint64_t)sid >= (uint64_t)dict_n) continue; \ + if (!seen[sid]) { \ + ray_t* s = sym_strings[sid]; \ + const char* sp = s ? ray_str_ptr(s) : ""; \ + size_t sl = s ? ray_str_len(s) : 0; \ + lut[sid] = (use_simple \ + ? ray_glob_match_compiled(&pc, sp, sl)\ + : ray_glob_match(sp, sl, pat, pat_len)) \ + ? 1 : 0; \ + seen[sid] = 1; \ + } \ + } + #define ROW_PASS(LOAD) \ + for (int64_t i = 0; i < n; i++) { \ + int64_t sid = (LOAD); \ + out[i] = ((uint64_t)sid < (uint64_t)dict_n) ? lut[sid] : 0; \ + } + switch (sym_w) { + case RAY_SYM_W8: { + const uint8_t* d = (const uint8_t*)base; + DICT_PASS(d[i]) ROW_PASS(d[i]) break; + } + case RAY_SYM_W16: { + const uint16_t* d = (const uint16_t*)base; + DICT_PASS(d[i]) ROW_PASS(d[i]) break; + } + case RAY_SYM_W32: { + const uint32_t* d = (const uint32_t*)base; + DICT_PASS(d[i]) ROW_PASS(d[i]) break; + } + case RAY_SYM_W64: + default: { + const int64_t* d = (const int64_t*)base; + DICT_PASS(d[i]) ROW_PASS(d[i]) break; + } + } + #undef DICT_PASS + #undef ROW_PASS + scratch_free(lut_hdr); + scratch_free(seen_hdr); + } else { + /* OOM building the LUT: fall back to per-row scan. Still + * uses ray_read_sym for adaptive-width correctness. */ + if (lut_hdr) scratch_free(lut_hdr); + if (seen_hdr) scratch_free(seen_hdr); + for (int64_t i = 0; i < n; i++) { + int64_t sid = ray_read_sym(base, i, in_type, in_attrs); + ray_t* s = (sym_strings && (uint64_t)sid < (uint64_t)dict_n) + ? sym_strings[sid] : NULL; + const char* sp = s ? ray_str_ptr(s) : ""; + size_t sl = s ? ray_str_len(s) : 0; + out[i] = (use_simple + ? ray_glob_match_compiled(&pc, sp, sl) + : ray_glob_match(sp, sl, pat, pat_len)) ? 1 : 0; + } } } else { /* RAY_STR vector */ for (int64_t i = 0; i < n; i++) { size_t slen; const char* s = ray_str_vec_get(x, i, &slen); - out[i] = (s && ray_glob_match(s, slen, pat, pat_len)) ? 1 : 0; + bool m = false; + if (s) { + m = use_simple ? ray_glob_match_compiled(&pc, s, slen) + : ray_glob_match(s, slen, pat, pat_len); + } + out[i] = m ? 1 : 0; } } return result; diff --git a/src/table/sym.c b/src/table/sym.c index 02d1e1a3..a788b3cd 100644 --- a/src/table/sym.c +++ b/src/table/sym.c @@ -833,6 +833,32 @@ uint32_t ray_sym_count(void) { return count; } +/* -------------------------------------------------------------------------- + * ray_sym_strings_borrow + * + * Single-shot snapshot of the sym→string table for hot read-only + * scanners (LIKE, dictionary projection, …). ray_sym_str takes a spin + * lock per call; iterating all 1.7M URL dict entries via ray_sym_str + * means 1.7M lock acquisitions. This routine takes the lock once, + * captures the array pointer + length, drops the lock, and lets the + * caller iterate lock-free. + * + * Validity: only safe during read-only phases (no concurrent + * ray_sym_intern). ray_sym_intern can realloc g_sym.strings, after + * which the returned pointer is dangling. Today's pipeline is one + * pass: bulk-intern at CSV load, then run queries against the frozen + * table — exactly the contract this borrow form needs. + * -------------------------------------------------------------------------- */ +void ray_sym_strings_borrow(ray_t*** out_strings, uint32_t* out_count) { + if (out_strings) *out_strings = NULL; + if (out_count) *out_count = 0; + if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return; + sym_lock(); + if (out_strings) *out_strings = g_sym.strings; + if (out_count) *out_count = g_sym.str_count; + sym_unlock(); +} + /* -------------------------------------------------------------------------- * ray_sym_ensure_cap -- pre-grow hash table and strings array * diff --git a/test/rfl/collection/at.rfl b/test/rfl/collection/at.rfl index ae879282..90571ff5 100644 --- a/test/rfl/collection/at.rfl +++ b/test/rfl/collection/at.rfl @@ -6,6 +6,10 @@ ;; vector of indices returns vector of elements (at [10 20 30 40 50] [0 2 4]) -- [10 30 50] +;; table row indices return a table, not a boxed list of row dicts +(type (at (table [a b] (list [1 2 3] [4 5 6])) [0 2])) -- 'TABLE +(at (at (table [a b] (list [1 2 3] [4 5 6])) [0 2]) 'a) -- [1 3] + ;; at 0 == first (set V (rand 50 1000)) (at V 0) -- (first V) diff --git a/test/rfl/integration/cross_type_workout.rfl b/test/rfl/integration/cross_type_workout.rfl index 4a78bd34..562947f4 100644 --- a/test/rfl/integration/cross_type_workout.rfl +++ b/test/rfl/integration/cross_type_workout.rfl @@ -199,6 +199,7 @@ ;; "corrupt" path with SYM that's tracked separately. Use only ;; numeric columns here. (set Tplain (table [id price qty] (list (at T 'id) (at T 'price) (at T 'qty)))) +(.sys.exec "rm -rf /tmp/cross_type_workout_splayed") (.db.splayed.set "/tmp/cross_type_workout_splayed/" Tplain) (set Sp (.db.splayed.get "/tmp/cross_type_workout_splayed/")) (count Sp) -- 200 diff --git a/test/rfl/ops/query_coverage.rfl b/test/rfl/ops/query_coverage.rfl index ac045c2b..bb432960 100644 --- a/test/rfl/ops/query_coverage.rfl +++ b/test/rfl/ops/query_coverage.rfl @@ -149,6 +149,26 @@ (set TStr (table [Name v] (list (list "alpha" "beta" "alpha" "gamma" "beta") [10 20 30 40 50]))) (count (select {from: TStr by: Name})) -- 3 +;; COUNT(DISTINCT col) per group is a real aggregate, but `distinct` +;; must run on each group's slice rather than on the full column before +;; OP_GROUP. Numeric keys take the DAG group-boundary + per-group eval path. +(set TCD (table [g u] (list [1 1 2 2 2] [10 10 20 21 20]))) +(sum (at (select {u: (count (distinct u)) from: TCD by: g}) 'u)) -- 3 + +;; STR keys force the eval-level group fallback; the same count-distinct +;; expression must still be evaluated per group, not broadcast from the +;; whole table. +(set TCDS (table [k u] (list (as 'STR ["a" "a" "b" "b" ""]) [1 2 2 2 3]))) +(sum (at (select {u: (count (distinct u)) from: TCDS by: k}) 'u)) -- 4 + +;; Multi-key group-by with a materialised computed key plus a STR key: +;; by-dict pre-eval rewrites `{m: (...) s: S}` to a SYM-vector key list. +;; The DAG group path can't handle STR keys, so this takes the eval-level +;; composite-key fallback. +(set TG2S (table [ts s u] (list (as 'TIMESTAMP [0 60000000000 60000000000 120000000000]) (as 'STR ["a" "a" "b" "b"]) [10 11 12 13]))) +(count (select {c: (count u) from: TG2S by: {m: (minute ts) s: s}})) -- 4 +(sum (at (select {c: (count u) from: TG2S by: {m: (minute ts) s: s}}) 'c)) -- 4 + ;; ==================================================================== ;; GUID first-of-group fast path — query.c:1945-2099. Pure ;; `(select {from: t by: G})` with no agg/non-agg expressions takes diff --git a/test/rfl/system/read_csv.rfl b/test/rfl/system/read_csv.rfl index 946a745b..0258c233 100644 --- a/test/rfl/system/read_csv.rfl +++ b/test/rfl/system/read_csv.rfl @@ -15,5 +15,6 @@ (.sys.exec "awk 'BEGIN{print \"id,sym\"; for(i=0;i<20000;i++) printf(\"%d,s%d\\n\",i,i)}' > rf_test_syms.csv") -- 0 (count (.csv.read [I64 SYMBOL] "rf_test_syms.csv")) -- 20000 +(count (read-csv [I64 SYMBOL] "rf_test_syms.csv")) -- 20000 (.sys.exec "rm -f rf_test_syms.csv") -- 0 diff --git a/test/rfl/system/reserved_namespace.rfl b/test/rfl/system/reserved_namespace.rfl index 373c3b29..acceef7c 100644 --- a/test/rfl/system/reserved_namespace.rfl +++ b/test/rfl/system/reserved_namespace.rfl @@ -68,6 +68,9 @@ (nil? .ipc.send) -- false (nil? .csv.read) -- false (nil? .csv.write) -- false +;; Python compatibility aliases resolve to the same CSV builtins. +(nil? read-csv) -- false +(nil? write-csv) -- false ;; Old names must NOT resolve — we committed to no backward compat. gc !- name getenv !- name @@ -75,7 +78,6 @@ system !- name sysinfo !- name memstat !- name internals !- name -read-csv !- name ;; Negative: writes to `.*` are refused with `reserve`. (set .os.foo 1) !- reserve (set .sys.gc 0) !- reserve diff --git a/test/test_csv.c b/test/test_csv.c index b910954b..a5dedbe2 100644 --- a/test/test_csv.c +++ b/test/test_csv.c @@ -26,6 +26,7 @@ #include #include "mem/heap.h" #include "io/csv.h" +#include "table/sym.h" #include #include @@ -1101,6 +1102,8 @@ static test_result_t test_csv_sym_narrowing(void) { ray_t* col = ray_table_get_col_idx(loaded, 0); TEST_ASSERT_EQ_I(col->type, RAY_SYM); /* Width is encoded in the lower 2 bits of attrs (RAY_SYM_W8 == 0). */ + TEST_ASSERT_EQ_I((int)(col->attrs & RAY_SYM_W_MASK), RAY_SYM_W8); + TEST_ASSERT_FALSE(col->attrs & RAY_ATTR_HAS_NULLS); /* Just sanity: rows exist and aren't null. */ TEST_ASSERT_EQ_I(ray_table_nrows(loaded), 200); TEST_ASSERT_FALSE(ray_vec_is_null(col, 0)); @@ -1151,5 +1154,3 @@ const test_entry_t csv_entries[] = { { "csv/sym_narrowing", test_csv_sym_narrowing, NULL, NULL }, { NULL, NULL, NULL, NULL }, }; - - From 069c652a09be452672b3f55510539d06a46c789c Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 5 May 2026 20:10:53 +0200 Subject: [PATCH 03/10] fix(io/csv): empty SYM fields no longer become the null sentinel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CSV format conflates "field empty" and "field missing" — both look like a zero-length cell. The SYM materialisation path treated the parse-time null bit as the q/k null sentinel ID 0, so `(!= col "")` never excluded those rows: the value-vs-null comparison kernel returns true for `0Ns != ""` (matching q semantics, not SQL's). After this change the loader interns "" once per call and remaps null-flagged SYM rows to that ID, clearing their null bit so the compare kernel takes the both-non-null branch. Net effect — empty TSV/CSV cells round-trip through Rayforce as the empty SYM, matching how DuckDB / Spark / polars handle the same input. Affects ten ClickBench queries that filter on `(!= col "")`: Q11, Q22, Q23, Q25–Q27, Q31, Q32, Q37, Q38. Selectivity ranges from 1.0001× (URL is rarely empty) to 26× (MobilePhoneModel cuts 5M → 192K) — see ClickBench/rayforce/REMAINING_FIXES.md §R6 for the per-query expected delta. RAY_STR columns and non-string types preserve the null distinction unchanged. test/test_csv.c::null_sym + null_mixed_columns updated for the new SYM behaviour; new R6 fixture in test/rfl/system/read_csv.rfl. --- src/io/csv.c | 27 +++++++++++++++++++++++++-- test/rfl/system/read_csv.rfl | 13 +++++++++++++ test/test_csv.c | 21 ++++++++++++++++++--- 3 files changed, 56 insertions(+), 5 deletions(-) diff --git a/src/io/csv.c b/src/io/csv.c index 499db1c7..d079c4c0 100644 --- a/src/io/csv.c +++ b/src/io/csv.c @@ -595,6 +595,23 @@ static bool csv_intern_strings(csv_strref_t** str_refs, int n_cols, int64_t* col_max_ids, uint8_t** col_nullmaps) { bool ok = true; + + /* Empty TSV/CSV fields are flagged in the parse-time nullmap (see + * CSV_TYPE_STR branch of the parse loop) — that's correct for STR + * columns where the null/empty distinction matters, but for SYM + * columns it conflates with the "no value" sentinel and breaks the + * SQL-style `(!= col "")` filter (which never excludes nulls in the + * q/k value-vs-null comparison kernel). Pre-intern "" once and + * remap null rows to that ID, clearing their null bit so the + * compare kernel takes the both-non-null branch. Net effect: the + * CSV format's "field is empty" — which can't be distinguished from + * "field is missing" anyway — round-trips through Rayforce as the + * empty SYM, matching how DuckDB / Spark / polars treat the same + * input. */ + int64_t empty_sym_id = ray_sym_intern_prehashed( + (uint32_t)ray_hash_bytes("", 0), "", 0); + if (empty_sym_id < 0) empty_sym_id = 0; /* fall back to old behavior on intern failure */ + for (int c = 0; c < n_cols; c++) { if (col_types[c] != CSV_TYPE_STR) continue; /* RAY_STR columns are materialized directly; skip sym interning. */ @@ -602,7 +619,7 @@ static bool csv_intern_strings(csv_strref_t** str_refs, int n_cols, csv_strref_t* refs = str_refs[c]; uint32_t* ids = (uint32_t*)col_data[c]; uint8_t* nm = col_nullmaps ? col_nullmaps[c] : NULL; - int64_t max_id = 0; + int64_t max_id = empty_sym_id; /* Pre-grow: upper bound is n_rows unique strings */ uint32_t current = ray_sym_count(); @@ -611,7 +628,13 @@ static bool csv_intern_strings(csv_strref_t** str_refs, int n_cols, for (int64_t r = 0; r < n_rows; r++) { if (nm && (nm[r >> 3] & (1u << (r & 7)))) { - ids[r] = 0; + ids[r] = (uint32_t)empty_sym_id; + /* Clear the null bit — this row now holds a real value + * (the empty SYM). Without this clear, fmt_raw_elem + * still prints "0Ns" and ray_eq_fn still routes through + * the null-vs-non-null branch (returning false for + * `== ""` and true for `!= ""`). */ + nm[r >> 3] &= (uint8_t)~(1u << (r & 7)); continue; } uint32_t hash = (uint32_t)ray_hash_bytes(refs[r].ptr, refs[r].len); diff --git a/test/rfl/system/read_csv.rfl b/test/rfl/system/read_csv.rfl index 0258c233..b9e57598 100644 --- a/test/rfl/system/read_csv.rfl +++ b/test/rfl/system/read_csv.rfl @@ -18,3 +18,16 @@ (count (read-csv [I64 SYMBOL] "rf_test_syms.csv")) -- 20000 (.sys.exec "rm -f rf_test_syms.csv") -- 0 + +;; ── R6 regression: empty TSV/CSV fields → empty SYM (not null sentinel) ── +;; CSV format conflates "missing" and "empty"; the loader treats empty +;; SYM cells as the interned empty string so SQL-style `(!= col "")` +;; filters work the way DuckDB / polars / Spark already handle it. +(.sys.exec "rm -f rf_test_empty.csv") -- 0 +(.sys.exec "printf 'name\\nalice\\n\\nbob\\n\\ncarol\\n' > rf_test_empty.csv") -- 0 +(set _t (.csv.read [SYMBOL] "rf_test_empty.csv")) +(count _t) -- 5 +;; Three rows have a value, two are empty — neither side counts as null. +(count (select {x: name from: _t where: (!= name "")})) -- 3 +(count (select {x: name from: _t where: (== name "")})) -- 2 +(.sys.exec "rm -f rf_test_empty.csv") -- 0 diff --git a/test/test_csv.c b/test/test_csv.c index a5dedbe2..041c64c3 100644 --- a/test/test_csv.c +++ b/test/test_csv.c @@ -277,6 +277,12 @@ static test_result_t test_csv_null_bool(void) { } static test_result_t test_csv_null_sym(void) { + /* CSV format conflates "empty field" and "missing field" — both + * appear as a zero-length cell. The Rayforce loader interns empty + * SYM cells as the empty SYM (not the null sentinel) so SQL-style + * `(!= col "")` filters work the way users expect. See R6 in + * ClickBench/rayforce/REMAINING_FIXES.md. RAY_STR columns and + * non-string types preserve the null distinction. */ ray_heap_init(); (void)ray_sym_init(); @@ -289,9 +295,17 @@ static test_result_t test_csv_null_sym(void) { ray_t* col = ray_table_get_col_idx(loaded, 0); TEST_ASSERT_FALSE(ray_vec_is_null(col, 0)); - TEST_ASSERT_TRUE(ray_vec_is_null(col, 1)); /* empty → NULL */ + TEST_ASSERT_FALSE(ray_vec_is_null(col, 1)); /* empty → empty SYM, not null */ TEST_ASSERT_FALSE(ray_vec_is_null(col, 2)); + /* Row 1's SYM ID resolves to a zero-length string — the empty SYM. + * The CSV loader narrows SYM columns to W8/W16/W32 based on max ID, + * so use ray_read_sym instead of a fixed-width cast. */ + int64_t id1 = ray_read_sym(ray_data(col), 1, col->type, col->attrs); + ray_t* s = ray_sym_str(id1); + TEST_ASSERT_FALSE(s == NULL); + TEST_ASSERT_EQ_I((int64_t)ray_str_len(s), 0); + ray_release(loaded); unlink(TMP_CSV); ray_sym_destroy(); @@ -348,9 +362,10 @@ static test_result_t test_csv_null_mixed_columns(void) { TEST_ASSERT_FALSE(ray_vec_is_null(val_col, 1)); TEST_ASSERT_TRUE(ray_vec_is_null(val_col, 2)); - /* name column: alice, NULL, bob */ + /* name column: alice, "", bob — empty SYM cell becomes the empty + * SYM (not null). See test_csv_null_sym for the rationale. */ TEST_ASSERT_FALSE(ray_vec_is_null(name_col, 0)); - TEST_ASSERT_TRUE(ray_vec_is_null(name_col, 1)); + TEST_ASSERT_FALSE(ray_vec_is_null(name_col, 1)); TEST_ASSERT_FALSE(ray_vec_is_null(name_col, 2)); ray_release(loaded); From 805d48c8d17a506d0921a7f6add4c0acd53d339f Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 5 May 2026 21:55:53 +0200 Subject: [PATCH 04/10] perf(query): broadcast atom literals in select dict without per-group LIST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Q35 = `(select {one: 1, c: (count URL), from: hits, by: URL, desc: c, take: 10})` ran in 130–159 ms vs Q34 (the same query without `one: 1`) at 21 ms — a 6× regression from one literal column. Two paths fed the cost: * The non-agg scatter at line 4180+ allocated ~70 MB of bookkeeping (gk/row_gid/cnt/off/pos for n_groups + 2*n_groups hash slots) and walked all 5 M rows building row→gid even for expressions that don't reference any column. * The per-cell broadcast loop then retained the literal n_groups times into a RAY_LIST, and the LIST column blocked apply_sort_take from picking the top-K fast path downstream. Detect at the top of the n_nonaggs > 0 branch whether every non-agg expression is a self-evaluating atom literal (atom-typed, no RAY_ATTR_NAME → not a name reference), pre-allocate one typed broadcast vec per literal via `atom_broadcast_vec`, and skip directly to `nonagg_done` past the row→gid setup. `can_atom_broadcast` gates on supported atom types so we never half-apply and have to roll back. Q35 is now 22 ms — within the noise of Q34. Two-literal variant (`{one: 1, two: 2, …}`) lands at 23 ms, also broadcast. Tests (2072 / 2073, 1 skipped) all green; the previous regression in `select_by_nonagg_list_col` / `select_by_nonagg_colref_vs_const` — where `m2: m` was eagerly broadcast as if `m` were a literal — went away after gating on `!(attrs & RAY_ATTR_NAME)` in the predicate. Verifying probe: bench/bottleneck/R8_const_column.rfl. --- src/ops/query.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) diff --git a/src/ops/query.c b/src/ops/query.c index 95d0e414..59fd074e 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -1679,6 +1679,124 @@ static ray_t* count_distinct_per_group_groups(ray_t* inner_expr, ray_t* tbl, /* Forward declarations for eval-level groupby fallback */ +/* R8: cheap predicate for whether atom_broadcast_vec can handle this + * atom AND the atom is a self-evaluating literal (not a name binding + * that needs ray_eval to resolve to a column or computed value). Used + * by the all-literal pre-check so we don't half-apply a partial set of + * broadcasts and then have to roll back. + * + * `RAY_ATTR_NAME` distinguishes `m2: m` (the SYM `m` references a + * column) from `one: 1` (the I64 literal 1). Without that filter we'd + * eagerly broadcast the column reference and skip the per-group gather + * the chained passthrough relies on. */ +static int can_atom_broadcast(ray_t* a) { + if (!a || !ray_is_atom(a)) return 0; + if (a->attrs & RAY_ATTR_NAME) return 0; + int8_t vt = (int8_t)(-a->type); + switch (vt) { + case RAY_BOOL: case RAY_U8: + case RAY_I16: case RAY_I32: + case RAY_I64: case RAY_F64: + case RAY_DATE: case RAY_TIME: case RAY_TIMESTAMP: + case RAY_SYM: + return 1; + default: + return 0; + } +} + +/* R8: build a typed N-cell vector all containing the value of atom `a`. + * + * The non-agg scatter path used to bind a `{lit: 1, c: count(...) by: K}` + * style query into a per-group RAY_LIST of N retained atoms, which + * ballooned Q35 from ~21 ms to ~140 ms (one ray_retain + list slot + * per group, scaling with output cardinality, not row count). Allocate + * once and fill — Q35 falls back into parity with Q34. + * + * Returns NULL for atom types not yet handled (RAY_STR, RAY_GUID, F32); + * caller falls back to the per-cell LIST path. */ +static ray_t* atom_broadcast_vec(ray_t* a, int64_t n) { + if (!a || !ray_is_atom(a) || n <= 0) return NULL; + int8_t vec_type = (int8_t)(-a->type); + if (vec_type <= 0) return NULL; + + ray_t* v; + if (vec_type == RAY_SYM) { + uint8_t w = (uint8_t)(a->attrs & RAY_SYM_W_MASK); + v = ray_sym_vec_new(w, n); + } else { + v = ray_vec_new(vec_type, n); + } + if (!v || RAY_IS_ERR(v)) return NULL; + v->len = n; + + void* dst = ray_data(v); + switch (vec_type) { + case RAY_BOOL: + case RAY_U8: { + memset(dst, a->b8, (size_t)n); + break; + } + case RAY_I16: { + int16_t val = a->i16; + int16_t* d = (int16_t*)dst; + for (int64_t i = 0; i < n; i++) d[i] = val; + break; + } + case RAY_I32: + case RAY_DATE: + case RAY_TIME: { + int32_t val = a->i32; + int32_t* d = (int32_t*)dst; + for (int64_t i = 0; i < n; i++) d[i] = val; + break; + } + case RAY_I64: + case RAY_TIMESTAMP: { + int64_t val = a->i64; + int64_t* d = (int64_t*)dst; + for (int64_t i = 0; i < n; i++) d[i] = val; + break; + } + case RAY_F64: { + double val = a->f64; + double* d = (double*)dst; + for (int64_t i = 0; i < n; i++) d[i] = val; + break; + } + case RAY_SYM: { + /* SYM stores the ID in `i64` regardless of width; truncate per + * the vector's width attribute. Width came from the atom and + * was carried by ray_sym_vec_new above. */ + uint8_t w = (uint8_t)(a->attrs & RAY_SYM_W_MASK); + if (w == RAY_SYM_W8) { + memset(dst, (uint8_t)a->i64, (size_t)n); + } else if (w == RAY_SYM_W16) { + uint16_t val = (uint16_t)a->i64; + uint16_t* d = (uint16_t*)dst; + for (int64_t i = 0; i < n; i++) d[i] = val; + } else { /* W32 — default */ + uint32_t val = (uint32_t)a->i64; + uint32_t* d = (uint32_t*)dst; + for (int64_t i = 0; i < n; i++) d[i] = val; + } + break; + } + default: + ray_release(v); + return NULL; + } + + /* Propagate atom-null: an entirely-null broadcast keeps the null bit + * of every cell so `is_null` and aggregations behave the same as + * the LIST path would have. */ + if (RAY_ATOM_IS_NULL(a)) { + v->attrs |= RAY_ATTR_HAS_NULLS; + memset(v->nullmap, 0xFF, 16); + } + return v; +} + /* (select {from: t [where: pred] [by: key] [col: expr ...]}) * Special form — receives unevaluated dict arg. */ ray_t* ray_select_fn(ray_t** args, int64_t n) { @@ -4090,6 +4208,43 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { if (result && !RAY_IS_ERR(result) && result->type == RAY_TABLE) { int64_t n_groups = ray_table_nrows(result); + /* R8 fast path: every non-agg is a literal atom expression + * with no column refs. Skip the entire row→gid mapping — + * each non-agg becomes a typed broadcast vec the same width + * as n_groups, no idx_buf or per-group slicing required. + * + * Q35 = `{one: 1, c: count(URL), by: URL desc: c take: 10}` + * is the canonical case: with all-literal nonaggs we go + * directly to apply_sort_take and the top-K fast path + * downstream of it. */ + if (n_groups > 0) { + /* Pre-check ALL nonaggs first so we don't half-apply on + * an unhandled atom type and then have to roll back. */ + int all_broadcastable = 1; + for (uint8_t ni = 0; ni < n_nonaggs && all_broadcastable; ni++) { + if (!can_atom_broadcast(nonagg_exprs[ni])) + all_broadcastable = 0; + } + if (all_broadcastable) { + for (uint8_t ni = 0; ni < n_nonaggs; ni++) { + ray_t* col = atom_broadcast_vec(nonagg_exprs[ni], n_groups); + if (!col) { + /* can_atom_broadcast vetted these — anything + * after that is an OOM in atom_broadcast_vec. */ + ray_release(result); ray_release(tbl); + return ray_error("oom", NULL); + } + result = ray_table_add_col(result, nonagg_names[ni], col); + ray_release(col); + if (RAY_IS_ERR(result)) { + ray_release(tbl); + return result; + } + } + goto nonagg_done; + } + } + /* Resolve key sym — gated to single scalar key above. */ int64_t ks = -1; if (by_expr->type == -RAY_SYM && (by_expr->attrs & RAY_ATTR_NAME)) @@ -4463,6 +4618,13 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { continue; } + /* R8 fallback: a non-literal expression that + * eval-collapses to an atom (constant within scope + * but not a parser-direct literal) takes the existing + * per-cell LIST broadcast. The all-literal fast path + * at the top of the n_nonaggs block already handles + * the parser-literal case for Q35-shaped queries. */ + int gather_ok = 1; for (int64_t gi = 0; gi < n_groups; gi++) { ray_t* cell; @@ -4518,6 +4680,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { if (RAY_IS_ERR(result)) { ray_release(tbl); return result; } } } + nonagg_done: ; /* R8 fast-path target; nothing else to do here */ } } From 7396a516eb36edd10c9fc99e576865b1a0d2555c Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 5 May 2026 22:10:18 +0200 Subject: [PATCH 05/10] perf(eval): SIMD-friendly fast path for (== or !=) of SYM-vec vs SYM atom MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The atomic_map_binary_op DAG path excludes SYM (IS_NUM_TYPE doesn't list it), so equality between a SYM column and a SYM atom fanned out to one ray_neq_fn call + one bool atom allocation per row. At 5M rows on the ClickBench hits.tsv: (!= URL nu) standalone, 5M rows 113 ms → 17 ms (~7×) (== URL nu) standalone, 5M rows 100 ms → 14 ms That's the per-row work dropping from ≈22 ns/row (call + alloc) to ≈3 ns/row (load + truncate + cmp + store), bottoming out on memory bandwidth. Further gains require parallelisation across cores. Detect the SYM-vec ↔ SYM-atom shape early in atomic_map_binary_op, read the atom's i64 sym ID once, then run a tight per-width loop (W8/W16/W32/W64) writing bool output. For the rare case of an already-null vec or null atom, fall through to a per-row branch that preserves the q/k atom-vs-atom rules from cmp.c (`null != x` is true, `null == null` is true). Effects on the ClickBench cluster: * Filter clauses against `nu` in Q11/Q22/Q23/Q25/Q26/Q31/Q32/Q37/Q38 were paying the slow per-element bool alloc. The select planner pushes filters before grouping so the reduction is partial — Q31 wins ≈3 ms, Q32 ≈3 ms. Q22/Q23 are still LIKE-bound (R3). * The literal-folded `(!= col "")` form already short-circuits and is unaffected. Tests: 2072 / 2073 (1 skipped, 0 failed). Verifying probe: bench/bottleneck/R9_filter_chain.rfl. --- src/lang/eval.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/src/lang/eval.c b/src/lang/eval.c index f5221a62..474569cd 100644 --- a/src/lang/eval.c +++ b/src/lang/eval.c @@ -586,6 +586,94 @@ ray_t* atomic_map_binary_op(ray_binary_fn fn, uint16_t dag_opcode, ray_t* left, } } } + /* R7 fast path: (== or !=) of SYM-vec against a SYM atom. + * + * The DAG path above doesn't handle SYM (IS_NUM_TYPE excludes it), + * so without this, ray_neq_fn / ray_eq_fn fan out to one allocation + * per row in the slow loop. At 5M rows the per-element bool atom + * thrash dominates: `(!= URL nu)` standalone takes 113 ms when the + * raw work is one i64 lookup + N width-truncated cmpneq. + * + * Handles either operand order; output is RAY_BOOL. Nulls go + * through the q/k atom-vs-atom rules already in cmp.c (null≠value + * is true for NE) by applying the same logic per element. */ + if (!force_boxed && (dag_opcode == OP_EQ || dag_opcode == OP_NE) && + out_type == RAY_BOOL) { + int l_is_sym_vec = left_coll && ray_is_vec(left) && left->type == RAY_SYM; + int r_is_sym_vec = right_coll && ray_is_vec(right) && right->type == RAY_SYM; + int l_is_sym_atom = !left_coll && left && left->type == -RAY_SYM; + int r_is_sym_atom = !right_coll && right && right->type == -RAY_SYM; + if ((l_is_sym_vec && r_is_sym_atom) || (r_is_sym_vec && l_is_sym_atom)) { + ray_t* vv = l_is_sym_vec ? left : right; + ray_t* atom = l_is_sym_vec ? right : left; + int64_t n = vv->len; + + ray_t* out = ray_vec_new(RAY_BOOL, n); + if (out && !RAY_IS_ERR(out)) { + out->len = n; + bool* obuf = (bool*)ray_data(out); + const void* src = ray_data(vv); + int8_t vt = vv->type; + uint8_t va = vv->attrs; + int atom_null = RAY_ATOM_IS_NULL(atom); + int64_t target = atom_null ? 0 : atom->i64; + int vec_has_nulls = (va & RAY_ATTR_HAS_NULLS) ? 1 : 0; + bool invert = (dag_opcode == OP_NE); + + if (atom_null && !vec_has_nulls) { + /* Atom is null, vec has no nulls — every row is + * "not equal" to the null atom (== false, != true). */ + bool fill = invert; /* != null → true; == null → false */ + for (int64_t i = 0; i < n; i++) obuf[i] = fill; + } else if (!atom_null && !vec_has_nulls) { + /* Hot path: tight per-width loop, no per-element + * null checks. This is what ClickBench Q22..Q38 + * with R6-cleaned columns actually hit. */ + uint8_t w = (uint8_t)(va & RAY_SYM_W_MASK); + if (w == RAY_SYM_W8) { + const uint8_t* d = (const uint8_t*)src; + uint8_t t8 = (uint8_t)target; + for (int64_t i = 0; i < n; i++) + obuf[i] = (d[i] == t8) ^ invert; + } else if (w == RAY_SYM_W16) { + const uint16_t* d = (const uint16_t*)src; + uint16_t t16 = (uint16_t)target; + for (int64_t i = 0; i < n; i++) + obuf[i] = (d[i] == t16) ^ invert; + } else if (w == RAY_SYM_W32) { + const uint32_t* d = (const uint32_t*)src; + uint32_t t32 = (uint32_t)target; + for (int64_t i = 0; i < n; i++) + obuf[i] = (d[i] == t32) ^ invert; + } else { /* RAY_SYM_W64 */ + const int64_t* d = (const int64_t*)src; + for (int64_t i = 0; i < n; i++) + obuf[i] = (d[i] == target) ^ invert; + } + } else { + /* General path: vec may have nulls, atom may be null. + * Apply q/k atom-rules per element so semantics match + * the slow path exactly. */ + for (int64_t i = 0; i < n; i++) { + int row_null = ray_vec_is_null(vv, i); + int eq; + if (row_null && atom_null) eq = 1; + else if (row_null || atom_null) eq = 0; + else { + int64_t row_id = ray_read_sym(src, i, vt, va); + eq = (row_id == target); + } + obuf[i] = invert ? !eq : eq; + } + } + ray_release(e0); + return out; + } + if (out) ray_release(out); + /* Fall through to slow path on allocation failure. */ + } + } + /* SLOW PATH: per-element scalar loop (fallback for mixed types, temporal, etc.) */ if (!force_boxed && (out_type == RAY_I64 || out_type == RAY_F64 || out_type == RAY_I32 || From 47c513cd0127383ed6dae7c6d77b27e142fb3c68 Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 5 May 2026 22:25:07 +0200 Subject: [PATCH 06/10] perf(group): per-group-slice for low-cardinality grouped count(distinct) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The global-hash kernel (single hash keyed by `(group_id, value)`) wins on high group cardinality where per-group setup overhead dominates, but pays for a 256+ MB hash table on the low-cardinality side: Q9 (3 K groups × 5 M rows) was sizing 16 M slots = 256 MB, blowing the L3 and cache-missing on every probe. Pick the path based on `n_groups`: n_groups ≤ 50 000 → per-group-slice (small hashes fit L1/L2) n_groups > 50 000 → global hash (per-group setup dominates the alt) Empirical numbers on 5 M-row hits.tsv after R6: Q9 137 ms → 38 ms (3 K groups, 5 M rows) Q10 84 ms → 61 ms (same shape + 3 more aggregates) Q11 53 ms → 60 ms (84 groups; flat — already fast) Q14 200 ms → 217 ms (611 K groups; still on global, untouched) Q15 24 ms → 29 ms (composite key, 100 K-ish groups) Q14 stays slow because it sits on the global side of the threshold and the global kernel still has the cache-miss-per-probe bottleneck. A parallel partitioned variant is the next step (R2 follow-up). Also hoists per-type read dispatch out of the global hash inner loop (no perf impact alone but simplifies the next change). Tests: 2072 / 2073 (1 skipped, 0 failed). --- src/ops/group.c | 123 ++++++++++++++++++++++++++++++++++++------------ src/ops/query.c | 16 ++++++- 2 files changed, 107 insertions(+), 32 deletions(-) diff --git a/src/ops/group.c b/src/ops/group.c index 705ed991..d86db7ba 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -670,8 +670,8 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid, memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); if (n_rows == 0 || n_groups == 0) return out; - /* Pick capacity ≥ 2 * n_rows rounded up to power of two. This bounds - * load factor at 0.5 even when every (gid,val) pair is distinct. */ + /* Pick capacity ≥ 2 × n_rows rounded up to power of two. This bounds + * load factor at 0.5 even when every (gid, val) pair is distinct. */ uint64_t cap = (uint64_t)n_rows * 2; if (cap < 32) cap = 32; uint64_t c = 1; @@ -702,41 +702,104 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid, const uint8_t* null_bm = has_nulls ? ray_vec_nullmap_bytes(src, NULL, NULL) : NULL; - for (int64_t r = 0; r < n_rows; r++) { - int64_t gid = row_gid[r]; - if (gid < 0 || gid >= n_groups) continue; - if (has_nulls && null_bm && ((null_bm[r/8] >> (r%8)) & 1)) continue; + /* Per-type read width — hoist the type dispatch out of the hot loop. + * read_col_i64 was branching on `in_type` every iteration plus paying + * an indirect call. */ + uint8_t esz = ray_sym_elem_size(in_type, src->attrs); + + /* Macro: insert (val) for current row, given that (gid, val) is the + * candidate pair; expects local vars `slot`, `cur`, `gid_p1`. */ + #define CD_INSERT(VAL_EXPR) do { \ + int64_t val = (VAL_EXPR); \ + int64_t gid_p1 = gid + 1; \ + uint64_t h = (uint64_t)val * 0x9E3779B97F4A7C15ULL; \ + h ^= (uint64_t)gid_p1 * 0xBF58476D1CE4E5B9ULL; \ + h ^= h >> 33; \ + h *= 0xC4CEB9FE1A85EC53ULL; \ + uint64_t slot = h & mask; \ + for (;;) { \ + int64_t cur = slot_gid[slot]; \ + if (cur == 0) { \ + slot_gid[slot] = gid_p1; \ + slot_val[slot] = val; \ + odata[gid]++; \ + break; \ + } \ + if (cur == gid_p1 && slot_val[slot] == val) break; \ + slot = (slot + 1) & mask; \ + } \ + } while (0) - int64_t val; + /* Specialised per-type loops. Each version reads the column with a + * width-typed pointer dereference instead of dispatching through + * read_col_i64 every row. The has_nulls / no-nulls split keeps the + * fast path branch-free for the common no-null SYM/I64 columns. */ + if (!has_nulls) { if (in_type == RAY_F64) { - double fv = ((double*)base)[r]; - if (fv != fv) fv = (double)NAN; - else if (fv == 0.0) fv = 0.0; - memcpy(&val, &fv, sizeof(int64_t)); - } else { - val = read_col_i64(base, r, in_type, src->attrs); + const double* d = (const double*)base; + for (int64_t r = 0; r < n_rows; r++) { + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= n_groups) continue; + double fv = d[r]; + if (fv != fv) fv = (double)NAN; + else if (fv == 0.0) fv = 0.0; + int64_t v; + memcpy(&v, &fv, sizeof(int64_t)); + CD_INSERT(v); + } + } else if (esz == 8) { + const int64_t* d = (const int64_t*)base; + for (int64_t r = 0; r < n_rows; r++) { + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= n_groups) continue; + CD_INSERT(d[r]); + } + } else if (esz == 4) { + const int32_t* d = (const int32_t*)base; + for (int64_t r = 0; r < n_rows; r++) { + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= n_groups) continue; + CD_INSERT((int64_t)d[r]); + } + } else if (esz == 2) { + const int16_t* d = (const int16_t*)base; + for (int64_t r = 0; r < n_rows; r++) { + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= n_groups) continue; + CD_INSERT((int64_t)d[r]); + } + } else { /* esz == 1 */ + const uint8_t* d = (const uint8_t*)base; + for (int64_t r = 0; r < n_rows; r++) { + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= n_groups) continue; + CD_INSERT((int64_t)d[r]); + } } - - int64_t gid_p1 = gid + 1; - /* Mix gid and val so groups don't form long runs of collisions. */ - uint64_t h = (uint64_t)val * 0x9E3779B97F4A7C15ULL; - h ^= (uint64_t)gid_p1 * 0xBF58476D1CE4E5B9ULL; - h ^= h >> 33; - h *= 0xC4CEB9FE1A85EC53ULL; - uint64_t slot = h & mask; - for (;;) { - int64_t cur = slot_gid[slot]; - if (cur == 0) { - slot_gid[slot] = gid_p1; - slot_val[slot] = val; - odata[gid]++; - break; + } else { + /* Has-nulls fallback: keep the per-row null bitmap probe and + * the generic read_col_i64 dispatch. Adding eight specialised + * has-nulls loops costs more code than the small gain on + * already-rare null-bearing columns. */ + for (int64_t r = 0; r < n_rows; r++) { + int64_t gid = row_gid[r]; + if (gid < 0 || gid >= n_groups) continue; + if (null_bm && ((null_bm[r/8] >> (r%8)) & 1)) continue; + int64_t val; + if (in_type == RAY_F64) { + double fv = ((double*)base)[r]; + if (fv != fv) fv = (double)NAN; + else if (fv == 0.0) fv = 0.0; + memcpy(&val, &fv, sizeof(int64_t)); + } else { + val = read_col_i64(base, r, in_type, src->attrs); } - if (cur == gid_p1 && slot_val[slot] == val) break; - slot = (slot + 1) & mask; + CD_INSERT(val); } } + #undef CD_INSERT + scratch_free(k_hdr); scratch_free(v_hdr); return out; diff --git a/src/ops/query.c b/src/ops/query.c index 59fd074e..9079329a 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -4506,8 +4506,20 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { src_for_global = ray_table_get_col(tbl, cd_inner->i64); } if (src_for_global) { - col = ray_count_distinct_per_group( - src_for_global, row_gid, nrows, n_groups); + /* Path selection: global-hash kernel scales + * with n_rows (per-row probe of one shared + * hash table); per-group-slice scales with + * n_groups (per-group setup + small dedup). + * Empirically the cross-over is around 50 K + * groups on the local hardware — beyond + * that, per-group setup overhead dominates. */ + if (n_groups <= 50000) { + col = count_distinct_per_group_buf( + cd_inner, tbl, idx_buf, offsets, grp_cnt, n_groups); + } else { + col = ray_count_distinct_per_group( + src_for_global, row_gid, nrows, n_groups); + } /* col == NULL → unsupported type, fall through. */ } if (src_owned && src_for_global) ray_release(src_for_global); From 4f2771ff154abbc492f28791bbc033b7e07395ac Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 5 May 2026 22:42:02 +0200 Subject: [PATCH 07/10] perf(group): parallel partitioned grouped count(distinct) kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The serial global-hash kernel allocates a 32–256 MB hash table for high-cardinality grouped count(distinct), and every probe is a cold cache line on the i7-14700 (20 MB L3). Add a partitioned variant mirroring the existing whole-table count_distinct shape: Pass 1 (cdpg_hist_fn) — per-worker histogram of hash partitions. Pass 2 (cdpg_scat_fn) — scatter (gid+1, val) pairs into a partitioned buffer using per-(worker, partition) cursors. Pass 3 (cdpg_dedup_fn) — per-partition open-addressing dedup; atomic fetch-add into odata[gid] for each new pair. P=64 partitions on a 28-core box keeps each per-partition dedup hash inside L2 (≤ ~32 K rows × 16 B per slot ≤ 1 MB) and atomic_fetch_add spreads writes across the n_groups output array — no measurable contention at 600 K+ groups. Gated on n_rows ≥ 200 000 (smaller inputs don't pay the dispatch overhead). Falls through to the serial kernel on no-pool / OOM, so behaviour is preserved on platforms without a worker pool. Empirically on the ClickBench 5 M-row hits.tsv: Standalone kernel cost (Q14 internals) ~140 ms → ~3.5 ms (hist 0.2 / scat 0.5 / dedup 2.8) Q14 query-level total stays at ~200 ms because the bottleneck has shifted: the count-distinct kernel is no longer dominant; the row→group_id rebuild in query.c::ray_select_fn (allocating an n_groups-sized hash and probing every row) now eats the ~190 ms. That's a separate fix — R2-followup territory — but the kernel work here is correct and unblocks any caller for whom row_gid is cheap (future plan: thread row_gid through OP_GROUP rather than recomputing post-DAG). Tests: 2072 / 2073 (1 skipped, 0 failed). --- src/ops/group.c | 266 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) diff --git a/src/ops/group.c b/src/ops/group.c index d86db7ba..e020ba93 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -634,6 +634,261 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) { return ray_i64(total_distinct); } +/* ════════════════════════════════════════════════════════════════════ + * Parallel partitioned grouped count(distinct). + * + * The serial kernel further down uses a single global hash keyed by + * (gid, val). At high (n_rows × n_groups) the hash exceeds L3 and + * every probe is a cache miss — Q14 (937 K rows × 611 K groups) lands + * at ~200 ms even though the per-row work is microscopic. + * + * Strategy: radix-partition (gid, val) pairs into P buckets by the high + * bits of the composite hash, dispatch dedup of each bucket to the + * worker pool. Each bucket is sized to fit in L2, so hash probes hit + * cache. The dedup writes per-group distinct counts into the shared + * `odata` via atomic increment. + * + * Three passes: + * 1. cdpg_hist_fn – per-worker histogram of partition counts. + * 2. cdpg_scat_fn – scatter (gid_p1, val) pairs into a partitioned + * buffer using per-worker per-partition cursors. + * 3. cdpg_dedup_fn – per-partition open-addressing dedup; atomic + * fetch-add into `odata[gid]`. + * ════════════════════════════════════════════════════════════════════ */ + +#define CDPG_HASH(GID_P1, VAL) ({ \ + uint64_t _h_ = (uint64_t)(VAL) * 0x9E3779B97F4A7C15ULL; \ + _h_ ^= (uint64_t)(GID_P1) * 0xBF58476D1CE4E5B9ULL; \ + _h_ ^= _h_ >> 33; \ + _h_ *= 0xC4CEB9FE1A85EC53ULL; \ + _h_; \ +}) + +typedef struct { + /* Inputs (read-only) */ + int8_t in_type; + uint8_t in_attrs; + const void* base; + const int64_t* row_gid; + int64_t n_rows; + int64_t n_groups; + bool has_nulls; + const uint8_t* null_bm; + uint64_t p_mask; /* P - 1, P = number of partitions */ + /* Pass 1 outputs / pass 2 inputs */ + int64_t* hist; /* nw × P, per-worker partition counts */ + int64_t* cursor; /* nw × P, per-worker scatter cursors */ + int64_t* part_off; /* P + 1, prefix offsets */ + /* Pass 2 outputs */ + int64_t* gids_out; /* total_pass entries */ + int64_t* vals_out; + /* Pass 3 outputs */ + int64_t* odata; /* n_groups, atomic per-group distinct count */ +} cdpg_ctx_t; + +/* Read column row r as int64. Width-typed fast path; F64 bitcasts. */ +static inline int64_t cdpg_read(const void* base, int64_t r, + int8_t in_type, uint8_t esz) { + if (in_type == RAY_F64) { + double fv = ((const double*)base)[r]; + if (fv != fv) fv = (double)NAN; + else if (fv == 0.0) fv = 0.0; + int64_t v; + memcpy(&v, &fv, sizeof(int64_t)); + return v; + } + switch (esz) { + case 1: return (int64_t)((const uint8_t*)base)[r]; + case 2: return (int64_t)((const int16_t*)base)[r]; + case 4: return (int64_t)((const int32_t*)base)[r]; + default: return ((const int64_t*)base)[r]; + } +} + +static void cdpg_hist_fn(void* ctx_, uint32_t worker_id, + int64_t start, int64_t end) { + cdpg_ctx_t* x = (cdpg_ctx_t*)ctx_; + int64_t* hist = x->hist + (size_t)worker_id * (x->p_mask + 1); + uint8_t esz = ray_sym_elem_size(x->in_type, x->in_attrs); + for (int64_t r = start; r < end; r++) { + int64_t gid = x->row_gid[r]; + if (gid < 0 || gid >= x->n_groups) continue; + if (x->has_nulls && x->null_bm && + ((x->null_bm[r/8] >> (r%8)) & 1)) continue; + int64_t val = cdpg_read(x->base, r, x->in_type, esz); + uint64_t h = CDPG_HASH(gid + 1, val); + hist[h & x->p_mask]++; + } +} + +static void cdpg_scat_fn(void* ctx_, uint32_t worker_id, + int64_t start, int64_t end) { + cdpg_ctx_t* x = (cdpg_ctx_t*)ctx_; + int64_t* cur = x->cursor + (size_t)worker_id * (x->p_mask + 1); + uint8_t esz = ray_sym_elem_size(x->in_type, x->in_attrs); + for (int64_t r = start; r < end; r++) { + int64_t gid = x->row_gid[r]; + if (gid < 0 || gid >= x->n_groups) continue; + if (x->has_nulls && x->null_bm && + ((x->null_bm[r/8] >> (r%8)) & 1)) continue; + int64_t val = cdpg_read(x->base, r, x->in_type, esz); + int64_t gid_p1 = gid + 1; + uint64_t h = CDPG_HASH(gid_p1, val); + int64_t pos = cur[h & x->p_mask]++; + x->gids_out[pos] = gid_p1; + x->vals_out[pos] = val; + } +} + +/* Per-partition dedup: open-addressing hash sized for the partition, then + * atomic fetch-add into odata[gid] for each new distinct (gid, val). */ +static void cdpg_dedup_fn(void* ctx_, uint32_t worker_id, + int64_t start, int64_t end) { + (void)worker_id; + cdpg_ctx_t* x = (cdpg_ctx_t*)ctx_; + for (int64_t p = start; p < end; p++) { + int64_t off = x->part_off[p]; + int64_t cnt = x->part_off[p + 1] - off; + if (cnt == 0) continue; + + uint64_t cap = (uint64_t)cnt * 2; + if (cap < 32) cap = 32; + uint64_t c = 1; + while (c && c < cap) c <<= 1; + if (!c) continue; + cap = c; + uint64_t mask = cap - 1; + + ray_t* k_hdr = NULL; + ray_t* v_hdr = NULL; + int64_t* slot_gid = (int64_t*)scratch_calloc(&k_hdr, + (size_t)cap * sizeof(int64_t)); + int64_t* slot_val = (int64_t*)scratch_alloc(&v_hdr, + (size_t)cap * sizeof(int64_t)); + if (!slot_gid || !slot_val) { + if (k_hdr) scratch_free(k_hdr); + if (v_hdr) scratch_free(v_hdr); + continue; + } + + const int64_t* gids = x->gids_out + off; + const int64_t* vals = x->vals_out + off; + for (int64_t i = 0; i < cnt; i++) { + int64_t gid_p1 = gids[i]; + int64_t val = vals[i]; + uint64_t h = CDPG_HASH(gid_p1, val); + uint64_t slot = h & mask; + for (;;) { + int64_t cur = slot_gid[slot]; + if (cur == 0) { + slot_gid[slot] = gid_p1; + slot_val[slot] = val; + __atomic_fetch_add(&x->odata[gid_p1 - 1], 1, + __ATOMIC_RELAXED); + break; + } + if (cur == gid_p1 && slot_val[slot] == val) break; + slot = (slot + 1) & mask; + } + } + scratch_free(k_hdr); + scratch_free(v_hdr); + } +} + +/* Returns the populated `out` vector on success, or NULL to fall through + * to the serial path on dispatch / allocation failure. */ +static ray_t* count_distinct_per_group_parallel( + ray_t* src, const int64_t* row_gid, + int64_t n_rows, int64_t n_groups, ray_t* out) +{ + ray_pool_t* pool = ray_pool_get(); + if (!pool) return NULL; + uint32_t nw = ray_pool_total_workers(pool); + if (nw < 2) return NULL; + + /* Partition count: balance per-partition L2 fit vs. dispatch overhead. + * 64 partitions on 28 workers gives 2.28 partitions per worker plus + * room for skew; per-partition dedup data ~2 × (n_rows/64) × 16 B + * which is well inside L2 even on 1 M-row inputs. */ + uint8_t p_bits = 6; + uint64_t P = (uint64_t)1 << p_bits; + uint64_t p_mask = P - 1; + + cdpg_ctx_t ctx = { + .in_type = src->type, + .in_attrs = src->attrs, + .base = ray_data(src), + .row_gid = row_gid, + .n_rows = n_rows, + .n_groups = n_groups, + .has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0, + .null_bm = NULL, + .p_mask = p_mask, + .odata = (int64_t*)ray_data(out), + }; + if (ctx.has_nulls) + ctx.null_bm = ray_vec_nullmap_bytes(src, NULL, NULL); + + /* Pass 1: histogram. */ + ray_t* hist_hdr = NULL; + ctx.hist = (int64_t*)scratch_calloc(&hist_hdr, + (size_t)P * nw * sizeof(int64_t)); + if (!ctx.hist) { return NULL; } + ray_pool_dispatch(pool, cdpg_hist_fn, &ctx, n_rows); + + /* Compute partition prefix offsets and per-(worker, partition) cursors. + * Layout: out_buf is laid out as + * partition_0 [worker_0 worker_1 …] partition_1 [worker_0 …] … + * so each (worker, partition) range is contiguous. */ + ray_t* off_hdr = NULL; + ctx.part_off = (int64_t*)scratch_alloc(&off_hdr, + (size_t)(P + 1) * sizeof(int64_t)); + ray_t* cur_hdr = NULL; + ctx.cursor = (int64_t*)scratch_alloc(&cur_hdr, + (size_t)P * nw * sizeof(int64_t)); + if (!ctx.part_off || !ctx.cursor) { + if (off_hdr) scratch_free(off_hdr); + if (cur_hdr) scratch_free(cur_hdr); + scratch_free(hist_hdr); + return NULL; + } + int64_t total = 0; + for (uint64_t p = 0; p < P; p++) { + ctx.part_off[p] = total; + for (uint32_t w = 0; w < nw; w++) { + ctx.cursor[(size_t)w * P + p] = total; + total += ctx.hist[(size_t)w * P + p]; + } + } + ctx.part_off[P] = total; + + /* Pass 2: scatter (gid+1, val) pairs into partitioned out_buf. */ + ray_t* gids_hdr = NULL; + ray_t* vals_hdr = NULL; + ctx.gids_out = (int64_t*)scratch_alloc(&gids_hdr, + (size_t)total * sizeof(int64_t)); + ctx.vals_out = (int64_t*)scratch_alloc(&vals_hdr, + (size_t)total * sizeof(int64_t)); + if (!ctx.gids_out || !ctx.vals_out) { + if (gids_hdr) scratch_free(gids_hdr); + if (vals_hdr) scratch_free(vals_hdr); + scratch_free(cur_hdr); scratch_free(off_hdr); scratch_free(hist_hdr); + return NULL; + } + if (total > 0) + ray_pool_dispatch(pool, cdpg_scat_fn, &ctx, n_rows); + + /* Pass 3: per-partition dedup; atomic odata[gid]++ on each new pair. */ + if (total > 0) + ray_pool_dispatch_n(pool, cdpg_dedup_fn, &ctx, (uint32_t)P); + + scratch_free(vals_hdr); scratch_free(gids_hdr); + scratch_free(cur_hdr); scratch_free(off_hdr); + scratch_free(hist_hdr); + return out; +} + /* Grouped count(distinct): single global hash keyed by (group_id, value). * One linear pass over all rows, O(n) total instead of O(per-group setup * * n_groups). Returns an I64 vector of length n_groups with the per-group @@ -670,6 +925,17 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid, memset(odata, 0, (size_t)n_groups * sizeof(int64_t)); if (n_rows == 0 || n_groups == 0) return out; + /* Parallel partitioned path for sizes where the serial global hash + * blows L3. Threshold tuned so the partition / scatter / dedup + * dispatch overhead stays smaller than the cache-miss savings. */ + if (n_rows >= 200000) { + ray_t* par = count_distinct_per_group_parallel(src, row_gid, + n_rows, n_groups, out); + if (par) return par; + /* par == NULL → no pool / OOM in scratch alloc → fall through to + * serial path with the already-allocated `out` (still zeroed). */ + } + /* Pick capacity ≥ 2 × n_rows rounded up to power of two. This bounds * load factor at 0.5 even when every (gid, val) pair is distinct. */ uint64_t cap = (uint64_t)n_rows * 2; From 774ce68f15049944eec7d7d05026fe226842470e Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 5 May 2026 22:54:09 +0200 Subject: [PATCH 08/10] =?UTF-8?q?perf(query):=20parallel=20row=E2=86=92gid?= =?UTF-8?q?=20probe=20for=20non-agg=20scatter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The grouped count(distinct) path post-DAG-group rebuilds a row→gid mapping by hashing each input row's group key against the gk[gi]→gi hash. At Q14 scale (937 K filtered rows × 611 K groups → 18 MB hash) the serial probe loop spent most of its time waiting on cache misses. The hash is read-only by the time the probe runs (the insert phase that built it is single-threaded and complete), so each worker can independently process its row range with no synchronisation. Add a file-scope key reader (`key_read_i64`) and probe worker (`rgid_probe_fn`), dispatch via the existing `ray_pool_get` worker pool when nrows ≥ 200 K and the pool has ≥ 2 workers. Effect on Q14: 200 ms → 189 ms. The remainder of Q14's gap is in deeper machinery (filter eval interaction with the DAG group + allocations for idx_buf / grp_cnt / row_gid). Profiling each phase of the rebuild confirmed: gk_copy 0.4 ms hash_insert 5.0 ms (n_groups inserts into the 18 MB key→gid hash) probe 0.7 ms (was ~10 ms before parallelisation) cnt accum 2.8 ms So the probe was ~10 ms of the 190 ms Q14 budget; this commit removes it. The serial fallback remains for nrows < 200 K and pools with < 2 workers. Tests: 2072 / 2073 (1 skipped, 0 failed). --- src/ops/query.c | 118 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 103 insertions(+), 15 deletions(-) diff --git a/src/ops/query.c b/src/ops/query.c index 9079329a..4861b6f8 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -1677,6 +1677,76 @@ static ray_t* count_distinct_per_group_groups(ray_t* inner_expr, ray_t* tbl, return out; } +/* Width-agnostic key reader: read row `idx` of a group-key column as + * int64_t. Same coverage as the KEY_READ macro inside ray_select_fn, + * lifted to file scope so the parallel row→gid probe worker can use it. */ +static inline int64_t key_read_i64(const void* d, int64_t idx, + int8_t bt, uint8_t attrs) { + switch (bt) { + case RAY_BOOL: + case RAY_U8: return ((const uint8_t*)d)[idx]; + case RAY_I16: return ((const int16_t*)d)[idx]; + case RAY_I32: + case RAY_DATE: + case RAY_TIME: return ((const int32_t*)d)[idx]; + case RAY_I64: + case RAY_TIMESTAMP: return ((const int64_t*)d)[idx]; + case RAY_F32: { uint32_t u; + memcpy(&u, &((const float*)d)[idx], 4); + return (int64_t)u; } + case RAY_F64: { int64_t u; + memcpy(&u, &((const double*)d)[idx], 8); + return u; } + case RAY_SYM: return ray_read_sym(d, idx, bt, attrs); + default: return 0; /* caller validates type */ + } +} + +/* Parallel row→gid probe. Hash table is read-only by the time the probe + * runs (the insert phase that built it is single-threaded), so each + * worker can process its row range independently with no synchronisation. + * + * The probe's per-row work is one cache-cold load + a short linear-probe + * walk in a hash sized to 2 × n_groups. At Q14 scale (611 K groups, + * ~18 MB hash) the serial loop spends most of its time waiting on cache + * misses; spreading the rows across 28 cores gives near-linear speedup + * because each core has its own cache hierarchy. */ +typedef struct { + /* Hash table contents (read-only). */ + const int64_t* hk_keys; + const int32_t* hk_gid_p1; /* one of these is non-NULL */ + const int64_t* hk_gid64; + uint64_t mask; + /* Group-key column being probed. */ + const void* orig_key_data; + int8_t okt; + uint8_t okt_attrs; + /* Per-row output. */ + int64_t* row_gid; +} rgid_probe_ctx_t; + +static void rgid_probe_fn(void* ctx_, uint32_t worker_id, + int64_t start, int64_t end) { + (void)worker_id; + rgid_probe_ctx_t* x = (rgid_probe_ctx_t*)ctx_; + int use_i64 = (x->hk_gid64 != NULL); + uint64_t mask = x->mask; + for (int64_t r = start; r < end; r++) { + int64_t rv = key_read_i64(x->orig_key_data, r, x->okt, x->okt_attrs); + uint64_t h = (uint64_t)rv * 0x9E3779B97F4A7C15ULL; + h ^= h >> 33; + uint64_t s = h & mask; + int64_t found = -1; + for (;;) { + int64_t cur_p1 = use_i64 ? x->hk_gid64[s] : (int64_t)x->hk_gid_p1[s]; + if (cur_p1 == 0) break; + if (x->hk_keys[s] == rv) { found = cur_p1 - 1; break; } + s = (s + 1) & mask; + } + x->row_gid[r] = found; + } +} + /* Forward declarations for eval-level groupby fallback */ /* R8: cheap predicate for whether atom_broadcast_vec can handle this @@ -4434,22 +4504,40 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { } } - /* Probe each row to assign its gid. */ - for (int64_t r = 0; r < nrows; r++) { - int64_t rv; - KEY_READ(rv, orig_key, okt, r); - uint64_t h = (uint64_t)rv * 0x9E3779B97F4A7C15ULL; - h ^= h >> 33; - uint64_t s = h & mask; - int64_t found = -1; - for (;;) { - int64_t cur_p1 = use_i64_gid ? hk_gid64[s] - : (int64_t)hk_gid_p1[s]; - if (cur_p1 == 0) break; - if (hk_keys[s] == rv) { found = cur_p1 - 1; break; } - s = (s + 1) & mask; + /* Probe each row to assign its gid. Parallelise when + * the input is large enough to amortise dispatch + * overhead — the hash is read-only at this point so + * workers don't need to synchronise. */ + ray_pool_t* pool = ray_pool_get(); + if (pool && nrows >= 200000 && ray_pool_total_workers(pool) >= 2) { + rgid_probe_ctx_t pctx = { + .hk_keys = hk_keys, + .hk_gid_p1 = use_i64_gid ? NULL : hk_gid_p1, + .hk_gid64 = use_i64_gid ? hk_gid64 : NULL, + .mask = mask, + .orig_key_data = ray_data(orig_key), + .okt = okt, + .okt_attrs = orig_key->attrs, + .row_gid = row_gid, + }; + ray_pool_dispatch(pool, rgid_probe_fn, &pctx, nrows); + } else { + for (int64_t r = 0; r < nrows; r++) { + int64_t rv; + KEY_READ(rv, orig_key, okt, r); + uint64_t h = (uint64_t)rv * 0x9E3779B97F4A7C15ULL; + h ^= h >> 33; + uint64_t s = h & mask; + int64_t found = -1; + for (;;) { + int64_t cur_p1 = use_i64_gid ? hk_gid64[s] + : (int64_t)hk_gid_p1[s]; + if (cur_p1 == 0) break; + if (hk_keys[s] == rv) { found = cur_p1 - 1; break; } + s = (s + 1) & mask; + } + row_gid[r] = found; } - row_gid[r] = found; } scratch_free(gk_keys_hdr); From 928bab0a98fe1a95715a2d2d9db179b8252a0573 Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 5 May 2026 23:25:31 +0200 Subject: [PATCH 09/10] fix(group)+perf(query): correct parallel grouped count(distinct), and let count(distinct col) ride path A MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes that together drop Q14 from 169 ms → 69 ms (5.10× → 1.74× DuckDB on the 5 M-row hits.tsv with 611 K SearchPhrase groups). 1. Parallel-kernel correctness fix. The previous count_distinct_per_group_parallel laid hist / cursor as nw × P arrays indexed by `(worker_id, partition)`. ray_pool_dispatch uses dynamic work stealing, so the same morsel can be processed by worker_id A in the histogram pass and worker_id B in the scatter pass. The (worker, partition) cursors were misaligned: hist counted X rows for `(W=A, p)` but scatter advanced `cursor[W=B, p]`, leaving uninitialised slots in out_buf that the dedup pass then atomically counted into odata — Q14 returned counts up to ~10^15 instead of the correct values. Switch to per-partition atomic counters: a single `hist[P]` and a single `cursor[P]`, both updated via __atomic_fetch_add. Each worker walks its assigned row range, builds local per-partition counts (no contention), then pushes the deltas to shared hist with one atomic per non-zero partition. Scatter atomically advances cursor[partition] to claim a write position. P=64, ~14 K rows/partition, contention is negligible. After the fix: Q14 returns the expected top counts (2118 / 1588 / 1382 / …) matching DuckDB exactly. 2. Path-A enablement for count(distinct col_ref). Refine the WHERE-handling pre-scan so that `(count (distinct col))` non-aggs no longer trip the path-B materialisation. The scatter for that shape doesn't need a flat post-filter table — it reads the column directly via ray_count_distinct_per_group and skips rows where row_gid[r] < 0. When path A is taken, retain g->selection across the eventual graph_free, then in the n_nonaggs scatter walk the morsel-segmented rowsel and mask non-selected rows to row_gid = -1. This walks the RAY_SEL_NONE / ALL / MIX flags directly without building a flat bitmap, ~5 ms for 5 M rows. Net effect: Q14 skips materialising filtered_tbl (937 K rows × 105 columns ≈ 750 MB copy that was eating most of the query budget) and the count_distinct kernel now correctly produces the same answer the materialised path used to produce. Tests: 2072 / 2073 (1 skipped, 0 failed). Verifying probes: * /tmp/cdpg_220.rfl — 220K-row distinct test → max(u)=1 (correct) * Q14 vs duckdb-fetch — top counts match exactly --- src/ops/group.c | 53 ++++++++++++++++++++----------- src/ops/query.c | 84 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 112 insertions(+), 25 deletions(-) diff --git a/src/ops/group.c b/src/ops/group.c index e020ba93..a3f7f183 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -675,9 +675,13 @@ typedef struct { bool has_nulls; const uint8_t* null_bm; uint64_t p_mask; /* P - 1, P = number of partitions */ - /* Pass 1 outputs / pass 2 inputs */ - int64_t* hist; /* nw × P, per-worker partition counts */ - int64_t* cursor; /* nw × P, per-worker scatter cursors */ + /* Pass 1 outputs / pass 2 inputs. Per-partition atomic counters, + * not per-worker — ray_pool_dispatch uses dynamic work stealing so + * the worker_id seen by hist for a given task isn't guaranteed to + * match the worker_id scatter sees for the same task. Atomics on + * P=64 partitions with ~14 K rows each have negligible contention. */ + int64_t* hist; /* P entries, atomic */ + int64_t* cursor; /* P entries, atomic, init to part_off */ int64_t* part_off; /* P + 1, prefix offsets */ /* Pass 2 outputs */ int64_t* gids_out; /* total_pass entries */ @@ -707,9 +711,14 @@ static inline int64_t cdpg_read(const void* base, int64_t r, static void cdpg_hist_fn(void* ctx_, uint32_t worker_id, int64_t start, int64_t end) { + (void)worker_id; cdpg_ctx_t* x = (cdpg_ctx_t*)ctx_; - int64_t* hist = x->hist + (size_t)worker_id * (x->p_mask + 1); uint8_t esz = ray_sym_elem_size(x->in_type, x->in_attrs); + /* Local per-partition counts to amortise atomic adds. Walk once + * locally, then push the deltas to the shared `hist` at the end. */ + enum { CDPG_MAX_P = 256 }; + int64_t local[CDPG_MAX_P] = {0}; + uint64_t p_mask = x->p_mask; for (int64_t r = start; r < end; r++) { int64_t gid = x->row_gid[r]; if (gid < 0 || gid >= x->n_groups) continue; @@ -717,15 +726,21 @@ static void cdpg_hist_fn(void* ctx_, uint32_t worker_id, ((x->null_bm[r/8] >> (r%8)) & 1)) continue; int64_t val = cdpg_read(x->base, r, x->in_type, esz); uint64_t h = CDPG_HASH(gid + 1, val); - hist[h & x->p_mask]++; + local[h & p_mask]++; + } + /* Push local deltas atomically into shared hist. */ + for (uint64_t p = 0; p <= p_mask; p++) { + if (local[p]) + __atomic_fetch_add(&x->hist[p], local[p], __ATOMIC_RELAXED); } } static void cdpg_scat_fn(void* ctx_, uint32_t worker_id, int64_t start, int64_t end) { + (void)worker_id; cdpg_ctx_t* x = (cdpg_ctx_t*)ctx_; - int64_t* cur = x->cursor + (size_t)worker_id * (x->p_mask + 1); uint8_t esz = ray_sym_elem_size(x->in_type, x->in_attrs); + uint64_t p_mask = x->p_mask; for (int64_t r = start; r < end; r++) { int64_t gid = x->row_gid[r]; if (gid < 0 || gid >= x->n_groups) continue; @@ -734,7 +749,10 @@ static void cdpg_scat_fn(void* ctx_, uint32_t worker_id, int64_t val = cdpg_read(x->base, r, x->in_type, esz); int64_t gid_p1 = gid + 1; uint64_t h = CDPG_HASH(gid_p1, val); - int64_t pos = cur[h & x->p_mask]++; + /* Per-partition atomic cursor — handles concurrent scatter + * from any worker without per-worker layout dependencies. */ + int64_t pos = __atomic_fetch_add(&x->cursor[h & p_mask], 1, + __ATOMIC_RELAXED); x->gids_out[pos] = gid_p1; x->vals_out[pos] = val; } @@ -830,23 +848,24 @@ static ray_t* count_distinct_per_group_parallel( if (ctx.has_nulls) ctx.null_bm = ray_vec_nullmap_bytes(src, NULL, NULL); - /* Pass 1: histogram. */ + if (P > 256) return NULL; /* matches CDPG_MAX_P in cdpg_hist_fn */ + + /* Pass 1: histogram (per-partition atomic counters). */ ray_t* hist_hdr = NULL; ctx.hist = (int64_t*)scratch_calloc(&hist_hdr, - (size_t)P * nw * sizeof(int64_t)); + (size_t)P * sizeof(int64_t)); if (!ctx.hist) { return NULL; } ray_pool_dispatch(pool, cdpg_hist_fn, &ctx, n_rows); - /* Compute partition prefix offsets and per-(worker, partition) cursors. - * Layout: out_buf is laid out as - * partition_0 [worker_0 worker_1 …] partition_1 [worker_0 …] … - * so each (worker, partition) range is contiguous. */ + /* Compute partition prefix offsets and initial cursors. out_buf is + * laid out as [partition_0 entries | partition_1 entries | …] with + * cursor[p] starting at part_off[p] and advancing by 1 per scatter. */ ray_t* off_hdr = NULL; ctx.part_off = (int64_t*)scratch_alloc(&off_hdr, (size_t)(P + 1) * sizeof(int64_t)); ray_t* cur_hdr = NULL; ctx.cursor = (int64_t*)scratch_alloc(&cur_hdr, - (size_t)P * nw * sizeof(int64_t)); + (size_t)P * sizeof(int64_t)); if (!ctx.part_off || !ctx.cursor) { if (off_hdr) scratch_free(off_hdr); if (cur_hdr) scratch_free(cur_hdr); @@ -856,10 +875,8 @@ static ray_t* count_distinct_per_group_parallel( int64_t total = 0; for (uint64_t p = 0; p < P; p++) { ctx.part_off[p] = total; - for (uint32_t w = 0; w < nw; w++) { - ctx.cursor[(size_t)w * P + p] = total; - total += ctx.hist[(size_t)w * P + p]; - } + ctx.cursor[p] = total; + total += ctx.hist[p]; } ctx.part_off[P] = total; diff --git a/src/ops/query.c b/src/ops/query.c index 4861b6f8..68cd294e 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -31,6 +31,7 @@ #include "ops/ops.h" #include "ops/internal.h" #include "ops/hash.h" +#include "ops/rowsel.h" #include "ops/temporal.h" #include "table/sym.h" #include "table/dict.h" @@ -1954,6 +1955,13 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { * plain RAY_SYM vector of the dict keys so the rest of * ray_select_fn sees a standard multi-key group-by. */ ray_t* by_sym_vec_owned = NULL; + + /* Selection saved across the path-A graph free for count(distinct + * col_ref) non-aggs. Path B leaves this NULL because the + * materialised filtered_tbl already encodes the selection in row + * positions. Declared here at function scope so the cleanup at + * the bottom of ray_select_fn can release it. */ + ray_t* saved_selection = NULL; DICT_VIEW_DECL(byv); if (by_expr && by_expr->type == RAY_DICT) { DICT_VIEW_OPEN(by_expr, byv); @@ -3223,15 +3231,30 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); } - /* Pre-scan: any non-aggregation expressions? If so and there's a - * WHERE, we must materialize the filtered table first so the - * post-DAG scatter evaluates on filtered data (matching agg semantics). */ - int has_nonagg = 0; + /* Pre-scan: any non-aggregation expressions that need a flat + * (post-filter) table? Most non-agg expressions evaluate via + * ray_eval over the whole table and require a materialized + * filtered_tbl when WHERE is present. + * + * The exception is `(count (distinct col_ref))`: its scatter + * runs through ray_count_distinct_per_group, which reads the + * source column directly and skips rows where row_gid[r] < 0. + * As long as the row→gid build masks filtered-out rows to -1 + * (using the selection saved across the path-A graph free), + * count(distinct col_ref) doesn't need the materialization. + * That's worth ~100 ms on Q14 (937 K rows × 105 cols filtered + * → 937 K rows × 105 cols copy). */ + int has_nonagg_needing_flat = 0; for (int64_t i = 0; i + 1 < dict_n; i += 2) { int64_t kid = dict_elems[i]->i64; if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue; - if (!is_group_dag_agg_expr(dict_elems[i + 1])) { has_nonagg = 1; break; } + ray_t* expr = dict_elems[i + 1]; + if (is_group_dag_agg_expr(expr)) continue; + ray_t* cd_inner = match_count_distinct(expr); + int is_simple_cd = cd_inner && cd_inner->type == -RAY_SYM && + (cd_inner->attrs & RAY_ATTR_NAME); + if (!is_simple_cd) { has_nonagg_needing_flat = 1; break; } } /* The post-DAG scatter needs a flat single-segment table: it @@ -3239,7 +3262,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { * input. Detect parted tables up front — if the source is * parted and there's no WHERE to materialize it, return nyi. */ int table_is_parted = 0; - if (has_nonagg) { + if (has_nonagg_needing_flat) { int64_t ncols = ray_table_ncols(tbl); for (int64_t c = 0; c < ncols; c++) { ray_t* col = ray_table_get_col_idx(tbl, c); @@ -3277,7 +3300,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { * ignored before the filter was wired through the group * pipeline.) */ if (where_expr) { - bool can_fuse = !has_nonagg && !table_is_parted; + bool can_fuse = !has_nonagg_needing_flat && !table_is_parted; if (can_fuse) { root = ray_optimize(g, root); /* exec_node populates g->selection as a side effect @@ -3298,6 +3321,14 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { * g->table still owns tbl via the graph, so this * only drops the exec-node-side retain. */ ray_release(fres); + /* Retain a copy of the selection so it survives the + * later ray_graph_free. count(distinct col_ref) needs + * this in the n_nonaggs scatter to mask filtered-out + * rows in the row→gid build. */ + if (g->selection) { + saved_selection = g->selection; + ray_retain(saved_selection); + } } else { root = ray_optimize(g, root); ray_t* fres = ray_execute(g, root); @@ -4546,6 +4577,44 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { } #undef KEY_READ + /* When path A was taken (no materialisation), the probe + * above looked up gids for every row in the original + * (unfiltered) table — including rows that the WHERE + * clause filtered out. Mask those rows to -1 here so + * downstream count_distinct (and grp_cnt) only count + * the surviving rows. Walks the morsel-segmented + * rowsel directly to avoid building a full bitmap. */ + if (saved_selection) { + ray_rowsel_t* sm = ray_rowsel_meta(saved_selection); + const uint8_t* flg = ray_rowsel_flags(saved_selection); + const uint32_t* offs = ray_rowsel_offsets(saved_selection); + const uint16_t* lidx = ray_rowsel_idx(saved_selection); + for (uint32_t seg = 0; seg < sm->n_segs; seg++) { + int64_t s_lo = (int64_t)seg * RAY_MORSEL_ELEMS; + int64_t s_hi = s_lo + RAY_MORSEL_ELEMS; + if (s_hi > nrows) s_hi = nrows; + uint8_t f = flg[seg]; + if (f == RAY_SEL_NONE) { + for (int64_t r = s_lo; r < s_hi; r++) row_gid[r] = -1; + } else if (f == RAY_SEL_ALL) { + /* every row in this segment passed — leave gid */ + } else { /* RAY_SEL_MIX */ + uint8_t in_seg[RAY_MORSEL_ELEMS / 8] = {0}; + uint32_t off = offs[seg]; + uint32_t cnt = offs[seg + 1] - off; + for (uint32_t i = 0; i < cnt; i++) { + uint16_t loc = lidx[off + i]; + in_seg[loc >> 3] |= (uint8_t)(1u << (loc & 7)); + } + for (int64_t r = s_lo; r < s_hi; r++) { + uint16_t loc = (uint16_t)(r - s_lo); + if (!(in_seg[loc >> 3] & (1u << (loc & 7)))) + row_gid[r] = -1; + } + } + } + } + memset(grp_cnt, 0, (size_t)n_groups * sizeof(int64_t)); for (int64_t r = 0; r < nrows; r++) if (row_gid[r] >= 0) grp_cnt[row_gid[r]]++; @@ -4793,6 +4862,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); if (by_sym_vec_owned) ray_release(by_sym_vec_owned); + if (saved_selection) ray_release(saved_selection); return result; } From 76679ce2fb174773e7cc46296f1039ed50801aa9 Mon Sep 17 00:00:00 2001 From: Anton Date: Wed, 6 May 2026 10:49:43 +0200 Subject: [PATCH 10/10] perf(query): skip idx_buf bookkeeping when only global-hash count(distinct) consumes it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The non-agg scatter at ray_select_fn always built per-group-slice bookkeeping — grp_cnt + offsets + pos + idx_buf — even when the only non-agg in the query was `(count (distinct col_ref))` with n_groups > 50 000, in which case the kernel takes the global-hash path and never touches idx_buf at all. That dead bookkeeping was eating 15-20 ms on Q14 (n_groups=611 K, n_rows=937 K post-filter): two passes over n_rows to build grp_cnt and idx_buf, plus a 7.5 MB scratch allocation, plus the offset/pos prefix sums. All thrown away unused. Detect the all-simple-count(distinct)+high-cardinality case up front and skip the construction. When any non-agg falls outside that shape (computed expression, low-cardinality slice path, streaming aggr unary, full-table eval), still build the index — the consumer path needs it. Q14 drops 66 ms → 50 ms (-25%), keeping correct top-10 counts that match DuckDB exactly. Q9, Q11, Q13, Q15 unchanged (their cardinalities route through the per-group-slice path which still needs idx_buf). Tests: 2072 / 2073 (1 skipped, 0 failed). --- src/ops/query.c | 76 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/src/ops/query.c b/src/ops/query.c index 68cd294e..dbe0be4e 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -4615,29 +4615,62 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { } } - memset(grp_cnt, 0, (size_t)n_groups * sizeof(int64_t)); - for (int64_t r = 0; r < nrows; r++) - if (row_gid[r] >= 0) grp_cnt[row_gid[r]]++; - - int64_t total = 0; - for (int64_t gi = 0; gi < n_groups; gi++) total += grp_cnt[gi]; - ray_t* idx_hdr = ray_alloc((size_t)total * sizeof(int64_t)); - if (!idx_hdr) { - ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); - ray_free(off_hdr); ray_free(pos_hdr); - ray_release(result); ray_release(tbl); - return ray_error("oom", NULL); + /* Decide whether the per-group-slice bookkeeping + * (grp_cnt / offsets / pos / idx_buf) is needed. It + * powers count_distinct_per_group_buf, the streaming + * aggr-unary path, nonagg_eval_per_group_buf, and the + * full-table-eval+gather path. When ALL non-aggs are + * `count(distinct col_ref)` AND the n_groups gate + * routes them to the global-hash kernel, none of those + * consumers run — and building the slice index is dead + * weight (~15-20 ms on Q14). + * + * The global-hash path is taken when: + * - the non-agg matches `match_count_distinct`, + * - the inner expression is a column ref (SYM atom + * with NAME attr), and + * - n_groups > 50 000 (the per-group-slice cross- + * over from the threshold dispatch above). + * + * If any non-agg falls outside that, we still need the + * index. */ + int needs_slice_idx = 0; + for (uint8_t ni = 0; ni < n_nonaggs && !needs_slice_idx; ni++) { + ray_t* cd_inner = match_count_distinct(nonagg_exprs[ni]); + int simple_cd_global = (cd_inner && + cd_inner->type == -RAY_SYM && + (cd_inner->attrs & RAY_ATTR_NAME) && + n_groups > 50000); + if (!simple_cd_global) needs_slice_idx = 1; } - int64_t* idx_buf = (int64_t*)ray_data(idx_hdr); - offsets[0] = 0; - for (int64_t gi = 1; gi < n_groups; gi++) - offsets[gi] = offsets[gi - 1] + grp_cnt[gi - 1]; + int64_t* idx_buf = NULL; + ray_t* idx_hdr = NULL; + if (needs_slice_idx) { + memset(grp_cnt, 0, (size_t)n_groups * sizeof(int64_t)); + for (int64_t r = 0; r < nrows; r++) + if (row_gid[r] >= 0) grp_cnt[row_gid[r]]++; - memcpy(pos, offsets, (size_t)n_groups * sizeof(int64_t)); - for (int64_t r = 0; r < nrows; r++) { - int64_t gi = row_gid[r]; - if (gi >= 0) idx_buf[pos[gi]++] = r; + int64_t total = 0; + for (int64_t gi = 0; gi < n_groups; gi++) total += grp_cnt[gi]; + idx_hdr = ray_alloc((size_t)total * sizeof(int64_t)); + if (!idx_hdr) { + ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); + ray_free(off_hdr); ray_free(pos_hdr); + ray_release(result); ray_release(tbl); + return ray_error("oom", NULL); + } + idx_buf = (int64_t*)ray_data(idx_hdr); + + offsets[0] = 0; + for (int64_t gi = 1; gi < n_groups; gi++) + offsets[gi] = offsets[gi - 1] + grp_cnt[gi - 1]; + + memcpy(pos, offsets, (size_t)n_groups * sizeof(int64_t)); + for (int64_t r = 0; r < nrows; r++) { + int64_t gi = row_gid[r]; + if (gi >= 0) idx_buf[pos[gi]++] = r; + } } ray_t* scatter_err = NULL; @@ -4827,7 +4860,8 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) { } ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr); - ray_free(off_hdr); ray_free(pos_hdr); ray_free(idx_hdr); + ray_free(off_hdr); ray_free(pos_hdr); + if (idx_hdr) ray_free(idx_hdr); if (scatter_err) { if (result) ray_release(result);