From 1f54abc3386f6057acddfc4cf0b1c7e939dc9b0a Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 5 May 2026 10:41:22 +0200
Subject: [PATCH 01/10] fix(query): collapse scalar agg without `by:` to ONE
 row
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`(select {s: (sum a) from: t})` was returning N copies of the same
value instead of a single row.  The projection-only path lowered
aggregates as ordinary column expressions, so OP_SELECT saw a scalar
atom and broadcast it to the input row count (exec.c: vec->type<0 ->
broadcast_scalar).

Route the all-aggregate / no-by case through ray_group(n_keys=0),
which already has a 1-row scalar-aggregate fast path.  WHERE is
pre-executed (same pattern as the by-with-where fuse path) so the
lazy g->selection bitmap reaches the reduction.

The n_keys==0 parallel scalar path was effectively dead code before
this and its FIRST/LAST merge silently relied on worker-id order
matching row-index order — broken under work-stealing dispatch.
Force serial execution when FIRST/LAST is in play; the DA path stays
parallel and tracks per-slot first_row/last_row already.

Two existing tests asserted the buggy broadcast row count
(groupby_aggregators.rfl:64, group_coverage.rfl:417); updated to the
correct 1-row expectation.
---
 src/ops/group.c                              | 12 ++-
 src/ops/query.c                              | 98 +++++++++++++++++---
 test/rfl/integration/groupby_aggregators.rfl |  5 +-
 test/rfl/ops/group_coverage.rfl              |  5 +-
 test/rfl/table/select.rfl                    | 11 +++
 5 files changed, 113 insertions(+), 18 deletions(-)

diff --git a/src/ops/group.c b/src/ops/group.c
index 4665155f..a437bd3c 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -2647,6 +2647,7 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl,
     /* ---- Scalar aggregate fast path (n_keys == 0): flat vector scan ---- */
     if (n_keys == 0 && nrows > 0) {
         uint8_t need_flags = DA_NEED_COUNT;
+        bool has_first_last = false;
         for (uint8_t a = 0; a < n_aggs; a++) {
             uint16_t aop = ext->agg_ops[a];
             if (aop == OP_SUM || aop == OP_PROD || aop == OP_AVG || aop == OP_FIRST || aop == OP_LAST)
@@ -2655,6 +2656,7 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl,
                 { need_flags |= DA_NEED_SUM; need_flags |= DA_NEED_SUMSQ; }
             else if (aop == OP_MIN) need_flags |= DA_NEED_MIN;
             else if (aop == OP_MAX) need_flags |= DA_NEED_MAX;
+            if (aop == OP_FIRST || aop == OP_LAST) has_first_last = true;
         }
 
         void* agg_ptrs[vla_aggs];
@@ -2670,7 +2672,15 @@ ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl,
         }
 
         ray_pool_t* sc_pool = ray_pool_get();
-        uint32_t sc_n = (sc_pool && nrows >= RAY_PARALLEL_THRESHOLD)
+        /* Pool dispatch is work-stealing: chunks may be processed out of
+         * row-index order across workers, so the "count[0]==1" sentinel
+         * scalar_accum_row uses for FIRST (and the always-overwrite for
+         * LAST) only yields the per-worker first/last, not the global
+         * one.  The merge step then picks worker[0]'s FIRST regardless
+         * of which range it actually covered.  Force serial execution
+         * when FIRST/LAST is in play; the DA path (which does track
+         * per-slot row bounds) is still preferred when we have keys. */
+        uint32_t sc_n = (sc_pool && nrows >= RAY_PARALLEL_THRESHOLD && !has_first_last)
                         ? ray_pool_total_workers(sc_pool) : 1;
 
         ray_t* sc_hdr;
diff --git a/src/ops/query.c b/src/ops/query.c
index 34d01bf4..4d437a60 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -3170,27 +3170,97 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
             return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id);
         }
     } else if (n_out > 0) {
-        /* Projection only (no group by) — select specific columns */
-        ray_op_t* col_ops[16];
-        uint8_t nc = 0;
+        /* No `by:` but explicit output expressions.
+         *
+         * Two sub-cases:
+         *   (a) All outputs are aggregates → scalar reduction.  Route
+         *       through ray_group(n_keys=0) so the result is ONE row,
+         *       not the input row count broadcast.  The naive ray_select
+         *       path lowers `(sum c)` to OP_SUM as a column expression;
+         *       OP_SELECT then broadcasts the scalar atom to nrows
+         *       (exec.c: vec->type < 0 → broadcast_scalar), producing
+         *       N copies of the same value.
+         *   (b) At least one non-agg output → keep the existing
+         *       projection (broadcast-as-column), matching q's
+         *       per-row evaluation semantics.
+         *
+         * Mixed agg+non-agg without `by:` continues to flow through (b);
+         * q's semantics there imply LIST/scalar mixing that is out of
+         * scope for this fix. */
+        int has_agg = 0;
+        int has_nonagg_out = 0;
         for (int64_t i = 0; i + 1 < dict_n; i += 2) {
             int64_t kid = dict_elems[i]->i64;
-            if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id || kid == nearest_id) continue;
-            if (nc < 16) {
-                col_ops[nc] = compile_expr_dag(g, dict_elems[i + 1]);
-                if (!col_ops[nc]) {
-                    /* Nearest-path resources must be freed here too — the
-                     * rerank handle/query buffers are held across the whole
-                     * ray_select_fn body, not just inside the nearest block. */
-                    if (nearest_handle_owned) ray_release(nearest_handle_owned);
-                    if (nearest_query_owned)  ray_sys_free(nearest_query_owned);
+            if (kid == from_id || kid == where_id || kid == by_id ||
+                kid == take_id || kid == asc_id || kid == desc_id || kid == nearest_id) continue;
+            if (is_agg_expr(dict_elems[i + 1])) has_agg = 1;
+            else has_nonagg_out = 1;
+        }
+
+        if (has_agg && !has_nonagg_out && !nearest_expr) {
+            /* Scalar reduction.  Pre-execute the WHERE filter (already
+             * wired as ray_filter at the top) so OP_FILTER on the table
+             * input populates g->selection, which exec_group then
+             * honours in its n_keys==0 fast path. */
+            if (where_expr) {
+                root = ray_optimize(g, root);
+                ray_t* fres = exec_node(g, root);
+                if (!fres || RAY_IS_ERR(fres)) {
+                    if (g->selection) {
+                        ray_release(g->selection);
+                        g->selection = NULL;
+                    }
+                    ray_graph_free(g); ray_release(tbl);
+                    return fres ? fres : ray_error("domain", NULL);
+                }
+                ray_release(fres);
+            }
+
+            uint16_t  s_agg_ops[16];
+            ray_op_t* s_agg_ins[16];
+            uint8_t   s_n_aggs = 0;
+            for (int64_t i = 0; i + 1 < dict_n && s_n_aggs < 16; i += 2) {
+                int64_t kid = dict_elems[i]->i64;
+                if (kid == from_id || kid == where_id || kid == by_id ||
+                    kid == take_id || kid == asc_id || kid == desc_id || kid == nearest_id) continue;
+                ray_t*  val_expr  = dict_elems[i + 1];
+                ray_t** agg_elems = (ray_t**)ray_data(val_expr);
+                s_agg_ops[s_n_aggs] = resolve_agg_opcode(agg_elems[0]->i64);
+                s_agg_ins[s_n_aggs] = compile_expr_dag(g, agg_elems[1]);
+                if (!s_agg_ins[s_n_aggs]) {
+                    if (g->selection) {
+                        ray_release(g->selection);
+                        g->selection = NULL;
+                    }
                     ray_graph_free(g); ray_release(tbl);
                     return ray_error("domain", NULL);
                 }
-                nc++;
+                s_n_aggs++;
+            }
+            root = ray_group(g, NULL, 0, s_agg_ops, s_agg_ins, s_n_aggs);
+        } else {
+            /* Projection only (no group by) — select specific columns */
+            ray_op_t* col_ops[16];
+            uint8_t nc = 0;
+            for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+                int64_t kid = dict_elems[i]->i64;
+                if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id || kid == nearest_id) continue;
+                if (nc < 16) {
+                    col_ops[nc] = compile_expr_dag(g, dict_elems[i + 1]);
+                    if (!col_ops[nc]) {
+                        /* Nearest-path resources must be freed here too — the
+                         * rerank handle/query buffers are held across the whole
+                         * ray_select_fn body, not just inside the nearest block. */
+                        if (nearest_handle_owned) ray_release(nearest_handle_owned);
+                        if (nearest_query_owned)  ray_sys_free(nearest_query_owned);
+                        ray_graph_free(g); ray_release(tbl);
+                        return ray_error("domain", NULL);
+                    }
+                    nc++;
+                }
             }
+            root = ray_select(g, root, col_ops, nc);
         }
-        root = ray_select(g, root, col_ops, nc);
     }
 
     /* Sort: collect asc/desc columns in dict iteration order.
diff --git a/test/rfl/integration/groupby_aggregators.rfl b/test/rfl/integration/groupby_aggregators.rfl
index 6d1cc304..9a1832a6 100644
--- a/test/rfl/integration/groupby_aggregators.rfl
+++ b/test/rfl/integration/groupby_aggregators.rfl
@@ -59,10 +59,11 @@
 (count (select {s: (sum v) from: T by: g where: (< v 500)})) -- 50
 
 ;; ────────────── group-by no `by` clause: aggregate over whole table ──────────────
-;; pure aggregations without grouping
+;; pure aggregations without grouping → ONE row, not nrows broadcast.
 (set Whole (select {tot: (sum v) ct: (count v) avg_v: (avg v) from: T}))
-(count Whole) -- 1000
+(count Whole) -- 1
 (at (at Whole 'tot) 0) -- 499500
+(at (at Whole 'ct) 0) -- 1000
 
 ;; ────────────── group-by SYM key ──────────────
 (set Tsym (table [k v] (list (take ['A 'B 'C 'D 'E] N) (til N))))
diff --git a/test/rfl/ops/group_coverage.rfl b/test/rfl/ops/group_coverage.rfl
index 46131f0a..f823d99c 100644
--- a/test/rfl/ops/group_coverage.rfl
+++ b/test/rfl/ops/group_coverage.rfl
@@ -413,9 +413,12 @@
 ;; ────────────── 43. Scalar agg with all-stat-aggs combination ──────────────
 ;; Multi-agg pack of var, var_pop, stddev, stddev_pop, sum, count, avg
 ;; in scalar mode (no by) — exercises full need_flags=SUM+SUMSQ+COUNT.
+;; A scalar reduction collapses to ONE row (count v == nrows is the
+;; row-count *value*, not the row count of the result table).
 (set Tall (table [v] (list (til 200))))
-(count (select {s: (sum v) c: (count v) av: (avg v) v: (var v) vp: (var_pop v) sd: (stddev v) sp: (stddev_pop v) from: Tall})) -- 200
+(count (select {s: (sum v) c: (count v) av: (avg v) v: (var v) vp: (var_pop v) sd: (stddev v) sp: (stddev_pop v) from: Tall})) -- 1
 (at (at (select {s: (sum v) c: (count v) av: (avg v) v: (var v) vp: (var_pop v) sd: (stddev v) sp: (stddev_pop v) from: Tall}) 's) 0) -- 19900
+(at (at (select {s: (sum v) c: (count v) av: (avg v) v: (var v) vp: (var_pop v) sd: (stddev v) sp: (stddev_pop v) from: Tall}) 'c) 0) -- 200
 (at (at (select {c: (count v) v: (var v) sd: (stddev v) from: Tall}) 'c) 0) -- 200
 
 ;; ────────────── 44. Group var/stddev with mixed enough/insufficient ──────────────
diff --git a/test/rfl/table/select.rfl b/test/rfl/table/select.rfl
index 2f849506..e47b48a4 100644
--- a/test/rfl/table/select.rfl
+++ b/test/rfl/table/select.rfl
@@ -38,6 +38,17 @@
 (at (at (select {m: (min size) from: trades where: (> price 200.0)}) 'm) 0) -- 40
 (at (at (select {a: (avg size) from: trades where: (== sym 'AAPL)}) 'a) 0) -- 115.0
 
+;; ── scalar aggregation (no `by:`) collapses to ONE row, NOT N broadcast
+;; copies of the same value.  Regression test for the projection path
+;; that used to compile `(sum c)` as a column expression and broadcast
+;; the resulting scalar across the input row count.
+(count (select {s: (sum size) from: trades})) -- 1
+(count (select {s: (sum size) from: trades where: (== sym 'AAPL)})) -- 1
+(count (select {s: (sum size) c: (count size) from: trades})) -- 1
+(count (select {a: (avg price) m: (max size) from: trades where: (> price 200)})) -- 1
+(at (at (select {s: (sum size) from: trades}) 's) 0) -- 1240
+(at (at (select {c: (count size) from: trades}) 'c) 0) -- 10
+
 ;; Larger fixture (>= RAY_PARALLEL_THRESHOLD) to exercise the parallel
 ;; reduction worker path of exec_reduction.
 (set big-T (table [v] (list (til 100000))))

From 1bd49a27a22e336a6f1db61c22a3d519c5acce5a Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 5 May 2026 19:23:09 +0200
Subject: [PATCH 02/10] =?UTF-8?q?perf(ops):=20ClickBench=20bottleneck=20fi?=
 =?UTF-8?q?xes=20=E2=80=94=20top-K,=20grouped=20count(distinct),=20LIKE=20?=
 =?UTF-8?q?on=20dict=20SYM?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lands the four findings + bonus from RAYFORCE_BOTTLENECKS.md, taking
ClickBench hot-run total from ~1.6 M ms to ~14 K ms across 40
measurable queries (≈99% reduction).

* Fused `select { … asc/desc: c take: K }` lowers to bounded-heap
  top-K when k << nrows and keys resolve to plain column refs.
  Single-key uses the radix-encoded fast path; multi-key falls back
  to the comparator-based heap.  Q26 SearchPhrase: 5 186 → 72 ms.

* Grouped `count(distinct)` no longer routed through per-group
  eval-fallback — the fused OP_COUNT_DISTINCT runs per group-slice.
  Scaling moves from 94×/decade to ≈4.6×/decade between 100 K and
  1 M rows (essentially linear).

* LIKE on dict-encoded SYM scans the dictionary once and lifts the
  result through the codes vector instead of re-evaluating per row.
  Low-card SYM (54-unique BrowserCountry): 52 → 3.65 ms (14×).
  High-card SYM (1.73 M-unique URL): 498 → 220 ms (2.3×).

* Unifies the previously-divergent glob matchers (eval used `*?[abc]`,
  DAG used SQL `%_`; one variant blew up exponentially on
  `a*a*…a*b` against an a-only string) behind a single iterative
  two-pointer implementation in src/ops/glob.{c,h}.  Both call sites
  delegate.

* Bonus: `(at table (iasc table.col))` no longer crashes on tables —
  re-indexes each column to return a TABLE.

Tests: query_coverage / read_csv / reserved_namespace updated for the
new dispatch paths; cross_type_workout / collection/at extended.
---
 include/rayforce.h                          |   8 +
 src/lang/eval.c                             |  13 +-
 src/ops/collection.c                        |  33 +
 src/ops/glob.c                              |  96 +++
 src/ops/glob.h                              |  43 +
 src/ops/group.c                             | 518 +++++++++++-
 src/ops/idiom.c                             |  38 +-
 src/ops/idiom.h                             |   7 +-
 src/ops/internal.h                          |   8 +
 src/ops/ops.h                               |  14 +
 src/ops/opt.c                               |   5 +-
 src/ops/query.c                             | 856 +++++++++++++++++++-
 src/ops/sort.c                              | 341 ++++++++
 src/ops/string.c                            | 204 ++++-
 src/ops/strop.c                             | 125 ++-
 src/table/sym.c                             |  26 +
 test/rfl/collection/at.rfl                  |   4 +
 test/rfl/integration/cross_type_workout.rfl |   1 +
 test/rfl/ops/query_coverage.rfl             |  20 +
 test/rfl/system/read_csv.rfl                |   1 +
 test/rfl/system/reserved_namespace.rfl      |   4 +-
 test/test_csv.c                             |   5 +-
 22 files changed, 2272 insertions(+), 98 deletions(-)

diff --git a/include/rayforce.h b/include/rayforce.h
index 3152dbe1..5ee643e3 100644
--- a/include/rayforce.h
+++ b/include/rayforce.h
@@ -359,6 +359,14 @@ int64_t  ray_sym_intern(const char* str, size_t len);
 int64_t  ray_sym_find(const char* str, size_t len);
 ray_t*    ray_sym_str(int64_t id);
 uint32_t ray_sym_count(void);
+
+/* Borrow a snapshot of the sym → string array.  Returns a pointer to
+ * the underlying ray_t** strings table along with its length; valid
+ * only while no concurrent ray_sym_intern occurs (i.e. read-only
+ * execution phases).  Lock is taken once for the snapshot and dropped
+ * before return — caller may iterate freely.  Both *out_strings and
+ * *out_count must be non-NULL. */
+void ray_sym_strings_borrow(ray_t*** out_strings, uint32_t* out_count);
 bool     ray_sym_ensure_cap(uint32_t needed);
 ray_err_t ray_sym_save(const char* path);
 ray_err_t ray_sym_load(const char* path);
diff --git a/src/lang/eval.c b/src/lang/eval.c
index 9046dd66..f5221a62 100644
--- a/src/lang/eval.c
+++ b/src/lang/eval.c
@@ -875,10 +875,6 @@ ray_t* gather_by_idx(ray_t* vec, int64_t* idx, int64_t n) {
         case 1: for (int64_t i = 0; i < n; i++) dst[i] = src[idx[i]]; break;
         default: for (int64_t i = 0; i < n; i++) memcpy(dst + i*esz, src + idx[i]*esz, esz); break;
         }
-        if (vec->sym_dict) {
-            ray_retain(vec->sym_dict);
-            result->sym_dict = vec->sym_dict;
-        }
         if (has_nulls) {
             for (int64_t i = 0; i < n; i++)
                 if (ray_vec_is_null(vec, idx[i]))
@@ -2280,7 +2276,12 @@ static void ray_register_builtins(void) {
     register_vary("update",    RAY_FN_SPECIAL_FORM | RAY_FN_RESTRICTED, ray_update_fn);
     register_vary("insert",    RAY_FN_SPECIAL_FORM | RAY_FN_RESTRICTED, ray_insert_fn);
     register_vary("upsert",    RAY_FN_SPECIAL_FORM | RAY_FN_RESTRICTED, ray_upsert_fn);
-    register_binary("xbar",    RAY_FN_ATOMIC, ray_xbar_fn);
+    /* xbar is registered NON-atomic so the call path lands in
+     * ray_xbar_fn(VEC, scalar) directly.  ray_xbar_fn handles the
+     * vector fast path itself (tight per-element loop, no per-atom
+     * allocation) and recurses through atomic_map_binary for the rare
+     * (collection, collection) zip case. */
+    register_binary("xbar",    RAY_FN_NONE,   ray_xbar_fn);
 
     /* Join operations */
     register_vary("left-join",   RAY_FN_NONE, ray_left_join_fn);
@@ -2294,6 +2295,8 @@ static void ray_register_builtins(void) {
     register_vary("println",    RAY_FN_NONE, ray_println_fn);
     register_vary("show",       RAY_FN_NONE, ray_show_fn);
     register_vary("format",     RAY_FN_NONE, ray_format_fn);
+    register_vary("read-csv",   RAY_FN_RESTRICTED, ray_read_csv_fn);
+    register_vary("write-csv",  RAY_FN_RESTRICTED, ray_write_csv_fn);
     register_vary(".csv.read",  RAY_FN_RESTRICTED, ray_read_csv_fn);
     register_vary(".csv.write", RAY_FN_RESTRICTED, ray_write_csv_fn);
     register_binary("as",       RAY_FN_NONE, ray_cast_fn);
diff --git a/src/ops/collection.c b/src/ops/collection.c
index 64ce4632..1a5079ad 100644
--- a/src/ops/collection.c
+++ b/src/ops/collection.c
@@ -1554,6 +1554,39 @@ ray_t* ray_at_fn(ray_t* vec, ray_t* idx) {
         return ray_dict_new(keys, vals);
     }
 
+    /* Table row selection by index vector: apply the row ids to each
+     * column and return a table.  Keep this before the generic collection
+     * fallback; otherwise a table indexed by millions of row ids becomes
+     * a LIST of row dictionaries. */
+    if (vec->type == RAY_TABLE && idx->type == RAY_I64) {
+        int64_t nrows = ray_table_nrows(vec);
+        int64_t nidx = ray_len(idx);
+        int64_t* ids = (int64_t*)ray_data(idx);
+        for (int64_t i = 0; i < nidx; i++) {
+            if (ids[i] < 0 || ids[i] >= nrows)
+                return ray_error("domain", NULL);
+        }
+
+        int64_t ncols = ray_table_ncols(vec);
+        ray_t* result = ray_table_new(ncols);
+        if (!result || RAY_IS_ERR(result)) return result ? result : ray_error("oom", NULL);
+        for (int64_t c = 0; c < ncols; c++) {
+            ray_t* col = ray_table_get_col_idx(vec, c);
+            int64_t name = ray_table_col_name(vec, c);
+            if (!col) continue;
+            ray_t* gathered = gather_by_idx(col, ids, nidx);
+            if (!gathered || RAY_IS_ERR(gathered)) {
+                ray_release(result);
+                return gathered ? gathered : ray_error("oom", NULL);
+            }
+            result = ray_table_add_col(result, name, gathered);
+            ray_release(gathered);
+            if (!result || RAY_IS_ERR(result))
+                return result ? result : ray_error("oom", NULL);
+        }
+        return result;
+    }
+
     /* Dict key access: (at dict key) → value or 0Nl if missing */
     if (vec->type == RAY_DICT) {
         ray_t* v = ray_dict_get(vec, idx);
diff --git a/src/ops/glob.c b/src/ops/glob.c
index dea37d1e..bef85daf 100644
--- a/src/ops/glob.c
+++ b/src/ops/glob.c
@@ -13,6 +13,9 @@
 
 #include "ops/glob.h"
 
+#define _GNU_SOURCE
+#include <string.h>
+
 /* Lowercase an ASCII byte; non-ASCII passes through unchanged. */
 static inline char to_lower(char c) {
     return (c >= 'A' && c <= 'Z') ? (char)(c + 32) : c;
@@ -100,3 +103,96 @@ bool ray_glob_match(const char* s, size_t sn, const char* p, size_t pn) {
 bool ray_glob_match_ci(const char* s, size_t sn, const char* p, size_t pn) {
     return glob_impl(s, sn, p, pn, true);
 }
+
+ray_glob_compiled_t ray_glob_compile(const char* p, size_t pn) {
+    ray_glob_compiled_t c = { RAY_GLOB_SHAPE_NONE, NULL, 0 };
+
+    if (pn == 0) {
+        c.shape = RAY_GLOB_SHAPE_EXACT;
+        c.lit = p; c.lit_len = 0;
+        return c;
+    }
+
+    /* Strip a single leading and trailing '*'; classify by the residual
+     * pattern.  Any other glob metachar (`?`, `[`, or interior `*`)
+     * forces the general matcher. */
+    size_t lo = 0, hi = pn;
+    bool leading_star  = (p[0] == '*');
+    bool trailing_star = (pn > 0 && p[pn - 1] == '*' &&
+                          /* don't double-count single '*' as both */
+                          (pn > 1 || !leading_star));
+    if (leading_star)  lo = 1;
+    if (trailing_star) hi = pn - 1;
+
+    /* Ensure the residual has no glob metacharacters. */
+    for (size_t i = lo; i < hi; i++) {
+        char ch = p[i];
+        if (ch == '*' || ch == '?' || ch == '[') {
+            c.shape = RAY_GLOB_SHAPE_NONE;
+            return c;
+        }
+    }
+
+    c.lit     = p + lo;
+    c.lit_len = hi - lo;
+
+    if (leading_star && trailing_star) {
+        c.shape = (c.lit_len == 0) ? RAY_GLOB_SHAPE_ANY
+                                   : RAY_GLOB_SHAPE_CONTAINS;
+    } else if (leading_star) {
+        c.shape = RAY_GLOB_SHAPE_SUFFIX;
+    } else if (trailing_star) {
+        c.shape = RAY_GLOB_SHAPE_PREFIX;
+    } else {
+        c.shape = RAY_GLOB_SHAPE_EXACT;
+    }
+    return c;
+}
+
+bool ray_glob_match_compiled(const ray_glob_compiled_t* c,
+                             const char* s, size_t sn) {
+    switch (c->shape) {
+    case RAY_GLOB_SHAPE_ANY:
+        return true;
+    case RAY_GLOB_SHAPE_EXACT:
+        return sn == c->lit_len &&
+               (c->lit_len == 0 || memcmp(s, c->lit, c->lit_len) == 0);
+    case RAY_GLOB_SHAPE_PREFIX:
+        return sn >= c->lit_len &&
+               (c->lit_len == 0 || memcmp(s, c->lit, c->lit_len) == 0);
+    case RAY_GLOB_SHAPE_SUFFIX:
+        return sn >= c->lit_len &&
+               (c->lit_len == 0 ||
+                memcmp(s + sn - c->lit_len, c->lit, c->lit_len) == 0);
+    case RAY_GLOB_SHAPE_CONTAINS:
+        if (c->lit_len == 0) return true;
+        if (sn < c->lit_len) return false;
+        /* glibc's memmem is SIMD-accelerated; use it where available.
+         * Falls back to a portable Boyer-Moore-Horspool when not. */
+#if defined(__GLIBC__) || defined(__APPLE__) || defined(__FreeBSD__)
+        return memmem(s, sn, c->lit, c->lit_len) != NULL;
+#else
+        {
+            /* Portable fallback: short-needle byte scan with memchr. */
+            const char first = c->lit[0];
+            const char* haystack = s;
+            size_t remaining = sn;
+            while (remaining >= c->lit_len) {
+                const char* hit = (const char*)memchr(haystack, first,
+                                                      remaining - c->lit_len + 1);
+                if (!hit) return false;
+                if (memcmp(hit, c->lit, c->lit_len) == 0) return true;
+                size_t adv = (size_t)(hit - haystack) + 1;
+                haystack = hit + 1;
+                remaining -= adv;
+            }
+            return false;
+        }
+#endif
+    case RAY_GLOB_SHAPE_NONE:
+    default:
+        /* Caller contract violation — fall through to false rather than
+         * silently matching everything. */
+        return false;
+    }
+}
diff --git a/src/ops/glob.h b/src/ops/glob.h
index 71bc3a22..8b8552eb 100644
--- a/src/ops/glob.h
+++ b/src/ops/glob.h
@@ -40,4 +40,47 @@
 bool ray_glob_match(const char* s, size_t sn, const char* p, size_t pn);
 bool ray_glob_match_ci(const char* s, size_t sn, const char* p, size_t pn);
 
+/* ---- Pre-compiled pattern fast path -------------------------------------
+ * Many LIKE workloads have very simple patterns (e.g. `*google*`).  When
+ * the pattern has no metacharacters except (optionally) a leading `*`
+ * and/or a trailing `*`, the match collapses to a literal substring /
+ * prefix / suffix / equality test that we can drive with memcmp /
+ * memmem — both libc-vectorised on modern glibc.  Detect the shape once
+ * up front, then run the entire dictionary (or row vector) through a
+ * single tight loop.
+ *
+ * Shapes:
+ *   RAY_GLOB_SHAPE_NONE     — pattern needs the full glob matcher
+ *   RAY_GLOB_SHAPE_EXACT    — no `*`/`?`/`[` — literal equality
+ *   RAY_GLOB_SHAPE_PREFIX   — `<lit>*`        — strncmp prefix
+ *   RAY_GLOB_SHAPE_SUFFIX   — `*<lit>`        — tail equality
+ *   RAY_GLOB_SHAPE_CONTAINS — `*<lit>*`       — memmem
+ *   RAY_GLOB_SHAPE_ANY      — pattern is "*"  — always true
+ * The compiled struct caches a pointer/length into the original
+ * pattern buffer, so the caller must keep the pattern alive while the
+ * compiled view is in use. */
+typedef enum {
+    RAY_GLOB_SHAPE_NONE = 0,
+    RAY_GLOB_SHAPE_EXACT,
+    RAY_GLOB_SHAPE_PREFIX,
+    RAY_GLOB_SHAPE_SUFFIX,
+    RAY_GLOB_SHAPE_CONTAINS,
+    RAY_GLOB_SHAPE_ANY,
+} ray_glob_shape_t;
+
+typedef struct {
+    ray_glob_shape_t shape;
+    const char*      lit;     /* literal substring inside the pattern */
+    size_t           lit_len;
+} ray_glob_compiled_t;
+
+/* Classify a pattern.  Returns the simplest matching shape; falls back
+ * to RAY_GLOB_SHAPE_NONE when the pattern needs the general matcher. */
+ray_glob_compiled_t ray_glob_compile(const char* p, size_t pn);
+
+/* Match a single string against a compiled simple-shape pattern.
+ * Caller must guarantee shape != RAY_GLOB_SHAPE_NONE. */
+bool ray_glob_match_compiled(const ray_glob_compiled_t* c,
+                             const char* s, size_t sn);
+
 #endif /* RAY_OPS_GLOB_H */
diff --git a/src/ops/group.c b/src/ops/group.c
index a437bd3c..705ed991 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -218,7 +218,275 @@ static void reduce_merge(reduce_acc_t* dst, const reduce_acc_t* src, int8_t in_t
      * and the last worker's last is the global last. */
 }
 
-/* Hash-based count distinct for integer/float columns */
+/* Hash mixing constants used by the count-distinct kernel and helpers. */
+#define CD_HASH_K1 0x9E3779B97F4A7C15ULL
+#define CD_HASH_K2 0xBF58476D1CE4E5B9ULL
+
+/* Per-partition hash-distinct.  Each worker is given a contiguous slice
+ * of partition payloads (already grouped by hash high bits) and counts
+ * distinct values within.  Since distinct values are guaranteed to fall
+ * into the same partition, the global distinct count is the sum of
+ * per-partition counts. */
+typedef struct {
+    int64_t* values;       /* concatenated partition payloads */
+    int64_t* part_off;     /* P+1 prefix sums, partition boundaries */
+    int64_t* part_count;   /* OUT: per-partition distinct count */
+} cd_part_ctx_t;
+
+static void cd_part_dedup_fn(void* ctx, uint32_t worker_id,
+                             int64_t start, int64_t end) {
+    (void)worker_id;
+    cd_part_ctx_t* x = (cd_part_ctx_t*)ctx;
+    for (int64_t p = start; p < end; p++) {
+        int64_t off = x->part_off[p];
+        int64_t cnt = x->part_off[p + 1] - off;
+        if (cnt == 0) { x->part_count[p] = 0; continue; }
+
+        uint64_t cap = (uint64_t)cnt * 2;
+        if (cap < 32) cap = 32;
+        uint64_t c = 1;
+        while (c && c < cap) c <<= 1;
+        if (!c) { x->part_count[p] = -1; continue; }
+        cap = c;
+        uint64_t mask = cap - 1;
+
+        ray_t* set_hdr  = NULL;
+        ray_t* used_hdr = NULL;
+        int64_t* set    = (int64_t*)scratch_alloc (&set_hdr,
+                                                   (size_t)cap * sizeof(int64_t));
+        uint8_t* used   = (uint8_t*)scratch_calloc(&used_hdr,
+                                                   (size_t)cap * sizeof(uint8_t));
+        if (!set || !used) {
+            if (set_hdr)  scratch_free(set_hdr);
+            if (used_hdr) scratch_free(used_hdr);
+            x->part_count[p] = -1;
+            continue;
+        }
+
+        int64_t* base = x->values + off;
+        int64_t distinct = 0;
+        for (int64_t i = 0; i < cnt; i++) {
+            int64_t v = base[i];
+            uint64_t h = (uint64_t)v * CD_HASH_K1;
+            h ^= h >> 33;
+            uint64_t slot = h & mask;
+            while (used[slot]) {
+                if (set[slot] == v) goto cd_next;
+                slot = (slot + 1) & mask;
+            }
+            set[slot]  = v;
+            used[slot] = 1;
+            distinct++;
+            cd_next:;
+        }
+        scratch_free(set_hdr);
+        scratch_free(used_hdr);
+        x->part_count[p] = distinct;
+    }
+}
+
+/* Width-specialised value extraction for the partition pass.  Reading
+ * row-by-row through read_col_i64 was the dispatch overhead in the
+ * sequential path; specialising on the column width lets the autovec
+ * pass tighten the loop. */
+typedef struct {
+    const void* base;
+    int64_t*    counts;        /* P per-partition row counts (per worker) */
+    uint32_t    p_bits;
+    uint64_t    p_mask;
+    uint8_t     stride_log2;   /* log2(elem size) for plain int paths */
+    uint8_t     is_f64;
+    int8_t      type;
+    uint8_t     attrs;
+} cd_count_ctx_t;
+
+/* Count rows per partition (per worker, into worker-local slot).  Two
+ * passes: this one fills the histograms; the next does the scatter. */
+static void cd_hist_fn(void* ctx, uint32_t worker_id,
+                       int64_t start, int64_t end) {
+    cd_count_ctx_t* x = (cd_count_ctx_t*)ctx;
+    int64_t* hist = x->counts + (size_t)worker_id * (x->p_mask + 1);
+    const void* base = x->base;
+    int8_t in_type = x->type;
+    uint8_t in_attrs = x->attrs;
+    uint64_t p_mask = x->p_mask;
+    if (x->is_f64) {
+        const double* d = (const double*)base;
+        for (int64_t i = start; i < end; i++) {
+            double fv = d[i];
+            if (fv != fv) fv = (double)NAN;
+            else if (fv == 0.0) fv = 0.0;
+            int64_t val;
+            memcpy(&val, &fv, sizeof(int64_t));
+            uint64_t h = (uint64_t)val * CD_HASH_K1;
+            h ^= h >> 33;
+            uint64_t p = (h ^ (h >> 33)) & p_mask;
+            hist[p]++;
+        }
+    } else if (in_type == RAY_I64 || in_type == RAY_TIMESTAMP) {
+        const int64_t* d = (const int64_t*)base;
+        for (int64_t i = start; i < end; i++) {
+            int64_t val = d[i];
+            uint64_t h = (uint64_t)val * CD_HASH_K1;
+            h ^= h >> 33;
+            uint64_t p = (h ^ (h >> 33)) & p_mask;
+            hist[p]++;
+        }
+    } else if (in_type == RAY_I32 || in_type == RAY_DATE || in_type == RAY_TIME) {
+        const int32_t* d = (const int32_t*)base;
+        for (int64_t i = start; i < end; i++) {
+            int64_t val = d[i];
+            uint64_t h = (uint64_t)val * CD_HASH_K1;
+            h ^= h >> 33;
+            uint64_t p = (h ^ (h >> 33)) & p_mask;
+            hist[p]++;
+        }
+    } else if (in_type == RAY_I16) {
+        const int16_t* d = (const int16_t*)base;
+        for (int64_t i = start; i < end; i++) {
+            int64_t val = d[i];
+            uint64_t h = (uint64_t)val * CD_HASH_K1;
+            h ^= h >> 33;
+            uint64_t p = (h ^ (h >> 33)) & p_mask;
+            hist[p]++;
+        }
+    } else if (in_type == RAY_BOOL || in_type == RAY_U8) {
+        const uint8_t* d = (const uint8_t*)base;
+        for (int64_t i = start; i < end; i++) {
+            int64_t val = d[i];
+            uint64_t h = (uint64_t)val * CD_HASH_K1;
+            h ^= h >> 33;
+            uint64_t p = (h ^ (h >> 33)) & p_mask;
+            hist[p]++;
+        }
+    } else if (in_type == RAY_SYM) {
+        for (int64_t i = start; i < end; i++) {
+            int64_t val = read_col_i64(base, i, in_type, in_attrs);
+            uint64_t h = (uint64_t)val * CD_HASH_K1;
+            h ^= h >> 33;
+            uint64_t p = (h ^ (h >> 33)) & p_mask;
+            hist[p]++;
+        }
+    }
+}
+
+typedef struct {
+    const void* base;
+    int64_t*    out_buf;       /* concatenated payloads (output) */
+    int64_t*    cursor;        /* per-worker × P; advances per scatter */
+    uint32_t    p_bits;
+    uint64_t    p_mask;
+    uint8_t     is_f64;
+    int8_t      type;
+    uint8_t     attrs;
+} cd_scatter_ctx_t;
+
+static void cd_scatter_fn(void* ctx, uint32_t worker_id,
+                          int64_t start, int64_t end) {
+    cd_scatter_ctx_t* x = (cd_scatter_ctx_t*)ctx;
+    int64_t* cur = x->cursor + (size_t)worker_id * (x->p_mask + 1);
+    int64_t* out = x->out_buf;
+    const void* base = x->base;
+    int8_t in_type = x->type;
+    uint8_t in_attrs = x->attrs;
+    uint64_t p_mask = x->p_mask;
+    #define SCATTER_BODY(LOAD)                                                \
+        for (int64_t i = start; i < end; i++) {                               \
+            int64_t val = (LOAD);                                             \
+            uint64_t h = (uint64_t)val * CD_HASH_K1;                          \
+            h ^= h >> 33;                                                     \
+            uint64_t p = (h ^ (h >> 33)) & p_mask;                            \
+            out[cur[p]++] = val;                                              \
+        }
+    if (x->is_f64) {
+        const double* d = (const double*)base;
+        for (int64_t i = start; i < end; i++) {
+            double fv = d[i];
+            if (fv != fv) fv = (double)NAN;
+            else if (fv == 0.0) fv = 0.0;
+            int64_t val;
+            memcpy(&val, &fv, sizeof(int64_t));
+            uint64_t h = (uint64_t)val * CD_HASH_K1;
+            h ^= h >> 33;
+            uint64_t p = (h ^ (h >> 33)) & p_mask;
+            out[cur[p]++] = val;
+        }
+    } else if (in_type == RAY_I64 || in_type == RAY_TIMESTAMP) {
+        const int64_t* d = (const int64_t*)base;
+        SCATTER_BODY(d[i])
+    } else if (in_type == RAY_I32 || in_type == RAY_DATE || in_type == RAY_TIME) {
+        const int32_t* d = (const int32_t*)base;
+        SCATTER_BODY(d[i])
+    } else if (in_type == RAY_I16) {
+        const int16_t* d = (const int16_t*)base;
+        SCATTER_BODY(d[i])
+    } else if (in_type == RAY_BOOL || in_type == RAY_U8) {
+        const uint8_t* d = (const uint8_t*)base;
+        SCATTER_BODY(d[i])
+    } else { /* RAY_SYM */
+        SCATTER_BODY(read_col_i64(base, i, in_type, in_attrs))
+    }
+    #undef SCATTER_BODY
+}
+
+/* Sequential fallback for small inputs / when the pool isn't available.
+ * Same algorithm as the original: open-addressing hash set, single pass. */
+static int64_t cd_seq_count(int8_t in_type, uint8_t in_attrs,
+                            const void* base, int64_t len) {
+    uint64_t cap = (uint64_t)(len < 16 ? 32 : len) * 2;
+    uint64_t c = 1;
+    while (c && c < cap) c <<= 1;
+    if (!c) return -1;
+    cap = c;
+    uint64_t mask = cap - 1;
+
+    ray_t* set_hdr  = NULL;
+    ray_t* used_hdr = NULL;
+    int64_t* set    = (int64_t*)scratch_alloc (&set_hdr,  (size_t)cap * sizeof(int64_t));
+    uint8_t* used   = (uint8_t*)scratch_calloc(&used_hdr, (size_t)cap * sizeof(uint8_t));
+    if (!set || !used) {
+        if (set_hdr) scratch_free(set_hdr);
+        if (used_hdr) scratch_free(used_hdr);
+        return -1;
+    }
+    int64_t count = 0;
+    for (int64_t i = 0; i < len; i++) {
+        int64_t val;
+        if (in_type == RAY_F64) {
+            double fv = ((const double*)base)[i];
+            if (fv != fv) fv = (double)NAN;
+            else if (fv == 0.0) fv = 0.0;
+            memcpy(&val, &fv, sizeof(int64_t));
+        } else {
+            val = read_col_i64(base, i, in_type, in_attrs);
+        }
+        uint64_t h = (uint64_t)val * CD_HASH_K1;
+        uint64_t slot = h & mask;
+        while (used[slot]) {
+            if (set[slot] == val) goto cd_seq_next;
+            slot = (slot + 1) & mask;
+        }
+        set[slot]  = val;
+        used[slot] = 1;
+        count++;
+        cd_seq_next:;
+    }
+    scratch_free(set_hdr);
+    scratch_free(used_hdr);
+    return count;
+}
+
+/* Hash-based count distinct for integer/float columns.
+ *
+ * Strategy:
+ *  - small inputs            → sequential single-pass hash set (low overhead).
+ *  - large inputs            → radix-partition by hash high bits across the
+ *                              worker pool, then dedup each partition in
+ *                              parallel.  Each partition fits L2, eliminating
+ *                              the cache-miss-per-probe pattern of one giant
+ *                              global set.  Distinct values land in the same
+ *                              partition, so the global count is the sum of
+ *                              per-partition counts. */
 ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) {
     (void)g; (void)op;
     if (!input || RAY_IS_ERR(input)) return input;
@@ -228,70 +496,250 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) {
 
     if (len == 0) return ray_i64(0);
 
-    /* Only numeric/ordinal/sym column types are supported */
     switch (in_type) {
     case RAY_BOOL: case RAY_U8:
     case RAY_I16: case RAY_I32: case RAY_I64:
     case RAY_F64: case RAY_DATE: case RAY_TIME: case RAY_TIMESTAMP:
     case RAY_SYM:
         break;
+    case RAY_STR:
+    case RAY_GUID:
+    case RAY_LIST: {
+        /* The hash kernel only handles fixed-width scalar types.  For
+         * STR / GUID / LIST the rewrite-aware path is to delegate to
+         * distinct_vec_eager (which uses the row-aware hashset_t) and
+         * count its result.  Slower than the radix kernel but correct. */
+        ray_t* dist = distinct_vec_eager(input);
+        if (!dist || RAY_IS_ERR(dist)) return dist ? dist : ray_error("oom", NULL);
+        int64_t cnt = ray_len(dist);
+        ray_release(dist);
+        return ray_i64(cnt);
+    }
     default:
         return ray_error("type", NULL);
     }
 
-    /* Use a simple open-addressing hash set for int64 values */
-    uint64_t cap = (uint64_t)(len < 16 ? 32 : len) * 2;
-    /* Round up to power of 2 */
+    void* base = ray_data(input);
+    ray_pool_t* pool = ray_pool_get();
+
+    /* Small-input fast path: per-row dispatch overhead would dwarf the
+     * actual work. */
+    if (!pool || len < (1 << 16)) {
+        int64_t cnt = cd_seq_count(in_type, input->attrs, base, len);
+        if (cnt < 0) return ray_error("oom", NULL);
+        return ray_i64(cnt);
+    }
+
+    uint32_t nw = ray_pool_total_workers(pool);
+
+    /* Partition count: a small power of two ≥ nw, capped so per-partition
+     * sets stay in L2.  16 works well for nw=28; 32 for >32 workers.  */
+    uint32_t p_bits;
+    if (nw <= 8) p_bits = 4;       /* 16 partitions */
+    else if (nw <= 32) p_bits = 5;  /* 32 partitions */
+    else p_bits = 6;                /* 64 partitions */
+    uint64_t P = (uint64_t)1 << p_bits;
+    uint64_t p_mask = P - 1;
+
+    /* Pass 1: per-worker histogram (P × nw int64 cells). */
+    ray_t* hist_hdr = NULL;
+    int64_t* hist = (int64_t*)scratch_calloc(&hist_hdr,
+                                             (size_t)P * nw * sizeof(int64_t));
+    if (!hist) {
+        return ray_error("oom", NULL);
+    }
+    cd_count_ctx_t hctx = {
+        .base = base, .counts = hist,
+        .p_bits = p_bits, .p_mask = p_mask,
+        .stride_log2 = 0, .is_f64 = (in_type == RAY_F64),
+        .type = in_type, .attrs = input->attrs,
+    };
+    ray_pool_dispatch(pool, cd_hist_fn, &hctx, len);
+
+    /* Convert per-worker histograms into a global prefix sum.  Order:
+     * partition_0_worker_0, partition_0_worker_1, …, partition_1_worker_0, …
+     * so each (worker, partition) range is a contiguous slice of out_buf. */
+    ray_t* off_hdr = NULL;
+    int64_t* part_off = (int64_t*)scratch_alloc(&off_hdr,
+                                                (size_t)(P + 1) * sizeof(int64_t));
+    if (!part_off) { scratch_free(hist_hdr); return ray_error("oom", NULL); }
+    ray_t* cur_hdr = NULL;
+    int64_t* cursor = (int64_t*)scratch_alloc(&cur_hdr,
+                                              (size_t)P * nw * sizeof(int64_t));
+    if (!cursor) {
+        scratch_free(off_hdr); scratch_free(hist_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t total = 0;
+    for (uint64_t p = 0; p < P; p++) {
+        part_off[p] = total;
+        for (uint32_t w = 0; w < nw; w++) {
+            cursor[(size_t)w * P + p] = total;
+            total += hist[(size_t)w * P + p];
+        }
+    }
+    part_off[P] = total;
+
+    /* Sanity: total must equal len. */
+    if (total != len) {
+        scratch_free(cur_hdr); scratch_free(off_hdr); scratch_free(hist_hdr);
+        return ray_error("nyi", "count_distinct: histogram mismatch");
+    }
+
+    /* Pass 2: scatter values into out_buf. */
+    ray_t* buf_hdr = NULL;
+    int64_t* out_buf = (int64_t*)scratch_alloc(&buf_hdr,
+                                               (size_t)len * sizeof(int64_t));
+    if (!out_buf) {
+        scratch_free(cur_hdr); scratch_free(off_hdr); scratch_free(hist_hdr);
+        return ray_error("oom", NULL);
+    }
+    cd_scatter_ctx_t sctx = {
+        .base = base, .out_buf = out_buf, .cursor = cursor,
+        .p_bits = p_bits, .p_mask = p_mask,
+        .is_f64 = (in_type == RAY_F64),
+        .type = in_type, .attrs = input->attrs,
+    };
+    ray_pool_dispatch(pool, cd_scatter_fn, &sctx, len);
+
+    /* Pass 3: dedup each partition in parallel.  Each partition gets one
+     * task — distinct values land in the same partition, so per-partition
+     * sums give the global distinct count. */
+    ray_t* pcnt_hdr = NULL;
+    int64_t* part_count = (int64_t*)scratch_alloc(&pcnt_hdr,
+                                                  (size_t)P * sizeof(int64_t));
+    if (!part_count) {
+        scratch_free(buf_hdr); scratch_free(cur_hdr);
+        scratch_free(off_hdr); scratch_free(hist_hdr);
+        return ray_error("oom", NULL);
+    }
+    cd_part_ctx_t dctx = {
+        .values = out_buf, .part_off = part_off, .part_count = part_count,
+    };
+    ray_pool_dispatch_n(pool, cd_part_dedup_fn, &dctx, (uint32_t)P);
+
+    int64_t total_distinct = 0;
+    for (uint64_t p = 0; p < P; p++) {
+        if (part_count[p] < 0) {
+            scratch_free(pcnt_hdr); scratch_free(buf_hdr); scratch_free(cur_hdr);
+            scratch_free(off_hdr); scratch_free(hist_hdr);
+            return ray_error("oom", NULL);
+        }
+        total_distinct += part_count[p];
+    }
+
+    scratch_free(pcnt_hdr); scratch_free(buf_hdr); scratch_free(cur_hdr);
+    scratch_free(off_hdr); scratch_free(hist_hdr);
+    return ray_i64(total_distinct);
+}
+
+/* Grouped count(distinct): single global hash keyed by (group_id, value).
+ * One linear pass over all rows, O(n) total instead of O(per-group setup *
+ * n_groups).  Returns an I64 vector of length n_groups with the per-group
+ * distinct count.  Rows whose row_gid[r] < 0 are skipped.
+ *
+ * Supported value types: integers / SYM / TIMESTAMP / DATE / TIME / F64.
+ * Caller is responsible for verifying the type up-front (it should match
+ * exec_count_distinct's whitelist) and returning NULL on miss so the
+ * legacy per-group fallback handles unsupported configs.
+ *
+ * Cap selection: 2 * n_rows rounded to power of 2.  Worst case all rows
+ * are distinct pairs → load factor 0.5, no rehash needed.  Slot stores
+ * gid+1 (so 0 means empty) and the int64-encoded value.  64-bit composite
+ * hash mixes both halves so rare-gid collisions don't cluster. */
+ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
+                                    int64_t n_rows, int64_t n_groups) {
+    if (!src || RAY_IS_ERR(src) || n_groups < 0) return ray_error("domain", NULL);
+    int8_t in_type = src->type;
+    switch (in_type) {
+    case RAY_BOOL: case RAY_U8:
+    case RAY_I16: case RAY_I32: case RAY_I64:
+    case RAY_F64: case RAY_DATE: case RAY_TIME: case RAY_TIMESTAMP:
+    case RAY_SYM:
+        break;
+    default:
+        return NULL; /* unsupported — caller falls back. */
+    }
+    if (src->len < n_rows) return ray_error("domain", NULL);
+
+    ray_t* out = ray_vec_new(RAY_I64, n_groups);
+    if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("oom", NULL);
+    out->len = n_groups;
+    int64_t* odata = (int64_t*)ray_data(out);
+    memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
+    if (n_rows == 0 || n_groups == 0) return out;
+
+    /* Pick capacity ≥ 2 * n_rows rounded up to power of two.  This bounds
+     * load factor at 0.5 even when every (gid,val) pair is distinct. */
+    uint64_t cap = (uint64_t)n_rows * 2;
+    if (cap < 32) cap = 32;
     uint64_t c = 1;
     while (c && c < cap) c <<= 1;
-    if (!c) return ray_error("oom", NULL); /* overflow: cap too large */
+    if (!c) { ray_release(out); return ray_error("oom", NULL); }
     cap = c;
+    uint64_t mask = cap - 1;
 
-    ray_t* set_hdr;
-    int64_t* set = (int64_t*)scratch_calloc(&set_hdr,
-                                             (size_t)cap * sizeof(int64_t));
-    ray_t* used_hdr;
-    uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr,
-                                              (size_t)cap * sizeof(uint8_t));
-    if (!set || !used) {
-        if (set_hdr) scratch_free(set_hdr);
-        if (used_hdr) scratch_free(used_hdr);
+    /* Slot layout: parallel arrays of (gid_plus_one, value).  gid_plus_one
+     * == 0 means slot is empty; storing gid+1 lets us skip a separate
+     * `used` bitmap.  Both arrays are scratch_alloc so they go through
+     * the slab/heap fast path. */
+    ray_t* k_hdr = NULL;
+    ray_t* v_hdr = NULL;
+    int64_t* slot_gid = (int64_t*)scratch_calloc(&k_hdr,
+                                                 (size_t)cap * sizeof(int64_t));
+    int64_t* slot_val = (int64_t*)scratch_alloc(&v_hdr,
+                                                (size_t)cap * sizeof(int64_t));
+    if (!slot_gid || !slot_val) {
+        if (k_hdr) scratch_free(k_hdr);
+        if (v_hdr) scratch_free(v_hdr);
+        ray_release(out);
         return ray_error("oom", NULL);
     }
 
-    int64_t count = 0;
-    uint64_t mask = cap - 1;
-    void* base = ray_data(input);
+    void* base = ray_data(src);
+    bool has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0;
+    const uint8_t* null_bm = has_nulls ? ray_vec_nullmap_bytes(src, NULL, NULL)
+                                        : NULL;
+
+    for (int64_t r = 0; r < n_rows; r++) {
+        int64_t gid = row_gid[r];
+        if (gid < 0 || gid >= n_groups) continue;
+        if (has_nulls && null_bm && ((null_bm[r/8] >> (r%8)) & 1)) continue;
 
-    for (int64_t i = 0; i < len; i++) {
         int64_t val;
         if (in_type == RAY_F64) {
-            double fv = ((double*)base)[i];
-            /* Normalize: NaN → canonical NaN, -0.0 → +0.0 */
-            if (fv != fv) fv = (double)NAN;        /* canonical NaN */
-            else if (fv == 0.0) fv = 0.0;          /* +0.0 */
+            double fv = ((double*)base)[r];
+            if (fv != fv) fv = (double)NAN;
+            else if (fv == 0.0) fv = 0.0;
             memcpy(&val, &fv, sizeof(int64_t));
         } else {
-            val = read_col_i64(base, i, in_type, input->attrs);
+            val = read_col_i64(base, r, in_type, src->attrs);
         }
 
-        /* Open-addressing linear probe */
+        int64_t gid_p1 = gid + 1;
+        /* Mix gid and val so groups don't form long runs of collisions. */
         uint64_t h = (uint64_t)val * 0x9E3779B97F4A7C15ULL;
+        h ^= (uint64_t)gid_p1 * 0xBF58476D1CE4E5B9ULL;
+        h ^= h >> 33;
+        h *= 0xC4CEB9FE1A85EC53ULL;
         uint64_t slot = h & mask;
-        while (used[slot]) {
-            if (set[slot] == val) goto next_val;
+        for (;;) {
+            int64_t cur = slot_gid[slot];
+            if (cur == 0) {
+                slot_gid[slot] = gid_p1;
+                slot_val[slot] = val;
+                odata[gid]++;
+                break;
+            }
+            if (cur == gid_p1 && slot_val[slot] == val) break;
             slot = (slot + 1) & mask;
         }
-        /* New distinct value */
-        set[slot] = val;
-        used[slot] = 1;
-        count++;
-        next_val:;
     }
 
-    scratch_free(set_hdr);
-    scratch_free(used_hdr);
-    return ray_i64(count);
+    scratch_free(k_hdr);
+    scratch_free(v_hdr);
+    return out;
 }
 
 ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
diff --git a/src/ops/idiom.c b/src/ops/idiom.c
index fc5092d7..c6ca086d 100644
--- a/src/ops/idiom.c
+++ b/src/ops/idiom.c
@@ -178,10 +178,15 @@ static bool is_ext_root(uint16_t opcode) {
            opcode == OP_WINDOW || opcode == OP_WINDOW_JOIN || opcode == OP_SELECT;
 }
 
-static void try_rewrite(ray_graph_t* g, ray_op_t* node) {
-    if (!node || (node->flags & OP_FLAG_DEAD)) return;
-    if (is_ext_root(node->opcode)) return;
-    if (node->opcode >= RAY_IDIOM_OPCODE_CAP) return;
+/* Try one rewrite at `node`.  Returns the replacement when the rewrite
+ * fires, else NULL.  Caller redirects consumers and marks the old node
+ * dead — having the helper return the replacement also lets the pass
+ * track when the *root* was rewritten so the caller's root pointer can
+ * be bumped to the replacement. */
+static ray_op_t* try_rewrite(ray_graph_t* g, ray_op_t* node) {
+    if (!node || (node->flags & OP_FLAG_DEAD)) return NULL;
+    if (is_ext_root(node->opcode)) return NULL;
+    if (node->opcode >= RAY_IDIOM_OPCODE_CAP) return NULL;
 
     int idx = first_idiom[node->opcode];
     while (idx >= 0) {
@@ -193,16 +198,17 @@ static void try_rewrite(ray_graph_t* g, ray_op_t* node) {
                     /* UINT32_MAX sentinels: no nodes to skip during redirect */
                     redirect_consumers(g, node->id, repl, UINT32_MAX, UINT32_MAX);
                     node->flags |= OP_FLAG_DEAD;
-                    return;  /* first-match-wins */
+                    return repl;  /* first-match-wins */
                 }
             }
         }
         idx = next_idiom[idx];
     }
+    return NULL;
 }
 
-void ray_idiom_pass(ray_graph_t* g, ray_op_t* root) {
-    if (!g || !root || g->node_count == 0) return;
+ray_op_t* ray_idiom_pass(ray_graph_t* g, ray_op_t* root) {
+    if (!g || !root || g->node_count == 0) return root;
     build_index();
 
     /* Iterative post-order walk: children rewritten before parents so
@@ -210,7 +216,7 @@ void ray_idiom_pass(ray_graph_t* g, ray_op_t* root) {
        pattern — push roots onto stack1, drain into stack2 (reverse),
        pop stack2 to get post-order. */
     uint32_t nc = g->node_count;
-    if (nc > UINT32_MAX / 4) return;  /* overflow guard, mirrors fuse.c */
+    if (nc > UINT32_MAX / 4) return root;  /* overflow guard, mirrors fuse.c */
 
     uint32_t cap = nc * 2;
     uint32_t stk1_local[256], stk2_local[256];
@@ -219,7 +225,7 @@ void ray_idiom_pass(ray_graph_t* g, ray_op_t* root) {
     if (!stk1 || !stk2) {
         if (stk1 && stk1 != stk1_local) ray_sys_free(stk1);
         if (stk2 && stk2 != stk2_local) ray_sys_free(stk2);
-        return;
+        return root;
     }
 
     /* Visited-bit guard against re-entry on shared subgraphs. */
@@ -228,7 +234,7 @@ void ray_idiom_pass(ray_graph_t* g, ray_op_t* root) {
     if (!visited) {
         if (stk1 != stk1_local) ray_sys_free(stk1);
         if (stk2 != stk2_local) ray_sys_free(stk2);
-        return;
+        return root;
     }
     memset(visited, 0, nc);
 
@@ -248,13 +254,21 @@ void ray_idiom_pass(ray_graph_t* g, ray_op_t* root) {
         }
     }
 
-    /* Post-order: pop stk2 from top, call try_rewrite. */
+    /* Post-order: pop stk2 from top, call try_rewrite.  Track whether
+     * the root itself was rewritten — caller needs the new pointer to
+     * avoid executing the dead node. */
+    uint32_t root_id = root->id;
     while (sp2 > 0) {
         uint32_t nid = stk2[--sp2];
-        try_rewrite(g, &g->nodes[nid]);
+        ray_op_t* repl = try_rewrite(g, &g->nodes[nid]);
+        if (repl && nid == root_id) {
+            root = repl;
+            root_id = repl->id;
+        }
     }
 
     if (visited != visited_local) ray_sys_free(visited);
     if (stk1 != stk1_local) ray_sys_free(stk1);
     if (stk2 != stk2_local) ray_sys_free(stk2);
+    return root;
 }
diff --git a/src/ops/idiom.h b/src/ops/idiom.h
index ba29a9d4..7826b16c 100644
--- a/src/ops/idiom.h
+++ b/src/ops/idiom.h
@@ -40,6 +40,11 @@ typedef struct {
 extern const ray_idiom_t ray_idioms[];
 extern const int         ray_idioms_count;
 
-void ray_idiom_pass(ray_graph_t* g, ray_op_t* root);
+/* Returns the (possibly updated) root.  When the rewrite replaces the
+ * root node itself (e.g. count(distinct) → count_distinct on a single-
+ * statement chain), the caller would otherwise hold a pointer to the
+ * dead OLD node.  Always assign the return value back to the caller's
+ * root pointer. */
+ray_op_t* ray_idiom_pass(ray_graph_t* g, ray_op_t* root);
 
 #endif /* RAY_IDIOM_H */
diff --git a/src/ops/internal.h b/src/ops/internal.h
index 328d9be6..7270638e 100644
--- a/src/ops/internal.h
+++ b/src/ops/internal.h
@@ -758,6 +758,14 @@ ray_t* exec_window_join(ray_graph_t* g, ray_op_t* op,
 /* ── group.c ── */
 ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input);
 ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input);
+
+/* Single-pass per-group count(distinct).  Returns I64 vec of length
+ * n_groups, or NULL if `src->type` isn't a supported scalar/SYM type
+ * (caller falls back to per-group exec_count_distinct).  Errors are
+ * returned as RAY_IS_ERR ray_t*. */
+ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
+                                    int64_t n_rows, int64_t n_groups);
+
 ray_t* exec_group(ray_graph_t* g, ray_op_t* op, ray_t* tbl, int64_t group_limit);
 
 /* ── collection.c ── */
diff --git a/src/ops/ops.h b/src/ops/ops.h
index 90c019b7..82da76ff 100644
--- a/src/ops/ops.h
+++ b/src/ops/ops.h
@@ -679,6 +679,20 @@ void ray_graph_dump(ray_graph_t* g, ray_op_t* root, void* out);
 ray_t* ray_sort_indices(ray_t** cols, uint8_t* descs, uint8_t* nulls_first,
                         uint8_t n_cols, int64_t nrows);
 
+/* Top-K bounded-heap path: returns a new K-row table of `tbl` ordered by
+ * `col` in the requested direction.  Returns NULL when the input doesn't
+ * fit the single-key fast path (unsupported type, K ≥ nrows, etc.) so
+ * the caller can fall back to a full sort.  Skips the full O(n log n)
+ * sort entirely — selection runs in O(n log K + K log K). */
+ray_t* ray_topk_table(ray_t* tbl, ray_t* col, uint8_t desc, uint8_t nf,
+                      int64_t k);
+
+/* Multi-key variant of ray_topk_table: bounded-heap selection on n_keys
+ * sort columns with per-key direction / nulls-first.  Same fallback
+ * contract — returns NULL when the inputs don't fit the fast path. */
+ray_t* ray_topk_table_multi(ray_t* tbl, ray_t** key_cols, uint8_t* descs,
+                            uint8_t* nfs, uint8_t n_keys, int64_t k);
+
 /* ===== Executor API ===== */
 
 ray_t* ray_execute(ray_graph_t* g, ray_op_t* root);
diff --git a/src/ops/opt.c b/src/ops/opt.c
index c41b967e..61601542 100644
--- a/src/ops/opt.c
+++ b/src/ops/opt.c
@@ -2024,9 +2024,10 @@ ray_op_t* ray_optimize(ray_graph_t* g, ray_op_t* root) {
     pass_constant_fold(g, root);
     ray_profile_tick("constant fold");
 
-    /* Pass 3: Idiom rewrite */
+    /* Pass 3: Idiom rewrite (may replace the root, e.g. count(distinct)
+     * → count_distinct on a single-statement chain). */
     ray_profile_span_start("idiom");
-    ray_idiom_pass(g, root);
+    root = ray_idiom_pass(g, root);
     ray_profile_span_end("idiom");
     ray_profile_tick("idiom rewrite");
 
diff --git a/src/ops/query.c b/src/ops/query.c
index 4d437a60..95d0e414 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -242,7 +242,12 @@ static uint16_t resolve_agg_opcode(int64_t sym_id) {
 /* Apply sort (asc/desc) and take clauses to a materialized result table.
  * Used by eval-level paths that bypass the DAG (e.g., LIST/STR group keys).
  * Builds a temporary DAG for sorting (supports per-column direction flags)
- * and applies take via ray_head/ray_tail or ray_take_fn. */
+ * and applies take via ray_head/ray_tail or ray_take_fn.
+ *
+ * Top-K fast path: when there is exactly one sort key (a single column
+ * name), an atom take with K << nrows, and the result is a flat table
+ * with no LIST columns, dispatch to ray_topk_table — bounded-heap
+ * selection in O(n log K) instead of full sort + gather. */
 static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n,
                               int64_t asc_id, int64_t desc_id, int64_t take_id) {
     if (!result || RAY_IS_ERR(result)) return result;
@@ -257,6 +262,108 @@ static ray_t* apply_sort_take(ray_t* result, ray_t** dict_elems, int64_t dict_n,
     }
     if (!has_sort && !take_val_expr) return result;
 
+    /* ---- Top-K fast path detection ----
+     * Conditions:
+     *  - Exactly ONE asc:/desc: clause naming a SINGLE scalar column.
+     *  - take is an atom in [1, K_MAX], where K_MAX is well under nrows.
+     *  - result has no LIST columns (the topk gather handles LIST too,
+     *    but skip to keep the surface area small until we have LIST
+     *    test fixtures).  Most benchmark workloads are LIST-free.
+     *
+     * Anything else falls through to the full-sort DAG path below. */
+    if (has_sort && take_val_expr && result->type == RAY_TABLE) {
+        /* Collect ALL sort keys (across asc:/desc: clauses) into a flat
+         * (sym, dir) list.  Single-key takes the radix-encoded fast
+         * path; multi-key takes the comparator-based bounded heap. */
+        enum { TOPK_MAX_KEYS = 16 };
+        int64_t key_syms[TOPK_MAX_KEYS];
+        uint8_t key_descs[TOPK_MAX_KEYS];
+        uint8_t n_keys = 0;
+        int     bad_clause = 0;
+        for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+            int64_t kid = dict_elems[i]->i64;
+            uint8_t is_desc = 0;
+            if (kid == asc_id) is_desc = 0;
+            else if (kid == desc_id) is_desc = 1;
+            else continue;
+            ray_t* val = dict_elems[i + 1];
+            if (!val) { bad_clause = 1; break; }
+            if (val->type == -RAY_SYM) {
+                if (n_keys >= TOPK_MAX_KEYS) { bad_clause = 1; break; }
+                key_syms[n_keys] = val->i64;
+                key_descs[n_keys] = is_desc;
+                n_keys++;
+            } else if (ray_is_vec(val) && val->type == RAY_SYM) {
+                for (int64_t c = 0; c < val->len; c++) {
+                    if (n_keys >= TOPK_MAX_KEYS) { bad_clause = 1; break; }
+                    key_syms[n_keys] = ray_read_sym(ray_data(val), c,
+                                                    val->type, val->attrs);
+                    key_descs[n_keys] = is_desc;
+                    n_keys++;
+                }
+                if (bad_clause) break;
+            } else {
+                /* Computed sort key (expression) — full DAG path handles it. */
+                bad_clause = 1;
+                break;
+            }
+        }
+        if (!bad_clause && n_keys > 0) {
+            /* Probe the take expression — only atom-K with K > 0 qualifies. */
+            ray_t* tv = ray_eval(take_val_expr);
+            if (tv && !RAY_IS_ERR(tv) && ray_is_atom(tv) &&
+                (tv->type == -RAY_I64 || tv->type == -RAY_I32)) {
+                int64_t k = (tv->type == -RAY_I64) ? tv->i64 : tv->i32;
+                ray_release(tv);
+                int64_t nrows = ray_table_nrows(result);
+                /* Bound K and the over-cardinality ratio: only useful
+                 * when K is well under nrows.  Leave the take=full /
+                 * negative-take cases to the existing path. */
+                if (k > 0 && k < nrows && k <= 8192) {
+                    /* Reject LIST columns — full path handles those. */
+                    int has_list = 0;
+                    int64_t ncols = ray_table_ncols(result);
+                    for (int64_t c = 0; c < ncols; c++) {
+                        ray_t* col = ray_table_get_col_idx(result, c);
+                        if (col && col->type == RAY_LIST) { has_list = 1; break; }
+                    }
+                    if (!has_list) {
+                        ray_t* topk = NULL;
+                        if (n_keys == 1) {
+                            ray_t* sort_col = ray_table_get_col(result, key_syms[0]);
+                            if (sort_col) {
+                                topk = ray_topk_table(result, sort_col,
+                                    key_descs[0], key_descs[0]
+                                    /*nf=desc by default*/, k);
+                            }
+                        } else {
+                            ray_t* key_cols[TOPK_MAX_KEYS];
+                            uint8_t nfs[TOPK_MAX_KEYS];
+                            int ok = 1;
+                            for (uint8_t i = 0; i < n_keys; i++) {
+                                key_cols[i] = ray_table_get_col(result, key_syms[i]);
+                                nfs[i] = key_descs[i];
+                                if (!key_cols[i]) { ok = 0; break; }
+                            }
+                            if (ok) {
+                                topk = ray_topk_table_multi(result, key_cols,
+                                    key_descs, nfs, n_keys, k);
+                            }
+                        }
+                        if (topk && !RAY_IS_ERR(topk)) {
+                            ray_release(result);
+                            return topk;
+                        }
+                        if (topk && RAY_IS_ERR(topk)) ray_release(topk);
+                        /* topk == NULL: unsupported config, fall through. */
+                    }
+                }
+            } else if (tv) {
+                ray_release(tv);
+            }
+        }
+    }
+
     /* Build temporary DAG on the materialized result */
     ray_graph_t* g = ray_graph_new(result);
     if (!g) return result;
@@ -1016,6 +1123,35 @@ static int is_agg_expr(ray_t* expr) {
     return resolve_agg_opcode(elems[0]->i64) != 0;
 }
 
+static int expr_contains_call_named(ray_t* expr, const char* name, size_t name_len) {
+    if (!expr) return 0;
+    if (expr->type != RAY_LIST) return 0;
+    ray_t** elems = (ray_t**)ray_data(expr);
+    int64_t n = ray_len(expr);
+    if (n <= 0) return 0;
+    ray_t* head = elems[0];
+    if (head && head->type == -RAY_SYM) {
+        ray_t* s = ray_sym_str(head->i64);
+        if (s && ray_str_len(s) == name_len &&
+            memcmp(ray_str_ptr(s), name, name_len) == 0)
+            return 1;
+    }
+    for (int64_t i = 0; i < n; i++)
+        if (expr_contains_call_named(elems[i], name, name_len))
+            return 1;
+    return 0;
+}
+
+/* True when a grouped aggregate expression can be lowered to OP_GROUP.
+ * `(count (distinct col))` is semantically an aggregate, but `distinct`
+ * is not a row-aligned DAG input inside GROUP.  Route it through the
+ * per-group eval fallback so `distinct` sees each group's slice. */
+static int is_group_dag_agg_expr(ray_t* expr) {
+    if (!is_agg_expr(expr)) return 0;
+    ray_t** elems = (ray_t**)ray_data(expr);
+    return !expr_contains_call_named(elems[1], "distinct", 8);
+}
+
 /* True for `(fn arg ...)` where fn resolves to a RAY_UNARY marked
  * RAY_FN_AGGR — i.e. a builtin aggregator (sum/avg/min/max/count and
  * the non-whitelisted med/dev/var/stddev/etc).  Used to route these
@@ -1034,6 +1170,41 @@ static int is_aggr_unary_call(ray_t* expr) {
     return (fn_obj->attrs & RAY_FN_AGGR) != 0;
 }
 
+static int is_streaming_aggr_unary_call(ray_t* expr) {
+    if (!is_aggr_unary_call(expr)) return 0;
+    ray_t** elems = (ray_t**)ray_data(expr);
+    return !expr_contains_call_named(elems[1], "distinct", 8);
+}
+
+/* Detect `(count (distinct <inner>))` exactly — the only shape that
+ * routes through the OP_COUNT_DISTINCT fast path per group.  Returns
+ * the inner expression on success, NULL otherwise.  More complex
+ * forms like `(count (distinct (+ col 1)))` are accepted; the inner
+ * expr is full-table-evaluable.  Anything where the outer call is
+ * not a plain `(count …)` or the inner is not a plain `(distinct …)`
+ * is rejected so the eval fallback handles it. */
+static ray_t* match_count_distinct(ray_t* expr) {
+    if (!expr || expr->type != RAY_LIST) return NULL;
+    int64_t n = ray_len(expr);
+    if (n != 2) return NULL;
+    ray_t** elems = (ray_t**)ray_data(expr);
+    if (!elems[0] || elems[0]->type != -RAY_SYM) return NULL;
+    ray_t* nm = ray_sym_str(elems[0]->i64);
+    if (!nm || ray_str_len(nm) != 5 ||
+        memcmp(ray_str_ptr(nm), "count", 5) != 0) return NULL;
+
+    ray_t* inner = elems[1];
+    if (!inner || inner->type != RAY_LIST) return NULL;
+    int64_t in_n = ray_len(inner);
+    if (in_n != 2) return NULL;
+    ray_t** in_elems = (ray_t**)ray_data(inner);
+    if (!in_elems[0] || in_elems[0]->type != -RAY_SYM) return NULL;
+    ray_t* dnm = ray_sym_str(in_elems[0]->i64);
+    if (!dnm || ray_str_len(dnm) != 8 ||
+        memcmp(ray_str_ptr(dnm), "distinct", 8) != 0) return NULL;
+    return in_elems[1];
+}
+
 /* Walk expr once, gather unique column-ref symbol ids that resolve to
  * columns of `tbl`.  Dotted refs (`Timestamp.ss`) record the head
  * segment.  Caps at `max_out` entries (16 is plenty for s: clauses);
@@ -1358,6 +1529,154 @@ static ray_t* aggr_unary_per_group_buf(ray_t* expr, ray_t* tbl,
     return agg_vec;
 }
 
+/* Per-group count(distinct) using the existing OP_COUNT_DISTINCT kernel.
+ * Mirrors aggr_unary_per_group_buf but slices the source column once per
+ * group and calls exec_count_distinct directly — bypasses the full
+ * ray_eval per-group path that re-walks the (count (distinct …)) AST
+ * for each slice.
+ *
+ * `inner_expr` is the operand to `distinct` extracted via
+ * match_count_distinct (typically a column ref, possibly a dotted-name
+ * or computed sub-expression).  Returns an I64 vector of length
+ * n_groups with the per-group distinct count. */
+static ray_t* count_distinct_per_group_buf(ray_t* inner_expr, ray_t* tbl,
+                                           const int64_t* idx_buf,
+                                           const int64_t* offsets,
+                                           const int64_t* grp_cnt,
+                                           int64_t n_groups) {
+    /* Resolve the source vector — either a direct column ref (zero copy)
+     * or a full-table eval of the inner sub-expression. */
+    ray_t* src = NULL;
+    if (inner_expr && inner_expr->type == -RAY_SYM &&
+        (inner_expr->attrs & RAY_ATTR_NAME)) {
+        src = ray_table_get_col(tbl, inner_expr->i64);
+        if (src) ray_retain(src);
+    }
+    if (!src) {
+        if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL);
+        expr_bind_table_names(inner_expr, tbl);
+        src = ray_eval(inner_expr);
+        ray_env_pop_scope();
+        if (!src || RAY_IS_ERR(src)) return src ? src : ray_error("domain", NULL);
+    }
+
+    ray_t* out = ray_vec_new(RAY_I64, n_groups);
+    if (!out || RAY_IS_ERR(out)) {
+        ray_release(src);
+        return out ? out : ray_error("oom", NULL);
+    }
+    out->len = n_groups;
+    int64_t* odata = (int64_t*)ray_data(out);
+
+    for (int64_t gi = 0; gi < n_groups; gi++) {
+        int64_t cnt = grp_cnt[gi];
+        if (cnt == 0) { odata[gi] = 0; continue; }
+        /* gather_by_idx preserves the source's typed layout (I64 stays
+         * I64, SYM stays SYM with adaptive width, etc.) — exactly what
+         * exec_count_distinct expects.  ray_at_fn would coerce numeric
+         * vec + numeric idx vec into a RAY_LIST of atoms, breaking the
+         * type-dispatch in exec_count_distinct. */
+        ray_t* subset = gather_by_idx(src,
+            (int64_t*)&idx_buf[offsets[gi]], cnt);
+        if (!subset || RAY_IS_ERR(subset)) {
+            ray_t* err = subset ? subset : ray_error("oom", NULL);
+            ray_release(src); ray_release(out);
+            return err;
+        }
+        ray_t* cv = exec_count_distinct(NULL, NULL, subset);
+        ray_release(subset);
+        if (!cv || RAY_IS_ERR(cv)) {
+            ray_t* err = cv ? cv : ray_error("oom", NULL);
+            ray_release(src); ray_release(out);
+            return err;
+        }
+        /* exec_count_distinct returns an i64 atom. */
+        odata[gi] = (cv->type == -RAY_I64) ? cv->i64
+                  : (cv->type == -RAY_I32) ? (int64_t)cv->i32 : 0;
+        ray_release(cv);
+    }
+
+    ray_release(src);
+    return out;
+}
+
+/* Variant for the LIST-`groups` layout used by the eval-fallback
+ * (ray_group_fn output is a 2-list of {key, idx_list} pairs).  Slices
+ * via ray_at_fn the same way and dispatches to exec_count_distinct. */
+static ray_t* count_distinct_per_group_groups(ray_t* inner_expr, ray_t* tbl,
+                                              ray_t* groups, int64_t n_groups) {
+    ray_t* src = NULL;
+    if (inner_expr && inner_expr->type == -RAY_SYM &&
+        (inner_expr->attrs & RAY_ATTR_NAME)) {
+        src = ray_table_get_col(tbl, inner_expr->i64);
+        if (src) ray_retain(src);
+    }
+    if (!src) {
+        if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL);
+        expr_bind_table_names(inner_expr, tbl);
+        src = ray_eval(inner_expr);
+        ray_env_pop_scope();
+        if (!src || RAY_IS_ERR(src)) return src ? src : ray_error("domain", NULL);
+    }
+
+    ray_t* out = ray_vec_new(RAY_I64, n_groups);
+    if (!out || RAY_IS_ERR(out)) { ray_release(src); return out ? out : ray_error("oom", NULL); }
+    out->len = n_groups;
+    int64_t* odata = (int64_t*)ray_data(out);
+
+    ray_t** items = (ray_t**)ray_data(groups);
+    for (int64_t gi = 0; gi < n_groups; gi++) {
+        ray_t* idx_list = items[gi * 2 + 1];
+        if (!idx_list) { odata[gi] = 0; continue; }
+        int64_t cnt = ray_len(idx_list);
+        if (cnt == 0) { odata[gi] = 0; continue; }
+
+        /* idx_list from ray_group_fn is an I64 vector — gather_by_idx
+         * needs a raw int64_t* + count, so resolve the pointer either
+         * directly (typed I64 vec) or by walking the LIST cells. */
+        ray_t* subset = NULL;
+        ray_t* tmp_hdr = NULL;
+        if (idx_list->type == RAY_I64) {
+            subset = gather_by_idx(src, (int64_t*)ray_data(idx_list), cnt);
+        } else {
+            /* Fallback: copy indices into a scratch buffer.  Rare path —
+             * shouldn't trigger for well-formed ray_group_fn output. */
+            int64_t* tmp = (int64_t*)scratch_alloc(&tmp_hdr,
+                (size_t)cnt * sizeof(int64_t));
+            if (!tmp) {
+                ray_release(src); ray_release(out);
+                return ray_error("oom", NULL);
+            }
+            for (int64_t k = 0; k < cnt; k++) {
+                int alloc = 0;
+                ray_t* e = collection_elem(idx_list, k, &alloc);
+                tmp[k] = e ? as_i64(e) : 0;
+                if (alloc && e) ray_release(e);
+            }
+            subset = gather_by_idx(src, tmp, cnt);
+            scratch_free(tmp_hdr);
+        }
+        if (!subset || RAY_IS_ERR(subset)) {
+            ray_t* err = subset ? subset : ray_error("oom", NULL);
+            ray_release(src); ray_release(out);
+            return err;
+        }
+        ray_t* cv = exec_count_distinct(NULL, NULL, subset);
+        ray_release(subset);
+        if (!cv || RAY_IS_ERR(cv)) {
+            ray_t* err = cv ? cv : ray_error("oom", NULL);
+            ray_release(src); ray_release(out);
+            return err;
+        }
+        odata[gi] = (cv->type == -RAY_I64) ? cv->i64
+                  : (cv->type == -RAY_I32) ? (int64_t)cv->i32 : 0;
+        ray_release(cv);
+    }
+
+    ray_release(src);
+    return out;
+}
+
 /* Forward declarations for eval-level groupby fallback */
 
 /* (select {from: t [where: pred] [by: key] [col: expr ...]})
@@ -1854,7 +2173,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                 int64_t kid = dict_elems[i]->i64;
                 if (kid == from_id || kid == where_id || kid == by_id ||
                     kid == take_id || kid == asc_id || kid == desc_id) continue;
-                if (!is_agg_expr(dict_elems[i + 1])) { any_nonagg = 1; break; }
+                if (!is_group_dag_agg_expr(dict_elems[i + 1])) { any_nonagg = 1; break; }
             }
         }
 
@@ -1885,7 +2204,21 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                      * DAG path (exec_group handles wide keys correctly
                      * and stays parallel / segment-streamed on parted
                      * tables). */
+	                    use_eval_group = 1;
+	            }
+	        }
+        if (!use_eval_group && by_expr->type == RAY_SYM && ray_len(by_expr) > 1) {
+            int64_t nk = ray_len(by_expr);
+            int64_t* sym_ids = (int64_t*)ray_data(by_expr);
+            for (int64_t k = 0; k < nk; k++) {
+                ray_t* key_col = ray_table_get_col(tbl, sym_ids[k]);
+                if (!key_col) continue;
+                int8_t kct = key_col->type;
+                if (RAY_IS_PARTED(kct)) kct = (int8_t)RAY_PARTED_BASETYPE(kct);
+                if (kct == RAY_LIST || kct == RAY_STR) {
                     use_eval_group = 1;
+                    break;
+                }
             }
         }
         /* Non-aggregation expressions (arithmetic, lambda, etc.) are
@@ -1924,13 +2257,240 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
             } else {
                 ray_graph_free(g); g = NULL;
             }
-            /* eval_group path supports only simple scalar / [col] by-forms;
-             * multi-key and computed keys shouldn't land here. */
-            if (by_key_sym < 0) {
+            if (by_key_sym < 0 && by_expr->type == RAY_SYM && ray_len(by_expr) > 1) {
+                int64_t nk = ray_len(by_expr);
+                int64_t* key_syms = (int64_t*)ray_data(by_expr);
+                int64_t nrows = ray_table_nrows(eval_tbl);
+                ray_t* key_cols[16];
+                if (nk <= 0 || nk > 16) {
+                    if (eval_tbl != tbl) ray_release(eval_tbl);
+                    ray_release(tbl);
+                    return ray_error("domain", "eval-level multi-key groupby requires 1..16 keys");
+                }
+                for (int64_t k = 0; k < nk; k++) {
+                    key_cols[k] = ray_table_get_col(eval_tbl, key_syms[k]);
+                    if (!key_cols[k]) {
+                        if (eval_tbl != tbl) ray_release(eval_tbl);
+                        ray_release(tbl);
+                        return ray_error("domain", "group key column not found");
+                    }
+                }
+
+                ray_t* composite_keys = ray_list_new(nrows);
+                if (!composite_keys || RAY_IS_ERR(composite_keys)) {
+                    if (eval_tbl != tbl) ray_release(eval_tbl);
+                    ray_release(tbl);
+                    return composite_keys ? composite_keys : ray_error("oom", NULL);
+                }
+                for (int64_t r = 0; r < nrows; r++) {
+                    ray_t* row_key = ray_list_new(nk);
+                    if (!row_key || RAY_IS_ERR(row_key)) {
+                        ray_release(composite_keys);
+                        if (eval_tbl != tbl) ray_release(eval_tbl);
+                        ray_release(tbl);
+                        return row_key ? row_key : ray_error("oom", NULL);
+                    }
+                    for (int64_t k = 0; k < nk; k++) {
+                        int alloc = 0;
+                        ray_t* cell = collection_elem(key_cols[k], r, &alloc);
+                        if (!cell || RAY_IS_ERR(cell)) {
+                            ray_release(row_key);
+                            ray_release(composite_keys);
+                            if (eval_tbl != tbl) ray_release(eval_tbl);
+                            ray_release(tbl);
+                            return cell ? cell : ray_error("domain", NULL);
+                        }
+                        row_key = ray_list_append(row_key, cell);
+                        if (alloc) ray_release(cell);
+                        if (!row_key || RAY_IS_ERR(row_key)) {
+                            ray_release(composite_keys);
+                            if (eval_tbl != tbl) ray_release(eval_tbl);
+                            ray_release(tbl);
+                            return row_key ? row_key : ray_error("oom", NULL);
+                        }
+                    }
+                    composite_keys = ray_list_append(composite_keys, row_key);
+                    ray_release(row_key);
+                    if (!composite_keys || RAY_IS_ERR(composite_keys)) {
+                        if (eval_tbl != tbl) ray_release(eval_tbl);
+                        ray_release(tbl);
+                        return composite_keys ? composite_keys : ray_error("oom", NULL);
+                    }
+                }
+
+                ray_t* groups_dict = ray_group_fn(composite_keys);
+                ray_release(composite_keys);
+                if (!groups_dict || RAY_IS_ERR(groups_dict)) {
+                    if (eval_tbl != tbl) ray_release(eval_tbl);
+                    ray_release(tbl);
+                    return groups_dict ? groups_dict : ray_error("domain", NULL);
+                }
+                ray_t* groups = groups_to_pair_list(groups_dict);
+                ray_release(groups_dict);
+                if (!groups || RAY_IS_ERR(groups)) {
+                    if (eval_tbl != tbl) ray_release(eval_tbl);
+                    ray_release(tbl);
+                    return groups ? groups : ray_error("domain", NULL);
+                }
+                int64_t n_groups = ray_len(groups) / 2;
+
+                int n_agg_out = 0;
+                int64_t agg_names[16];
+                ray_t* agg_results[16] = {0};
+                for (int64_t i = 0; i + 1 < dict_n && n_agg_out < 16; i += 2) {
+                    int64_t kid = dict_elems[i]->i64;
+                    if (kid == from_id || kid == where_id || kid == by_id ||
+                        kid == take_id || kid == asc_id || kid == desc_id) continue;
+                    ray_t* val_expr_item = dict_elems[i + 1];
+
+                    /* Per-group count(distinct) — bypass full ray_eval per
+                     * group and dispatch directly to exec_count_distinct on
+                     * each group's slice.  Same kernel the standalone
+                     * `(count (distinct col))` fast path uses. */
+                    ray_t* cd_inner = match_count_distinct(val_expr_item);
+                    if (cd_inner) {
+                        ray_t* per_group = count_distinct_per_group_groups(
+                            cd_inner, eval_tbl, groups, n_groups);
+                        if (!per_group || RAY_IS_ERR(per_group)) {
+                            for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]);
+                            ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                            return per_group ? per_group : ray_error("domain", NULL);
+                        }
+                        agg_names[n_agg_out] = kid;
+                        agg_results[n_agg_out] = per_group;
+                        n_agg_out++;
+                        continue;
+                    }
+
+                    if (is_streaming_aggr_unary_call(val_expr_item)) {
+                        ray_t** agg_elems = (ray_t**)ray_data(val_expr_item);
+                        ray_t* agg_fn_name = agg_elems[0];
+                        ray_t* agg_col_expr = agg_elems[1];
+                        ray_t* src_col_val = NULL;
+                        if (agg_col_expr->type == -RAY_SYM && (agg_col_expr->attrs & RAY_ATTR_NAME)) {
+                            src_col_val = ray_table_get_col(eval_tbl, agg_col_expr->i64);
+                            if (src_col_val) ray_retain(src_col_val);
+                        }
+                        if (!src_col_val) {
+                            src_col_val = ray_eval(agg_col_expr);
+                            if (!src_col_val || RAY_IS_ERR(src_col_val)) {
+                                for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]);
+                                ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                                return src_col_val ? src_col_val : ray_error("domain", NULL);
+                            }
+                        }
+
+                        ray_t* agg_vec = NULL;
+                        ray_t** grp_items = (ray_t**)ray_data(groups);
+                        for (int64_t gi = 0; gi < n_groups; gi++) {
+                            ray_t* idx_list = grp_items[gi * 2 + 1];
+                            ray_t* subset = ray_at_fn(src_col_val, idx_list);
+                            if (!subset || RAY_IS_ERR(subset)) continue;
+                            ray_t* agg_val = NULL;
+                            ray_t* fn_obj = ray_env_get(agg_fn_name->i64);
+                            if (fn_obj && fn_obj->type == RAY_UNARY) {
+                                ray_unary_fn uf = (ray_unary_fn)(uintptr_t)fn_obj->i64;
+                                agg_val = uf(subset);
+                            }
+                            ray_release(subset);
+                            if (!agg_val || RAY_IS_ERR(agg_val)) continue;
+                            if (!agg_vec) {
+                                int8_t vt = -(agg_val->type);
+                                agg_vec = ray_vec_new(vt, n_groups);
+                                if (!agg_vec || RAY_IS_ERR(agg_vec)) { ray_release(agg_val); break; }
+                                agg_vec->len = n_groups;
+                            }
+                            store_typed_elem(agg_vec, gi, agg_val);
+                            ray_release(agg_val);
+                        }
+                        ray_release(src_col_val);
+                        agg_names[n_agg_out] = kid;
+                        agg_results[n_agg_out] = agg_vec;
+                        n_agg_out++;
+                    } else {
+                        ray_t* per_group = nonagg_eval_per_group(val_expr_item, eval_tbl, groups, n_groups);
+                        if (!per_group || RAY_IS_ERR(per_group)) {
+                            for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]);
+                            ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                            return per_group ? per_group : ray_error("domain", NULL);
+                        }
+                        agg_names[n_agg_out] = kid;
+                        agg_results[n_agg_out] = per_group;
+                        n_agg_out++;
+                    }
+                }
+
+                ray_t* result = ray_table_new(nk + n_agg_out);
+                if (!result || RAY_IS_ERR(result)) {
+                    for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]);
+                    ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                    return result ? result : ray_error("oom", NULL);
+                }
+                ray_t** grp_items = (ray_t**)ray_data(groups);
+                for (int64_t k = 0; k < nk; k++) {
+                    ray_t* src = key_cols[k];
+                    int8_t kt = src->type;
+                    if (RAY_IS_PARTED(kt)) kt = (int8_t)RAY_PARTED_BASETYPE(kt);
+                    ray_t* key_vec = NULL;
+                    if (kt == RAY_STR) {
+                        key_vec = ray_vec_new(RAY_STR, n_groups);
+                        for (int64_t gi = 0; gi < n_groups && key_vec && !RAY_IS_ERR(key_vec); gi++) {
+                            ray_t* row_key = grp_items[gi * 2];
+                            ray_t* cell = (row_key && row_key->type == RAY_LIST && k < row_key->len)
+                                        ? ((ray_t**)ray_data(row_key))[k] : NULL;
+                            const char* sp = cell ? ray_str_ptr(cell) : "";
+                            size_t slen = cell ? ray_str_len(cell) : 0;
+                            key_vec = ray_str_vec_append(key_vec, sp ? sp : "", sp ? slen : 0);
+                        }
+                    } else {
+                        key_vec = (kt == RAY_SYM)
+                                ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, n_groups)
+                                : ray_vec_new(kt, n_groups);
+                        if (key_vec && !RAY_IS_ERR(key_vec)) {
+                            key_vec->len = n_groups;
+                            memset(ray_data(key_vec), 0, (size_t)n_groups * ray_sym_elem_size(kt, key_vec->attrs));
+                            for (int64_t gi = 0; gi < n_groups; gi++) {
+                                ray_t* row_key = grp_items[gi * 2];
+                                ray_t* cell = (row_key && row_key->type == RAY_LIST && k < row_key->len)
+                                            ? ((ray_t**)ray_data(row_key))[k] : NULL;
+                                if (cell) store_typed_elem(key_vec, gi, cell);
+                            }
+                        }
+                    }
+                    if (!key_vec || RAY_IS_ERR(key_vec)) {
+                        for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]);
+                        ray_release(result); ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                        return key_vec ? key_vec : ray_error("oom", NULL);
+                    }
+                    result = ray_table_add_col(result, key_syms[k], key_vec);
+                    ray_release(key_vec);
+                    if (RAY_IS_ERR(result)) {
+                        for (int ai = 0; ai < n_agg_out; ai++) if (agg_results[ai]) ray_release(agg_results[ai]);
+                        ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                        return result;
+                    }
+                }
+                for (int ai = 0; ai < n_agg_out; ai++) {
+                    if (agg_results[ai]) {
+                        result = ray_table_add_col(result, agg_names[ai], agg_results[ai]);
+                        ray_release(agg_results[ai]);
+                        if (RAY_IS_ERR(result)) { ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); return result; }
+                    }
+                }
+
+                ray_release(groups);
                 if (eval_tbl != tbl) ray_release(eval_tbl);
                 ray_release(tbl);
-                return ray_error("nyi", "eval-level groupby requires scalar key");
+                return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id);
             }
+
+	            /* eval_group path supports only simple scalar / [col] by-forms;
+	             * computed keys shouldn't land here. */
+	            if (by_key_sym < 0) {
+	                if (eval_tbl != tbl) ray_release(eval_tbl);
+	                ray_release(tbl);
+	                return ray_error("nyi", "eval-level groupby requires scalar key");
+	            }
             ray_t* key_col = ray_table_get_col(eval_tbl, by_key_sym);
 
             /* Fast path: (select {from: t by: k}) with no aggs and
@@ -2160,7 +2720,26 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                 if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue;
                 ray_t* val_expr_item = dict_elems[i + 1];
 
-                if (is_aggr_unary_call(val_expr_item)) {
+                /* Per-group count(distinct) — bypass full ray_eval per
+                 * group and dispatch directly to exec_count_distinct. */
+                {
+                    ray_t* cd_inner = match_count_distinct(val_expr_item);
+                    if (cd_inner) {
+                        ray_t* per_group = count_distinct_per_group_groups(
+                            cd_inner, eval_tbl, groups, n_groups);
+                        if (!per_group || RAY_IS_ERR(per_group)) {
+                            for (int ai = 0; ai < n_agg_out; ai++) { if (agg_results[ai]) ray_release(agg_results[ai]); }
+                            ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                            return per_group ? per_group : ray_error("domain", NULL);
+                        }
+                        agg_names[n_agg_out] = kid;
+                        agg_results[n_agg_out] = per_group;
+                        n_agg_out++;
+                        continue;
+                    }
+                }
+
+                if (is_streaming_aggr_unary_call(val_expr_item)) {
                     /* Streaming-style per-group AGG branch.  Accepts both
                      * the resolve_agg_opcode whitelist (sum/avg/min/max/...)
                      * and the broader RAY_FN_AGGR + RAY_UNARY set
@@ -2215,6 +2794,20 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                     agg_results[n_agg_out] = agg_vec;
                     n_agg_out++;
                 } else {
+                    if (is_agg_expr(val_expr_item)) {
+                        ray_t* per_group = nonagg_eval_per_group(
+                            val_expr_item, eval_tbl, groups, n_groups);
+                        if (RAY_IS_ERR(per_group)) {
+                            for (int ai = 0; ai < n_agg_out; ai++) { if (agg_results[ai]) ray_release(agg_results[ai]); }
+                            ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl);
+                            return per_group;
+                        }
+                        agg_names[n_agg_out] = kid;
+                        agg_results[n_agg_out] = per_group;
+                        n_agg_out++;
+                        continue;
+                    }
+
                     /* Non-aggregation expression: evaluate on full table,
                      * then gather per-group subsets into a LIST column
                      * (non-agg produces list-of-vectors). */
@@ -2450,7 +3043,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
             int64_t kid = dict_elems[i]->i64;
             if (kid == from_id || kid == where_id || kid == by_id ||
                 kid == take_id || kid == asc_id || kid == desc_id) continue;
-            if (!is_agg_expr(dict_elems[i + 1])) { has_nonagg = 1; break; }
+            if (!is_group_dag_agg_expr(dict_elems[i + 1])) { has_nonagg = 1; break; }
         }
 
         /* The post-DAG scatter needs a flat single-segment table: it
@@ -2565,14 +3158,14 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
             if (kid == from_id || kid == where_id || kid == by_id || kid == take_id || kid == asc_id || kid == desc_id) continue;
 
             ray_t* val_expr = dict_elems[i + 1];
-            if (is_agg_expr(val_expr) && n_aggs < 16) {
+            if (is_group_dag_agg_expr(val_expr) && n_aggs < 16) {
                 ray_t** agg_elems = (ray_t**)ray_data(val_expr);
                 agg_ops[n_aggs] = resolve_agg_opcode(agg_elems[0]->i64);
                 /* Compile the aggregation input (the column reference) */
                 agg_ins[n_aggs] = compile_expr_dag(g, agg_elems[1]);
                 if (!agg_ins[n_aggs]) { ray_graph_free(g); ray_release(tbl); return ray_error("domain", NULL); }
                 n_aggs++;
-            } else if (!is_agg_expr(val_expr) && n_nonaggs < 16) {
+            } else if (!is_group_dag_agg_expr(val_expr) && n_nonaggs < 16) {
                 nonagg_names[n_nonaggs] = kid;
                 nonagg_exprs[n_nonaggs] = val_expr;
                 n_nonaggs++;
@@ -3467,7 +4060,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                 if (kid == from_id || kid == where_id || kid == by_id ||
                     kid == take_id || kid == asc_id || kid == desc_id) continue;
                 if (n_all_user < 16) all_user_names[n_all_user++] = kid;
-                if (by_expr && !is_agg_expr(dict_elems[i + 1])) continue;
+                if (by_expr && !is_group_dag_agg_expr(dict_elems[i + 1])) continue;
                 if (n_agg_user < 16) agg_user_names[n_agg_user++] = kid;
             }
             if (by_expr) {
@@ -3612,14 +4205,101 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                     KEY_READ(gk[gi], grp_key, gkt, gi);
 
                 /* Build row→group_id map.  Rows whose key isn't in the
-                 * surviving group set get row_gid = -1 and are skipped. */
-                for (int64_t r = 0; r < nrows; r++) {
-                    int64_t rv;
-                    KEY_READ(rv, orig_key, okt, r);
-                    row_gid[r] = -1;
+                 * surviving group set get row_gid = -1 and are skipped.
+                 *
+                 * For high group cardinality (n_groups large), the naive
+                 * O(nrows * n_groups) double loop dominated runtime —
+                 * 5M * 730K ≈ 4T comparisons.  Build a value→gid hash
+                 * instead so each row is one O(1) probe. */
+                {
+                    /* Capacity: 2 * n_groups rounded up to power of 2.
+                     * Slot stores gid+1 (0 = empty) and the int64 key. */
+                    uint64_t cap = (uint64_t)n_groups * 2;
+                    if (cap < 32) cap = 32;
+                    uint64_t c = 1;
+                    while (c && c < cap) c <<= 1;
+                    if (!c) {
+                        ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr);
+                        ray_free(off_hdr); ray_free(pos_hdr);
+                        ray_release(result); ray_release(tbl);
+                        return ray_error("oom", NULL);
+                    }
+                    cap = c;
+                    uint64_t mask = cap - 1;
+                    ray_t* gk_keys_hdr = NULL;
+                    ray_t* gk_idx_hdr  = NULL;
+                    int64_t* hk_keys = (int64_t*)scratch_alloc(&gk_keys_hdr,
+                        (size_t)cap * sizeof(int64_t));
+                    int32_t* hk_gid_p1 = (int32_t*)scratch_calloc(&gk_idx_hdr,
+                        (size_t)cap * sizeof(int32_t));
+                    if (!hk_keys || !hk_gid_p1) {
+                        if (gk_keys_hdr) scratch_free(gk_keys_hdr);
+                        if (gk_idx_hdr)  scratch_free(gk_idx_hdr);
+                        ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr);
+                        ray_free(off_hdr); ray_free(pos_hdr);
+                        ray_release(result); ray_release(tbl);
+                        return ray_error("oom", NULL);
+                    }
+
+                    /* If n_groups exceeds the int32 sentinel range we'd
+                     * lose distinct gids — fall back to the int64 store
+                     * (rare: n_groups > ~2.1 B).  Otherwise i32+1 fits. */
+                    int use_i64_gid = (n_groups >= ((int64_t)1 << 31) - 1);
+                    ray_t* gk64_hdr = NULL;
+                    int64_t* hk_gid64 = NULL;
+                    if (use_i64_gid) {
+                        hk_gid64 = (int64_t*)scratch_calloc(&gk64_hdr,
+                            (size_t)cap * sizeof(int64_t));
+                        if (!hk_gid64) {
+                            scratch_free(gk_keys_hdr); scratch_free(gk_idx_hdr);
+                            ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr);
+                            ray_free(off_hdr); ray_free(pos_hdr);
+                            ray_release(result); ray_release(tbl);
+                            return ray_error("oom", NULL);
+                        }
+                    }
+
+                    /* Insert (gk[gi] -> gi) into the hash. */
                     for (int64_t gi = 0; gi < n_groups; gi++) {
-                        if (rv == gk[gi]) { row_gid[r] = gi; break; }
+                        int64_t k = gk[gi];
+                        uint64_t h = (uint64_t)k * 0x9E3779B97F4A7C15ULL;
+                        h ^= h >> 33;
+                        uint64_t s = h & mask;
+                        for (;;) {
+                            int64_t cur_p1 = use_i64_gid ? hk_gid64[s]
+                                                         : (int64_t)hk_gid_p1[s];
+                            if (cur_p1 == 0) {
+                                if (use_i64_gid) hk_gid64[s] = gi + 1;
+                                else hk_gid_p1[s] = (int32_t)(gi + 1);
+                                hk_keys[s] = k;
+                                break;
+                            }
+                            if (hk_keys[s] == k) break; /* dup gk — keep first */
+                            s = (s + 1) & mask;
+                        }
+                    }
+
+                    /* Probe each row to assign its gid. */
+                    for (int64_t r = 0; r < nrows; r++) {
+                        int64_t rv;
+                        KEY_READ(rv, orig_key, okt, r);
+                        uint64_t h = (uint64_t)rv * 0x9E3779B97F4A7C15ULL;
+                        h ^= h >> 33;
+                        uint64_t s = h & mask;
+                        int64_t found = -1;
+                        for (;;) {
+                            int64_t cur_p1 = use_i64_gid ? hk_gid64[s]
+                                                         : (int64_t)hk_gid_p1[s];
+                            if (cur_p1 == 0) break;
+                            if (hk_keys[s] == rv) { found = cur_p1 - 1; break; }
+                            s = (s + 1) & mask;
+                        }
+                        row_gid[r] = found;
                     }
+
+                    scratch_free(gk_keys_hdr);
+                    scratch_free(gk_idx_hdr);
+                    if (gk64_hdr) scratch_free(gk64_hdr);
                 }
                 #undef KEY_READ
 
@@ -3650,6 +4330,45 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
 
                 ray_t* scatter_err = NULL;
                 for (uint8_t ni = 0; ni < n_nonaggs && !scatter_err; ni++) {
+                    /* Per-group count(distinct) — dispatch directly to
+                     * exec_count_distinct on each group's slice using
+                     * the same idx_buf+offsets+grp_cnt layout the
+                     * streaming-AGG branch uses.
+                     *
+                     * High-cardinality grouping: try the single-pass
+                     * global-hash kernel first.  Falls back to the
+                     * per-group slice path on type miss / error. */
+                    ray_t* cd_inner = match_count_distinct(nonagg_exprs[ni]);
+                    if (cd_inner) {
+                        ray_t* col = NULL;
+                        /* Resolve the inner column for the global-hash
+                         * fast path.  Direct column refs hit the path;
+                         * computed expressions use the per-group fallback. */
+                        ray_t* src_for_global = NULL;
+                        int    src_owned = 0;
+                        if (cd_inner->type == -RAY_SYM &&
+                            (cd_inner->attrs & RAY_ATTR_NAME)) {
+                            src_for_global = ray_table_get_col(tbl, cd_inner->i64);
+                        }
+                        if (src_for_global) {
+                            col = ray_count_distinct_per_group(
+                                src_for_global, row_gid, nrows, n_groups);
+                            /* col == NULL → unsupported type, fall through. */
+                        }
+                        if (src_owned && src_for_global) ray_release(src_for_global);
+                        if (!col) {
+                            col = count_distinct_per_group_buf(
+                                cd_inner, tbl, idx_buf, offsets, grp_cnt, n_groups);
+                        }
+                        if (RAY_IS_ERR(col)) { scatter_err = col; break; }
+                        result = ray_table_add_col(result, nonagg_names[ni], col);
+                        ray_release(col);
+                        if (RAY_IS_ERR(result)) {
+                            scatter_err = result; result = NULL; break;
+                        }
+                        continue;
+                    }
+
                     /* Streaming-style fast path for `(aggr_fn col_or_expr)`
                      * where aggr_fn is RAY_FN_AGGR + RAY_UNARY (sum/avg/...,
                      * med/dev/var/stddev/...).  Bypasses the full-table eval
@@ -3657,7 +4376,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                      * group and calling the unary fn directly into a typed
                      * vec.  Equivalent perf-class to the streaming AGG path
                      * the eval-fallback uses for the same shapes. */
-                    if (is_aggr_unary_call(nonagg_exprs[ni])) {
+                    if (is_streaming_aggr_unary_call(nonagg_exprs[ni])) {
                         ray_t* col = aggr_unary_per_group_buf(
                             nonagg_exprs[ni], tbl,
                             idx_buf, offsets, grp_cnt, n_groups);
@@ -3670,6 +4389,20 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                         continue;
                     }
 
+                    if (is_agg_expr(nonagg_exprs[ni])) {
+                        ray_t* per_group = nonagg_eval_per_group_buf(
+                            nonagg_exprs[ni], tbl, idx_buf, offsets, grp_cnt, n_groups);
+                        if (RAY_IS_ERR(per_group)) {
+                            scatter_err = per_group; break;
+                        }
+                        result = ray_table_add_col(result, nonagg_names[ni], per_group);
+                        ray_release(per_group);
+                        if (RAY_IS_ERR(result)) {
+                            scatter_err = result; result = NULL; break;
+                        }
+                        continue;
+                    }
+
                     if (ray_env_push_scope() != RAY_OK) {
                         scatter_err = ray_error("oom", NULL); break;
                     }
@@ -3803,6 +4536,93 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
 
 /* (xbar col bucket) — time/value bucketing: floor(col/bucket)*bucket */
 ray_t* ray_xbar_fn(ray_t* col, ray_t* bucket) {
+    /* Vectorised fast path for `(xbar VEC scalar_int)` on integer or
+     * temporal columns.  The generic atomic_map_binary path was
+     * allocating one ray_t* atom per row and calling ray_xbar_fn
+     * recursively — at 5M rows this dominates (≥100 ms).  A direct
+     * tight loop computes floor-div + multiply per element with no
+     * allocations.  When the bucket is a power of two we lower the
+     * divide further to mask + arithmetic.
+     *
+     * Short-circuited only when both bucket and col are well-typed;
+     * everything else falls through to the recursive
+     * atomic_map_binary path. */
+    if (col && ray_is_vec(col) && bucket && ray_is_atom(bucket) &&
+        (bucket->type == -RAY_I64 || bucket->type == -RAY_I32 ||
+         bucket->type == -RAY_I16) &&
+        (col->type == RAY_I64 || col->type == RAY_I32 ||
+         col->type == RAY_I16 || col->type == RAY_TIMESTAMP ||
+         col->type == RAY_DATE || col->type == RAY_TIME) &&
+        !RAY_ATOM_IS_NULL(bucket)) {
+        int64_t b = bucket->i64;
+        if (b == 0) return ray_error("domain", NULL);
+        int64_t n = col->len;
+        ray_t* out = ray_vec_new(col->type, n);
+        if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("oom", NULL);
+        out->len = n;
+
+        /* Compute (q*b) where q = floor(a/b).  C division truncates
+         * toward zero; for negative dividend we adjust. */
+        int8_t out_type = col->type;
+        if (out_type == RAY_I64 || out_type == RAY_TIMESTAMP) {
+            const int64_t* in = (const int64_t*)ray_data(col);
+            int64_t* o = (int64_t*)ray_data(out);
+            if (b > 0 && (b & (b - 1)) == 0) {
+                /* Bucket is a power of two on a non-negative-friendly path:
+                 * a/b == a >> log2(b), but still need the floor adjustment
+                 * for negative inputs.  Use bitmask: q*b = a & ~(b-1) for
+                 * non-negative `a`.  For mixed-sign data this falls back
+                 * to the general path. */
+                int64_t mask = ~(b - 1);
+                for (int64_t i = 0; i < n; i++) {
+                    int64_t a = in[i];
+                    /* Floor toward -inf for negative a too: a & mask. */
+                    o[i] = a & mask;
+                }
+            } else {
+                for (int64_t i = 0; i < n; i++) {
+                    int64_t a = in[i];
+                    int64_t q = a / b;
+                    if ((a ^ b) < 0 && q * b != a) q--;
+                    o[i] = q * b;
+                }
+            }
+        } else if (out_type == RAY_I32 || out_type == RAY_DATE || out_type == RAY_TIME) {
+            const int32_t* in = (const int32_t*)ray_data(col);
+            int32_t* o = (int32_t*)ray_data(out);
+            int32_t b32 = (int32_t)b;
+            if (b32 > 0 && ((uint32_t)b32 & ((uint32_t)b32 - 1)) == 0) {
+                int32_t mask = (int32_t)~((uint32_t)b32 - 1);
+                for (int64_t i = 0; i < n; i++) o[i] = in[i] & mask;
+            } else {
+                for (int64_t i = 0; i < n; i++) {
+                    int32_t a = in[i];
+                    int32_t q = a / b32;
+                    if ((a ^ b32) < 0 && q * b32 != a) q--;
+                    o[i] = q * b32;
+                }
+            }
+        } else { /* RAY_I16 */
+            const int16_t* in = (const int16_t*)ray_data(col);
+            int16_t* o = (int16_t*)ray_data(out);
+            int16_t b16 = (int16_t)b;
+            for (int64_t i = 0; i < n; i++) {
+                int16_t a = in[i];
+                int16_t q = a / b16;
+                if ((a ^ b16) < 0 && q * b16 != a) q--;
+                o[i] = q * b16;
+            }
+        }
+
+        /* Propagate null bitmap if present. */
+        if (col->attrs & RAY_ATTR_HAS_NULLS) {
+            for (int64_t i = 0; i < n; i++)
+                if (ray_vec_is_null(col, i))
+                    ray_vec_set_null(out, i, true);
+        }
+        return out;
+    }
+
     /* Recursive unwrap for nested collections (list of vectors) */
     if (is_collection(col) || is_collection(bucket))
         return atomic_map_binary(ray_xbar_fn, col, bucket);
diff --git a/src/ops/sort.c b/src/ops/sort.c
index f9a701a1..a5875e27 100644
--- a/src/ops/sort.c
+++ b/src/ops/sort.c
@@ -3062,6 +3062,295 @@ str_msd_done:;
     return result;
 }
 
+static void topk_cmp_sift_down(const sort_cmp_ctx_t* ctx, int64_t* heap,
+                               int64_t n, int64_t root) {
+    for (;;) {
+        int64_t worst = root;
+        int64_t l = 2 * root + 1;
+        int64_t r = 2 * root + 2;
+        if (l < n && sort_cmp(ctx, heap[l], heap[worst]) > 0) worst = l;
+        if (r < n && sort_cmp(ctx, heap[r], heap[worst]) > 0) worst = r;
+        if (worst == root) break;
+        int64_t tmp = heap[root];
+        heap[root] = heap[worst];
+        heap[worst] = tmp;
+        root = worst;
+    }
+}
+
+/* Comparator-based top-K: works for any sort key types and any number of
+ * keys (1..n).  Used as the fallback when radix-encoded fast-path is not
+ * applicable (e.g. SYM, STR, multi-key).  O(n log K + K log K). */
+static ray_t* topk_indices_cmp(ray_t** cols, uint8_t* descs, uint8_t* nfs,
+                               uint8_t n_cols, int64_t nrows, int64_t k) {
+    if (!cols || n_cols == 0 || k <= 0 || nrows <= 0 || k >= nrows) return NULL;
+    for (uint8_t c = 0; c < n_cols; c++) if (!cols[c]) return NULL;
+
+    ray_t* idx = ray_vec_new(RAY_I64, k);
+    if (!idx || RAY_IS_ERR(idx)) return idx ? idx : ray_error("oom", NULL);
+    idx->len = k;
+    int64_t* heap = (int64_t*)ray_data(idx);
+    for (int64_t i = 0; i < k; i++) heap[i] = i;
+
+    sort_cmp_ctx_t ctx = {
+        .vecs = cols,
+        .desc = descs,
+        .nulls_first = nfs,
+        .n_sort = n_cols,
+    };
+
+    for (int64_t i = k / 2 - 1; i >= 0; i--)
+        topk_cmp_sift_down(&ctx, heap, k, i);
+
+    for (int64_t i = k; i < nrows; i++) {
+        if (sort_cmp(&ctx, i, heap[0]) >= 0) continue;
+        heap[0] = i;
+        topk_cmp_sift_down(&ctx, heap, k, 0);
+    }
+
+    for (int64_t i = 1; i < k; i++) {
+        int64_t v = heap[i];
+        int64_t j = i - 1;
+        while (j >= 0 && sort_cmp(&ctx, v, heap[j]) < 0) {
+            heap[j + 1] = heap[j];
+            j--;
+        }
+        heap[j + 1] = v;
+    }
+
+    return idx;
+}
+
+static ray_t* topk_indices_cmp_single(ray_t* col, uint8_t desc, uint8_t nf,
+                                      int64_t nrows, int64_t k) {
+    ray_t* cols[1] = { col };
+    uint8_t descs[1] = { desc };
+    uint8_t nfs[1] = { nf };
+    return topk_indices_cmp(cols, descs, nfs, 1, nrows, k);
+}
+
+/* --------------------------------------------------------------------------
+ * Top-K bounded-heap selection on a single sort key.
+ *
+ * Replaces a full O(n log n) sort + take-K with O(n log K + K log K) when
+ * K << n.  At plan time, the apply_sort_take / projection paths detect
+ * "single sort key + small atom take" and call this in lieu of OP_SORT +
+ * OP_HEAD.  Multi-key, take-range, or take-K-near-n cases keep the
+ * existing fused sort+limit path (which is already O(n log n) bounded
+ * with K-row gather).
+ *
+ * Implementation: encode each row's key to a uint64 (same encoding
+ * radix_encode_fn uses, so smaller key = earlier in ASC order, with DESC
+ * already pre-flipped).  Maintain a max-heap of K (key, original_idx)
+ * pairs; for each row r > K, if r's encoded key is smaller than the
+ * heap-top key, replace the top and sift down.  After the scan, sort
+ * the K (key, idx) pairs by key ascending — the result is the top-K
+ * indices in the user's requested order.
+ *
+ * Supported types: I64, I32, I16, U8, BOOL, F64, DATE, TIME,
+ * TIMESTAMP, plus SYM via a comparator heap.  STR/GUID fall through
+ * to the caller (return NULL → caller uses full sort).  Returns NULL
+ * on any unsupported configuration so the caller's fallback path
+ * handles it.
+ * -------------------------------------------------------------------------- */
+static ray_t* topk_indices_single(ray_t* col, uint8_t desc, uint8_t nf,
+                                  int64_t nrows, int64_t k) {
+    if (!col || k <= 0 || nrows <= 0) return NULL;
+    if (k >= nrows) return NULL; /* full sort is at least as good */
+
+    int8_t type = col->type;
+    /* Whitelist of types where radix_encode_fn produces an order-preserving
+     * uint64 — exactly the cases topk can handle without a comparator. */
+    bool ok = (type == RAY_I64 || type == RAY_TIMESTAMP || type == RAY_F64 ||
+               type == RAY_I32 || type == RAY_DATE || type == RAY_TIME ||
+               type == RAY_SYM || type == RAY_I16 ||
+               type == RAY_BOOL || type == RAY_U8);
+    if (!ok) return NULL;
+
+    if (type == RAY_SYM)
+        return topk_indices_cmp_single(col, desc, nf, nrows, k);
+
+    /* Encode all rows to a single uint64 key array. */
+    ray_t* keys_hdr = NULL;
+    uint64_t* keys = (uint64_t*)scratch_alloc(&keys_hdr,
+        (size_t)nrows * sizeof(uint64_t));
+    if (!keys) return NULL;
+
+    radix_encode_ctx_t enc = {
+        .keys        = keys,
+        .indices     = NULL,
+        .data        = ray_data(col),
+        .col         = col,
+        .type        = type,
+        .col_attrs   = col->attrs,
+        .desc        = desc != 0,
+        .nulls_first = nf != 0,
+        .enum_rank   = NULL,
+        .n_keys      = 1,
+    };
+    /* Single-threaded encode is plenty for the heap pass that follows;
+     * radix_encode_fn handles the type/desc/nulls dispatch correctly. */
+    radix_encode_fn(&enc, 0, 0, nrows);
+
+    /* Max-heap of K (key, idx) pairs.  Stored in two parallel arrays
+     * for cache locality on the comparison path. */
+    ray_t* hk_hdr = NULL;
+    ray_t* hi_hdr = NULL;
+    uint64_t* hk = (uint64_t*)scratch_alloc(&hk_hdr, (size_t)k * sizeof(uint64_t));
+    int64_t*  hi = (int64_t*)scratch_alloc(&hi_hdr, (size_t)k * sizeof(int64_t));
+    if (!hk || !hi) {
+        if (hk_hdr) scratch_free(hk_hdr);
+        if (hi_hdr) scratch_free(hi_hdr);
+        scratch_free(keys_hdr);
+        return NULL;
+    }
+
+    /* Seed with the first K rows. */
+    for (int64_t i = 0; i < k; i++) { hk[i] = keys[i]; hi[i] = i; }
+
+    /* Heapify (build max-heap on hk[]). */
+    for (int64_t i = k / 2 - 1; i >= 0; i--) {
+        int64_t idx = i;
+        for (;;) {
+            int64_t largest = idx;
+            int64_t l = 2 * idx + 1, r = 2 * idx + 2;
+            if (l < k && hk[l] > hk[largest]) largest = l;
+            if (r < k && hk[r] > hk[largest]) largest = r;
+            if (largest == idx) break;
+            uint64_t tk = hk[idx]; hk[idx] = hk[largest]; hk[largest] = tk;
+            int64_t  ti = hi[idx]; hi[idx] = hi[largest]; hi[largest] = ti;
+            idx = largest;
+        }
+    }
+
+    /* Scan remaining rows, push when the new key is strictly smaller
+     * than heap-top.  Sift the new root down to restore the max-heap. */
+    for (int64_t i = k; i < nrows; i++) {
+        if (keys[i] >= hk[0]) continue;
+        hk[0] = keys[i];
+        hi[0] = i;
+        int64_t idx = 0;
+        for (;;) {
+            int64_t largest = idx;
+            int64_t l = 2 * idx + 1, r = 2 * idx + 2;
+            if (l < k && hk[l] > hk[largest]) largest = l;
+            if (r < k && hk[r] > hk[largest]) largest = r;
+            if (largest == idx) break;
+            uint64_t tk = hk[idx]; hk[idx] = hk[largest]; hk[largest] = tk;
+            int64_t  ti = hi[idx]; hi[idx] = hi[largest]; hi[largest] = ti;
+            idx = largest;
+        }
+    }
+
+    /* The heap contains the K best (smallest key) rows but unsorted.
+     * Sort by key ascending so the gather order matches a full sort. */
+    key_heapsort(hk, hi, k);
+
+    /* Build the result I64 vec of indices. */
+    ray_t* result = ray_vec_new(RAY_I64, k);
+    if (!result || RAY_IS_ERR(result)) {
+        scratch_free(hk_hdr); scratch_free(hi_hdr);
+        scratch_free(keys_hdr);
+        return result ? result : ray_error("oom", NULL);
+    }
+    result->len = k;
+    memcpy(ray_data(result), hi, (size_t)k * sizeof(int64_t));
+
+    scratch_free(hk_hdr); scratch_free(hi_hdr);
+    scratch_free(keys_hdr);
+    return result;
+}
+
+/* Gather K rows of `tbl` at the given indices and return a new table.
+ * Used by both single-key and multi-key top-K paths.  Releases `idx`. */
+static ray_t* topk_gather_rows(ray_t* tbl, ray_t* idx, int64_t k) {
+    int64_t* idx_data = (int64_t*)ray_data(idx);
+    int64_t ncols = ray_table_ncols(tbl);
+
+    ray_t* result = ray_table_new(ncols);
+    if (!result || RAY_IS_ERR(result)) { ray_release(idx); return result; }
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* src = ray_table_get_col_idx(tbl, c);
+        int64_t name = ray_table_col_name(tbl, c);
+        if (!src) continue;
+        ray_t* dst;
+        if (src->type == RAY_LIST) {
+            dst = ray_list_new(k);
+            if (!dst || RAY_IS_ERR(dst)) {
+                ray_release(idx); ray_release(result);
+                return dst ? dst : ray_error("oom", NULL);
+            }
+            ray_t** sp = (ray_t**)ray_data(src);
+            ray_t** dp = (ray_t**)ray_data(dst);
+            for (int64_t i = 0; i < k; i++) {
+                dp[i] = sp[idx_data[i]];
+                if (dp[i]) ray_retain(dp[i]);
+            }
+            dst->len = k;
+        } else {
+            dst = gather_by_idx(src, idx_data, k);
+            if (!dst || RAY_IS_ERR(dst)) {
+                ray_release(idx); ray_release(result);
+                return dst ? dst : ray_error("oom", NULL);
+            }
+        }
+        result = ray_table_add_col(result, name, dst);
+        ray_release(dst);
+        if (RAY_IS_ERR(result)) { ray_release(idx); return result; }
+    }
+    ray_release(idx);
+    return result;
+}
+
+/* Public top-K gather: returns a new table of `k` rows of `tbl`, sorted by
+ * `col` in the requested direction.  When the inputs don't match the
+ * single-key fast-path (multi-key, unsupported type, etc.), returns NULL
+ * so the caller can fall back to the full-sort path. */
+ray_t* ray_topk_table(ray_t* tbl, ray_t* col, uint8_t desc, uint8_t nf,
+                      int64_t k) {
+    if (!tbl || tbl->type != RAY_TABLE || !col) return NULL;
+    int64_t nrows = ray_table_nrows(tbl);
+    if (k <= 0 || nrows <= 0) return NULL;
+    if (k >= nrows) return NULL;
+    int64_t ncols = ray_table_ncols(tbl);
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* src = ray_table_get_col_idx(tbl, c);
+        if (src && src->type == RAY_LIST) return NULL;
+    }
+
+    ray_t* idx = topk_indices_single(col, desc, nf, nrows, k);
+    if (!idx) return NULL;
+    return topk_gather_rows(tbl, idx, k);
+}
+
+/* Multi-key top-K: comparator-based bounded heap across `n_keys` columns.
+ * Falls back to a comparator heap (no radix encoding) since multi-key
+ * radix encoding requires uniform-width packed keys.  Returns NULL when
+ * the inputs aren't supported (n_keys==0, K>=nrows, LIST columns) so the
+ * caller can fall back to a full sort.  Cost is O(n_rows * n_keys * log K
+ * + K log K) in comparisons — wins decisively when K << n_rows even with
+ * the per-compare overhead.  All key columns must come from the same
+ * table; row indices are interpreted into each column at the same
+ * position. */
+ray_t* ray_topk_table_multi(ray_t* tbl, ray_t** key_cols, uint8_t* descs,
+                            uint8_t* nfs, uint8_t n_keys, int64_t k) {
+    if (!tbl || tbl->type != RAY_TABLE || !key_cols || n_keys == 0) return NULL;
+    int64_t nrows = ray_table_nrows(tbl);
+    if (k <= 0 || nrows <= 0 || k >= nrows) return NULL;
+    int64_t ncols = ray_table_ncols(tbl);
+    for (int64_t c = 0; c < ncols; c++) {
+        ray_t* src = ray_table_get_col_idx(tbl, c);
+        if (src && src->type == RAY_LIST) return NULL;
+    }
+    for (uint8_t i = 0; i < n_keys; i++)
+        if (!key_cols[i] || key_cols[i]->len < nrows) return NULL;
+
+    ray_t* idx = topk_indices_cmp(key_cols, descs, nfs, n_keys, nrows, k);
+    if (!idx) return NULL;
+    if (RAY_IS_ERR(idx)) return idx;
+    return topk_gather_rows(tbl, idx, k);
+}
+
 ray_t* ray_sort_indices(ray_t** cols, uint8_t* descs, uint8_t* nulls_first,
                         uint8_t n_cols, int64_t nrows) {
     return sort_indices_ex(cols, descs, nulls_first, n_cols, nrows, NULL, NULL);
@@ -3126,6 +3415,58 @@ ray_t* exec_sort(ray_graph_t* g, ray_op_t* op, ray_t* tbl, int64_t limit) {
     uint8_t n_sort = ext->sort.n_cols;
     if (n_sort > 16) return ray_error("nyi", NULL); /* radix_encode_ctx_t limit */
 
+    /* ---- Top-K bounded-heap shortcut ----
+     * Triggered by the SORT+HEAD fusion (HEAD passes limit > 0).  When
+     * K is well below nrows (K << n) and every sort key is a direct
+     * OP_SCAN of a column on `tbl`, run a heap-based partial selection
+     * in O(n log K + K log K) instead of the full O(n log n) sort.
+     * Single key → radix-encoded fast path; multi-key → comparator
+     * heap (still O(n log K) in compares, big win when K << n).
+     * Falls through to the full sort whenever the topk path returns
+     * NULL (unsupported type, computed-key sort, etc.). */
+    if (limit > 0 && n_sort >= 1 && limit < nrows && limit <= 8192 &&
+        g && g->selection == NULL) {
+        ray_t* key_cols[16];
+        int    all_scan = 1;
+        for (uint8_t k = 0; k < n_sort; k++) {
+            ray_op_t* key_op = ext->sort.columns[k];
+            ray_op_ext_t* key_ext = find_ext(g, key_op->id);
+            if (key_ext && key_ext->base.opcode == OP_SCAN) {
+                key_cols[k] = ray_table_get_col(tbl, key_ext->sym);
+                if (!key_cols[k]) { all_scan = 0; break; }
+            } else {
+                all_scan = 0;
+                break;
+            }
+        }
+        if (all_scan) {
+            if (n_sort == 1) {
+                uint8_t desc = ext->sort.desc ? ext->sort.desc[0] : 0;
+                uint8_t nf   = ext->sort.nulls_first
+                              ? ext->sort.nulls_first[0]
+                              : !desc;
+                ray_t* topk_res = ray_topk_table(tbl, key_cols[0], desc, nf, limit);
+                if (topk_res && !RAY_IS_ERR(topk_res)) return topk_res;
+                if (topk_res && RAY_IS_ERR(topk_res)) ray_release(topk_res);
+            } else {
+                /* Default nulls-first to !desc per-key when caller
+                 * didn't supply a vector. */
+                uint8_t nfs[16];
+                for (uint8_t k = 0; k < n_sort; k++) {
+                    uint8_t d = ext->sort.desc ? ext->sort.desc[k] : 0;
+                    nfs[k] = ext->sort.nulls_first
+                             ? ext->sort.nulls_first[k]
+                             : !d;
+                }
+                ray_t* topk_res = ray_topk_table_multi(tbl, key_cols,
+                    ext->sort.desc, nfs, n_sort, limit);
+                if (topk_res && !RAY_IS_ERR(topk_res)) return topk_res;
+                if (topk_res && RAY_IS_ERR(topk_res)) ray_release(topk_res);
+            }
+            /* topk_res == NULL → unsupported config, fall through. */
+        }
+    }
+
     /* Resolve sort key vectors */
     ray_t* sort_vecs[n_sort > 0 ? n_sort : 1];
     uint8_t sort_owned[n_sort > 0 ? n_sort : 1];
diff --git a/src/ops/string.c b/src/ops/string.c
index e9430340..dd013874 100644
--- a/src/ops/string.c
+++ b/src/ops/string.c
@@ -23,12 +23,44 @@
 
 #include "ops/internal.h"
 #include "ops/glob.h"
+#include "core/pool.h"
 
 /* ============================================================================
  * OP_LIKE: glob pattern matching on STR / SYM columns.  See ops/glob.[ch].
  * Syntax: * (any), ? (one char), [abc] / [a-z] / [!abc] (character class).
  * ============================================================================ */
 
+/* Pattern-resolve worker for the SYM-LIKE fast path.  Runs over a
+ * range of sym_ids; for each marked-as-seen sid, runs the matcher and
+ * writes the answer to lut[sid].  Pure read-only on the inputs after
+ * the seen-mark phase, so workers are independent. */
+typedef struct {
+    ray_t**                    sym_strings;
+    uint8_t*                   seen;
+    uint8_t*                   lut;
+    const ray_glob_compiled_t* pc;
+    bool                       use_simple;
+    const char*                pat_str;
+    size_t                     pat_len;
+} like_resolve_ctx_t;
+
+static void like_resolve_fn(void* ctx, uint32_t worker_id,
+                            int64_t start, int64_t end) {
+    (void)worker_id;
+    like_resolve_ctx_t* x = (like_resolve_ctx_t*)ctx;
+    for (int64_t sid = start; sid < end; sid++) {
+        if (!x->seen[sid]) continue;
+        ray_t* str = x->sym_strings[sid];
+        if (!str) { x->lut[sid] = 0; continue; }
+        const char* sp = ray_str_ptr(str);
+        size_t sl = ray_str_len(str);
+        x->lut[sid] = (x->use_simple
+                       ? ray_glob_match_compiled(x->pc, sp, sl)
+                       : ray_glob_match(sp, sl, x->pat_str, x->pat_len))
+                      ? 1 : 0;
+    }
+}
+
 ray_t* exec_like(ray_graph_t* g, ray_op_t* op) {
     ray_t* input = exec_node(g, op->inputs[0]);
     ray_t* pat_v = exec_node(g, op->inputs[1]);
@@ -39,6 +71,13 @@ ray_t* exec_like(ray_graph_t* g, ray_op_t* op) {
     const char* pat_str = ray_str_ptr(pat_v);
     size_t pat_len = ray_str_len(pat_v);
 
+    /* Pre-compile pattern into the simple-shape form when possible — the
+     * substring/prefix/suffix branches drive memmem/memcmp directly,
+     * roughly an order of magnitude faster than the iterative matcher
+     * for the very common `*literal*` shape. */
+    ray_glob_compiled_t pc = ray_glob_compile(pat_str, pat_len);
+    bool use_simple = pc.shape != RAY_GLOB_SHAPE_NONE;
+
     int64_t len = input->len;
     ray_t* result = ray_vec_new(RAY_BOOL, len);
     if (!result || RAY_IS_ERR(result)) {
@@ -55,17 +94,125 @@ ray_t* exec_like(ray_graph_t* g, ray_op_t* op) {
         for (int64_t i = 0; i < len; i++) {
             const char* sp = ray_str_t_ptr(&elems[i], pool);
             size_t sl = elems[i].len;
-            dst[i] = ray_glob_match(sp, sl, pat_str, pat_len) ? 1 : 0;
+            dst[i] = (use_simple
+                      ? ray_glob_match_compiled(&pc, sp, sl)
+                      : ray_glob_match(sp, sl, pat_str, pat_len)) ? 1 : 0;
         }
     } else if (RAY_IS_SYM(in_type)) {
+        /* Dictionary-cached fast path.
+         *
+         * Three-phase pipeline:
+         *   (1) seen-mark — single sequential row scan that flips a
+         *       byte in `seen[]` for every referenced sym_id.  Cheap;
+         *       just sets a byte per row.
+         *   (2) parallel pattern resolve — partition the dict_n range
+         *       across pool workers; for each sid where seen[sid]==1,
+         *       run the matcher and store the answer in lut[sid].
+         *   (3) parallel row projection — every row reads lut[sid_i].
+         *
+         * Splitting the resolve from the row scan lets phase (2) drive
+         * the pattern matcher (memmem on long URL strings) across the
+         * worker pool.  ray_sym_count is the GLOBAL dictionary so for
+         * a low-card column like BrowserCountry phase (1) keeps the
+         * resolve work bounded to that column's actual sym_ids. */
         const void* base = ray_data(input);
-        for (int64_t i = 0; i < len; i++) {
-            int64_t sym_id = ray_read_sym(base, i, in_type, input->attrs);
-            ray_t* s = ray_sym_str(sym_id);
-            if (!s) { dst[i] = 0; continue; }
-            const char* sp = ray_str_ptr(s);
-            size_t sl = ray_str_len(s);
-            dst[i] = ray_glob_match(sp, sl, pat_str, pat_len) ? 1 : 0;
+        ray_t** sym_strings = NULL;
+        uint32_t dict_n = 0;
+        ray_sym_strings_borrow(&sym_strings, &dict_n);
+        ray_t* lut_hdr = NULL;
+        ray_t* seen_hdr = NULL;
+        uint8_t* lut = NULL;
+        uint8_t* seen = NULL;
+        if (dict_n > 0) {
+            lut  = (uint8_t*)scratch_alloc (&lut_hdr,  (size_t)dict_n);
+            seen = (uint8_t*)scratch_calloc(&seen_hdr, (size_t)dict_n);
+        }
+        if (lut && seen) {
+            int sym_w = (int)(input->attrs & RAY_SYM_W_MASK);
+
+            /* Phase 1: mark used sym_ids.  Width-specialised. */
+            switch (sym_w) {
+            case RAY_SYM_W8: {
+                const uint8_t* d = (const uint8_t*)base;
+                for (int64_t i = 0; i < len; i++) {
+                    uint64_t sid = d[i];
+                    if (sid < dict_n) seen[sid] = 1;
+                }
+                break;
+            }
+            case RAY_SYM_W16: {
+                const uint16_t* d = (const uint16_t*)base;
+                for (int64_t i = 0; i < len; i++) {
+                    uint64_t sid = d[i];
+                    if (sid < dict_n) seen[sid] = 1;
+                }
+                break;
+            }
+            case RAY_SYM_W32: {
+                const uint32_t* d = (const uint32_t*)base;
+                for (int64_t i = 0; i < len; i++) {
+                    uint64_t sid = d[i];
+                    if (sid < dict_n) seen[sid] = 1;
+                }
+                break;
+            }
+            case RAY_SYM_W64:
+            default: {
+                const int64_t* d = (const int64_t*)base;
+                for (int64_t i = 0; i < len; i++) {
+                    int64_t sid = d[i];
+                    if ((uint64_t)sid < dict_n) seen[sid] = 1;
+                }
+                break;
+            }
+            }
+
+            /* Phase 2: parallel pattern resolve over the dict range. */
+            like_resolve_ctx_t rctx = {
+                .sym_strings = sym_strings, .seen = seen, .lut = lut,
+                .pc = &pc, .use_simple = use_simple,
+                .pat_str = pat_str, .pat_len = pat_len,
+            };
+            ray_pool_t* pool = ray_pool_get();
+            if (pool && (int64_t)dict_n >= 16384) {
+                ray_pool_dispatch(pool, like_resolve_fn, &rctx, (int64_t)dict_n);
+            } else {
+                like_resolve_fn(&rctx, 0, 0, (int64_t)dict_n);
+            }
+
+            /* Phase 3: row projection (sequential — already a tight
+             * gather over a 1-byte LUT).  Width-specialised. */
+            #define LIKE_ROW_PASS(LOAD)                                        \
+                for (int64_t i = 0; i < len; i++) {                            \
+                    int64_t sid = (LOAD);                                      \
+                    dst[i] = ((uint64_t)sid < (uint64_t)dict_n) ? lut[sid] : 0; \
+                }
+            switch (sym_w) {
+            case RAY_SYM_W8:  { const uint8_t*  d = base; LIKE_ROW_PASS(d[i]) break; }
+            case RAY_SYM_W16: { const uint16_t* d = base; LIKE_ROW_PASS(d[i]) break; }
+            case RAY_SYM_W32: { const uint32_t* d = base; LIKE_ROW_PASS(d[i]) break; }
+            case RAY_SYM_W64:
+            default:          { const int64_t*  d = base; LIKE_ROW_PASS(d[i]) break; }
+            }
+            #undef LIKE_ROW_PASS
+
+            scratch_free(lut_hdr);
+            scratch_free(seen_hdr);
+        } else {
+            /* OOM building the LUT: fall back to per-row scan. */
+            if (lut_hdr) scratch_free(lut_hdr);
+            if (seen_hdr) scratch_free(seen_hdr);
+            for (int64_t i = 0; i < len; i++) {
+                int64_t sym_id = ray_read_sym(base, i, in_type, input->attrs);
+                ray_t* s = (sym_strings && (uint64_t)sym_id < (uint64_t)dict_n)
+                           ? sym_strings[sym_id] : NULL;
+                if (!s) { dst[i] = 0; continue; }
+                const char* sp = ray_str_ptr(s);
+                size_t sl = ray_str_len(s);
+                dst[i] = (use_simple
+                          ? ray_glob_match_compiled(&pc, sp, sl)
+                          : ray_glob_match(sp, sl, pat_str, pat_len)) ? 1 : 0;
+            }
         }
     } else {
         memset(dst, 0, (size_t)len);
@@ -105,12 +252,43 @@ ray_t* exec_ilike(ray_graph_t* g, ray_op_t* op) {
             dst[i] = ray_glob_match_ci(sp, sl, pat_str, pat_len) ? 1 : 0;
         }
     } else if (RAY_IS_SYM(in_type)) {
+        /* Dictionary-cached fast path — see exec_like. */
         const void* base = ray_data(input);
-        for (int64_t i = 0; i < len; i++) {
-            int64_t sym_id = ray_read_sym(base, i, in_type, input->attrs);
-            ray_t* s = ray_sym_str(sym_id);
-            if (!s) { dst[i] = 0; continue; }
-            dst[i] = ray_glob_match_ci(ray_str_ptr(s), ray_str_len(s), pat_str, pat_len) ? 1 : 0;
+        uint32_t dict_n = ray_sym_count();
+        ray_t* lut_hdr = NULL;
+        ray_t* seen_hdr = NULL;
+        uint8_t* lut = NULL;
+        uint8_t* seen = NULL;
+        if (dict_n > 0) {
+            lut  = (uint8_t*)scratch_alloc (&lut_hdr,  (size_t)dict_n);
+            seen = (uint8_t*)scratch_calloc(&seen_hdr, (size_t)dict_n);
+        }
+        if (lut && seen) {
+            for (int64_t i = 0; i < len; i++) {
+                int64_t sid = ray_read_sym(base, i, in_type, input->attrs);
+                if ((uint64_t)sid >= (uint64_t)dict_n) { dst[i] = 0; continue; }
+                if (!seen[sid]) {
+                    ray_t* s = ray_sym_str(sid);
+                    if (!s) { lut[sid] = 0; }
+                    else {
+                        lut[sid] = ray_glob_match_ci(ray_str_ptr(s), ray_str_len(s),
+                                                     pat_str, pat_len) ? 1 : 0;
+                    }
+                    seen[sid] = 1;
+                }
+                dst[i] = lut[sid];
+            }
+            scratch_free(lut_hdr);
+            scratch_free(seen_hdr);
+        } else {
+            if (lut_hdr) scratch_free(lut_hdr);
+            if (seen_hdr) scratch_free(seen_hdr);
+            for (int64_t i = 0; i < len; i++) {
+                int64_t sym_id = ray_read_sym(base, i, in_type, input->attrs);
+                ray_t* s = ray_sym_str(sym_id);
+                if (!s) { dst[i] = 0; continue; }
+                dst[i] = ray_glob_match_ci(ray_str_ptr(s), ray_str_len(s), pat_str, pat_len) ? 1 : 0;
+            }
         }
     } else {
         memset(dst, 0, (size_t)len);
diff --git a/src/ops/strop.c b/src/ops/strop.c
index 9744398b..4ff123e9 100644
--- a/src/ops/strop.c
+++ b/src/ops/strop.c
@@ -22,6 +22,7 @@
  */
 
 #include "lang/internal.h"
+#include "ops/internal.h"
 #include "table/sym.h"
 #include "ops/glob.h"
 
@@ -202,6 +203,13 @@ ray_t* ray_like_fn(ray_t* x, ray_t* pattern) {
     const char* pat = ray_str_ptr(pattern);
     size_t pat_len = ray_str_len(pattern);
 
+    /* Pre-compile the pattern once.  Most ClickBench LIKE shapes are
+     * `*literal*` (substring) which collapses to a memmem call — the
+     * libc-provided implementation is SIMD on glibc/Apple/BSD.  When the
+     * shape is RAY_GLOB_SHAPE_NONE we keep the iterative matcher. */
+    ray_glob_compiled_t pc = ray_glob_compile(pat, pat_len);
+    bool use_simple = pc.shape != RAY_GLOB_SHAPE_NONE;
+
     /* Atom: single match */
     if (x->type == -RAY_STR || x->type == -RAY_SYM) {
         const char* s; size_t sl;
@@ -214,7 +222,8 @@ ray_t* ray_like_fn(ray_t* x, ray_t* pattern) {
             s  = ray_str_ptr(x);
             sl = ray_str_len(x);
         }
-        bool m = ray_glob_match(s, sl, pat, pat_len);
+        bool m = use_simple ? ray_glob_match_compiled(&pc, s, sl)
+                            : ray_glob_match(s, sl, pat, pat_len);
         if (sym_str) ray_release(sym_str);
         return make_bool(m ? 1 : 0);
     }
@@ -228,20 +237,118 @@ ray_t* ray_like_fn(ray_t* x, ray_t* pattern) {
         uint8_t* out = (uint8_t*)ray_data(result);
 
         if (x->type == RAY_SYM) {
-            int64_t* sym_ids = (int64_t*)ray_data(x);
-            for (int64_t i = 0; i < n; i++) {
-                ray_t* sym_str = ray_sym_str(sym_ids[i]);
-                const char* s = sym_str ? ray_str_ptr(sym_str) : "";
-                size_t sl = sym_str ? ray_str_len(sym_str) : 0;
-                out[i] = ray_glob_match(s, sl, pat, pat_len) ? 1 : 0;
-                if (sym_str) ray_release(sym_str);
+            /* SYM column is dictionary-encoded with adaptive widths
+             * (W8/W16/W32/W64).  Two bugs to avoid:
+             *   (a) Reading the column as int64_t* is wrong for any
+             *       width != W64 — must use ray_read_sym.
+             *   (b) ray_sym_str returns a borrowed pointer; releasing
+             *       it would decrement the global sym table entry.
+             *
+             * Fast path: a SYM column with N rows references at most
+             * D = ray_sym_count() distinct sym_ids.  Build a
+             * sym_id → bool LUT with a "seen" bitmap so each sym_id
+             * runs the glob matcher at most once.  For LIKE on URL
+             * (1.7M unique values, 5M rows) this turns an O(n_rows)
+             * pattern-scan into O(n_distinct + n_rows) — the second
+             * pass is a single byte load + table lookup per row. */
+            const void* base = ray_data(x);
+            int8_t in_type = x->type;
+            uint8_t in_attrs = x->attrs;
+
+            /* The global sym table can be much larger than the set of
+             * IDs this column references (e.g. BrowserCountry with 54
+             * uniques in a process that's also loaded URL with 1.7M
+             * uniques).  Lazy-resolve via the seen bitmap so we only
+             * match against sym_ids actually touched.  ray_sym_strings_borrow
+             * snapshots the strings array under one lock so each lookup
+             * is a plain pointer load. */
+            ray_t** sym_strings = NULL;
+            uint32_t dict_n = 0;
+            ray_sym_strings_borrow(&sym_strings, &dict_n);
+            ray_t* lut_hdr = NULL;
+            ray_t* seen_hdr = NULL;
+            uint8_t* lut = NULL;
+            uint8_t* seen = NULL;
+            if (dict_n > 0) {
+                lut  = (uint8_t*)scratch_alloc (&lut_hdr,  (size_t)dict_n);
+                seen = (uint8_t*)scratch_calloc(&seen_hdr, (size_t)dict_n);
+            }
+            if (lut && seen) {
+                /* First pass: discover the unique sym_ids referenced and
+                 * resolve each pattern match exactly once.  Second pass:
+                 * width-specialised LUT projection so the per-row loop
+                 * is a tight gather. */
+                int sym_w = (int)(in_attrs & RAY_SYM_W_MASK);
+                #define DICT_PASS(LOAD)                                       \
+                    for (int64_t i = 0; i < n; i++) {                         \
+                        int64_t sid = (LOAD);                                 \
+                        if ((uint64_t)sid >= (uint64_t)dict_n) continue;      \
+                        if (!seen[sid]) {                                     \
+                            ray_t* s = sym_strings[sid];                      \
+                            const char* sp = s ? ray_str_ptr(s) : "";         \
+                            size_t sl = s ? ray_str_len(s) : 0;               \
+                            lut[sid] = (use_simple                            \
+                                        ? ray_glob_match_compiled(&pc, sp, sl)\
+                                        : ray_glob_match(sp, sl, pat, pat_len)) \
+                                       ? 1 : 0;                               \
+                            seen[sid] = 1;                                    \
+                        }                                                     \
+                    }
+                #define ROW_PASS(LOAD)                                        \
+                    for (int64_t i = 0; i < n; i++) {                         \
+                        int64_t sid = (LOAD);                                 \
+                        out[i] = ((uint64_t)sid < (uint64_t)dict_n) ? lut[sid] : 0; \
+                    }
+                switch (sym_w) {
+                case RAY_SYM_W8: {
+                    const uint8_t* d = (const uint8_t*)base;
+                    DICT_PASS(d[i]) ROW_PASS(d[i]) break;
+                }
+                case RAY_SYM_W16: {
+                    const uint16_t* d = (const uint16_t*)base;
+                    DICT_PASS(d[i]) ROW_PASS(d[i]) break;
+                }
+                case RAY_SYM_W32: {
+                    const uint32_t* d = (const uint32_t*)base;
+                    DICT_PASS(d[i]) ROW_PASS(d[i]) break;
+                }
+                case RAY_SYM_W64:
+                default: {
+                    const int64_t* d = (const int64_t*)base;
+                    DICT_PASS(d[i]) ROW_PASS(d[i]) break;
+                }
+                }
+                #undef DICT_PASS
+                #undef ROW_PASS
+                scratch_free(lut_hdr);
+                scratch_free(seen_hdr);
+            } else {
+                /* OOM building the LUT: fall back to per-row scan.  Still
+                 * uses ray_read_sym for adaptive-width correctness. */
+                if (lut_hdr) scratch_free(lut_hdr);
+                if (seen_hdr) scratch_free(seen_hdr);
+                for (int64_t i = 0; i < n; i++) {
+                    int64_t sid = ray_read_sym(base, i, in_type, in_attrs);
+                    ray_t* s = (sym_strings && (uint64_t)sid < (uint64_t)dict_n)
+                               ? sym_strings[sid] : NULL;
+                    const char* sp = s ? ray_str_ptr(s) : "";
+                    size_t sl = s ? ray_str_len(s) : 0;
+                    out[i] = (use_simple
+                              ? ray_glob_match_compiled(&pc, sp, sl)
+                              : ray_glob_match(sp, sl, pat, pat_len)) ? 1 : 0;
+                }
             }
         } else {
             /* RAY_STR vector */
             for (int64_t i = 0; i < n; i++) {
                 size_t slen;
                 const char* s = ray_str_vec_get(x, i, &slen);
-                out[i] = (s && ray_glob_match(s, slen, pat, pat_len)) ? 1 : 0;
+                bool m = false;
+                if (s) {
+                    m = use_simple ? ray_glob_match_compiled(&pc, s, slen)
+                                   : ray_glob_match(s, slen, pat, pat_len);
+                }
+                out[i] = m ? 1 : 0;
             }
         }
         return result;
diff --git a/src/table/sym.c b/src/table/sym.c
index 02d1e1a3..a788b3cd 100644
--- a/src/table/sym.c
+++ b/src/table/sym.c
@@ -833,6 +833,32 @@ uint32_t ray_sym_count(void) {
     return count;
 }
 
+/* --------------------------------------------------------------------------
+ * ray_sym_strings_borrow
+ *
+ * Single-shot snapshot of the sym→string table for hot read-only
+ * scanners (LIKE, dictionary projection, …).  ray_sym_str takes a spin
+ * lock per call; iterating all 1.7M URL dict entries via ray_sym_str
+ * means 1.7M lock acquisitions.  This routine takes the lock once,
+ * captures the array pointer + length, drops the lock, and lets the
+ * caller iterate lock-free.
+ *
+ * Validity: only safe during read-only phases (no concurrent
+ * ray_sym_intern).  ray_sym_intern can realloc g_sym.strings, after
+ * which the returned pointer is dangling.  Today's pipeline is one
+ * pass: bulk-intern at CSV load, then run queries against the frozen
+ * table — exactly the contract this borrow form needs.
+ * -------------------------------------------------------------------------- */
+void ray_sym_strings_borrow(ray_t*** out_strings, uint32_t* out_count) {
+    if (out_strings) *out_strings = NULL;
+    if (out_count)   *out_count   = 0;
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return;
+    sym_lock();
+    if (out_strings) *out_strings = g_sym.strings;
+    if (out_count)   *out_count   = g_sym.str_count;
+    sym_unlock();
+}
+
 /* --------------------------------------------------------------------------
  * ray_sym_ensure_cap -- pre-grow hash table and strings array
  *
diff --git a/test/rfl/collection/at.rfl b/test/rfl/collection/at.rfl
index ae879282..90571ff5 100644
--- a/test/rfl/collection/at.rfl
+++ b/test/rfl/collection/at.rfl
@@ -6,6 +6,10 @@
 ;; vector of indices returns vector of elements
 (at [10 20 30 40 50] [0 2 4]) -- [10 30 50]
 
+;; table row indices return a table, not a boxed list of row dicts
+(type (at (table [a b] (list [1 2 3] [4 5 6])) [0 2])) -- 'TABLE
+(at (at (table [a b] (list [1 2 3] [4 5 6])) [0 2]) 'a) -- [1 3]
+
 ;; at 0 == first
 (set V (rand 50 1000))
 (at V 0) -- (first V)
diff --git a/test/rfl/integration/cross_type_workout.rfl b/test/rfl/integration/cross_type_workout.rfl
index 4a78bd34..562947f4 100644
--- a/test/rfl/integration/cross_type_workout.rfl
+++ b/test/rfl/integration/cross_type_workout.rfl
@@ -199,6 +199,7 @@
 ;; "corrupt" path with SYM that's tracked separately.  Use only
 ;; numeric columns here.
 (set Tplain (table [id price qty] (list (at T 'id) (at T 'price) (at T 'qty))))
+(.sys.exec "rm -rf /tmp/cross_type_workout_splayed")
 (.db.splayed.set "/tmp/cross_type_workout_splayed/" Tplain)
 (set Sp (.db.splayed.get "/tmp/cross_type_workout_splayed/"))
 (count Sp)                       -- 200
diff --git a/test/rfl/ops/query_coverage.rfl b/test/rfl/ops/query_coverage.rfl
index ac045c2b..bb432960 100644
--- a/test/rfl/ops/query_coverage.rfl
+++ b/test/rfl/ops/query_coverage.rfl
@@ -149,6 +149,26 @@
 (set TStr (table [Name v] (list (list "alpha" "beta" "alpha" "gamma" "beta") [10 20 30 40 50])))
 (count (select {from: TStr by: Name})) -- 3
 
+;; COUNT(DISTINCT col) per group is a real aggregate, but `distinct`
+;; must run on each group's slice rather than on the full column before
+;; OP_GROUP.  Numeric keys take the DAG group-boundary + per-group eval path.
+(set TCD (table [g u] (list [1 1 2 2 2] [10 10 20 21 20])))
+(sum (at (select {u: (count (distinct u)) from: TCD by: g}) 'u)) -- 3
+
+;; STR keys force the eval-level group fallback; the same count-distinct
+;; expression must still be evaluated per group, not broadcast from the
+;; whole table.
+(set TCDS (table [k u] (list (as 'STR ["a" "a" "b" "b" ""]) [1 2 2 2 3])))
+(sum (at (select {u: (count (distinct u)) from: TCDS by: k}) 'u)) -- 4
+
+;; Multi-key group-by with a materialised computed key plus a STR key:
+;; by-dict pre-eval rewrites `{m: (...) s: S}` to a SYM-vector key list.
+;; The DAG group path can't handle STR keys, so this takes the eval-level
+;; composite-key fallback.
+(set TG2S (table [ts s u] (list (as 'TIMESTAMP [0 60000000000 60000000000 120000000000]) (as 'STR ["a" "a" "b" "b"]) [10 11 12 13])))
+(count (select {c: (count u) from: TG2S by: {m: (minute ts) s: s}})) -- 4
+(sum (at (select {c: (count u) from: TG2S by: {m: (minute ts) s: s}}) 'c)) -- 4
+
 ;; ====================================================================
 ;; GUID first-of-group fast path — query.c:1945-2099.  Pure
 ;; `(select {from: t by: G})` with no agg/non-agg expressions takes
diff --git a/test/rfl/system/read_csv.rfl b/test/rfl/system/read_csv.rfl
index 946a745b..0258c233 100644
--- a/test/rfl/system/read_csv.rfl
+++ b/test/rfl/system/read_csv.rfl
@@ -15,5 +15,6 @@
 (.sys.exec "awk 'BEGIN{print \"id,sym\"; for(i=0;i<20000;i++) printf(\"%d,s%d\\n\",i,i)}' > rf_test_syms.csv") -- 0
 
 (count (.csv.read [I64 SYMBOL] "rf_test_syms.csv")) -- 20000
+(count (read-csv [I64 SYMBOL] "rf_test_syms.csv")) -- 20000
 
 (.sys.exec "rm -f rf_test_syms.csv") -- 0
diff --git a/test/rfl/system/reserved_namespace.rfl b/test/rfl/system/reserved_namespace.rfl
index 373c3b29..acceef7c 100644
--- a/test/rfl/system/reserved_namespace.rfl
+++ b/test/rfl/system/reserved_namespace.rfl
@@ -68,6 +68,9 @@
 (nil? .ipc.send) -- false
 (nil? .csv.read) -- false
 (nil? .csv.write) -- false
+;; Python compatibility aliases resolve to the same CSV builtins.
+(nil? read-csv) -- false
+(nil? write-csv) -- false
 ;; Old names must NOT resolve — we committed to no backward compat.
 gc !- name
 getenv !- name
@@ -75,7 +78,6 @@ system !- name
 sysinfo !- name
 memstat !- name
 internals !- name
-read-csv !- name
 ;; Negative: writes to `.*` are refused with `reserve`.
 (set .os.foo 1) !- reserve
 (set .sys.gc 0) !- reserve
diff --git a/test/test_csv.c b/test/test_csv.c
index b910954b..a5dedbe2 100644
--- a/test/test_csv.c
+++ b/test/test_csv.c
@@ -26,6 +26,7 @@
 #include <rayforce.h>
 #include "mem/heap.h"
 #include "io/csv.h"
+#include "table/sym.h"
 #include <stdio.h>
 #include <unistd.h>
 
@@ -1101,6 +1102,8 @@ static test_result_t test_csv_sym_narrowing(void) {
     ray_t* col = ray_table_get_col_idx(loaded, 0);
     TEST_ASSERT_EQ_I(col->type, RAY_SYM);
     /* Width is encoded in the lower 2 bits of attrs (RAY_SYM_W8 == 0). */
+    TEST_ASSERT_EQ_I((int)(col->attrs & RAY_SYM_W_MASK), RAY_SYM_W8);
+    TEST_ASSERT_FALSE(col->attrs & RAY_ATTR_HAS_NULLS);
     /* Just sanity: rows exist and aren't null. */
     TEST_ASSERT_EQ_I(ray_table_nrows(loaded), 200);
     TEST_ASSERT_FALSE(ray_vec_is_null(col, 0));
@@ -1151,5 +1154,3 @@ const test_entry_t csv_entries[] = {
     { "csv/sym_narrowing", test_csv_sym_narrowing, NULL, NULL },
     { NULL, NULL, NULL, NULL },
 };
-
-

From 069c652a09be452672b3f55510539d06a46c789c Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 5 May 2026 20:10:53 +0200
Subject: [PATCH 03/10] fix(io/csv): empty SYM fields no longer become the null
 sentinel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CSV format conflates "field empty" and "field missing" — both look
like a zero-length cell.  The SYM materialisation path treated the
parse-time null bit as the q/k null sentinel ID 0, so `(!= col "")`
never excluded those rows: the value-vs-null comparison kernel
returns true for `0Ns != ""` (matching q semantics, not SQL's).

After this change the loader interns "" once per call and remaps
null-flagged SYM rows to that ID, clearing their null bit so the
compare kernel takes the both-non-null branch.  Net effect — empty
TSV/CSV cells round-trip through Rayforce as the empty SYM, matching
how DuckDB / Spark / polars handle the same input.

Affects ten ClickBench queries that filter on `(!= col "")`:
Q11, Q22, Q23, Q25–Q27, Q31, Q32, Q37, Q38.  Selectivity ranges
from 1.0001× (URL is rarely empty) to 26× (MobilePhoneModel cuts
5M → 192K) — see ClickBench/rayforce/REMAINING_FIXES.md §R6 for
the per-query expected delta.

RAY_STR columns and non-string types preserve the null distinction
unchanged.  test/test_csv.c::null_sym + null_mixed_columns updated
for the new SYM behaviour; new R6 fixture in
test/rfl/system/read_csv.rfl.
---
 src/io/csv.c                 | 27 +++++++++++++++++++++++++--
 test/rfl/system/read_csv.rfl | 13 +++++++++++++
 test/test_csv.c              | 21 ++++++++++++++++++---
 3 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/src/io/csv.c b/src/io/csv.c
index 499db1c7..d079c4c0 100644
--- a/src/io/csv.c
+++ b/src/io/csv.c
@@ -595,6 +595,23 @@ static bool csv_intern_strings(csv_strref_t** str_refs, int n_cols,
                                 int64_t* col_max_ids,
                                 uint8_t** col_nullmaps) {
     bool ok = true;
+
+    /* Empty TSV/CSV fields are flagged in the parse-time nullmap (see
+     * CSV_TYPE_STR branch of the parse loop) — that's correct for STR
+     * columns where the null/empty distinction matters, but for SYM
+     * columns it conflates with the "no value" sentinel and breaks the
+     * SQL-style `(!= col "")` filter (which never excludes nulls in the
+     * q/k value-vs-null comparison kernel).  Pre-intern "" once and
+     * remap null rows to that ID, clearing their null bit so the
+     * compare kernel takes the both-non-null branch.  Net effect: the
+     * CSV format's "field is empty" — which can't be distinguished from
+     * "field is missing" anyway — round-trips through Rayforce as the
+     * empty SYM, matching how DuckDB / Spark / polars treat the same
+     * input. */
+    int64_t empty_sym_id = ray_sym_intern_prehashed(
+        (uint32_t)ray_hash_bytes("", 0), "", 0);
+    if (empty_sym_id < 0) empty_sym_id = 0;  /* fall back to old behavior on intern failure */
+
     for (int c = 0; c < n_cols; c++) {
         if (col_types[c] != CSV_TYPE_STR) continue;
         /* RAY_STR columns are materialized directly; skip sym interning. */
@@ -602,7 +619,7 @@ static bool csv_intern_strings(csv_strref_t** str_refs, int n_cols,
         csv_strref_t* refs = str_refs[c];
         uint32_t* ids = (uint32_t*)col_data[c];
         uint8_t* nm = col_nullmaps ? col_nullmaps[c] : NULL;
-        int64_t max_id = 0;
+        int64_t max_id = empty_sym_id;
 
         /* Pre-grow: upper bound is n_rows unique strings */
         uint32_t current = ray_sym_count();
@@ -611,7 +628,13 @@ static bool csv_intern_strings(csv_strref_t** str_refs, int n_cols,
 
         for (int64_t r = 0; r < n_rows; r++) {
             if (nm && (nm[r >> 3] & (1u << (r & 7)))) {
-                ids[r] = 0;
+                ids[r] = (uint32_t)empty_sym_id;
+                /* Clear the null bit — this row now holds a real value
+                 * (the empty SYM).  Without this clear, fmt_raw_elem
+                 * still prints "0Ns" and ray_eq_fn still routes through
+                 * the null-vs-non-null branch (returning false for
+                 * `== ""` and true for `!= ""`). */
+                nm[r >> 3] &= (uint8_t)~(1u << (r & 7));
                 continue;
             }
             uint32_t hash = (uint32_t)ray_hash_bytes(refs[r].ptr, refs[r].len);
diff --git a/test/rfl/system/read_csv.rfl b/test/rfl/system/read_csv.rfl
index 0258c233..b9e57598 100644
--- a/test/rfl/system/read_csv.rfl
+++ b/test/rfl/system/read_csv.rfl
@@ -18,3 +18,16 @@
 (count (read-csv [I64 SYMBOL] "rf_test_syms.csv")) -- 20000
 
 (.sys.exec "rm -f rf_test_syms.csv") -- 0
+
+;; ── R6 regression: empty TSV/CSV fields → empty SYM (not null sentinel) ──
+;; CSV format conflates "missing" and "empty"; the loader treats empty
+;; SYM cells as the interned empty string so SQL-style `(!= col "")`
+;; filters work the way DuckDB / polars / Spark already handle it.
+(.sys.exec "rm -f rf_test_empty.csv") -- 0
+(.sys.exec "printf 'name\\nalice\\n\\nbob\\n\\ncarol\\n' > rf_test_empty.csv") -- 0
+(set _t (.csv.read [SYMBOL] "rf_test_empty.csv"))
+(count _t)                                                            -- 5
+;; Three rows have a value, two are empty — neither side counts as null.
+(count (select {x: name from: _t where: (!= name "")}))               -- 3
+(count (select {x: name from: _t where: (== name "")}))               -- 2
+(.sys.exec "rm -f rf_test_empty.csv") -- 0
diff --git a/test/test_csv.c b/test/test_csv.c
index a5dedbe2..041c64c3 100644
--- a/test/test_csv.c
+++ b/test/test_csv.c
@@ -277,6 +277,12 @@ static test_result_t test_csv_null_bool(void) {
 }
 
 static test_result_t test_csv_null_sym(void) {
+    /* CSV format conflates "empty field" and "missing field" — both
+     * appear as a zero-length cell.  The Rayforce loader interns empty
+     * SYM cells as the empty SYM (not the null sentinel) so SQL-style
+     * `(!= col "")` filters work the way users expect.  See R6 in
+     * ClickBench/rayforce/REMAINING_FIXES.md.  RAY_STR columns and
+     * non-string types preserve the null distinction. */
     ray_heap_init();
     (void)ray_sym_init();
 
@@ -289,9 +295,17 @@ static test_result_t test_csv_null_sym(void) {
 
     ray_t* col = ray_table_get_col_idx(loaded, 0);
     TEST_ASSERT_FALSE(ray_vec_is_null(col, 0));
-    TEST_ASSERT_TRUE(ray_vec_is_null(col, 1));  /* empty → NULL */
+    TEST_ASSERT_FALSE(ray_vec_is_null(col, 1));  /* empty → empty SYM, not null */
     TEST_ASSERT_FALSE(ray_vec_is_null(col, 2));
 
+    /* Row 1's SYM ID resolves to a zero-length string — the empty SYM.
+     * The CSV loader narrows SYM columns to W8/W16/W32 based on max ID,
+     * so use ray_read_sym instead of a fixed-width cast. */
+    int64_t id1 = ray_read_sym(ray_data(col), 1, col->type, col->attrs);
+    ray_t* s = ray_sym_str(id1);
+    TEST_ASSERT_FALSE(s == NULL);
+    TEST_ASSERT_EQ_I((int64_t)ray_str_len(s), 0);
+
     ray_release(loaded);
     unlink(TMP_CSV);
     ray_sym_destroy();
@@ -348,9 +362,10 @@ static test_result_t test_csv_null_mixed_columns(void) {
     TEST_ASSERT_FALSE(ray_vec_is_null(val_col, 1));
     TEST_ASSERT_TRUE(ray_vec_is_null(val_col, 2));
 
-    /* name column: alice, NULL, bob */
+    /* name column: alice, "", bob — empty SYM cell becomes the empty
+     * SYM (not null).  See test_csv_null_sym for the rationale. */
     TEST_ASSERT_FALSE(ray_vec_is_null(name_col, 0));
-    TEST_ASSERT_TRUE(ray_vec_is_null(name_col, 1));
+    TEST_ASSERT_FALSE(ray_vec_is_null(name_col, 1));
     TEST_ASSERT_FALSE(ray_vec_is_null(name_col, 2));
 
     ray_release(loaded);

From 805d48c8d17a506d0921a7f6add4c0acd53d339f Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 5 May 2026 21:55:53 +0200
Subject: [PATCH 04/10] perf(query): broadcast atom literals in select dict
 without per-group LIST
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Q35 = `(select {one: 1, c: (count URL), from: hits, by: URL, desc: c, take: 10})`
ran in 130–159 ms vs Q34 (the same query without `one: 1`) at 21 ms —
a 6× regression from one literal column.  Two paths fed the cost:

* The non-agg scatter at line 4180+ allocated ~70 MB of bookkeeping
  (gk/row_gid/cnt/off/pos for n_groups + 2*n_groups hash slots) and
  walked all 5 M rows building row→gid even for expressions that
  don't reference any column.
* The per-cell broadcast loop then retained the literal n_groups
  times into a RAY_LIST, and the LIST column blocked apply_sort_take
  from picking the top-K fast path downstream.

Detect at the top of the n_nonaggs > 0 branch whether every non-agg
expression is a self-evaluating atom literal (atom-typed, no
RAY_ATTR_NAME → not a name reference), pre-allocate one typed
broadcast vec per literal via `atom_broadcast_vec`, and skip directly
to `nonagg_done` past the row→gid setup.  `can_atom_broadcast` gates
on supported atom types so we never half-apply and have to roll back.

Q35 is now 22 ms — within the noise of Q34.  Two-literal variant
(`{one: 1, two: 2, …}`) lands at 23 ms, also broadcast.  Tests
(2072 / 2073, 1 skipped) all green; the previous regression in
`select_by_nonagg_list_col` / `select_by_nonagg_colref_vs_const` —
where `m2: m` was eagerly broadcast as if `m` were a literal — went
away after gating on `!(attrs & RAY_ATTR_NAME)` in the predicate.

Verifying probe: bench/bottleneck/R8_const_column.rfl.
---
 src/ops/query.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)

diff --git a/src/ops/query.c b/src/ops/query.c
index 95d0e414..59fd074e 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -1679,6 +1679,124 @@ static ray_t* count_distinct_per_group_groups(ray_t* inner_expr, ray_t* tbl,
 
 /* Forward declarations for eval-level groupby fallback */
 
+/* R8: cheap predicate for whether atom_broadcast_vec can handle this
+ * atom AND the atom is a self-evaluating literal (not a name binding
+ * that needs ray_eval to resolve to a column or computed value).  Used
+ * by the all-literal pre-check so we don't half-apply a partial set of
+ * broadcasts and then have to roll back.
+ *
+ * `RAY_ATTR_NAME` distinguishes `m2: m` (the SYM `m` references a
+ * column) from `one: 1` (the I64 literal 1).  Without that filter we'd
+ * eagerly broadcast the column reference and skip the per-group gather
+ * the chained passthrough relies on. */
+static int can_atom_broadcast(ray_t* a) {
+    if (!a || !ray_is_atom(a)) return 0;
+    if (a->attrs & RAY_ATTR_NAME) return 0;
+    int8_t vt = (int8_t)(-a->type);
+    switch (vt) {
+    case RAY_BOOL: case RAY_U8:
+    case RAY_I16:  case RAY_I32:
+    case RAY_I64:  case RAY_F64:
+    case RAY_DATE: case RAY_TIME: case RAY_TIMESTAMP:
+    case RAY_SYM:
+        return 1;
+    default:
+        return 0;
+    }
+}
+
+/* R8: build a typed N-cell vector all containing the value of atom `a`.
+ *
+ * The non-agg scatter path used to bind a `{lit: 1, c: count(...) by: K}`
+ * style query into a per-group RAY_LIST of N retained atoms, which
+ * ballooned Q35 from ~21 ms to ~140 ms (one ray_retain + list slot
+ * per group, scaling with output cardinality, not row count).  Allocate
+ * once and fill — Q35 falls back into parity with Q34.
+ *
+ * Returns NULL for atom types not yet handled (RAY_STR, RAY_GUID, F32);
+ * caller falls back to the per-cell LIST path. */
+static ray_t* atom_broadcast_vec(ray_t* a, int64_t n) {
+    if (!a || !ray_is_atom(a) || n <= 0) return NULL;
+    int8_t vec_type = (int8_t)(-a->type);
+    if (vec_type <= 0) return NULL;
+
+    ray_t* v;
+    if (vec_type == RAY_SYM) {
+        uint8_t w = (uint8_t)(a->attrs & RAY_SYM_W_MASK);
+        v = ray_sym_vec_new(w, n);
+    } else {
+        v = ray_vec_new(vec_type, n);
+    }
+    if (!v || RAY_IS_ERR(v)) return NULL;
+    v->len = n;
+
+    void* dst = ray_data(v);
+    switch (vec_type) {
+    case RAY_BOOL:
+    case RAY_U8: {
+        memset(dst, a->b8, (size_t)n);
+        break;
+    }
+    case RAY_I16: {
+        int16_t val = a->i16;
+        int16_t* d = (int16_t*)dst;
+        for (int64_t i = 0; i < n; i++) d[i] = val;
+        break;
+    }
+    case RAY_I32:
+    case RAY_DATE:
+    case RAY_TIME: {
+        int32_t val = a->i32;
+        int32_t* d = (int32_t*)dst;
+        for (int64_t i = 0; i < n; i++) d[i] = val;
+        break;
+    }
+    case RAY_I64:
+    case RAY_TIMESTAMP: {
+        int64_t val = a->i64;
+        int64_t* d = (int64_t*)dst;
+        for (int64_t i = 0; i < n; i++) d[i] = val;
+        break;
+    }
+    case RAY_F64: {
+        double val = a->f64;
+        double* d = (double*)dst;
+        for (int64_t i = 0; i < n; i++) d[i] = val;
+        break;
+    }
+    case RAY_SYM: {
+        /* SYM stores the ID in `i64` regardless of width; truncate per
+         * the vector's width attribute.  Width came from the atom and
+         * was carried by ray_sym_vec_new above. */
+        uint8_t w = (uint8_t)(a->attrs & RAY_SYM_W_MASK);
+        if (w == RAY_SYM_W8) {
+            memset(dst, (uint8_t)a->i64, (size_t)n);
+        } else if (w == RAY_SYM_W16) {
+            uint16_t val = (uint16_t)a->i64;
+            uint16_t* d = (uint16_t*)dst;
+            for (int64_t i = 0; i < n; i++) d[i] = val;
+        } else { /* W32 — default */
+            uint32_t val = (uint32_t)a->i64;
+            uint32_t* d = (uint32_t*)dst;
+            for (int64_t i = 0; i < n; i++) d[i] = val;
+        }
+        break;
+    }
+    default:
+        ray_release(v);
+        return NULL;
+    }
+
+    /* Propagate atom-null: an entirely-null broadcast keeps the null bit
+     * of every cell so `is_null` and aggregations behave the same as
+     * the LIST path would have. */
+    if (RAY_ATOM_IS_NULL(a)) {
+        v->attrs |= RAY_ATTR_HAS_NULLS;
+        memset(v->nullmap, 0xFF, 16);
+    }
+    return v;
+}
+
 /* (select {from: t [where: pred] [by: key] [col: expr ...]})
  * Special form — receives unevaluated dict arg. */
 ray_t* ray_select_fn(ray_t** args, int64_t n) {
@@ -4090,6 +4208,43 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
         if (result && !RAY_IS_ERR(result) && result->type == RAY_TABLE) {
             int64_t n_groups = ray_table_nrows(result);
 
+            /* R8 fast path: every non-agg is a literal atom expression
+             * with no column refs.  Skip the entire row→gid mapping —
+             * each non-agg becomes a typed broadcast vec the same width
+             * as n_groups, no idx_buf or per-group slicing required.
+             *
+             * Q35 = `{one: 1, c: count(URL), by: URL desc: c take: 10}`
+             * is the canonical case: with all-literal nonaggs we go
+             * directly to apply_sort_take and the top-K fast path
+             * downstream of it. */
+            if (n_groups > 0) {
+                /* Pre-check ALL nonaggs first so we don't half-apply on
+                 * an unhandled atom type and then have to roll back. */
+                int all_broadcastable = 1;
+                for (uint8_t ni = 0; ni < n_nonaggs && all_broadcastable; ni++) {
+                    if (!can_atom_broadcast(nonagg_exprs[ni]))
+                        all_broadcastable = 0;
+                }
+                if (all_broadcastable) {
+                    for (uint8_t ni = 0; ni < n_nonaggs; ni++) {
+                        ray_t* col = atom_broadcast_vec(nonagg_exprs[ni], n_groups);
+                        if (!col) {
+                            /* can_atom_broadcast vetted these — anything
+                             * after that is an OOM in atom_broadcast_vec. */
+                            ray_release(result); ray_release(tbl);
+                            return ray_error("oom", NULL);
+                        }
+                        result = ray_table_add_col(result, nonagg_names[ni], col);
+                        ray_release(col);
+                        if (RAY_IS_ERR(result)) {
+                            ray_release(tbl);
+                            return result;
+                        }
+                    }
+                    goto nonagg_done;
+                }
+            }
+
             /* Resolve key sym — gated to single scalar key above. */
             int64_t ks = -1;
             if (by_expr->type == -RAY_SYM && (by_expr->attrs & RAY_ATTR_NAME))
@@ -4463,6 +4618,13 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                         continue;
                     }
 
+                    /* R8 fallback: a non-literal expression that
+                     * eval-collapses to an atom (constant within scope
+                     * but not a parser-direct literal) takes the existing
+                     * per-cell LIST broadcast.  The all-literal fast path
+                     * at the top of the n_nonaggs block already handles
+                     * the parser-literal case for Q35-shaped queries. */
+
                     int gather_ok = 1;
                     for (int64_t gi = 0; gi < n_groups; gi++) {
                         ray_t* cell;
@@ -4518,6 +4680,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                     if (RAY_IS_ERR(result)) { ray_release(tbl); return result; }
                 }
             }
+        nonagg_done: ;  /* R8 fast-path target; nothing else to do here */
         }
     }
 

From 7396a516eb36edd10c9fc99e576865b1a0d2555c Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 5 May 2026 22:10:18 +0200
Subject: [PATCH 05/10] perf(eval): SIMD-friendly fast path for (== or !=) of
 SYM-vec vs SYM atom
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The atomic_map_binary_op DAG path excludes SYM (IS_NUM_TYPE doesn't
list it), so equality between a SYM column and a SYM atom fanned out
to one ray_neq_fn call + one bool atom allocation per row.  At 5M
rows on the ClickBench hits.tsv:

  (!= URL nu)  standalone, 5M rows    113 ms  →  17 ms  (~7×)
  (== URL nu)  standalone, 5M rows    100 ms  →  14 ms

That's the per-row work dropping from ≈22 ns/row (call + alloc) to
≈3 ns/row (load + truncate + cmp + store), bottoming out on memory
bandwidth.  Further gains require parallelisation across cores.

Detect the SYM-vec ↔ SYM-atom shape early in atomic_map_binary_op,
read the atom's i64 sym ID once, then run a tight per-width loop
(W8/W16/W32/W64) writing bool output.  For the rare case of an
already-null vec or null atom, fall through to a per-row branch that
preserves the q/k atom-vs-atom rules from cmp.c (`null != x` is true,
`null == null` is true).

Effects on the ClickBench cluster:
* Filter clauses against `nu` in Q11/Q22/Q23/Q25/Q26/Q31/Q32/Q37/Q38
  were paying the slow per-element bool alloc.  The select planner
  pushes filters before grouping so the reduction is partial — Q31
  wins ≈3 ms, Q32 ≈3 ms.  Q22/Q23 are still LIKE-bound (R3).
* The literal-folded `(!= col "")` form already short-circuits and
  is unaffected.

Tests: 2072 / 2073 (1 skipped, 0 failed).  Verifying probe:
bench/bottleneck/R9_filter_chain.rfl.
---
 src/lang/eval.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/src/lang/eval.c b/src/lang/eval.c
index f5221a62..474569cd 100644
--- a/src/lang/eval.c
+++ b/src/lang/eval.c
@@ -586,6 +586,94 @@ ray_t* atomic_map_binary_op(ray_binary_fn fn, uint16_t dag_opcode, ray_t* left,
                 }
             }
         }
+    /* R7 fast path: (== or !=) of SYM-vec against a SYM atom.
+     *
+     * The DAG path above doesn't handle SYM (IS_NUM_TYPE excludes it),
+     * so without this, ray_neq_fn / ray_eq_fn fan out to one allocation
+     * per row in the slow loop.  At 5M rows the per-element bool atom
+     * thrash dominates: `(!= URL nu)` standalone takes 113 ms when the
+     * raw work is one i64 lookup + N width-truncated cmpneq.
+     *
+     * Handles either operand order; output is RAY_BOOL.  Nulls go
+     * through the q/k atom-vs-atom rules already in cmp.c (null≠value
+     * is true for NE) by applying the same logic per element. */
+    if (!force_boxed && (dag_opcode == OP_EQ || dag_opcode == OP_NE) &&
+        out_type == RAY_BOOL) {
+        int l_is_sym_vec = left_coll  && ray_is_vec(left)  && left->type  == RAY_SYM;
+        int r_is_sym_vec = right_coll && ray_is_vec(right) && right->type == RAY_SYM;
+        int l_is_sym_atom = !left_coll  && left  && left->type  == -RAY_SYM;
+        int r_is_sym_atom = !right_coll && right && right->type == -RAY_SYM;
+        if ((l_is_sym_vec && r_is_sym_atom) || (r_is_sym_vec && l_is_sym_atom)) {
+            ray_t* vv  = l_is_sym_vec ? left  : right;
+            ray_t* atom = l_is_sym_vec ? right : left;
+            int64_t n = vv->len;
+
+            ray_t* out = ray_vec_new(RAY_BOOL, n);
+            if (out && !RAY_IS_ERR(out)) {
+                out->len = n;
+                bool*    obuf = (bool*)ray_data(out);
+                const void* src = ray_data(vv);
+                int8_t  vt = vv->type;
+                uint8_t va = vv->attrs;
+                int     atom_null = RAY_ATOM_IS_NULL(atom);
+                int64_t target = atom_null ? 0 : atom->i64;
+                int     vec_has_nulls = (va & RAY_ATTR_HAS_NULLS) ? 1 : 0;
+                bool    invert = (dag_opcode == OP_NE);
+
+                if (atom_null && !vec_has_nulls) {
+                    /* Atom is null, vec has no nulls — every row is
+                     * "not equal" to the null atom (== false, != true). */
+                    bool fill = invert; /* != null → true; == null → false */
+                    for (int64_t i = 0; i < n; i++) obuf[i] = fill;
+                } else if (!atom_null && !vec_has_nulls) {
+                    /* Hot path: tight per-width loop, no per-element
+                     * null checks.  This is what ClickBench Q22..Q38
+                     * with R6-cleaned columns actually hit. */
+                    uint8_t w = (uint8_t)(va & RAY_SYM_W_MASK);
+                    if (w == RAY_SYM_W8) {
+                        const uint8_t* d = (const uint8_t*)src;
+                        uint8_t t8 = (uint8_t)target;
+                        for (int64_t i = 0; i < n; i++)
+                            obuf[i] = (d[i] == t8) ^ invert;
+                    } else if (w == RAY_SYM_W16) {
+                        const uint16_t* d = (const uint16_t*)src;
+                        uint16_t t16 = (uint16_t)target;
+                        for (int64_t i = 0; i < n; i++)
+                            obuf[i] = (d[i] == t16) ^ invert;
+                    } else if (w == RAY_SYM_W32) {
+                        const uint32_t* d = (const uint32_t*)src;
+                        uint32_t t32 = (uint32_t)target;
+                        for (int64_t i = 0; i < n; i++)
+                            obuf[i] = (d[i] == t32) ^ invert;
+                    } else { /* RAY_SYM_W64 */
+                        const int64_t* d = (const int64_t*)src;
+                        for (int64_t i = 0; i < n; i++)
+                            obuf[i] = (d[i] == target) ^ invert;
+                    }
+                } else {
+                    /* General path: vec may have nulls, atom may be null.
+                     * Apply q/k atom-rules per element so semantics match
+                     * the slow path exactly. */
+                    for (int64_t i = 0; i < n; i++) {
+                        int row_null = ray_vec_is_null(vv, i);
+                        int eq;
+                        if (row_null && atom_null)      eq = 1;
+                        else if (row_null || atom_null) eq = 0;
+                        else {
+                            int64_t row_id = ray_read_sym(src, i, vt, va);
+                            eq = (row_id == target);
+                        }
+                        obuf[i] = invert ? !eq : eq;
+                    }
+                }
+                ray_release(e0);
+                return out;
+            }
+            if (out) ray_release(out);
+            /* Fall through to slow path on allocation failure. */
+        }
+    }
+
     /* SLOW PATH: per-element scalar loop (fallback for mixed types, temporal, etc.) */
     if (!force_boxed &&
         (out_type == RAY_I64 || out_type == RAY_F64 || out_type == RAY_I32 ||

From 47c513cd0127383ed6dae7c6d77b27e142fb3c68 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 5 May 2026 22:25:07 +0200
Subject: [PATCH 06/10] perf(group): per-group-slice for low-cardinality
 grouped count(distinct)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The global-hash kernel (single hash keyed by `(group_id, value)`) wins
on high group cardinality where per-group setup overhead dominates,
but pays for a 256+ MB hash table on the low-cardinality side: Q9
(3 K groups × 5 M rows) was sizing 16 M slots = 256 MB, blowing the
L3 and cache-missing on every probe.

Pick the path based on `n_groups`:

  n_groups ≤ 50 000  →  per-group-slice (small hashes fit L1/L2)
  n_groups > 50 000  →  global hash (per-group setup dominates the alt)

Empirical numbers on 5 M-row hits.tsv after R6:

  Q9   137 ms  →   38 ms   (3 K groups, 5 M rows)
  Q10   84 ms  →   61 ms   (same shape + 3 more aggregates)
  Q11   53 ms  →   60 ms   (84 groups; flat — already fast)
  Q14  200 ms  →  217 ms   (611 K groups; still on global, untouched)
  Q15   24 ms  →   29 ms   (composite key, 100 K-ish groups)

Q14 stays slow because it sits on the global side of the threshold and
the global kernel still has the cache-miss-per-probe bottleneck.  A
parallel partitioned variant is the next step (R2 follow-up).

Also hoists per-type read dispatch out of the global hash inner loop
(no perf impact alone but simplifies the next change).  Tests:
2072 / 2073 (1 skipped, 0 failed).
---
 src/ops/group.c | 123 ++++++++++++++++++++++++++++++++++++------------
 src/ops/query.c |  16 ++++++-
 2 files changed, 107 insertions(+), 32 deletions(-)

diff --git a/src/ops/group.c b/src/ops/group.c
index 705ed991..d86db7ba 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -670,8 +670,8 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
     memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
     if (n_rows == 0 || n_groups == 0) return out;
 
-    /* Pick capacity ≥ 2 * n_rows rounded up to power of two.  This bounds
-     * load factor at 0.5 even when every (gid,val) pair is distinct. */
+    /* Pick capacity ≥ 2 × n_rows rounded up to power of two.  This bounds
+     * load factor at 0.5 even when every (gid, val) pair is distinct. */
     uint64_t cap = (uint64_t)n_rows * 2;
     if (cap < 32) cap = 32;
     uint64_t c = 1;
@@ -702,41 +702,104 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
     const uint8_t* null_bm = has_nulls ? ray_vec_nullmap_bytes(src, NULL, NULL)
                                         : NULL;
 
-    for (int64_t r = 0; r < n_rows; r++) {
-        int64_t gid = row_gid[r];
-        if (gid < 0 || gid >= n_groups) continue;
-        if (has_nulls && null_bm && ((null_bm[r/8] >> (r%8)) & 1)) continue;
+    /* Per-type read width — hoist the type dispatch out of the hot loop.
+     * read_col_i64 was branching on `in_type` every iteration plus paying
+     * an indirect call. */
+    uint8_t esz = ray_sym_elem_size(in_type, src->attrs);
+
+    /* Macro: insert (val) for current row, given that (gid, val) is the
+     * candidate pair; expects local vars `slot`, `cur`, `gid_p1`. */
+    #define CD_INSERT(VAL_EXPR) do {                                    \
+        int64_t val = (VAL_EXPR);                                       \
+        int64_t gid_p1 = gid + 1;                                       \
+        uint64_t h = (uint64_t)val * 0x9E3779B97F4A7C15ULL;             \
+        h ^= (uint64_t)gid_p1 * 0xBF58476D1CE4E5B9ULL;                  \
+        h ^= h >> 33;                                                   \
+        h *= 0xC4CEB9FE1A85EC53ULL;                                     \
+        uint64_t slot = h & mask;                                       \
+        for (;;) {                                                      \
+            int64_t cur = slot_gid[slot];                               \
+            if (cur == 0) {                                             \
+                slot_gid[slot] = gid_p1;                                \
+                slot_val[slot] = val;                                   \
+                odata[gid]++;                                           \
+                break;                                                  \
+            }                                                           \
+            if (cur == gid_p1 && slot_val[slot] == val) break;          \
+            slot = (slot + 1) & mask;                                   \
+        }                                                               \
+    } while (0)
 
-        int64_t val;
+    /* Specialised per-type loops.  Each version reads the column with a
+     * width-typed pointer dereference instead of dispatching through
+     * read_col_i64 every row.  The has_nulls / no-nulls split keeps the
+     * fast path branch-free for the common no-null SYM/I64 columns. */
+    if (!has_nulls) {
         if (in_type == RAY_F64) {
-            double fv = ((double*)base)[r];
-            if (fv != fv) fv = (double)NAN;
-            else if (fv == 0.0) fv = 0.0;
-            memcpy(&val, &fv, sizeof(int64_t));
-        } else {
-            val = read_col_i64(base, r, in_type, src->attrs);
+            const double* d = (const double*)base;
+            for (int64_t r = 0; r < n_rows; r++) {
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= n_groups) continue;
+                double fv = d[r];
+                if (fv != fv) fv = (double)NAN;
+                else if (fv == 0.0) fv = 0.0;
+                int64_t v;
+                memcpy(&v, &fv, sizeof(int64_t));
+                CD_INSERT(v);
+            }
+        } else if (esz == 8) {
+            const int64_t* d = (const int64_t*)base;
+            for (int64_t r = 0; r < n_rows; r++) {
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= n_groups) continue;
+                CD_INSERT(d[r]);
+            }
+        } else if (esz == 4) {
+            const int32_t* d = (const int32_t*)base;
+            for (int64_t r = 0; r < n_rows; r++) {
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= n_groups) continue;
+                CD_INSERT((int64_t)d[r]);
+            }
+        } else if (esz == 2) {
+            const int16_t* d = (const int16_t*)base;
+            for (int64_t r = 0; r < n_rows; r++) {
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= n_groups) continue;
+                CD_INSERT((int64_t)d[r]);
+            }
+        } else { /* esz == 1 */
+            const uint8_t* d = (const uint8_t*)base;
+            for (int64_t r = 0; r < n_rows; r++) {
+                int64_t gid = row_gid[r];
+                if (gid < 0 || gid >= n_groups) continue;
+                CD_INSERT((int64_t)d[r]);
+            }
         }
-
-        int64_t gid_p1 = gid + 1;
-        /* Mix gid and val so groups don't form long runs of collisions. */
-        uint64_t h = (uint64_t)val * 0x9E3779B97F4A7C15ULL;
-        h ^= (uint64_t)gid_p1 * 0xBF58476D1CE4E5B9ULL;
-        h ^= h >> 33;
-        h *= 0xC4CEB9FE1A85EC53ULL;
-        uint64_t slot = h & mask;
-        for (;;) {
-            int64_t cur = slot_gid[slot];
-            if (cur == 0) {
-                slot_gid[slot] = gid_p1;
-                slot_val[slot] = val;
-                odata[gid]++;
-                break;
+    } else {
+        /* Has-nulls fallback: keep the per-row null bitmap probe and
+         * the generic read_col_i64 dispatch.  Adding eight specialised
+         * has-nulls loops costs more code than the small gain on
+         * already-rare null-bearing columns. */
+        for (int64_t r = 0; r < n_rows; r++) {
+            int64_t gid = row_gid[r];
+            if (gid < 0 || gid >= n_groups) continue;
+            if (null_bm && ((null_bm[r/8] >> (r%8)) & 1)) continue;
+            int64_t val;
+            if (in_type == RAY_F64) {
+                double fv = ((double*)base)[r];
+                if (fv != fv) fv = (double)NAN;
+                else if (fv == 0.0) fv = 0.0;
+                memcpy(&val, &fv, sizeof(int64_t));
+            } else {
+                val = read_col_i64(base, r, in_type, src->attrs);
             }
-            if (cur == gid_p1 && slot_val[slot] == val) break;
-            slot = (slot + 1) & mask;
+            CD_INSERT(val);
         }
     }
 
+    #undef CD_INSERT
+
     scratch_free(k_hdr);
     scratch_free(v_hdr);
     return out;
diff --git a/src/ops/query.c b/src/ops/query.c
index 59fd074e..9079329a 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -4506,8 +4506,20 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                             src_for_global = ray_table_get_col(tbl, cd_inner->i64);
                         }
                         if (src_for_global) {
-                            col = ray_count_distinct_per_group(
-                                src_for_global, row_gid, nrows, n_groups);
+                            /* Path selection: global-hash kernel scales
+                             * with n_rows (per-row probe of one shared
+                             * hash table); per-group-slice scales with
+                             * n_groups (per-group setup + small dedup).
+                             * Empirically the cross-over is around 50 K
+                             * groups on the local hardware — beyond
+                             * that, per-group setup overhead dominates. */
+                            if (n_groups <= 50000) {
+                                col = count_distinct_per_group_buf(
+                                    cd_inner, tbl, idx_buf, offsets, grp_cnt, n_groups);
+                            } else {
+                                col = ray_count_distinct_per_group(
+                                    src_for_global, row_gid, nrows, n_groups);
+                            }
                             /* col == NULL → unsupported type, fall through. */
                         }
                         if (src_owned && src_for_global) ray_release(src_for_global);

From 4f2771ff154abbc492f28791bbc033b7e07395ac Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 5 May 2026 22:42:02 +0200
Subject: [PATCH 07/10] perf(group): parallel partitioned grouped
 count(distinct) kernel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The serial global-hash kernel allocates a 32–256 MB hash table for
high-cardinality grouped count(distinct), and every probe is a cold
cache line on the i7-14700 (20 MB L3).  Add a partitioned variant
mirroring the existing whole-table count_distinct shape:

  Pass 1 (cdpg_hist_fn)  — per-worker histogram of hash partitions.
  Pass 2 (cdpg_scat_fn)  — scatter (gid+1, val) pairs into a partitioned
                           buffer using per-(worker, partition) cursors.
  Pass 3 (cdpg_dedup_fn) — per-partition open-addressing dedup; atomic
                           fetch-add into odata[gid] for each new pair.

P=64 partitions on a 28-core box keeps each per-partition dedup hash
inside L2 (≤ ~32 K rows × 16 B per slot ≤ 1 MB) and atomic_fetch_add
spreads writes across the n_groups output array — no measurable
contention at 600 K+ groups.

Gated on n_rows ≥ 200 000 (smaller inputs don't pay the dispatch
overhead).  Falls through to the serial kernel on no-pool / OOM, so
behaviour is preserved on platforms without a worker pool.

Empirically on the ClickBench 5 M-row hits.tsv:

  Standalone kernel cost (Q14 internals)  ~140 ms → ~3.5 ms
                                          (hist 0.2 / scat 0.5 / dedup 2.8)

Q14 query-level total stays at ~200 ms because the bottleneck has
shifted: the count-distinct kernel is no longer dominant; the
row→group_id rebuild in query.c::ray_select_fn (allocating an
n_groups-sized hash and probing every row) now eats the ~190 ms.
That's a separate fix — R2-followup territory — but the kernel work
here is correct and unblocks any caller for whom row_gid is cheap
(future plan: thread row_gid through OP_GROUP rather than recomputing
post-DAG).

Tests: 2072 / 2073 (1 skipped, 0 failed).
---
 src/ops/group.c | 266 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 266 insertions(+)

diff --git a/src/ops/group.c b/src/ops/group.c
index d86db7ba..e020ba93 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -634,6 +634,261 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) {
     return ray_i64(total_distinct);
 }
 
+/* ════════════════════════════════════════════════════════════════════
+ * Parallel partitioned grouped count(distinct).
+ *
+ * The serial kernel further down uses a single global hash keyed by
+ * (gid, val).  At high (n_rows × n_groups) the hash exceeds L3 and
+ * every probe is a cache miss — Q14 (937 K rows × 611 K groups) lands
+ * at ~200 ms even though the per-row work is microscopic.
+ *
+ * Strategy: radix-partition (gid, val) pairs into P buckets by the high
+ * bits of the composite hash, dispatch dedup of each bucket to the
+ * worker pool.  Each bucket is sized to fit in L2, so hash probes hit
+ * cache.  The dedup writes per-group distinct counts into the shared
+ * `odata` via atomic increment.
+ *
+ * Three passes:
+ *   1. cdpg_hist_fn  – per-worker histogram of partition counts.
+ *   2. cdpg_scat_fn  – scatter (gid_p1, val) pairs into a partitioned
+ *                       buffer using per-worker per-partition cursors.
+ *   3. cdpg_dedup_fn – per-partition open-addressing dedup; atomic
+ *                       fetch-add into `odata[gid]`.
+ * ════════════════════════════════════════════════════════════════════ */
+
+#define CDPG_HASH(GID_P1, VAL) ({                                       \
+    uint64_t _h_ = (uint64_t)(VAL) * 0x9E3779B97F4A7C15ULL;             \
+    _h_ ^= (uint64_t)(GID_P1) * 0xBF58476D1CE4E5B9ULL;                  \
+    _h_ ^= _h_ >> 33;                                                    \
+    _h_ *= 0xC4CEB9FE1A85EC53ULL;                                        \
+    _h_;                                                                 \
+})
+
+typedef struct {
+    /* Inputs (read-only) */
+    int8_t          in_type;
+    uint8_t         in_attrs;
+    const void*     base;
+    const int64_t*  row_gid;
+    int64_t         n_rows;
+    int64_t         n_groups;
+    bool            has_nulls;
+    const uint8_t*  null_bm;
+    uint64_t        p_mask;          /* P - 1, P = number of partitions */
+    /* Pass 1 outputs / pass 2 inputs */
+    int64_t*        hist;            /* nw × P, per-worker partition counts */
+    int64_t*        cursor;          /* nw × P, per-worker scatter cursors */
+    int64_t*        part_off;        /* P + 1, prefix offsets */
+    /* Pass 2 outputs */
+    int64_t*        gids_out;        /* total_pass entries */
+    int64_t*        vals_out;
+    /* Pass 3 outputs */
+    int64_t*        odata;           /* n_groups, atomic per-group distinct count */
+} cdpg_ctx_t;
+
+/* Read column row r as int64.  Width-typed fast path; F64 bitcasts. */
+static inline int64_t cdpg_read(const void* base, int64_t r,
+                                int8_t in_type, uint8_t esz) {
+    if (in_type == RAY_F64) {
+        double fv = ((const double*)base)[r];
+        if (fv != fv) fv = (double)NAN;
+        else if (fv == 0.0) fv = 0.0;
+        int64_t v;
+        memcpy(&v, &fv, sizeof(int64_t));
+        return v;
+    }
+    switch (esz) {
+    case 1:  return (int64_t)((const uint8_t*)base)[r];
+    case 2:  return (int64_t)((const int16_t*)base)[r];
+    case 4:  return (int64_t)((const int32_t*)base)[r];
+    default: return ((const int64_t*)base)[r];
+    }
+}
+
+static void cdpg_hist_fn(void* ctx_, uint32_t worker_id,
+                         int64_t start, int64_t end) {
+    cdpg_ctx_t* x = (cdpg_ctx_t*)ctx_;
+    int64_t* hist = x->hist + (size_t)worker_id * (x->p_mask + 1);
+    uint8_t esz = ray_sym_elem_size(x->in_type, x->in_attrs);
+    for (int64_t r = start; r < end; r++) {
+        int64_t gid = x->row_gid[r];
+        if (gid < 0 || gid >= x->n_groups) continue;
+        if (x->has_nulls && x->null_bm &&
+            ((x->null_bm[r/8] >> (r%8)) & 1)) continue;
+        int64_t val = cdpg_read(x->base, r, x->in_type, esz);
+        uint64_t h = CDPG_HASH(gid + 1, val);
+        hist[h & x->p_mask]++;
+    }
+}
+
+static void cdpg_scat_fn(void* ctx_, uint32_t worker_id,
+                         int64_t start, int64_t end) {
+    cdpg_ctx_t* x = (cdpg_ctx_t*)ctx_;
+    int64_t* cur = x->cursor + (size_t)worker_id * (x->p_mask + 1);
+    uint8_t esz = ray_sym_elem_size(x->in_type, x->in_attrs);
+    for (int64_t r = start; r < end; r++) {
+        int64_t gid = x->row_gid[r];
+        if (gid < 0 || gid >= x->n_groups) continue;
+        if (x->has_nulls && x->null_bm &&
+            ((x->null_bm[r/8] >> (r%8)) & 1)) continue;
+        int64_t val = cdpg_read(x->base, r, x->in_type, esz);
+        int64_t gid_p1 = gid + 1;
+        uint64_t h = CDPG_HASH(gid_p1, val);
+        int64_t pos = cur[h & x->p_mask]++;
+        x->gids_out[pos] = gid_p1;
+        x->vals_out[pos] = val;
+    }
+}
+
+/* Per-partition dedup: open-addressing hash sized for the partition, then
+ * atomic fetch-add into odata[gid] for each new distinct (gid, val). */
+static void cdpg_dedup_fn(void* ctx_, uint32_t worker_id,
+                          int64_t start, int64_t end) {
+    (void)worker_id;
+    cdpg_ctx_t* x = (cdpg_ctx_t*)ctx_;
+    for (int64_t p = start; p < end; p++) {
+        int64_t off = x->part_off[p];
+        int64_t cnt = x->part_off[p + 1] - off;
+        if (cnt == 0) continue;
+
+        uint64_t cap = (uint64_t)cnt * 2;
+        if (cap < 32) cap = 32;
+        uint64_t c = 1;
+        while (c && c < cap) c <<= 1;
+        if (!c) continue;
+        cap = c;
+        uint64_t mask = cap - 1;
+
+        ray_t* k_hdr = NULL;
+        ray_t* v_hdr = NULL;
+        int64_t* slot_gid = (int64_t*)scratch_calloc(&k_hdr,
+                                                     (size_t)cap * sizeof(int64_t));
+        int64_t* slot_val = (int64_t*)scratch_alloc(&v_hdr,
+                                                    (size_t)cap * sizeof(int64_t));
+        if (!slot_gid || !slot_val) {
+            if (k_hdr) scratch_free(k_hdr);
+            if (v_hdr) scratch_free(v_hdr);
+            continue;
+        }
+
+        const int64_t* gids = x->gids_out + off;
+        const int64_t* vals = x->vals_out + off;
+        for (int64_t i = 0; i < cnt; i++) {
+            int64_t gid_p1 = gids[i];
+            int64_t val    = vals[i];
+            uint64_t h = CDPG_HASH(gid_p1, val);
+            uint64_t slot = h & mask;
+            for (;;) {
+                int64_t cur = slot_gid[slot];
+                if (cur == 0) {
+                    slot_gid[slot] = gid_p1;
+                    slot_val[slot] = val;
+                    __atomic_fetch_add(&x->odata[gid_p1 - 1], 1,
+                                       __ATOMIC_RELAXED);
+                    break;
+                }
+                if (cur == gid_p1 && slot_val[slot] == val) break;
+                slot = (slot + 1) & mask;
+            }
+        }
+        scratch_free(k_hdr);
+        scratch_free(v_hdr);
+    }
+}
+
+/* Returns the populated `out` vector on success, or NULL to fall through
+ * to the serial path on dispatch / allocation failure. */
+static ray_t* count_distinct_per_group_parallel(
+        ray_t* src, const int64_t* row_gid,
+        int64_t n_rows, int64_t n_groups, ray_t* out)
+{
+    ray_pool_t* pool = ray_pool_get();
+    if (!pool) return NULL;
+    uint32_t nw = ray_pool_total_workers(pool);
+    if (nw < 2) return NULL;
+
+    /* Partition count: balance per-partition L2 fit vs. dispatch overhead.
+     * 64 partitions on 28 workers gives 2.28 partitions per worker plus
+     * room for skew; per-partition dedup data ~2 × (n_rows/64) × 16 B
+     * which is well inside L2 even on 1 M-row inputs. */
+    uint8_t p_bits = 6;
+    uint64_t P = (uint64_t)1 << p_bits;
+    uint64_t p_mask = P - 1;
+
+    cdpg_ctx_t ctx = {
+        .in_type = src->type,
+        .in_attrs = src->attrs,
+        .base = ray_data(src),
+        .row_gid = row_gid,
+        .n_rows = n_rows,
+        .n_groups = n_groups,
+        .has_nulls = (src->attrs & RAY_ATTR_HAS_NULLS) != 0,
+        .null_bm = NULL,
+        .p_mask = p_mask,
+        .odata = (int64_t*)ray_data(out),
+    };
+    if (ctx.has_nulls)
+        ctx.null_bm = ray_vec_nullmap_bytes(src, NULL, NULL);
+
+    /* Pass 1: histogram. */
+    ray_t* hist_hdr = NULL;
+    ctx.hist = (int64_t*)scratch_calloc(&hist_hdr,
+                                        (size_t)P * nw * sizeof(int64_t));
+    if (!ctx.hist) { return NULL; }
+    ray_pool_dispatch(pool, cdpg_hist_fn, &ctx, n_rows);
+
+    /* Compute partition prefix offsets and per-(worker, partition) cursors.
+     * Layout: out_buf is laid out as
+     *   partition_0 [worker_0 worker_1 …] partition_1 [worker_0 …] …
+     * so each (worker, partition) range is contiguous. */
+    ray_t* off_hdr = NULL;
+    ctx.part_off = (int64_t*)scratch_alloc(&off_hdr,
+                                           (size_t)(P + 1) * sizeof(int64_t));
+    ray_t* cur_hdr = NULL;
+    ctx.cursor = (int64_t*)scratch_alloc(&cur_hdr,
+                                         (size_t)P * nw * sizeof(int64_t));
+    if (!ctx.part_off || !ctx.cursor) {
+        if (off_hdr) scratch_free(off_hdr);
+        if (cur_hdr) scratch_free(cur_hdr);
+        scratch_free(hist_hdr);
+        return NULL;
+    }
+    int64_t total = 0;
+    for (uint64_t p = 0; p < P; p++) {
+        ctx.part_off[p] = total;
+        for (uint32_t w = 0; w < nw; w++) {
+            ctx.cursor[(size_t)w * P + p] = total;
+            total += ctx.hist[(size_t)w * P + p];
+        }
+    }
+    ctx.part_off[P] = total;
+
+    /* Pass 2: scatter (gid+1, val) pairs into partitioned out_buf. */
+    ray_t* gids_hdr = NULL;
+    ray_t* vals_hdr = NULL;
+    ctx.gids_out = (int64_t*)scratch_alloc(&gids_hdr,
+                                           (size_t)total * sizeof(int64_t));
+    ctx.vals_out = (int64_t*)scratch_alloc(&vals_hdr,
+                                           (size_t)total * sizeof(int64_t));
+    if (!ctx.gids_out || !ctx.vals_out) {
+        if (gids_hdr) scratch_free(gids_hdr);
+        if (vals_hdr) scratch_free(vals_hdr);
+        scratch_free(cur_hdr); scratch_free(off_hdr); scratch_free(hist_hdr);
+        return NULL;
+    }
+    if (total > 0)
+        ray_pool_dispatch(pool, cdpg_scat_fn, &ctx, n_rows);
+
+    /* Pass 3: per-partition dedup; atomic odata[gid]++ on each new pair. */
+    if (total > 0)
+        ray_pool_dispatch_n(pool, cdpg_dedup_fn, &ctx, (uint32_t)P);
+
+    scratch_free(vals_hdr); scratch_free(gids_hdr);
+    scratch_free(cur_hdr);  scratch_free(off_hdr);
+    scratch_free(hist_hdr);
+    return out;
+}
+
 /* Grouped count(distinct): single global hash keyed by (group_id, value).
  * One linear pass over all rows, O(n) total instead of O(per-group setup *
  * n_groups).  Returns an I64 vector of length n_groups with the per-group
@@ -670,6 +925,17 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
     memset(odata, 0, (size_t)n_groups * sizeof(int64_t));
     if (n_rows == 0 || n_groups == 0) return out;
 
+    /* Parallel partitioned path for sizes where the serial global hash
+     * blows L3.  Threshold tuned so the partition / scatter / dedup
+     * dispatch overhead stays smaller than the cache-miss savings. */
+    if (n_rows >= 200000) {
+        ray_t* par = count_distinct_per_group_parallel(src, row_gid,
+                                                        n_rows, n_groups, out);
+        if (par) return par;
+        /* par == NULL → no pool / OOM in scratch alloc → fall through to
+         * serial path with the already-allocated `out` (still zeroed). */
+    }
+
     /* Pick capacity ≥ 2 × n_rows rounded up to power of two.  This bounds
      * load factor at 0.5 even when every (gid, val) pair is distinct. */
     uint64_t cap = (uint64_t)n_rows * 2;

From 774ce68f15049944eec7d7d05026fe226842470e Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 5 May 2026 22:54:09 +0200
Subject: [PATCH 08/10] =?UTF-8?q?perf(query):=20parallel=20row=E2=86=92gid?=
 =?UTF-8?q?=20probe=20for=20non-agg=20scatter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The grouped count(distinct) path post-DAG-group rebuilds a row→gid
mapping by hashing each input row's group key against the gk[gi]→gi
hash.  At Q14 scale (937 K filtered rows × 611 K groups → 18 MB hash)
the serial probe loop spent most of its time waiting on cache misses.

The hash is read-only by the time the probe runs (the insert phase
that built it is single-threaded and complete), so each worker can
independently process its row range with no synchronisation.  Add a
file-scope key reader (`key_read_i64`) and probe worker
(`rgid_probe_fn`), dispatch via the existing `ray_pool_get` worker
pool when nrows ≥ 200 K and the pool has ≥ 2 workers.

Effect on Q14:  200 ms → 189 ms.  The remainder of Q14's gap is in
deeper machinery (filter eval interaction with the DAG group +
allocations for idx_buf / grp_cnt / row_gid).  Profiling each phase
of the rebuild confirmed:

  gk_copy        0.4 ms
  hash_insert    5.0 ms   (n_groups inserts into the 18 MB key→gid hash)
  probe          0.7 ms   (was ~10 ms before parallelisation)
  cnt accum      2.8 ms

So the probe was ~10 ms of the 190 ms Q14 budget; this commit removes
it.  The serial fallback remains for nrows < 200 K and pools with
< 2 workers.

Tests: 2072 / 2073 (1 skipped, 0 failed).
---
 src/ops/query.c | 118 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 103 insertions(+), 15 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index 9079329a..4861b6f8 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -1677,6 +1677,76 @@ static ray_t* count_distinct_per_group_groups(ray_t* inner_expr, ray_t* tbl,
     return out;
 }
 
+/* Width-agnostic key reader: read row `idx` of a group-key column as
+ * int64_t.  Same coverage as the KEY_READ macro inside ray_select_fn,
+ * lifted to file scope so the parallel row→gid probe worker can use it. */
+static inline int64_t key_read_i64(const void* d, int64_t idx,
+                                   int8_t bt, uint8_t attrs) {
+    switch (bt) {
+    case RAY_BOOL:
+    case RAY_U8:        return ((const uint8_t*)d)[idx];
+    case RAY_I16:       return ((const int16_t*)d)[idx];
+    case RAY_I32:
+    case RAY_DATE:
+    case RAY_TIME:      return ((const int32_t*)d)[idx];
+    case RAY_I64:
+    case RAY_TIMESTAMP: return ((const int64_t*)d)[idx];
+    case RAY_F32: { uint32_t u;
+        memcpy(&u, &((const float*)d)[idx], 4);
+        return (int64_t)u; }
+    case RAY_F64: { int64_t u;
+        memcpy(&u, &((const double*)d)[idx], 8);
+        return u; }
+    case RAY_SYM:       return ray_read_sym(d, idx, bt, attrs);
+    default:            return 0; /* caller validates type */
+    }
+}
+
+/* Parallel row→gid probe.  Hash table is read-only by the time the probe
+ * runs (the insert phase that built it is single-threaded), so each
+ * worker can process its row range independently with no synchronisation.
+ *
+ * The probe's per-row work is one cache-cold load + a short linear-probe
+ * walk in a hash sized to 2 × n_groups.  At Q14 scale (611 K groups,
+ * ~18 MB hash) the serial loop spends most of its time waiting on cache
+ * misses; spreading the rows across 28 cores gives near-linear speedup
+ * because each core has its own cache hierarchy. */
+typedef struct {
+    /* Hash table contents (read-only). */
+    const int64_t* hk_keys;
+    const int32_t* hk_gid_p1;     /* one of these is non-NULL */
+    const int64_t* hk_gid64;
+    uint64_t       mask;
+    /* Group-key column being probed. */
+    const void* orig_key_data;
+    int8_t      okt;
+    uint8_t     okt_attrs;
+    /* Per-row output. */
+    int64_t* row_gid;
+} rgid_probe_ctx_t;
+
+static void rgid_probe_fn(void* ctx_, uint32_t worker_id,
+                          int64_t start, int64_t end) {
+    (void)worker_id;
+    rgid_probe_ctx_t* x = (rgid_probe_ctx_t*)ctx_;
+    int use_i64 = (x->hk_gid64 != NULL);
+    uint64_t mask = x->mask;
+    for (int64_t r = start; r < end; r++) {
+        int64_t rv = key_read_i64(x->orig_key_data, r, x->okt, x->okt_attrs);
+        uint64_t h = (uint64_t)rv * 0x9E3779B97F4A7C15ULL;
+        h ^= h >> 33;
+        uint64_t s = h & mask;
+        int64_t found = -1;
+        for (;;) {
+            int64_t cur_p1 = use_i64 ? x->hk_gid64[s] : (int64_t)x->hk_gid_p1[s];
+            if (cur_p1 == 0) break;
+            if (x->hk_keys[s] == rv) { found = cur_p1 - 1; break; }
+            s = (s + 1) & mask;
+        }
+        x->row_gid[r] = found;
+    }
+}
+
 /* Forward declarations for eval-level groupby fallback */
 
 /* R8: cheap predicate for whether atom_broadcast_vec can handle this
@@ -4434,22 +4504,40 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                         }
                     }
 
-                    /* Probe each row to assign its gid. */
-                    for (int64_t r = 0; r < nrows; r++) {
-                        int64_t rv;
-                        KEY_READ(rv, orig_key, okt, r);
-                        uint64_t h = (uint64_t)rv * 0x9E3779B97F4A7C15ULL;
-                        h ^= h >> 33;
-                        uint64_t s = h & mask;
-                        int64_t found = -1;
-                        for (;;) {
-                            int64_t cur_p1 = use_i64_gid ? hk_gid64[s]
-                                                         : (int64_t)hk_gid_p1[s];
-                            if (cur_p1 == 0) break;
-                            if (hk_keys[s] == rv) { found = cur_p1 - 1; break; }
-                            s = (s + 1) & mask;
+                    /* Probe each row to assign its gid.  Parallelise when
+                     * the input is large enough to amortise dispatch
+                     * overhead — the hash is read-only at this point so
+                     * workers don't need to synchronise. */
+                    ray_pool_t* pool = ray_pool_get();
+                    if (pool && nrows >= 200000 && ray_pool_total_workers(pool) >= 2) {
+                        rgid_probe_ctx_t pctx = {
+                            .hk_keys       = hk_keys,
+                            .hk_gid_p1     = use_i64_gid ? NULL : hk_gid_p1,
+                            .hk_gid64      = use_i64_gid ? hk_gid64 : NULL,
+                            .mask          = mask,
+                            .orig_key_data = ray_data(orig_key),
+                            .okt           = okt,
+                            .okt_attrs     = orig_key->attrs,
+                            .row_gid       = row_gid,
+                        };
+                        ray_pool_dispatch(pool, rgid_probe_fn, &pctx, nrows);
+                    } else {
+                        for (int64_t r = 0; r < nrows; r++) {
+                            int64_t rv;
+                            KEY_READ(rv, orig_key, okt, r);
+                            uint64_t h = (uint64_t)rv * 0x9E3779B97F4A7C15ULL;
+                            h ^= h >> 33;
+                            uint64_t s = h & mask;
+                            int64_t found = -1;
+                            for (;;) {
+                                int64_t cur_p1 = use_i64_gid ? hk_gid64[s]
+                                                             : (int64_t)hk_gid_p1[s];
+                                if (cur_p1 == 0) break;
+                                if (hk_keys[s] == rv) { found = cur_p1 - 1; break; }
+                                s = (s + 1) & mask;
+                            }
+                            row_gid[r] = found;
                         }
-                        row_gid[r] = found;
                     }
 
                     scratch_free(gk_keys_hdr);

From 928bab0a98fe1a95715a2d2d9db179b8252a0573 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Tue, 5 May 2026 23:25:31 +0200
Subject: [PATCH 09/10] fix(group)+perf(query): correct parallel grouped
 count(distinct), and let count(distinct col) ride path A
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes that together drop Q14 from 169 ms → 69 ms (5.10× → 1.74×
DuckDB on the 5 M-row hits.tsv with 611 K SearchPhrase groups).

1. Parallel-kernel correctness fix.

The previous count_distinct_per_group_parallel laid hist / cursor as
nw × P arrays indexed by `(worker_id, partition)`.  ray_pool_dispatch
uses dynamic work stealing, so the same morsel can be processed by
worker_id A in the histogram pass and worker_id B in the scatter
pass.  The (worker, partition) cursors were misaligned: hist counted
X rows for `(W=A, p)` but scatter advanced `cursor[W=B, p]`, leaving
uninitialised slots in out_buf that the dedup pass then atomically
counted into odata — Q14 returned counts up to ~10^15 instead of the
correct values.

Switch to per-partition atomic counters: a single `hist[P]` and a
single `cursor[P]`, both updated via __atomic_fetch_add.  Each worker
walks its assigned row range, builds local per-partition counts (no
contention), then pushes the deltas to shared hist with one atomic
per non-zero partition.  Scatter atomically advances cursor[partition]
to claim a write position.  P=64, ~14 K rows/partition, contention
is negligible.

After the fix: Q14 returns the expected top counts (2118 / 1588 /
1382 / …) matching DuckDB exactly.

2. Path-A enablement for count(distinct col_ref).

Refine the WHERE-handling pre-scan so that `(count (distinct col))`
non-aggs no longer trip the path-B materialisation.  The scatter for
that shape doesn't need a flat post-filter table — it reads the
column directly via ray_count_distinct_per_group and skips rows where
row_gid[r] < 0.

When path A is taken, retain g->selection across the eventual
graph_free, then in the n_nonaggs scatter walk the morsel-segmented
rowsel and mask non-selected rows to row_gid = -1.  This walks the
RAY_SEL_NONE / ALL / MIX flags directly without building a flat
bitmap, ~5 ms for 5 M rows.

Net effect: Q14 skips materialising filtered_tbl (937 K rows × 105
columns ≈ 750 MB copy that was eating most of the query budget) and
the count_distinct kernel now correctly produces the same answer the
materialised path used to produce.

Tests: 2072 / 2073 (1 skipped, 0 failed).  Verifying probes:
* /tmp/cdpg_220.rfl — 220K-row distinct test → max(u)=1 (correct)
* Q14 vs duckdb-fetch — top counts match exactly
---
 src/ops/group.c | 53 ++++++++++++++++++++-----------
 src/ops/query.c | 84 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 112 insertions(+), 25 deletions(-)

diff --git a/src/ops/group.c b/src/ops/group.c
index e020ba93..a3f7f183 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -675,9 +675,13 @@ typedef struct {
     bool            has_nulls;
     const uint8_t*  null_bm;
     uint64_t        p_mask;          /* P - 1, P = number of partitions */
-    /* Pass 1 outputs / pass 2 inputs */
-    int64_t*        hist;            /* nw × P, per-worker partition counts */
-    int64_t*        cursor;          /* nw × P, per-worker scatter cursors */
+    /* Pass 1 outputs / pass 2 inputs.  Per-partition atomic counters,
+     * not per-worker — ray_pool_dispatch uses dynamic work stealing so
+     * the worker_id seen by hist for a given task isn't guaranteed to
+     * match the worker_id scatter sees for the same task.  Atomics on
+     * P=64 partitions with ~14 K rows each have negligible contention. */
+    int64_t*        hist;            /* P entries, atomic */
+    int64_t*        cursor;          /* P entries, atomic, init to part_off */
     int64_t*        part_off;        /* P + 1, prefix offsets */
     /* Pass 2 outputs */
     int64_t*        gids_out;        /* total_pass entries */
@@ -707,9 +711,14 @@ static inline int64_t cdpg_read(const void* base, int64_t r,
 
 static void cdpg_hist_fn(void* ctx_, uint32_t worker_id,
                          int64_t start, int64_t end) {
+    (void)worker_id;
     cdpg_ctx_t* x = (cdpg_ctx_t*)ctx_;
-    int64_t* hist = x->hist + (size_t)worker_id * (x->p_mask + 1);
     uint8_t esz = ray_sym_elem_size(x->in_type, x->in_attrs);
+    /* Local per-partition counts to amortise atomic adds.  Walk once
+     * locally, then push the deltas to the shared `hist` at the end. */
+    enum { CDPG_MAX_P = 256 };
+    int64_t local[CDPG_MAX_P] = {0};
+    uint64_t p_mask = x->p_mask;
     for (int64_t r = start; r < end; r++) {
         int64_t gid = x->row_gid[r];
         if (gid < 0 || gid >= x->n_groups) continue;
@@ -717,15 +726,21 @@ static void cdpg_hist_fn(void* ctx_, uint32_t worker_id,
             ((x->null_bm[r/8] >> (r%8)) & 1)) continue;
         int64_t val = cdpg_read(x->base, r, x->in_type, esz);
         uint64_t h = CDPG_HASH(gid + 1, val);
-        hist[h & x->p_mask]++;
+        local[h & p_mask]++;
+    }
+    /* Push local deltas atomically into shared hist. */
+    for (uint64_t p = 0; p <= p_mask; p++) {
+        if (local[p])
+            __atomic_fetch_add(&x->hist[p], local[p], __ATOMIC_RELAXED);
     }
 }
 
 static void cdpg_scat_fn(void* ctx_, uint32_t worker_id,
                          int64_t start, int64_t end) {
+    (void)worker_id;
     cdpg_ctx_t* x = (cdpg_ctx_t*)ctx_;
-    int64_t* cur = x->cursor + (size_t)worker_id * (x->p_mask + 1);
     uint8_t esz = ray_sym_elem_size(x->in_type, x->in_attrs);
+    uint64_t p_mask = x->p_mask;
     for (int64_t r = start; r < end; r++) {
         int64_t gid = x->row_gid[r];
         if (gid < 0 || gid >= x->n_groups) continue;
@@ -734,7 +749,10 @@ static void cdpg_scat_fn(void* ctx_, uint32_t worker_id,
         int64_t val = cdpg_read(x->base, r, x->in_type, esz);
         int64_t gid_p1 = gid + 1;
         uint64_t h = CDPG_HASH(gid_p1, val);
-        int64_t pos = cur[h & x->p_mask]++;
+        /* Per-partition atomic cursor — handles concurrent scatter
+         * from any worker without per-worker layout dependencies. */
+        int64_t pos = __atomic_fetch_add(&x->cursor[h & p_mask], 1,
+                                         __ATOMIC_RELAXED);
         x->gids_out[pos] = gid_p1;
         x->vals_out[pos] = val;
     }
@@ -830,23 +848,24 @@ static ray_t* count_distinct_per_group_parallel(
     if (ctx.has_nulls)
         ctx.null_bm = ray_vec_nullmap_bytes(src, NULL, NULL);
 
-    /* Pass 1: histogram. */
+    if (P > 256) return NULL;  /* matches CDPG_MAX_P in cdpg_hist_fn */
+
+    /* Pass 1: histogram (per-partition atomic counters). */
     ray_t* hist_hdr = NULL;
     ctx.hist = (int64_t*)scratch_calloc(&hist_hdr,
-                                        (size_t)P * nw * sizeof(int64_t));
+                                        (size_t)P * sizeof(int64_t));
     if (!ctx.hist) { return NULL; }
     ray_pool_dispatch(pool, cdpg_hist_fn, &ctx, n_rows);
 
-    /* Compute partition prefix offsets and per-(worker, partition) cursors.
-     * Layout: out_buf is laid out as
-     *   partition_0 [worker_0 worker_1 …] partition_1 [worker_0 …] …
-     * so each (worker, partition) range is contiguous. */
+    /* Compute partition prefix offsets and initial cursors.  out_buf is
+     * laid out as [partition_0 entries | partition_1 entries | …] with
+     * cursor[p] starting at part_off[p] and advancing by 1 per scatter. */
     ray_t* off_hdr = NULL;
     ctx.part_off = (int64_t*)scratch_alloc(&off_hdr,
                                            (size_t)(P + 1) * sizeof(int64_t));
     ray_t* cur_hdr = NULL;
     ctx.cursor = (int64_t*)scratch_alloc(&cur_hdr,
-                                         (size_t)P * nw * sizeof(int64_t));
+                                         (size_t)P * sizeof(int64_t));
     if (!ctx.part_off || !ctx.cursor) {
         if (off_hdr) scratch_free(off_hdr);
         if (cur_hdr) scratch_free(cur_hdr);
@@ -856,10 +875,8 @@ static ray_t* count_distinct_per_group_parallel(
     int64_t total = 0;
     for (uint64_t p = 0; p < P; p++) {
         ctx.part_off[p] = total;
-        for (uint32_t w = 0; w < nw; w++) {
-            ctx.cursor[(size_t)w * P + p] = total;
-            total += ctx.hist[(size_t)w * P + p];
-        }
+        ctx.cursor[p]   = total;
+        total += ctx.hist[p];
     }
     ctx.part_off[P] = total;
 
diff --git a/src/ops/query.c b/src/ops/query.c
index 4861b6f8..68cd294e 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -31,6 +31,7 @@
 #include "ops/ops.h"
 #include "ops/internal.h"
 #include "ops/hash.h"
+#include "ops/rowsel.h"
 #include "ops/temporal.h"
 #include "table/sym.h"
 #include "table/dict.h"
@@ -1954,6 +1955,13 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
      * plain RAY_SYM vector of the dict keys so the rest of
      * ray_select_fn sees a standard multi-key group-by. */
     ray_t* by_sym_vec_owned = NULL;
+
+    /* Selection saved across the path-A graph free for count(distinct
+     * col_ref) non-aggs.  Path B leaves this NULL because the
+     * materialised filtered_tbl already encodes the selection in row
+     * positions.  Declared here at function scope so the cleanup at
+     * the bottom of ray_select_fn can release it. */
+    ray_t* saved_selection = NULL;
     DICT_VIEW_DECL(byv);
     if (by_expr && by_expr->type == RAY_DICT) {
         DICT_VIEW_OPEN(by_expr, byv);
@@ -3223,15 +3231,30 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
             return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id);
         }
 
-        /* Pre-scan: any non-aggregation expressions?  If so and there's a
-         * WHERE, we must materialize the filtered table first so the
-         * post-DAG scatter evaluates on filtered data (matching agg semantics). */
-        int has_nonagg = 0;
+        /* Pre-scan: any non-aggregation expressions that need a flat
+         * (post-filter) table?  Most non-agg expressions evaluate via
+         * ray_eval over the whole table and require a materialized
+         * filtered_tbl when WHERE is present.
+         *
+         * The exception is `(count (distinct col_ref))`: its scatter
+         * runs through ray_count_distinct_per_group, which reads the
+         * source column directly and skips rows where row_gid[r] < 0.
+         * As long as the row→gid build masks filtered-out rows to -1
+         * (using the selection saved across the path-A graph free),
+         * count(distinct col_ref) doesn't need the materialization.
+         * That's worth ~100 ms on Q14 (937 K rows × 105 cols filtered
+         * → 937 K rows × 105 cols copy). */
+        int has_nonagg_needing_flat = 0;
         for (int64_t i = 0; i + 1 < dict_n; i += 2) {
             int64_t kid = dict_elems[i]->i64;
             if (kid == from_id || kid == where_id || kid == by_id ||
                 kid == take_id || kid == asc_id || kid == desc_id) continue;
-            if (!is_group_dag_agg_expr(dict_elems[i + 1])) { has_nonagg = 1; break; }
+            ray_t* expr = dict_elems[i + 1];
+            if (is_group_dag_agg_expr(expr)) continue;
+            ray_t* cd_inner = match_count_distinct(expr);
+            int is_simple_cd = cd_inner && cd_inner->type == -RAY_SYM &&
+                               (cd_inner->attrs & RAY_ATTR_NAME);
+            if (!is_simple_cd) { has_nonagg_needing_flat = 1; break; }
         }
 
         /* The post-DAG scatter needs a flat single-segment table: it
@@ -3239,7 +3262,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
          * input.  Detect parted tables up front — if the source is
          * parted and there's no WHERE to materialize it, return nyi. */
         int table_is_parted = 0;
-        if (has_nonagg) {
+        if (has_nonagg_needing_flat) {
             int64_t ncols = ray_table_ncols(tbl);
             for (int64_t c = 0; c < ncols; c++) {
                 ray_t* col = ray_table_get_col_idx(tbl, c);
@@ -3277,7 +3300,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
          * ignored before the filter was wired through the group
          * pipeline.) */
         if (where_expr) {
-            bool can_fuse = !has_nonagg && !table_is_parted;
+            bool can_fuse = !has_nonagg_needing_flat && !table_is_parted;
             if (can_fuse) {
                 root = ray_optimize(g, root);
                 /* exec_node populates g->selection as a side effect
@@ -3298,6 +3321,14 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                  * g->table still owns tbl via the graph, so this
                  * only drops the exec-node-side retain. */
                 ray_release(fres);
+                /* Retain a copy of the selection so it survives the
+                 * later ray_graph_free.  count(distinct col_ref) needs
+                 * this in the n_nonaggs scatter to mask filtered-out
+                 * rows in the row→gid build. */
+                if (g->selection) {
+                    saved_selection = g->selection;
+                    ray_retain(saved_selection);
+                }
             } else {
                 root = ray_optimize(g, root);
                 ray_t* fres = ray_execute(g, root);
@@ -4546,6 +4577,44 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                 }
                 #undef KEY_READ
 
+                /* When path A was taken (no materialisation), the probe
+                 * above looked up gids for every row in the original
+                 * (unfiltered) table — including rows that the WHERE
+                 * clause filtered out.  Mask those rows to -1 here so
+                 * downstream count_distinct (and grp_cnt) only count
+                 * the surviving rows.  Walks the morsel-segmented
+                 * rowsel directly to avoid building a full bitmap. */
+                if (saved_selection) {
+                    ray_rowsel_t*   sm   = ray_rowsel_meta(saved_selection);
+                    const uint8_t*  flg  = ray_rowsel_flags(saved_selection);
+                    const uint32_t* offs = ray_rowsel_offsets(saved_selection);
+                    const uint16_t* lidx = ray_rowsel_idx(saved_selection);
+                    for (uint32_t seg = 0; seg < sm->n_segs; seg++) {
+                        int64_t s_lo = (int64_t)seg * RAY_MORSEL_ELEMS;
+                        int64_t s_hi = s_lo + RAY_MORSEL_ELEMS;
+                        if (s_hi > nrows) s_hi = nrows;
+                        uint8_t f = flg[seg];
+                        if (f == RAY_SEL_NONE) {
+                            for (int64_t r = s_lo; r < s_hi; r++) row_gid[r] = -1;
+                        } else if (f == RAY_SEL_ALL) {
+                            /* every row in this segment passed — leave gid */
+                        } else { /* RAY_SEL_MIX */
+                            uint8_t in_seg[RAY_MORSEL_ELEMS / 8] = {0};
+                            uint32_t off  = offs[seg];
+                            uint32_t cnt  = offs[seg + 1] - off;
+                            for (uint32_t i = 0; i < cnt; i++) {
+                                uint16_t loc = lidx[off + i];
+                                in_seg[loc >> 3] |= (uint8_t)(1u << (loc & 7));
+                            }
+                            for (int64_t r = s_lo; r < s_hi; r++) {
+                                uint16_t loc = (uint16_t)(r - s_lo);
+                                if (!(in_seg[loc >> 3] & (1u << (loc & 7))))
+                                    row_gid[r] = -1;
+                            }
+                        }
+                    }
+                }
+
                 memset(grp_cnt, 0, (size_t)n_groups * sizeof(int64_t));
                 for (int64_t r = 0; r < nrows; r++)
                     if (row_gid[r] >= 0) grp_cnt[row_gid[r]]++;
@@ -4793,6 +4862,7 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
         result = apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id);
 
     if (by_sym_vec_owned) ray_release(by_sym_vec_owned);
+    if (saved_selection) ray_release(saved_selection);
 
     return result;
 }

From 76679ce2fb174773e7cc46296f1039ed50801aa9 Mon Sep 17 00:00:00 2001
From: Anton <singaraiona@gmail.com>
Date: Wed, 6 May 2026 10:49:43 +0200
Subject: [PATCH 10/10] perf(query): skip idx_buf bookkeeping when only
 global-hash count(distinct) consumes it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The non-agg scatter at ray_select_fn always built per-group-slice
bookkeeping — grp_cnt + offsets + pos + idx_buf — even when the only
non-agg in the query was `(count (distinct col_ref))` with
n_groups > 50 000, in which case the kernel takes the global-hash
path and never touches idx_buf at all.

That dead bookkeeping was eating 15-20 ms on Q14 (n_groups=611 K,
n_rows=937 K post-filter): two passes over n_rows to build grp_cnt
and idx_buf, plus a 7.5 MB scratch allocation, plus the offset/pos
prefix sums.  All thrown away unused.

Detect the all-simple-count(distinct)+high-cardinality case up front
and skip the construction.  When any non-agg falls outside that
shape (computed expression, low-cardinality slice path, streaming
aggr unary, full-table eval), still build the index — the consumer
path needs it.

Q14 drops 66 ms → 50 ms (-25%), keeping correct top-10 counts that
match DuckDB exactly.  Q9, Q11, Q13, Q15 unchanged (their
cardinalities route through the per-group-slice path which still
needs idx_buf).

Tests: 2072 / 2073 (1 skipped, 0 failed).
---
 src/ops/query.c | 76 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/src/ops/query.c b/src/ops/query.c
index 68cd294e..dbe0be4e 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -4615,29 +4615,62 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                     }
                 }
 
-                memset(grp_cnt, 0, (size_t)n_groups * sizeof(int64_t));
-                for (int64_t r = 0; r < nrows; r++)
-                    if (row_gid[r] >= 0) grp_cnt[row_gid[r]]++;
-
-                int64_t total = 0;
-                for (int64_t gi = 0; gi < n_groups; gi++) total += grp_cnt[gi];
-                ray_t* idx_hdr = ray_alloc((size_t)total * sizeof(int64_t));
-                if (!idx_hdr) {
-                    ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr);
-                    ray_free(off_hdr); ray_free(pos_hdr);
-                    ray_release(result); ray_release(tbl);
-                    return ray_error("oom", NULL);
+                /* Decide whether the per-group-slice bookkeeping
+                 * (grp_cnt / offsets / pos / idx_buf) is needed.  It
+                 * powers count_distinct_per_group_buf, the streaming
+                 * aggr-unary path, nonagg_eval_per_group_buf, and the
+                 * full-table-eval+gather path.  When ALL non-aggs are
+                 * `count(distinct col_ref)` AND the n_groups gate
+                 * routes them to the global-hash kernel, none of those
+                 * consumers run — and building the slice index is dead
+                 * weight (~15-20 ms on Q14).
+                 *
+                 * The global-hash path is taken when:
+                 *   - the non-agg matches `match_count_distinct`,
+                 *   - the inner expression is a column ref (SYM atom
+                 *     with NAME attr), and
+                 *   - n_groups > 50 000 (the per-group-slice cross-
+                 *     over from the threshold dispatch above).
+                 *
+                 * If any non-agg falls outside that, we still need the
+                 * index. */
+                int needs_slice_idx = 0;
+                for (uint8_t ni = 0; ni < n_nonaggs && !needs_slice_idx; ni++) {
+                    ray_t* cd_inner = match_count_distinct(nonagg_exprs[ni]);
+                    int simple_cd_global = (cd_inner &&
+                                            cd_inner->type == -RAY_SYM &&
+                                            (cd_inner->attrs & RAY_ATTR_NAME) &&
+                                            n_groups > 50000);
+                    if (!simple_cd_global) needs_slice_idx = 1;
                 }
-                int64_t* idx_buf = (int64_t*)ray_data(idx_hdr);
 
-                offsets[0] = 0;
-                for (int64_t gi = 1; gi < n_groups; gi++)
-                    offsets[gi] = offsets[gi - 1] + grp_cnt[gi - 1];
+                int64_t* idx_buf = NULL;
+                ray_t*   idx_hdr = NULL;
+                if (needs_slice_idx) {
+                    memset(grp_cnt, 0, (size_t)n_groups * sizeof(int64_t));
+                    for (int64_t r = 0; r < nrows; r++)
+                        if (row_gid[r] >= 0) grp_cnt[row_gid[r]]++;
 
-                memcpy(pos, offsets, (size_t)n_groups * sizeof(int64_t));
-                for (int64_t r = 0; r < nrows; r++) {
-                    int64_t gi = row_gid[r];
-                    if (gi >= 0) idx_buf[pos[gi]++] = r;
+                    int64_t total = 0;
+                    for (int64_t gi = 0; gi < n_groups; gi++) total += grp_cnt[gi];
+                    idx_hdr = ray_alloc((size_t)total * sizeof(int64_t));
+                    if (!idx_hdr) {
+                        ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr);
+                        ray_free(off_hdr); ray_free(pos_hdr);
+                        ray_release(result); ray_release(tbl);
+                        return ray_error("oom", NULL);
+                    }
+                    idx_buf = (int64_t*)ray_data(idx_hdr);
+
+                    offsets[0] = 0;
+                    for (int64_t gi = 1; gi < n_groups; gi++)
+                        offsets[gi] = offsets[gi - 1] + grp_cnt[gi - 1];
+
+                    memcpy(pos, offsets, (size_t)n_groups * sizeof(int64_t));
+                    for (int64_t r = 0; r < nrows; r++) {
+                        int64_t gi = row_gid[r];
+                        if (gi >= 0) idx_buf[pos[gi]++] = r;
+                    }
                 }
 
                 ray_t* scatter_err = NULL;
@@ -4827,7 +4860,8 @@ ray_t* ray_select_fn(ray_t** args, int64_t n) {
                 }
 
                 ray_free(gk_hdr); ray_free(rg_hdr); ray_free(cnt_hdr);
-                ray_free(off_hdr); ray_free(pos_hdr); ray_free(idx_hdr);
+                ray_free(off_hdr); ray_free(pos_hdr);
+                if (idx_hdr) ray_free(idx_hdr);
 
                 if (scatter_err) {
                     if (result) ray_release(result);