From 5e22fb2ed1658eb63fa1393b693109d0bdae81b3 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Wed, 1 Apr 2026 21:47:37 +0000 Subject: [PATCH 1/7] Add a new GUC and the SortedMergeAdapter --- .../distributed/executor/multi_executor.c | 3 + .../distributed/executor/sorted_merge.c | 204 ++++++++++++++++++ src/backend/distributed/shared_library_init.c | 15 ++ src/include/distributed/citus_custom_scan.h | 3 + src/include/distributed/multi_executor.h | 1 + src/include/distributed/sorted_merge.h | 14 ++ 6 files changed, 240 insertions(+) diff --git a/src/backend/distributed/executor/multi_executor.c b/src/backend/distributed/executor/multi_executor.c index 8661d367345..91479019df1 100644 --- a/src/backend/distributed/executor/multi_executor.c +++ b/src/backend/distributed/executor/multi_executor.c @@ -88,6 +88,9 @@ bool SortReturning = false; /* when true at planning time, enables coordinator sorted merge for ORDER BY */ bool EnableSortedMerge = false; +/* when true, uses streaming adapter instead of eager merge for sorted merge */ +bool EnableStreamingSortedMerge = false; + /* * How many nested executors have we started? This can happen for SQL * UDF calls. The outer query starts an executor, then postgres opens diff --git a/src/backend/distributed/executor/sorted_merge.c b/src/backend/distributed/executor/sorted_merge.c index f3514dfdb38..76b9989b504 100644 --- a/src/backend/distributed/executor/sorted_merge.c +++ b/src/backend/distributed/executor/sorted_merge.c @@ -331,3 +331,207 @@ MergeHeapComparator(Datum a, Datum b, void *arg) return 0; } + + +/* + * SortedMergeAdapter streams tuples from K pre-sorted per-task stores + * via a binary heap, returning one globally-sorted tuple per call. + * + * This is the streaming replacement for MergePerTaskStoresIntoFinalStore(). + * Instead of copying all tuples into a final tuplestore, the adapter holds + * the per-task stores and heap alive, producing tuples on demand. + * + * Modeled after PostgreSQL's MergeAppend (nodeMergeAppend.c), which uses + * the same binary-heap-over-sorted-inputs pattern. + */ +struct SortedMergeAdapter +{ + Tuplestorestate **perTaskStores; /* K per-task stores (owned) */ + int nstores; + + binaryheap *heap; + + MergeContext mergeCtx; /* embedded — passed to heap as bh_arg */ + + TupleDesc tupleDesc; + bool exhausted; + bool initialized; +}; + + +/* + * CreateSortedMergeAdapter builds a streaming merge adapter over K per-task + * stores. The adapter takes ownership of perTaskStores — the caller must + * not free them; FreeSortedMergeAdapter() handles cleanup. + * + * All memory is allocated in CurrentMemoryContext. The caller must ensure + * this context outlives the adapter (the AdaptiveExecutor local context + * already satisfies this — see adaptive_executor.c). + */ +SortedMergeAdapter * +CreateSortedMergeAdapter(Tuplestorestate **perTaskStores, + int nstores, + SortedMergeKey *mergeKeys, + int nkeys, + TupleDesc tupleDesc) +{ + SortedMergeAdapter *adapter = palloc0(sizeof(SortedMergeAdapter)); + adapter->perTaskStores = perTaskStores; + adapter->nstores = nstores; + adapter->tupleDesc = tupleDesc; + + /* one comparison slot per store — owned via mergeCtx.slots */ + TupleTableSlot **slots = palloc(nstores * sizeof(TupleTableSlot *)); + for (int i = 0; i < nstores; i++) + { + slots[i] = MakeSingleTupleTableSlot(tupleDesc, &TTSOpsMinimalTuple); + } + + /* build SortSupport (same logic as MergePerTaskStoresIntoFinalStore) */ + SortSupportData *sortKeys = palloc0(nkeys * sizeof(SortSupportData)); + for (int i = 0; i < nkeys; i++) + { + SortSupport sk = &sortKeys[i]; + sk->ssup_cxt = CurrentMemoryContext; + sk->ssup_collation = mergeKeys[i].collation; + sk->ssup_nulls_first = mergeKeys[i].nullsFirst; + sk->ssup_attno = mergeKeys[i].attno; + PrepareSortSupportFromOrderingOp(mergeKeys[i].sortop, sk); + } + + /* set up embedded merge context for heap comparisons */ + adapter->mergeCtx.slots = slots; + adapter->mergeCtx.sortKeys = sortKeys; + adapter->mergeCtx.nkeys = nkeys; + + /* allocate heap with embedded context as comparator arg */ + adapter->heap = binaryheap_allocate(nstores, MergeHeapComparator, + &adapter->mergeCtx); + + return adapter; +} + + +/* + * SortedMergeAdapterNext returns the next globally-sorted tuple from the + * adapter by copying it into the provided scanSlot. Returns true if a tuple + * was returned, false if all stores are exhausted. + * + * The heap uses per-store comparison slots (mergeCtx.slots). After + * identifying the winner, we ExecCopySlot from the winner's comparison + * slot into the scan slot. This is a MinimalTuple copy, comparable in + * cost to the tuplestore_puttupleslot write in the eager merge path. + * + * On each call after the first, we advance the previous winner's store + * and update the heap before selecting the new winner. This matches the + * MergeAppend pattern in nodeMergeAppend.c. + */ +bool +SortedMergeAdapterNext(SortedMergeAdapter *adapter, TupleTableSlot *scanSlot) +{ + if (adapter->exhausted) + { + ExecClearTuple(scanSlot); + return false; + } + + if (!adapter->initialized) + { + /* first call: seed the heap with the first tuple from each store */ + for (int i = 0; i < adapter->nstores; i++) + { + tuplestore_rescan(adapter->perTaskStores[i]); + if (tuplestore_gettupleslot(adapter->perTaskStores[i], true, false, + adapter->mergeCtx.slots[i])) + { + binaryheap_add_unordered(adapter->heap, Int32GetDatum(i)); + } + } + binaryheap_build(adapter->heap); + adapter->initialized = true; + } + else + { + /* advance the previous winner and update the heap */ + int prevWinner = DatumGetInt32(binaryheap_first(adapter->heap)); + if (tuplestore_gettupleslot(adapter->perTaskStores[prevWinner], true, + false, adapter->mergeCtx.slots[prevWinner])) + { + binaryheap_replace_first(adapter->heap, Int32GetDatum(prevWinner)); + } + else + { + (void) binaryheap_remove_first(adapter->heap); + } + } + + if (binaryheap_empty(adapter->heap)) + { + adapter->exhausted = true; + ExecClearTuple(scanSlot); + return false; + } + + int winner = DatumGetInt32(binaryheap_first(adapter->heap)); + ExecCopySlot(scanSlot, adapter->mergeCtx.slots[winner]); + + return true; +} + + +/* + * SortedMergeAdapterRescan resets the adapter to re-read from the beginning. + * Called from CitusReScan() for cursor WITH HOLD patterns. + * + * Cost is O(K log K) to rebuild the heap, which is negligible for typical + * shard counts (4-64). Both binaryheap_reset() and tuplestore_rescan() + * are proven APIs used by PostgreSQL's ExecReScanMergeAppend. + */ +void +SortedMergeAdapterRescan(SortedMergeAdapter *adapter) +{ + binaryheap_reset(adapter->heap); + + for (int i = 0; i < adapter->nstores; i++) + { + tuplestore_rescan(adapter->perTaskStores[i]); + if (tuplestore_gettupleslot(adapter->perTaskStores[i], true, false, + adapter->mergeCtx.slots[i])) + { + binaryheap_add_unordered(adapter->heap, Int32GetDatum(i)); + } + } + binaryheap_build(adapter->heap); + + adapter->exhausted = false; + adapter->initialized = true; +} + + +/* + * FreeSortedMergeAdapter releases all adapter resources including + * per-task stores, comparison slots, sort keys, and the heap. + * Called from CitusEndScan() for deterministic cleanup. + */ +void +FreeSortedMergeAdapter(SortedMergeAdapter *adapter) +{ + if (adapter == NULL) + { + return; + } + + for (int i = 0; i < adapter->nstores; i++) + { + tuplestore_end(adapter->perTaskStores[i]); + ExecDropSingleTupleTableSlot(adapter->mergeCtx.slots[i]); + } + + binaryheap_free(adapter->heap); + pfree(adapter->mergeCtx.slots); + pfree(adapter->mergeCtx.sortKeys); + pfree(adapter->perTaskStores); + + /* mergeCtx is embedded in adapter, freed with the adapter itself */ + pfree(adapter); +} diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 48842050c3b..8b4b366ace3 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1617,6 +1617,21 @@ RegisterCitusConfigVariables(void) GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, NULL, NULL, NULL); + DefineCustomBoolVariable( + "citus.enable_streaming_sorted_merge", + gettext_noop("Use streaming adapter instead of eager merge for sorted merge."), + gettext_noop("When enabled alongside citus.enable_sorted_merge, the coordinator " + "streams merged tuples directly from per-task stores via a binary " + "heap instead of eagerly copying all tuples into a final tuplestore. " + "This reduces memory usage and improves time-to-first-tuple, " + "especially for LIMIT queries. Requires citus.enable_sorted_merge " + "to also be enabled. This is an experimental feature."), + &EnableStreamingSortedMerge, + false, + PGC_USERSET, + GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + DefineCustomBoolVariable( "citus.enable_stat_counters", gettext_noop("Enables the collection of statistic counters for Citus."), diff --git a/src/include/distributed/citus_custom_scan.h b/src/include/distributed/citus_custom_scan.h index db1f0ce1f2a..dbe71df0856 100644 --- a/src/include/distributed/citus_custom_scan.h +++ b/src/include/distributed/citus_custom_scan.h @@ -28,6 +28,9 @@ typedef struct CitusScanState MultiExecutorType executorType; /* distributed executor type */ bool finishedRemoteScan; /* flag to check if remote scan is finished */ Tuplestorestate *tuplestorestate; /* tuple store to store distributed results */ + + /* streaming sorted merge adapter (NULL when not using sorted merge) */ + struct SortedMergeAdapter *mergeAdapter; } CitusScanState; diff --git a/src/include/distributed/multi_executor.h b/src/include/distributed/multi_executor.h index c18067b5499..da30bd0c838 100644 --- a/src/include/distributed/multi_executor.h +++ b/src/include/distributed/multi_executor.h @@ -71,6 +71,7 @@ extern int MaxAdaptiveExecutorPoolSize; extern int ExecutorSlowStartInterval; extern bool SortReturning; extern bool EnableSortedMerge; +extern bool EnableStreamingSortedMerge; extern int ExecutorLevel; diff --git a/src/include/distributed/sorted_merge.h b/src/include/distributed/sorted_merge.h index eeb3e690d35..d39f03ad777 100644 --- a/src/include/distributed/sorted_merge.h +++ b/src/include/distributed/sorted_merge.h @@ -18,6 +18,10 @@ #include "distributed/tuple_destination.h" +/* opaque streaming merge adapter — full definition in sorted_merge.c */ +typedef struct SortedMergeAdapter SortedMergeAdapter; + + extern TupleDestination * CreatePerTaskDispatchDest(List *taskList, TupleDesc tupleDesc, TupleDestinationStats *sharedStats, @@ -31,4 +35,14 @@ extern void MergePerTaskStoresIntoFinalStore(Tuplestorestate *finalStore, int nkeys, TupleDesc tupleDesc); +extern SortedMergeAdapter * CreateSortedMergeAdapter(Tuplestorestate **perTaskStores, + int nstores, + SortedMergeKey *mergeKeys, + int nkeys, + TupleDesc tupleDesc); +extern bool SortedMergeAdapterNext(SortedMergeAdapter *adapter, + TupleTableSlot *scanSlot); +extern void SortedMergeAdapterRescan(SortedMergeAdapter *adapter); +extern void FreeSortedMergeAdapter(SortedMergeAdapter *adapter); + #endif /* SORTED_MERGE_H */ From 71a653304dc675d975338d49f82115c4377b3b45 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Fri, 3 Apr 2026 20:17:55 +0000 Subject: [PATCH 2/7] Use the SortedMergeAdapter in the existing functions instead of duplicating code --- .../distributed/executor/sorted_merge.c | 151 +++++++----------- src/include/distributed/sorted_merge.h | 3 +- 2 files changed, 62 insertions(+), 92 deletions(-) diff --git a/src/backend/distributed/executor/sorted_merge.c b/src/backend/distributed/executor/sorted_merge.c index 76b9989b504..75e09f92baa 100644 --- a/src/backend/distributed/executor/sorted_merge.c +++ b/src/backend/distributed/executor/sorted_merge.c @@ -66,6 +66,32 @@ typedef struct MergeContext } MergeContext; +/* + * SortedMergeAdapter streams tuples from K pre-sorted per-task stores + * via a binary heap, returning one globally-sorted tuple per call. + * + * Used both as the streaming replacement for MergePerTaskStoresIntoFinalStore() + * and internally by that function itself (to avoid duplicating the merge logic). + * + * Modeled after PostgreSQL's MergeAppend (nodeMergeAppend.c), which uses + * the same binary-heap-over-sorted-inputs pattern. + */ +struct SortedMergeAdapter +{ + Tuplestorestate **perTaskStores; /* K per-task stores (not owned in eager mode) */ + int nstores; + bool ownsStores; /* if true, FreeSortedMergeAdapter frees stores */ + + binaryheap *heap; + + MergeContext mergeCtx; /* embedded — passed to heap as bh_arg */ + + TupleDesc tupleDesc; + bool exhausted; + bool initialized; +}; + + /* forward declarations */ static void PerTaskDispatchPutTuple(TupleDestination *self, Task *task, int placementIndex, int queryNumber, @@ -213,7 +239,9 @@ PerTaskDispatchTupleDescForQuery(TupleDestination *self, int queryNumber) * Each per-task store must contain tuples sorted by the given merge keys. * The output tuplestore will contain all tuples in globally sorted order. * - * Uses PostgreSQL's public binaryheap and SortSupport APIs. + * Implemented by creating a temporary SortedMergeAdapter, draining it into + * the final store, and freeing the adapter. The per-task stores are NOT + * freed by this function — the caller is responsible for that. */ void MergePerTaskStoresIntoFinalStore(Tuplestorestate *finalStore, @@ -228,69 +256,21 @@ MergePerTaskStoresIntoFinalStore(Tuplestorestate *finalStore, return; } - /* allocate one reusable slot per task store */ - TupleTableSlot **slots = palloc(nstores * sizeof(TupleTableSlot *)); - for (int i = 0; i < nstores; i++) - { - slots[i] = MakeSingleTupleTableSlot(tupleDesc, &TTSOpsMinimalTuple); - } + SortedMergeAdapter *adapter = CreateSortedMergeAdapter(perTaskStores, + nstores, mergeKeys, + nkeys, tupleDesc, + false); - /* build SortSupport from serialized merge keys */ - SortSupportData *sortKeys = palloc0(nkeys * sizeof(SortSupportData)); - for (int i = 0; i < nkeys; i++) - { - SortSupport sk = &sortKeys[i]; - sk->ssup_cxt = CurrentMemoryContext; - sk->ssup_collation = mergeKeys[i].collation; - sk->ssup_nulls_first = mergeKeys[i].nullsFirst; - sk->ssup_attno = mergeKeys[i].attno; - PrepareSortSupportFromOrderingOp(mergeKeys[i].sortop, sk); - } - - /* set up merge context for heap comparisons */ - MergeContext ctx; - ctx.slots = slots; - ctx.sortKeys = sortKeys; - ctx.nkeys = nkeys; - - binaryheap *heap = binaryheap_allocate(nstores, MergeHeapComparator, &ctx); + TupleTableSlot *slot = MakeSingleTupleTableSlot(tupleDesc, + &TTSOpsMinimalTuple); - /* seed the heap with the first tuple from each non-empty store */ - for (int i = 0; i < nstores; i++) + while (SortedMergeAdapterNext(adapter, slot)) { - tuplestore_rescan(perTaskStores[i]); - if (tuplestore_gettupleslot(perTaskStores[i], true, false, slots[i])) - { - binaryheap_add_unordered(heap, Int32GetDatum(i)); - } + tuplestore_puttupleslot(finalStore, slot); } - binaryheap_build(heap); - - /* merge loop: extract min, write to final store, advance winner */ - while (!binaryheap_empty(heap)) - { - int winner = DatumGetInt32(binaryheap_first(heap)); - tuplestore_puttupleslot(finalStore, slots[winner]); - if (tuplestore_gettupleslot(perTaskStores[winner], true, false, - slots[winner])) - { - binaryheap_replace_first(heap, Int32GetDatum(winner)); - } - else - { - (void) binaryheap_remove_first(heap); - } - } - - /* free merge-local resources */ - binaryheap_free(heap); - for (int i = 0; i < nstores; i++) - { - ExecDropSingleTupleTableSlot(slots[i]); - } - pfree(slots); - pfree(sortKeys); + ExecDropSingleTupleTableSlot(slot); + FreeSortedMergeAdapter(adapter); } @@ -333,36 +313,11 @@ MergeHeapComparator(Datum a, Datum b, void *arg) } -/* - * SortedMergeAdapter streams tuples from K pre-sorted per-task stores - * via a binary heap, returning one globally-sorted tuple per call. - * - * This is the streaming replacement for MergePerTaskStoresIntoFinalStore(). - * Instead of copying all tuples into a final tuplestore, the adapter holds - * the per-task stores and heap alive, producing tuples on demand. - * - * Modeled after PostgreSQL's MergeAppend (nodeMergeAppend.c), which uses - * the same binary-heap-over-sorted-inputs pattern. - */ -struct SortedMergeAdapter -{ - Tuplestorestate **perTaskStores; /* K per-task stores (owned) */ - int nstores; - - binaryheap *heap; - - MergeContext mergeCtx; /* embedded — passed to heap as bh_arg */ - - TupleDesc tupleDesc; - bool exhausted; - bool initialized; -}; - - /* * CreateSortedMergeAdapter builds a streaming merge adapter over K per-task - * stores. The adapter takes ownership of perTaskStores — the caller must - * not free them; FreeSortedMergeAdapter() handles cleanup. + * stores. When ownsStores is true, FreeSortedMergeAdapter() will call + * tuplestore_end() on each per-task store; when false, the caller retains + * ownership and must free them separately. * * All memory is allocated in CurrentMemoryContext. The caller must ensure * this context outlives the adapter (the AdaptiveExecutor local context @@ -373,11 +328,13 @@ CreateSortedMergeAdapter(Tuplestorestate **perTaskStores, int nstores, SortedMergeKey *mergeKeys, int nkeys, - TupleDesc tupleDesc) + TupleDesc tupleDesc, + bool ownsStores) { SortedMergeAdapter *adapter = palloc0(sizeof(SortedMergeAdapter)); adapter->perTaskStores = perTaskStores; adapter->nstores = nstores; + adapter->ownsStores = ownsStores; adapter->tupleDesc = tupleDesc; /* one comparison slot per store — owned via mergeCtx.slots */ @@ -387,7 +344,7 @@ CreateSortedMergeAdapter(Tuplestorestate **perTaskStores, slots[i] = MakeSingleTupleTableSlot(tupleDesc, &TTSOpsMinimalTuple); } - /* build SortSupport (same logic as MergePerTaskStoresIntoFinalStore) */ + /* build SortSupport from serialized merge keys */ SortSupportData *sortKeys = palloc0(nkeys * sizeof(SortSupportData)); for (int i = 0; i < nkeys; i++) { @@ -425,6 +382,11 @@ CreateSortedMergeAdapter(Tuplestorestate **perTaskStores, * On each call after the first, we advance the previous winner's store * and update the heap before selecting the new winner. This matches the * MergeAppend pattern in nodeMergeAppend.c. + * + * Possible perf optimizations to explore in the future: + * Avoid copying the winning tuple into the scan slot by returning a pointer to the winner's slot instead. + * This would require changes to the caller to not modify the returned slot and to understand that it's owned by the adapter until the next call. + * It would save a copy per tuple at the cost of a more complex API and potential lifetime management issues. */ bool SortedMergeAdapterNext(SortedMergeAdapter *adapter, TupleTableSlot *scanSlot) @@ -523,14 +485,21 @@ FreeSortedMergeAdapter(SortedMergeAdapter *adapter) for (int i = 0; i < adapter->nstores; i++) { - tuplestore_end(adapter->perTaskStores[i]); + if (adapter->ownsStores) + { + tuplestore_end(adapter->perTaskStores[i]); + } ExecDropSingleTupleTableSlot(adapter->mergeCtx.slots[i]); } binaryheap_free(adapter->heap); pfree(adapter->mergeCtx.slots); pfree(adapter->mergeCtx.sortKeys); - pfree(adapter->perTaskStores); + + if (adapter->ownsStores) + { + pfree(adapter->perTaskStores); + } /* mergeCtx is embedded in adapter, freed with the adapter itself */ pfree(adapter); diff --git a/src/include/distributed/sorted_merge.h b/src/include/distributed/sorted_merge.h index d39f03ad777..d82fd626030 100644 --- a/src/include/distributed/sorted_merge.h +++ b/src/include/distributed/sorted_merge.h @@ -39,7 +39,8 @@ extern SortedMergeAdapter * CreateSortedMergeAdapter(Tuplestorestate **perTaskSt int nstores, SortedMergeKey *mergeKeys, int nkeys, - TupleDesc tupleDesc); + TupleDesc tupleDesc, + bool ownsStores); extern bool SortedMergeAdapterNext(SortedMergeAdapter *adapter, TupleTableSlot *scanSlot); extern void SortedMergeAdapterRescan(SortedMergeAdapter *adapter); From 10ac20a030fe5d0153ff7e6351d464ff86701bce Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Fri, 3 Apr 2026 23:16:02 +0000 Subject: [PATCH 3/7] Wire up streaming k way merge and add integration test --- .../distributed/executor/adaptive_executor.c | 48 +- .../distributed/executor/citus_custom_scan.c | 14 +- .../distributed/executor/multi_executor.c | 54 +- .../distributed/planner/distributed_planner.c | 13 +- .../multi_orderby_pushdown_streaming.out | 1888 +++++++++++++++++ .../sql/multi_orderby_pushdown_streaming.sql | 14 + 6 files changed, 2002 insertions(+), 29 deletions(-) create mode 100644 src/test/regress/expected/multi_orderby_pushdown_streaming.out create mode 100644 src/test/regress/sql/multi_orderby_pushdown_streaming.sql diff --git a/src/backend/distributed/executor/adaptive_executor.c b/src/backend/distributed/executor/adaptive_executor.c index a4e5461e51e..53a6c4c7b46 100644 --- a/src/backend/distributed/executor/adaptive_executor.c +++ b/src/backend/distributed/executor/adaptive_executor.c @@ -947,23 +947,45 @@ AdaptiveExecutor(CitusScanState *scanState) * When sorted merge is active, k-way merge the per-task stores into * the final tuplestore. This produces globally sorted output that the * existing ReturnTupleFromTuplestore() path can read unchanged. + * + * When streaming sorted merge is enabled, create an adapter instead + * that delivers tuples one at a time without a final tuplestore. */ if (execution->useSortedMerge && execution->perTaskStoreCount > 0) { - scanState->tuplestorestate = - tuplestore_begin_heap(randomAccess, interTransactions, work_mem); - - MergePerTaskStoresIntoFinalStore(scanState->tuplestorestate, - execution->perTaskStores, - execution->perTaskStoreCount, - distributedPlan->sortedMergeKeys, - distributedPlan->sortedMergeKeyCount, - tupleDescriptor); - - /* free per-task stores — they are no longer needed */ - for (int i = 0; i < execution->perTaskStoreCount; i++) + if (EnableStreamingSortedMerge) + { + /* + * Streaming mode: create an adapter that delivers tuples one + * at a time from the per-task stores via a binary heap. The + * adapter takes ownership of the per-task stores. + */ + scanState->mergeAdapter = CreateSortedMergeAdapter( + execution->perTaskStores, + execution->perTaskStoreCount, + distributedPlan->sortedMergeKeys, + distributedPlan->sortedMergeKeyCount, + tupleDescriptor, + true); + } + else { - tuplestore_end(execution->perTaskStores[i]); + /* Eager mode (default): merge all tuples into a final tuplestore */ + scanState->tuplestorestate = + tuplestore_begin_heap(randomAccess, interTransactions, work_mem); + + MergePerTaskStoresIntoFinalStore(scanState->tuplestorestate, + execution->perTaskStores, + execution->perTaskStoreCount, + distributedPlan->sortedMergeKeys, + distributedPlan->sortedMergeKeyCount, + tupleDescriptor); + + /* free per-task stores — they are no longer needed */ + for (int i = 0; i < execution->perTaskStoreCount; i++) + { + tuplestore_end(execution->perTaskStores[i]); + } } } diff --git a/src/backend/distributed/executor/citus_custom_scan.c b/src/backend/distributed/executor/citus_custom_scan.c index db7e4f725ff..4b1c4701d08 100644 --- a/src/backend/distributed/executor/citus_custom_scan.c +++ b/src/backend/distributed/executor/citus_custom_scan.c @@ -46,6 +46,7 @@ #include "distributed/multi_router_planner.h" #include "distributed/multi_server_executor.h" #include "distributed/shard_utils.h" +#include "distributed/sorted_merge.h" #include "distributed/stats/query_stats.h" #include "distributed/stats/stat_counters.h" #include "distributed/subplan_execution.h" @@ -835,6 +836,12 @@ CitusEndScan(CustomScanState *node) CitusQueryStatsExecutorsEntry(queryId, executorType, partitionKeyString); } + if (scanState->mergeAdapter) + { + FreeSortedMergeAdapter(scanState->mergeAdapter); + scanState->mergeAdapter = NULL; + } + if (scanState->tuplestorestate) { tuplestore_end(scanState->tuplestorestate); @@ -857,7 +864,12 @@ CitusReScan(CustomScanState *node) ExecScanReScan(&node->ss); CitusScanState *scanState = (CitusScanState *) node; - if (scanState->tuplestorestate) + + if (scanState->mergeAdapter) + { + SortedMergeAdapterRescan(scanState->mergeAdapter); + } + else if (scanState->tuplestorestate) { tuplestore_rescan(scanState->tuplestorestate); } diff --git a/src/backend/distributed/executor/multi_executor.c b/src/backend/distributed/executor/multi_executor.c index 91479019df1..8f2774af057 100644 --- a/src/backend/distributed/executor/multi_executor.c +++ b/src/backend/distributed/executor/multi_executor.c @@ -50,6 +50,7 @@ #include "distributed/multi_server_executor.h" #include "distributed/relation_access_tracking.h" #include "distributed/resource_lock.h" +#include "distributed/sorted_merge.h" #include "distributed/transaction_management.h" #include "distributed/version_compat.h" #include "distributed/worker_protocol.h" @@ -346,21 +347,48 @@ CitusCustomScanStateWalker(PlanState *planState, List **citusCustomScanStates) /* - * ReturnTupleFromTuplestore reads the next tuple from the tuple store of the - * given Citus scan node and returns it. It returns null if all tuples are read - * from the tuple store. + * FetchNextScanTuple loads the next tuple into the scan slot. + * Returns true if a tuple was loaded, false if exhausted. + * + * When a merge adapter is active, it streams from the adapter. + * Otherwise, it reads from the tuplestore in the given direction. */ -TupleTableSlot * -ReturnTupleFromTuplestore(CitusScanState *scanState) +static inline bool +FetchNextScanTuple(CitusScanState *scanState, bool forward, TupleTableSlot *slot) { - Tuplestorestate *tupleStore = scanState->tuplestorestate; - bool forwardScanDirection = true; + if (scanState->mergeAdapter != NULL) + { + /* + * Adapter is forward-only. Backward scan should never reach here + * because the planner removes CUSTOMPATH_SUPPORT_BACKWARD_SCAN + * when sorted merge is active, causing PostgreSQL to insert a + * Material node above us for scrollable cursors. + */ + Assert(forward); + return SortedMergeAdapterNext(scanState->mergeAdapter, slot); + } + Tuplestorestate *tupleStore = scanState->tuplestorestate; if (tupleStore == NULL) { - return NULL; + ExecClearTuple(slot); + return false; } + return tuplestore_gettupleslot(tupleStore, forward, false, slot); +} + + +/* + * ReturnTupleFromTuplestore reads the next tuple from the tuple store (or + * streaming merge adapter) of the given Citus scan node and returns it. + * It returns null if all tuples are read. + */ +TupleTableSlot * +ReturnTupleFromTuplestore(CitusScanState *scanState) +{ + bool forwardScanDirection = true; + EState *executorState = ScanStateGetExecutorState(scanState); ScanDirection scanDirection = executorState->es_direction; Assert(ScanDirectionIsValid(scanDirection)); @@ -376,9 +404,9 @@ ReturnTupleFromTuplestore(CitusScanState *scanState) if (!qual && !projInfo) { - /* no quals, nor projections return directly from the tuple store. */ + /* no quals, nor projections return directly from the tuple source. */ TupleTableSlot *slot = scanState->customScanState.ss.ss_ScanTupleSlot; - tuplestore_gettupleslot(tupleStore, forwardScanDirection, false, slot); + FetchNextScanTuple(scanState, forwardScanDirection, slot); return slot; } @@ -397,12 +425,10 @@ ReturnTupleFromTuplestore(CitusScanState *scanState) ResetExprContext(econtext); TupleTableSlot *slot = scanState->customScanState.ss.ss_ScanTupleSlot; - tuplestore_gettupleslot(tupleStore, forwardScanDirection, false, slot); - - if (TupIsNull(slot)) + if (!FetchNextScanTuple(scanState, forwardScanDirection, slot)) { /* - * When the tuple is null we have reached the end of the tuplestore. We will + * When the tuple is null we have reached the end of the source. We will * return a null tuple, however, depending on the existence of a projection we * need to either return the scan tuple or the projected tuple. */ diff --git a/src/backend/distributed/planner/distributed_planner.c b/src/backend/distributed/planner/distributed_planner.c index d80216b3682..5961cca91a5 100644 --- a/src/backend/distributed/planner/distributed_planner.c +++ b/src/backend/distributed/planner/distributed_planner.c @@ -1499,7 +1499,18 @@ FinalizePlan(PlannedStmt *localPlan, DistributedPlan *distributedPlan) customScan->custom_private = list_make1(distributedPlanData); /* necessary to avoid extra Result node in PG15 */ - customScan->flags = CUSTOMPATH_SUPPORT_BACKWARD_SCAN | CUSTOMPATH_SUPPORT_PROJECTION; + int customFlags = CUSTOMPATH_SUPPORT_PROJECTION; + if (!(distributedPlan->useSortedMerge && EnableStreamingSortedMerge)) + { + /* + * Advertise backward-scan support unless both sorted merge and + * the streaming adapter are active. When streaming, the adapter + * is forward-only; PostgreSQL's planner will insert a Material + * node above us for scrollable cursors. + */ + customFlags |= CUSTOMPATH_SUPPORT_BACKWARD_SCAN; + } + customScan->flags = customFlags; /* * Fast path queries cannot have any subplans by definition, so skip diff --git a/src/test/regress/expected/multi_orderby_pushdown_streaming.out b/src/test/regress/expected/multi_orderby_pushdown_streaming.out new file mode 100644 index 00000000000..45d3522aed0 --- /dev/null +++ b/src/test/regress/expected/multi_orderby_pushdown_streaming.out @@ -0,0 +1,1888 @@ +-- +-- MULTI_SORTED_MERGE_STREAMING +-- +-- Runs the same test cases as multi_orderby_pushdown.sql but with the +-- streaming sorted merge adapter enabled via the GUC. This validates +-- that the streaming code path produces identical results to the eager +-- merge path. +-- +SET citus.enable_streaming_sorted_merge TO on; +\i sql/multi_orderby_pushdown.sql +-- +-- MULTI_SORTED_MERGE +-- +-- Tests for the citus.enable_sorted_merge GUC and the sorted merge +-- planner eligibility logic. Verifies that enabling the GUC does not +-- introduce regressions for any query pattern. +-- +SET citus.next_shard_id TO 960000; +-- ================================================================= +-- Setup: create test tables +-- ================================================================= +CREATE TABLE sorted_merge_test ( + id int, + val text, + num numeric, + ts timestamptz DEFAULT now() +); +SELECT create_distributed_table('sorted_merge_test', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- Insert 100 rows + NULLs + duplicates +INSERT INTO sorted_merge_test (id, val, num) +SELECT i, 'val_' || i, (i * 1.5)::numeric +FROM generate_series(1, 100) i; +INSERT INTO sorted_merge_test (id, val, num) VALUES (101, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (102, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (200, 'dup_a', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (201, 'dup_b', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (202, 'dup_c', 10.5); +-- Second table for join tests +CREATE TABLE sorted_merge_events ( + id int, + event_type text, + event_val int +); +SELECT create_distributed_table('sorted_merge_events', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO sorted_merge_events +SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i +FROM generate_series(1, 200) i; +-- ================================================================= +-- 1. GUC basics +-- ================================================================= +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------------------------------------------------- + off +(1 row) + +SET citus.enable_sorted_merge TO on; +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------------------------------------------------- + on +(1 row) + +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Category A: Eligibility — sort IS pushed to workers +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- A1: ORDER BY distribution column +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 1027 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: 255 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) + +-- A2: ORDER BY DESC +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id DESC; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 420 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: 104 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id + Sort Key: sorted_merge_test.id DESC + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(15 rows) + +-- A3: ORDER BY DESC NULLS LAST +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.num + Task Count: 4 + Tuple data received from nodes: 1556 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num DESC NULLS LAST + Tuple data received from node: 392 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, num + Sort Key: sorted_merge_test.num DESC NULLS LAST + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, num +(15 rows) + +-- A4: ORDER BY non-distribution column +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test ORDER BY val; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 1027 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY val + Tuple data received from node: 255 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.val + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) + +-- A5: Multi-column ORDER BY +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id, val; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 1027 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, val + Tuple data received from node: 255 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id, sorted_merge_test.val + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) + +-- A6: Mixed directions +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: 4 + Tuple data received from nodes: 2163 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, num DESC + Tuple data received from node: 543 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, val, num + Sort Key: sorted_merge_test.id, sorted_merge_test.num DESC + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num +(15 rows) + +-- A7: GROUP BY dist_col ORDER BY dist_col +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 1260 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id + Tuple data received from node: 312 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(19 rows) + +-- A8: WHERE clause + ORDER BY +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=67 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 671 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) '50'::numeric) ORDER BY id + Tuple data received from node: 130 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=13 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=13 loops=1) + Output: id, val + Filter: (sorted_merge_test.num > '50'::numeric) + Rows Removed by Filter: 13 +(17 rows) + +-- A9: Expression in ORDER BY (non-aggregate) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, num FROM sorted_merge_test ORDER BY id + 1; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.num, remote_scan.worker_column_3 + Task Count: 4 + Tuple data received from nodes: 1976 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, num, (id OPERATOR(pg_catalog.+) 1) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) 1) + Tuple data received from node: 496 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, num, ((id + 1)) + Sort Key: ((sorted_merge_test.id + 1)) + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, num, (id + 1) +(15 rows) + +-- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 80 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 20 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id + -> Sort (actual rows=5 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(19 rows) + +-- ================================================================= +-- Category B: Ineligibility — sort NOT pushed for merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- B1: ORDER BY count(*) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); + QUERY PLAN +--------------------------------------------------------------------- + Sort (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.count + Sort Key: remote_scan.count + Sort Method: quicksort Memory: 28kB + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 1260 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: 312 bytes + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(19 rows) + +-- B2: ORDER BY avg(col) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); + QUERY PLAN +--------------------------------------------------------------------- + Sort (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.avg + Sort Key: remote_scan.avg + Sort Method: quicksort Memory: 28kB + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.avg + Task Count: 4 + Tuple data received from nodes: 1556 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, avg(num) AS avg FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: 392 bytes + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate (actual rows=26 loops=1) + Output: id, avg(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(19 rows) + +-- B3: GROUP BY non-dist col, ORDER BY non-dist col +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; + QUERY PLAN +--------------------------------------------------------------------- + Sort (actual rows=104 loops=1) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + Sort Key: remote_scan.val + Sort Method: quicksort Memory: 28kB + -> HashAggregate (actual rows=104 loops=1) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.val, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 1447 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: 359 bytes + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate (actual rows=26 loops=1) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(23 rows) + +-- B4: GROUP BY non-dist col, ORDER BY aggregate +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); + QUERY PLAN +--------------------------------------------------------------------- + Sort (actual rows=104 loops=1) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + Sort Method: quicksort Memory: 28kB + -> HashAggregate (actual rows=104 loops=1) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.val, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 1447 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: 359 bytes + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate (actual rows=26 loops=1) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(23 rows) + +-- ================================================================= +-- Category C: Correctness — results match GUC off vs on +-- ================================================================= +-- C1: Simple ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- C2: ORDER BY DESC +SET citus.enable_sorted_merge TO off; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +-- C3: Multi-column ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- C4: ORDER BY non-distribution column +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- C5: GROUP BY dist_col ORDER BY dist_col +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- C6: Mixed directions +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- C7: WHERE + ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +-- C8: Aggregates in SELECT, ORDER BY on dist_col (GROUP BY dist_col) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +-- ================================================================= +-- Category D: Complex queries — regression guards +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- D1: Subquery in FROM with ORDER BY +SELECT * FROM ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) sub ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D2: CTE with ORDER BY +WITH top5 AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) +SELECT * FROM top5 ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D3: Co-located JOIN + ORDER BY +SELECT t.id, t.val, e.event_type +FROM sorted_merge_test t +JOIN sorted_merge_events e ON t.id = e.id +WHERE t.id <= 5 +ORDER BY t.id, e.event_type +LIMIT 10; + id | val | event_type +--------------------------------------------------------------------- + 1 | val_1 | buy + 1 | val_1 | buy + 1 | val_1 | click + 1 | val_1 | view + 2 | val_2 | buy + 2 | val_2 | click + 2 | val_2 | view + 2 | val_2 | view + 3 | val_3 | buy + 3 | val_3 | buy +(10 rows) + +-- D4: UNION ALL + ORDER BY +SELECT id, val FROM sorted_merge_test WHERE id <= 3 +UNION ALL +SELECT id, val FROM sorted_merge_test WHERE id BETWEEN 98 AND 100 +ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 98 | val_98 + 99 | val_99 + 100 | val_100 +(6 rows) + +-- D5: DISTINCT + ORDER BY +SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +-- D6: DISTINCT ON + ORDER BY +SELECT DISTINCT ON (id) id, val, num +FROM sorted_merge_test +WHERE id <= 5 +ORDER BY id, num DESC; + id | val | num +--------------------------------------------------------------------- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +-- D7: EXISTS subquery + ORDER BY +SELECT id, val FROM sorted_merge_test t +WHERE EXISTS (SELECT 1 FROM sorted_merge_events e WHERE e.id = t.id) +ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D8: IN subquery + ORDER BY +SELECT id, val FROM sorted_merge_test +WHERE id IN (SELECT id FROM sorted_merge_events WHERE event_type = 'click') +ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D9: Multiple aggregates, GROUP BY dist_col, ORDER BY dist_col +SELECT id, count(*), sum(num), avg(num), min(val), max(val) +FROM sorted_merge_test +GROUP BY id +ORDER BY id +LIMIT 5; + id | count | sum | avg | min | max +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 | val_1 | val_1 + 2 | 1 | 3.0 | 3.0000000000000000 | val_2 | val_2 + 3 | 1 | 4.5 | 4.5000000000000000 | val_3 | val_3 + 4 | 1 | 6.0 | 6.0000000000000000 | val_4 | val_4 + 5 | 1 | 7.5 | 7.5000000000000000 | val_5 | val_5 +(5 rows) + +-- D10: CASE expression in SELECT + ORDER BY +SELECT id, + CASE WHEN num > 75 THEN 'high' WHEN num > 25 THEN 'mid' ELSE 'low' END as bucket +FROM sorted_merge_test +WHERE num IS NOT NULL +ORDER BY id +LIMIT 10; + id | bucket +--------------------------------------------------------------------- + 1 | low + 2 | low + 3 | low + 4 | low + 5 | low + 6 | low + 7 | low + 8 | low + 9 | low + 10 | low +(10 rows) + +-- D11: NULL values ordering +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +-- D12: Large OFFSET +SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; + id +--------------------------------------------------------------------- + 101 + 102 + 200 + 201 + 202 +(5 rows) + +-- D13: ORDER BY ordinal position +SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- ================================================================= +-- Category E: Edge cases +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- E1: Empty result set +SELECT id FROM sorted_merge_test WHERE id < 0 ORDER BY id; + id +--------------------------------------------------------------------- +(0 rows) + +-- E2: Single row (may go through router planner) +SELECT id, val FROM sorted_merge_test WHERE id = 42 ORDER BY id; + id | val +--------------------------------------------------------------------- + 42 | val_42 +(1 row) + +-- E3: All rows with same sort value +SELECT id, num FROM sorted_merge_test WHERE num = 10.5 ORDER BY num, id; + id | num +--------------------------------------------------------------------- + 7 | 10.5 + 200 | 10.5 + 201 | 10.5 + 202 | 10.5 +(4 rows) + +-- E4: Wide sort key (4 columns) +SELECT id, val, num FROM sorted_merge_test +WHERE id <= 5 +ORDER BY num, val, id +LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +-- E5: Zero-task defensive path +-- CreatePerTaskDispatchDest handles taskCount=0 gracefully (returns a no-op +-- destination). This cannot be triggered via normal SQL because distributed +-- tables always have at least one shard. The closest we can test is an +-- empty-result query through the sorted merge path to verify no crash. +SELECT id FROM sorted_merge_test WHERE false ORDER BY id; + id +--------------------------------------------------------------------- +(0 rows) + +-- ================================================================= +-- Category F: Existing LIMIT pushdown stability +-- ================================================================= +-- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on +SET citus.enable_sorted_merge TO off; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id + -> Sort (actual rows=5 loops=1) + Output: remote_scan.id + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: 25kB + -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 80 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 20 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id + -> Sort (actual rows=5 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(23 rows) + +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 80 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 20 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id + -> Sort (actual rows=5 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(19 rows) + +-- F2: GROUP BY dist_col + ORDER BY + LIMIT +SET citus.enable_sorted_merge TO off; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count + -> Sort (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: 25kB + -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 240 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 60 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id, (count(*)) + -> Sort (actual rows=5 loops=1) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(27 rows) + +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 240 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 60 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id, (count(*)) + -> Sort (actual rows=5 loops=1) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(23 rows) + +-- F3: ORDER BY aggregate + LIMIT (not eligible for merge) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- ================================================================= +-- Category G: Phase 4 — Sort elision and advanced scenarios +-- ================================================================= +-- G1: Sort elision verification — coordinator Sort node absent +SET citus.enable_sorted_merge TO off; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Sort (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Sort Key: remote_scan.id + Sort Method: quicksort Memory: 28kB + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 1027 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true + Tuple data received from node: 255 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) + +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 1027 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: 255 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) + +-- G2a: PREPARE with merge ON, EXECUTE after turning OFF +-- Plan-time decision is baked in — cached plan must still merge correctly +SET citus.enable_sorted_merge TO on; +PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO off; +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +DEALLOCATE merge_on_stmt; +-- G2b: PREPARE with merge OFF, EXECUTE after turning ON +-- Cached plan has Sort node — must still return sorted results +SET citus.enable_sorted_merge TO off; +PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO on; +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +DEALLOCATE merge_off_stmt; +-- G3: Cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_cursor CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:355: ERROR: cursor can only scan forward +HINT: Declare it with SCROLL option to enable backward scan. +FETCH 2 FROM sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:356: ERROR: current transaction is aborted, commands ignored until end of transaction block +CLOSE sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:357: ERROR: current transaction is aborted, commands ignored until end of transaction block +COMMIT; +-- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 80 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 20 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id + -> Sort (actual rows=5 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(19 rows) + +-- G5: ORDER BY aggregate + LIMIT — crash regression test +-- Previously caused SIGSEGV when sorted merge was enabled because +-- aggregate ORDER BY was erroneously tagged as merge-eligible. +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 3; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 +(3 rows) + +-- G6: Small work_mem with many tasks (32 shards) +SET citus.enable_sorted_merge TO on; +SET work_mem TO '64kB'; +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +RESET work_mem; +-- G7: max_intermediate_result_size with CTE subplan +SET citus.enable_sorted_merge TO on; +SET citus.max_intermediate_result_size TO '4kB'; +WITH cte AS (SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 50) +SELECT * FROM cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- ================================================================= +-- Category H: Subplan + Sorted Merge interactions +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1: CTE subplan with simple ORDER BY — eligible for sorted merge +-- The CTE becomes a subplan; its DistributedPlan may have useSortedMerge=true +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + id | val | cnt +--------------------------------------------------------------------- + 1 | val_1 | 1 + 2 | val_2 | 1 + 3 | val_3 | 1 + 4 | val_4 | 1 + 5 | val_5 | 1 + 6 | val_6 | 1 + 7 | val_7 | 1 + 8 | val_8 | 1 + 9 | val_9 | 1 + 10 | val_10 | 1 + 11 | val_11 | 1 + 12 | val_12 | 1 + 13 | val_13 | 1 + 14 | val_14 | 1 + 15 | val_15 | 1 +(15 rows) + +-- H3: CTE subplan feeding outer ORDER BY — both levels may merge independently +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- H4: Subquery in WHERE with ORDER BY + LIMIT — becomes subplan with merge +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 +(3 rows) + +-- H5: CTE subplan with max_intermediate_result_size enforcement +-- Tests that EnsureIntermediateSizeLimitNotExceeded works through per-task dispatch +SET citus.max_intermediate_result_size TO '4kB'; +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- H6: Cross-join subplan with non-aggregate ORDER BY (crash regression variant) +-- Similar pattern to subquery_complex_target_list but without aggregate ORDER BY +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + id | bar_id +--------------------------------------------------------------------- + 1 | 1 + 1 | 1 + 1 | 1 + 2 | 1 + 2 | 1 +(5 rows) + +-- H7: CTE correctness comparison — GUC off vs on must produce identical results +SET citus.enable_sorted_merge TO off; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +-- ================================================================= +-- Category H EXPLAIN: Query plans for subplan + sorted merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 191 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint + Tuple data received from node: 47 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=5 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val +(19 rows) + +-- H2 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.cnt + -> Distributed Subplan XXX_1 + Intermediate Data Size: 397 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 791 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 197 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=20 loops=1) + Output: id, val + -> Sort (actual rows=20 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val + -> Distributed Subplan XXX_2 + Intermediate Data Size: 330 bytes + Result destination: Write locally + -> Limit (actual rows=15 loops=1) + Output: remote_scan.id, remote_scan.cnt + -> Sort (actual rows=15 loops=1) + Output: remote_scan.id, remote_scan.cnt + Sort Key: remote_scan.cnt DESC, remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.cnt + Task Count: 4 + Tuple data received from nodes: 720 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT '15'::bigint + Tuple data received from node: 180 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=15 loops=1) + Output: id, (count(*)) + -> Sort (actual rows=15 loops=1) + Output: id, (count(*)) + Sort Key: (count(*)) DESC, sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts + Task Count: 1 + Tuple data received from nodes: 87 bytes + Tasks Shown: All + -> Task + Query: SELECT e.id, e.val, i.cnt FROM ((SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) e JOIN (SELECT intermediate_result.id, intermediate_result.cnt FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer, cnt bigint)) i ON ((e.id OPERATOR(pg_catalog.=) i.id))) ORDER BY e.id + Tuple data received from node: 87 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Merge Join (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result_1.cnt + Merge Cond: (intermediate_result.id = intermediate_result_1.id) + -> Sort (actual rows=6 loops=1) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=15 loops=1) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Sort Key: intermediate_result_1.id + Sort Method: quicksort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=15 loops=1) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(77 rows) + +-- H3 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=10 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=10 loops=1) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: 200 bytes + Result destination: Send to 2 nodes + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 320 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 80 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=20 loops=1) + Output: id + -> Sort (actual rows=20 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id + Task Count: 4 + Tuple data received from nodes: 97 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT '10'::bigint + Tuple data received from node: 97 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=10 loops=1) + Output: t.id, t.val + -> Merge Join (actual rows=10 loops=1) + Output: t.id, t.val + Merge Cond: (intermediate_result.id = t.id) + -> Sort (actual rows=10 loops=1) + Output: intermediate_result.id + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=10 loops=1) + Output: t.id, t.val + Sort Key: t.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=26 loops=1) + Output: t.id, t.val +(51 rows) + +-- H4 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=3 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=3 loops=1) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: 100 bytes + Result destination: Send to 2 nodes + -> Limit (actual rows=10 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=40 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 160 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '10'::bigint + Tuple data received from node: 40 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=10 loops=1) + Output: id + -> Sort (actual rows=10 loops=1) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) + Output: id + Task Count: 4 + Tuple data received from nodes: 27 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint + Tuple data received from node: 27 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=3 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=3 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Hash Semi Join (actual rows=3 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + Hash Cond: (sorted_merge_test.id = intermediate_result.id) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val, sorted_merge_test.num, sorted_merge_test.ts + -> Hash (actual rows=10 loops=1) + Output: intermediate_result.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=10 loops=1) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(50 rows) + +-- H5 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: 397 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 791 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 197 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=20 loops=1) + Output: id, val + -> Sort (actual rows=20 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val + Task Count: 1 + Tuple data received from nodes: 47 bytes + Tasks Shown: All + -> Task + Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT 5 + Tuple data received from node: 47 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(40 rows) + +-- H6 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.bar_id + -> Distributed Subplan XXX_1 + Intermediate Data Size: 30 bytes + Result destination: Write locally + -> Limit (actual rows=3 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 48 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '3'::bigint + Tuple data received from node: 12 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=3 loops=1) + Output: id + -> Sort (actual rows=3 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id + -> Distributed Subplan XXX_2 + Intermediate Data Size: 30 bytes + Result destination: Write locally + -> Limit (actual rows=3 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 48 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '3'::bigint + Tuple data received from node: 12 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=3 loops=1) + Output: id + -> Sort (actual rows=3 loops=1) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) + Output: id + Task Count: 1 + Tuple data received from nodes: 40 bytes + Tasks Shown: All + -> Task + Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT 5 + Tuple data received from node: 40 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result_1.id + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result_1.id + Sort Key: intermediate_result.id, intermediate_result_1.id + Sort Method: quicksort Memory: 25kB + -> Nested Loop (actual rows=9 loops=1) + Output: intermediate_result.id, intermediate_result_1.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=3 loops=1) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=3 loops=3) + Output: intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(67 rows) + +-- H7 EXPLAIN — GUC off vs on +SET citus.enable_sorted_merge TO off; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: 691 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Sort (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: 4 + Tuple data received from nodes: 1673 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 419 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=20 loops=1) + Output: id, val, num + -> Sort (actual rows=20 loops=1) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num + Task Count: 1 + Tuple data received from nodes: 103 bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 + Tuple data received from node: 103 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=14 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > '10'::numeric) + Rows Removed by Filter: 6 +(45 rows) + +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: 699 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: 4 + Tuple data received from nodes: 1673 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 419 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=20 loops=1) + Output: id, val, num + -> Sort (actual rows=20 loops=1) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num + Task Count: 1 + Tuple data received from nodes: 101 bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 + Tuple data received from node: 101 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=18 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > '10'::numeric) + Rows Removed by Filter: 2 +(42 rows) + +-- ================================================================= +-- Cleanup +-- ================================================================= +SET citus.enable_sorted_merge TO off; +DROP TABLE sorted_merge_test; +DROP TABLE sorted_merge_events; +RESET citus.enable_streaming_sorted_merge; diff --git a/src/test/regress/sql/multi_orderby_pushdown_streaming.sql b/src/test/regress/sql/multi_orderby_pushdown_streaming.sql new file mode 100644 index 00000000000..e7faed04373 --- /dev/null +++ b/src/test/regress/sql/multi_orderby_pushdown_streaming.sql @@ -0,0 +1,14 @@ +-- +-- MULTI_SORTED_MERGE_STREAMING +-- +-- Runs the same test cases as multi_orderby_pushdown.sql but with the +-- streaming sorted merge adapter enabled via the GUC. This validates +-- that the streaming code path produces identical results to the eager +-- merge path. +-- + +SET citus.enable_streaming_sorted_merge TO on; + +\i sql/multi_orderby_pushdown.sql + +RESET citus.enable_streaming_sorted_merge; From f1f7f3599b7905d85476f73e1ef926869f8ab98e Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Mon, 13 Apr 2026 17:48:32 +0000 Subject: [PATCH 4/7] Refactor regress test for order by pushdown and handle scroll cursors --- .../distributed/executor/multi_executor.c | 22 +- .../distributed/planner/distributed_planner.c | 16 + .../multi_orderby_pushdown_streaming.out | 1947 ++++++++++++++++- .../regress/sql/multi_orderby_pushdown.sql | 48 +- .../sql/multi_orderby_pushdown_streaming.sql | 22 +- .../sql/setup_multi_orderby_pushdown.sql | 45 + 6 files changed, 2033 insertions(+), 67 deletions(-) create mode 100644 src/test/regress/sql/setup_multi_orderby_pushdown.sql diff --git a/src/backend/distributed/executor/multi_executor.c b/src/backend/distributed/executor/multi_executor.c index 8f2774af057..e9857fda136 100644 --- a/src/backend/distributed/executor/multi_executor.c +++ b/src/backend/distributed/executor/multi_executor.c @@ -359,12 +359,24 @@ FetchNextScanTuple(CitusScanState *scanState, bool forward, TupleTableSlot *slot if (scanState->mergeAdapter != NULL) { /* - * Adapter is forward-only. Backward scan should never reach here - * because the planner removes CUSTOMPATH_SUPPORT_BACKWARD_SCAN - * when sorted merge is active, causing PostgreSQL to insert a - * Material node above us for scrollable cursors. + * The streaming merge adapter is forward-only. + * + * Citus replaces the entire plan tree after standard_planner() + * returns, so PostgreSQL's cursor-time materialize_finished_plan() + * check does not see the Citus CustomScan. That means SCROLL + * cursors can reach here with a backward scan request even though + * the adapter cannot satisfy it. Report a user-facing error + * rather than crashing. */ - Assert(forward); + if (!forward) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("streaming sorted merge does not support " + "backward scan"), + errhint("Use SET citus.enable_streaming_sorted_merge " + "TO off to allow backward scan."))); + } return SortedMergeAdapterNext(scanState->mergeAdapter, slot); } diff --git a/src/backend/distributed/planner/distributed_planner.c b/src/backend/distributed/planner/distributed_planner.c index 5961cca91a5..0db8cf338cf 100644 --- a/src/backend/distributed/planner/distributed_planner.c +++ b/src/backend/distributed/planner/distributed_planner.c @@ -855,6 +855,22 @@ CreateDistributedPlannedStmt(DistributedPlanningContext *planContext) /* create final plan by combining local plan with distributed plan */ resultPlan = FinalizePlan(planContext->plan, distributedPlan); + /* + * When the streaming sorted merge adapter is active, the CustomScan + * does not support backward scan. If the query is a SCROLL cursor, + * insert a Material node above the plan tree so backward fetches work. + * + * Normally standard_planner() handles this (planner.c:447-451), but + * Citus replaces the plan tree after standard_planner returns via + * FinalizePlan(), losing any Material node it inserted. + */ + if ((planContext->cursorOptions & CURSOR_OPT_SCROLL) && + distributedPlan->useSortedMerge && EnableStreamingSortedMerge && + !ExecSupportsBackwardScan(resultPlan->planTree)) + { + resultPlan->planTree = materialize_finished_plan(resultPlan->planTree); + } + /* * As explained above, force planning costs to be unrealistically high if * query planning failed (possibly) due to prepared statement parameters or diff --git a/src/test/regress/expected/multi_orderby_pushdown_streaming.out b/src/test/regress/expected/multi_orderby_pushdown_streaming.out index 45d3522aed0..b000e7a0c5f 100644 --- a/src/test/regress/expected/multi_orderby_pushdown_streaming.out +++ b/src/test/regress/expected/multi_orderby_pushdown_streaming.out @@ -1,19 +1,21 @@ -- --- MULTI_SORTED_MERGE_STREAMING +-- MULTI_ORDERBY_PUSHDOWN_STREAMING -- --- Runs the same test cases as multi_orderby_pushdown.sql but with the --- streaming sorted merge adapter enabled via the GUC. This validates --- that the streaming code path produces identical results to the eager --- merge path. +-- Runs the sorted merge test suite (multi_orderby_pushdown.sql) twice: +-- first with the default eager-merge path, then with the streaming +-- adapter enabled via citus.enable_streaming_sorted_merge. Both runs +-- share the same setup tables and must produce identical results +-- (except for the G3 backward-scan test, where the streaming adapter's +-- forward-only cursor correctly errors on FETCH BACKWARD). -- -SET citus.enable_streaming_sorted_merge TO on; -\i sql/multi_orderby_pushdown.sql +\i sql/setup_multi_orderby_pushdown.sql -- --- MULTI_SORTED_MERGE +-- SETUP_MULTI_ORDERBY_PUSHDOWN -- --- Tests for the citus.enable_sorted_merge GUC and the sorted merge --- planner eligibility logic. Verifies that enabling the GUC does not --- introduce regressions for any query pattern. +-- Creates the test tables and data used by multi_orderby_pushdown.sql +-- and its variants (e.g., multi_orderby_pushdown_streaming.sql). +-- This file is meant to be included via \i from test files that need +-- these tables. -- SET citus.next_shard_id TO 960000; -- ================================================================= @@ -55,6 +57,15 @@ SELECT create_distributed_table('sorted_merge_events', 'id'); INSERT INTO sorted_merge_events SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i FROM generate_series(1, 200) i; +-- Run 1: eager merge (default) +\i sql/multi_orderby_pushdown.sql +-- +-- MULTI_SORTED_MERGE +-- +-- Tests for the citus.enable_sorted_merge GUC and the sorted merge +-- planner eligibility logic. Verifies that enabling the GUC does not +-- introduce regressions for any query pattern. +-- -- ================================================================= -- 1. GUC basics -- ================================================================= @@ -1164,12 +1175,46 @@ FETCH 3 FROM sorted_cursor; (3 rows) FETCH BACKWARD 1 FROM sorted_cursor; -psql:sql/multi_orderby_pushdown.sql:355: ERROR: cursor can only scan forward -HINT: Declare it with SCROLL option to enable backward scan. + id +--------------------------------------------------------------------- + 2 +(1 row) + FETCH 2 FROM sorted_cursor; -psql:sql/multi_orderby_pushdown.sql:356: ERROR: current transaction is aborted, commands ignored until end of transaction block + id +--------------------------------------------------------------------- + 3 + 4 +(2 rows) + CLOSE sorted_cursor; -psql:sql/multi_orderby_pushdown.sql:357: ERROR: current transaction is aborted, commands ignored until end of transaction block +COMMIT; +-- G3b: SCROLL cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_scroll_cursor SCROLL CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 2 +(1 row) + +FETCH 2 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 3 + 4 +(2 rows) + +CLOSE sorted_scroll_cursor; COMMIT; -- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) SET citus.enable_sorted_merge TO on; @@ -1883,6 +1928,1872 @@ SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; -- Cleanup -- ================================================================= SET citus.enable_sorted_merge TO off; -DROP TABLE sorted_merge_test; -DROP TABLE sorted_merge_events; -RESET citus.enable_streaming_sorted_merge; +-- Run 2: streaming adapter +SET citus.enable_streaming_sorted_merge TO on; +\i sql/multi_orderby_pushdown.sql +-- +-- MULTI_SORTED_MERGE +-- +-- Tests for the citus.enable_sorted_merge GUC and the sorted merge +-- planner eligibility logic. Verifies that enabling the GUC does not +-- introduce regressions for any query pattern. +-- +-- ================================================================= +-- 1. GUC basics +-- ================================================================= +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------------------------------------------------- + off +(1 row) + +SET citus.enable_sorted_merge TO on; +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------------------------------------------------- + on +(1 row) + +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Category A: Eligibility — sort IS pushed to workers +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- A1: ORDER BY distribution column +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 1027 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: 255 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) + +-- A2: ORDER BY DESC +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id DESC; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 420 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: 104 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id + Sort Key: sorted_merge_test.id DESC + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(15 rows) + +-- A3: ORDER BY DESC NULLS LAST +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.num + Task Count: 4 + Tuple data received from nodes: 1556 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num DESC NULLS LAST + Tuple data received from node: 392 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, num + Sort Key: sorted_merge_test.num DESC NULLS LAST + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, num +(15 rows) + +-- A4: ORDER BY non-distribution column +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test ORDER BY val; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 1027 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY val + Tuple data received from node: 255 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.val + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) + +-- A5: Multi-column ORDER BY +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id, val; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 1027 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, val + Tuple data received from node: 255 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id, sorted_merge_test.val + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) + +-- A6: Mixed directions +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: 4 + Tuple data received from nodes: 2163 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, num DESC + Tuple data received from node: 543 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, val, num + Sort Key: sorted_merge_test.id, sorted_merge_test.num DESC + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num +(15 rows) + +-- A7: GROUP BY dist_col ORDER BY dist_col +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 1260 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id + Tuple data received from node: 312 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(19 rows) + +-- A8: WHERE clause + ORDER BY +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=67 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 671 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) '50'::numeric) ORDER BY id + Tuple data received from node: 130 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=13 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=13 loops=1) + Output: id, val + Filter: (sorted_merge_test.num > '50'::numeric) + Rows Removed by Filter: 13 +(17 rows) + +-- A9: Expression in ORDER BY (non-aggregate) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, num FROM sorted_merge_test ORDER BY id + 1; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.num, remote_scan.worker_column_3 + Task Count: 4 + Tuple data received from nodes: 1976 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, num, (id OPERATOR(pg_catalog.+) 1) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) 1) + Tuple data received from node: 496 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, num, ((id + 1)) + Sort Key: ((sorted_merge_test.id + 1)) + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, num, (id + 1) +(15 rows) + +-- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 80 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 20 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id + -> Sort (actual rows=5 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(19 rows) + +-- ================================================================= +-- Category B: Ineligibility — sort NOT pushed for merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- B1: ORDER BY count(*) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); + QUERY PLAN +--------------------------------------------------------------------- + Sort (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.count + Sort Key: remote_scan.count + Sort Method: quicksort Memory: 28kB + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 1260 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: 312 bytes + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(19 rows) + +-- B2: ORDER BY avg(col) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); + QUERY PLAN +--------------------------------------------------------------------- + Sort (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.avg + Sort Key: remote_scan.avg + Sort Method: quicksort Memory: 28kB + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.avg + Task Count: 4 + Tuple data received from nodes: 1556 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, avg(num) AS avg FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: 392 bytes + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate (actual rows=26 loops=1) + Output: id, avg(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(19 rows) + +-- B3: GROUP BY non-dist col, ORDER BY non-dist col +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; + QUERY PLAN +--------------------------------------------------------------------- + Sort (actual rows=104 loops=1) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + Sort Key: remote_scan.val + Sort Method: quicksort Memory: 28kB + -> HashAggregate (actual rows=104 loops=1) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.val, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 1447 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: 359 bytes + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate (actual rows=26 loops=1) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(23 rows) + +-- B4: GROUP BY non-dist col, ORDER BY aggregate +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); + QUERY PLAN +--------------------------------------------------------------------- + Sort (actual rows=104 loops=1) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + Sort Method: quicksort Memory: 28kB + -> HashAggregate (actual rows=104 loops=1) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.val, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 1447 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: 359 bytes + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate (actual rows=26 loops=1) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(23 rows) + +-- ================================================================= +-- Category C: Correctness — results match GUC off vs on +-- ================================================================= +-- C1: Simple ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- C2: ORDER BY DESC +SET citus.enable_sorted_merge TO off; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +-- C3: Multi-column ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- C4: ORDER BY non-distribution column +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- C5: GROUP BY dist_col ORDER BY dist_col +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- C6: Mixed directions +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- C7: WHERE + ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +-- C8: Aggregates in SELECT, ORDER BY on dist_col (GROUP BY dist_col) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +-- ================================================================= +-- Category D: Complex queries — regression guards +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- D1: Subquery in FROM with ORDER BY +SELECT * FROM ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) sub ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D2: CTE with ORDER BY +WITH top5 AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) +SELECT * FROM top5 ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D3: Co-located JOIN + ORDER BY +SELECT t.id, t.val, e.event_type +FROM sorted_merge_test t +JOIN sorted_merge_events e ON t.id = e.id +WHERE t.id <= 5 +ORDER BY t.id, e.event_type +LIMIT 10; + id | val | event_type +--------------------------------------------------------------------- + 1 | val_1 | buy + 1 | val_1 | buy + 1 | val_1 | click + 1 | val_1 | view + 2 | val_2 | buy + 2 | val_2 | click + 2 | val_2 | view + 2 | val_2 | view + 3 | val_3 | buy + 3 | val_3 | buy +(10 rows) + +-- D4: UNION ALL + ORDER BY +SELECT id, val FROM sorted_merge_test WHERE id <= 3 +UNION ALL +SELECT id, val FROM sorted_merge_test WHERE id BETWEEN 98 AND 100 +ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 98 | val_98 + 99 | val_99 + 100 | val_100 +(6 rows) + +-- D5: DISTINCT + ORDER BY +SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +-- D6: DISTINCT ON + ORDER BY +SELECT DISTINCT ON (id) id, val, num +FROM sorted_merge_test +WHERE id <= 5 +ORDER BY id, num DESC; + id | val | num +--------------------------------------------------------------------- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +-- D7: EXISTS subquery + ORDER BY +SELECT id, val FROM sorted_merge_test t +WHERE EXISTS (SELECT 1 FROM sorted_merge_events e WHERE e.id = t.id) +ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D8: IN subquery + ORDER BY +SELECT id, val FROM sorted_merge_test +WHERE id IN (SELECT id FROM sorted_merge_events WHERE event_type = 'click') +ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D9: Multiple aggregates, GROUP BY dist_col, ORDER BY dist_col +SELECT id, count(*), sum(num), avg(num), min(val), max(val) +FROM sorted_merge_test +GROUP BY id +ORDER BY id +LIMIT 5; + id | count | sum | avg | min | max +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 | val_1 | val_1 + 2 | 1 | 3.0 | 3.0000000000000000 | val_2 | val_2 + 3 | 1 | 4.5 | 4.5000000000000000 | val_3 | val_3 + 4 | 1 | 6.0 | 6.0000000000000000 | val_4 | val_4 + 5 | 1 | 7.5 | 7.5000000000000000 | val_5 | val_5 +(5 rows) + +-- D10: CASE expression in SELECT + ORDER BY +SELECT id, + CASE WHEN num > 75 THEN 'high' WHEN num > 25 THEN 'mid' ELSE 'low' END as bucket +FROM sorted_merge_test +WHERE num IS NOT NULL +ORDER BY id +LIMIT 10; + id | bucket +--------------------------------------------------------------------- + 1 | low + 2 | low + 3 | low + 4 | low + 5 | low + 6 | low + 7 | low + 8 | low + 9 | low + 10 | low +(10 rows) + +-- D11: NULL values ordering +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +-- D12: Large OFFSET +SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; + id +--------------------------------------------------------------------- + 101 + 102 + 200 + 201 + 202 +(5 rows) + +-- D13: ORDER BY ordinal position +SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- ================================================================= +-- Category E: Edge cases +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- E1: Empty result set +SELECT id FROM sorted_merge_test WHERE id < 0 ORDER BY id; + id +--------------------------------------------------------------------- +(0 rows) + +-- E2: Single row (may go through router planner) +SELECT id, val FROM sorted_merge_test WHERE id = 42 ORDER BY id; + id | val +--------------------------------------------------------------------- + 42 | val_42 +(1 row) + +-- E3: All rows with same sort value +SELECT id, num FROM sorted_merge_test WHERE num = 10.5 ORDER BY num, id; + id | num +--------------------------------------------------------------------- + 7 | 10.5 + 200 | 10.5 + 201 | 10.5 + 202 | 10.5 +(4 rows) + +-- E4: Wide sort key (4 columns) +SELECT id, val, num FROM sorted_merge_test +WHERE id <= 5 +ORDER BY num, val, id +LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +-- E5: Zero-task defensive path +-- CreatePerTaskDispatchDest handles taskCount=0 gracefully (returns a no-op +-- destination). This cannot be triggered via normal SQL because distributed +-- tables always have at least one shard. The closest we can test is an +-- empty-result query through the sorted merge path to verify no crash. +SELECT id FROM sorted_merge_test WHERE false ORDER BY id; + id +--------------------------------------------------------------------- +(0 rows) + +-- ================================================================= +-- Category F: Existing LIMIT pushdown stability +-- ================================================================= +-- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on +SET citus.enable_sorted_merge TO off; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id + -> Sort (actual rows=5 loops=1) + Output: remote_scan.id + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: 25kB + -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 80 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 20 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id + -> Sort (actual rows=5 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(23 rows) + +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 80 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 20 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id + -> Sort (actual rows=5 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(19 rows) + +-- F2: GROUP BY dist_col + ORDER BY + LIMIT +SET citus.enable_sorted_merge TO off; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count + -> Sort (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: 25kB + -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 240 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 60 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id, (count(*)) + -> Sort (actual rows=5 loops=1) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(27 rows) + +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count + Task Count: 4 + Tuple data received from nodes: 240 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 60 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id, (count(*)) + -> Sort (actual rows=5 loops=1) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(23 rows) + +-- F3: ORDER BY aggregate + LIMIT (not eligible for merge) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- ================================================================= +-- Category G: Phase 4 — Sort elision and advanced scenarios +-- ================================================================= +-- G1: Sort elision verification — coordinator Sort node absent +SET citus.enable_sorted_merge TO off; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Sort (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Sort Key: remote_scan.id + Sort Method: quicksort Memory: 28kB + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 1027 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true + Tuple data received from node: 255 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) + +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 1027 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: 255 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) + +-- G2a: PREPARE with merge ON, EXECUTE after turning OFF +-- Plan-time decision is baked in — cached plan must still merge correctly +SET citus.enable_sorted_merge TO on; +PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO off; +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +DEALLOCATE merge_on_stmt; +-- G2b: PREPARE with merge OFF, EXECUTE after turning ON +-- Cached plan has Sort node — must still return sorted results +SET citus.enable_sorted_merge TO off; +PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO on; +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +DEALLOCATE merge_off_stmt; +-- G3: Cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_cursor CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:319: ERROR: cursor can only scan forward +HINT: Declare it with SCROLL option to enable backward scan. +FETCH 2 FROM sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:320: ERROR: current transaction is aborted, commands ignored until end of transaction block +CLOSE sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:321: ERROR: current transaction is aborted, commands ignored until end of transaction block +COMMIT; +-- G3b: SCROLL cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_scroll_cursor SCROLL CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 2 +(1 row) + +FETCH 2 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 3 + 4 +(2 rows) + +CLOSE sorted_scroll_cursor; +COMMIT; +-- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 80 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 20 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: id + -> Sort (actual rows=5 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(19 rows) + +-- G5: ORDER BY aggregate + LIMIT — crash regression test +-- Previously caused SIGSEGV when sorted merge was enabled because +-- aggregate ORDER BY was erroneously tagged as merge-eligible. +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 3; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 +(3 rows) + +-- G6: Small work_mem with many tasks (32 shards) +SET citus.enable_sorted_merge TO on; +SET work_mem TO '64kB'; +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +RESET work_mem; +-- G7: max_intermediate_result_size with CTE subplan +SET citus.enable_sorted_merge TO on; +SET citus.max_intermediate_result_size TO '4kB'; +WITH cte AS (SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 50) +SELECT * FROM cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- ================================================================= +-- Category H: Subplan + Sorted Merge interactions +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1: CTE subplan with simple ORDER BY — eligible for sorted merge +-- The CTE becomes a subplan; its DistributedPlan may have useSortedMerge=true +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + id | val | cnt +--------------------------------------------------------------------- + 1 | val_1 | 1 + 2 | val_2 | 1 + 3 | val_3 | 1 + 4 | val_4 | 1 + 5 | val_5 | 1 + 6 | val_6 | 1 + 7 | val_7 | 1 + 8 | val_8 | 1 + 9 | val_9 | 1 + 10 | val_10 | 1 + 11 | val_11 | 1 + 12 | val_12 | 1 + 13 | val_13 | 1 + 14 | val_14 | 1 + 15 | val_15 | 1 +(15 rows) + +-- H3: CTE subplan feeding outer ORDER BY — both levels may merge independently +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- H4: Subquery in WHERE with ORDER BY + LIMIT — becomes subplan with merge +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 +(3 rows) + +-- H5: CTE subplan with max_intermediate_result_size enforcement +-- Tests that EnsureIntermediateSizeLimitNotExceeded works through per-task dispatch +SET citus.max_intermediate_result_size TO '4kB'; +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- H6: Cross-join subplan with non-aggregate ORDER BY (crash regression variant) +-- Similar pattern to subquery_complex_target_list but without aggregate ORDER BY +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + id | bar_id +--------------------------------------------------------------------- + 1 | 1 + 1 | 1 + 1 | 1 + 2 | 1 + 2 | 1 +(5 rows) + +-- H7: CTE correctness comparison — GUC off vs on must produce identical results +SET citus.enable_sorted_merge TO off; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +-- ================================================================= +-- Category H EXPLAIN: Query plans for subplan + sorted merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 191 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint + Tuple data received from node: 47 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=5 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val +(19 rows) + +-- H2 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.cnt + -> Distributed Subplan XXX_1 + Intermediate Data Size: 397 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 791 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 197 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=20 loops=1) + Output: id, val + -> Sort (actual rows=20 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val + -> Distributed Subplan XXX_2 + Intermediate Data Size: 330 bytes + Result destination: Write locally + -> Limit (actual rows=15 loops=1) + Output: remote_scan.id, remote_scan.cnt + -> Sort (actual rows=15 loops=1) + Output: remote_scan.id, remote_scan.cnt + Sort Key: remote_scan.cnt DESC, remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.cnt + Task Count: 4 + Tuple data received from nodes: 720 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT '15'::bigint + Tuple data received from node: 180 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=15 loops=1) + Output: id, (count(*)) + -> Sort (actual rows=15 loops=1) + Output: id, (count(*)) + Sort Key: (count(*)) DESC, sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts + Task Count: 1 + Tuple data received from nodes: 87 bytes + Tasks Shown: All + -> Task + Query: SELECT e.id, e.val, i.cnt FROM ((SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) e JOIN (SELECT intermediate_result.id, intermediate_result.cnt FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer, cnt bigint)) i ON ((e.id OPERATOR(pg_catalog.=) i.id))) ORDER BY e.id + Tuple data received from node: 87 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Merge Join (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result_1.cnt + Merge Cond: (intermediate_result.id = intermediate_result_1.id) + -> Sort (actual rows=6 loops=1) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=15 loops=1) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Sort Key: intermediate_result_1.id + Sort Method: quicksort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=15 loops=1) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(77 rows) + +-- H3 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=10 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=10 loops=1) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: 200 bytes + Result destination: Send to 2 nodes + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 320 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 80 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=20 loops=1) + Output: id + -> Sort (actual rows=20 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id + Task Count: 4 + Tuple data received from nodes: 97 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT '10'::bigint + Tuple data received from node: 97 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=10 loops=1) + Output: t.id, t.val + -> Merge Join (actual rows=10 loops=1) + Output: t.id, t.val + Merge Cond: (intermediate_result.id = t.id) + -> Sort (actual rows=10 loops=1) + Output: intermediate_result.id + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=10 loops=1) + Output: t.id, t.val + Sort Key: t.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=26 loops=1) + Output: t.id, t.val +(51 rows) + +-- H4 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=3 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=3 loops=1) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: 100 bytes + Result destination: Send to 2 nodes + -> Limit (actual rows=10 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=40 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 160 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '10'::bigint + Tuple data received from node: 40 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=10 loops=1) + Output: id + -> Sort (actual rows=10 loops=1) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) + Output: id + Task Count: 4 + Tuple data received from nodes: 27 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint + Tuple data received from node: 27 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=3 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=3 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Hash Semi Join (actual rows=3 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + Hash Cond: (sorted_merge_test.id = intermediate_result.id) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val, sorted_merge_test.num, sorted_merge_test.ts + -> Hash (actual rows=10 loops=1) + Output: intermediate_result.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=10 loops=1) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(50 rows) + +-- H5 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: 397 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id, remote_scan.val + Task Count: 4 + Tuple data received from nodes: 791 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 197 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=20 loops=1) + Output: id, val + -> Sort (actual rows=20 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val + Task Count: 1 + Tuple data received from nodes: 47 bytes + Tasks Shown: All + -> Task + Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT 5 + Tuple data received from node: 47 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(40 rows) + +-- H6 EXPLAIN +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.bar_id + -> Distributed Subplan XXX_1 + Intermediate Data Size: 30 bytes + Result destination: Write locally + -> Limit (actual rows=3 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 48 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '3'::bigint + Tuple data received from node: 12 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=3 loops=1) + Output: id + -> Sort (actual rows=3 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id + -> Distributed Subplan XXX_2 + Intermediate Data Size: 30 bytes + Result destination: Write locally + -> Limit (actual rows=3 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) + Output: remote_scan.id + Task Count: 4 + Tuple data received from nodes: 48 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '3'::bigint + Tuple data received from node: 12 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=3 loops=1) + Output: id + -> Sort (actual rows=3 loops=1) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) + Output: id + Task Count: 1 + Tuple data received from nodes: 40 bytes + Tasks Shown: All + -> Task + Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT 5 + Tuple data received from node: 40 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result_1.id + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result_1.id + Sort Key: intermediate_result.id, intermediate_result_1.id + Sort Method: quicksort Memory: 25kB + -> Nested Loop (actual rows=9 loops=1) + Output: intermediate_result.id, intermediate_result_1.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=3 loops=1) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=3 loops=3) + Output: intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(67 rows) + +-- H7 EXPLAIN — GUC off vs on +SET citus.enable_sorted_merge TO off; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: 691 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Sort (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: 4 + Tuple data received from nodes: 1673 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 419 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=20 loops=1) + Output: id, val, num + -> Sort (actual rows=20 loops=1) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num + Task Count: 1 + Tuple data received from nodes: 103 bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 + Tuple data received from node: 103 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=14 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > '10'::numeric) + Rows Removed by Filter: 6 +(45 rows) + +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: 699 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: 4 + Tuple data received from nodes: 1673 bytes + Tasks Shown: One of 4 + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 419 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=20 loops=1) + Output: id, val, num + -> Sort (actual rows=20 loops=1) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num + Task Count: 1 + Tuple data received from nodes: 101 bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 + Tuple data received from node: 101 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=18 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > '10'::numeric) + Rows Removed by Filter: 2 +(42 rows) + +-- ================================================================= +-- Cleanup +-- ================================================================= +SET citus.enable_sorted_merge TO off; +RESET citus.enable_streaming_sorted_merge; +-- Cleanup +DROP TABLE sorted_merge_test; +DROP TABLE sorted_merge_events; diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index cc2bb87377f..4fb4f0cab32 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -10,42 +10,6 @@ -- when any node in the cluster acts as coordinator. -- -SET citus.next_shard_id TO 960000; - --- ================================================================= --- Setup: create test tables --- ================================================================= - -CREATE TABLE sorted_merge_test ( - id int, - val text, - num numeric, - ts timestamptz DEFAULT now() -); -SELECT create_distributed_table('sorted_merge_test', 'id'); - --- Insert 100 rows + NULLs + duplicates -INSERT INTO sorted_merge_test (id, val, num) -SELECT i, 'val_' || i, (i * 1.5)::numeric -FROM generate_series(1, 100) i; - -INSERT INTO sorted_merge_test (id, val, num) VALUES (101, NULL, NULL); -INSERT INTO sorted_merge_test (id, val, num) VALUES (102, NULL, NULL); -INSERT INTO sorted_merge_test (id, val, num) VALUES (200, 'dup_a', 10.5); -INSERT INTO sorted_merge_test (id, val, num) VALUES (201, 'dup_b', 10.5); -INSERT INTO sorted_merge_test (id, val, num) VALUES (202, 'dup_c', 10.5); - --- Second table for join tests -CREATE TABLE sorted_merge_events ( - id int, - event_type text, - event_val int -); -SELECT create_distributed_table('sorted_merge_events', 'id'); - -INSERT INTO sorted_merge_events -SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i -FROM generate_series(1, 200) i; -- ================================================================= -- 1. GUC basics @@ -361,6 +325,16 @@ FETCH 2 FROM sorted_cursor; CLOSE sorted_cursor; COMMIT; +-- G3b: SCROLL cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_scroll_cursor SCROLL CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_scroll_cursor; +FETCH BACKWARD 1 FROM sorted_scroll_cursor; +FETCH 2 FROM sorted_scroll_cursor; +CLOSE sorted_scroll_cursor; +COMMIT; + -- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) SET citus.enable_sorted_merge TO on; SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); @@ -821,5 +795,3 @@ SET citus.enable_sorted_merge TO off; -- ================================================================= SET citus.enable_sorted_merge TO off; -DROP TABLE sorted_merge_test; -DROP TABLE sorted_merge_events; diff --git a/src/test/regress/sql/multi_orderby_pushdown_streaming.sql b/src/test/regress/sql/multi_orderby_pushdown_streaming.sql index e7faed04373..10c20e26c81 100644 --- a/src/test/regress/sql/multi_orderby_pushdown_streaming.sql +++ b/src/test/regress/sql/multi_orderby_pushdown_streaming.sql @@ -1,14 +1,24 @@ -- --- MULTI_SORTED_MERGE_STREAMING +-- MULTI_ORDERBY_PUSHDOWN_STREAMING -- --- Runs the same test cases as multi_orderby_pushdown.sql but with the --- streaming sorted merge adapter enabled via the GUC. This validates --- that the streaming code path produces identical results to the eager --- merge path. +-- Runs the sorted merge test suite (multi_orderby_pushdown.sql) twice: +-- first with the default eager-merge path, then with the streaming +-- adapter enabled via citus.enable_streaming_sorted_merge. Both runs +-- share the same setup tables and must produce identical results +-- (except for the G3 backward-scan test, where the streaming adapter's +-- forward-only cursor correctly errors on FETCH BACKWARD). -- -SET citus.enable_streaming_sorted_merge TO on; +\i sql/setup_multi_orderby_pushdown.sql +-- Run 1: eager merge (default) \i sql/multi_orderby_pushdown.sql +-- Run 2: streaming adapter +SET citus.enable_streaming_sorted_merge TO on; +\i sql/multi_orderby_pushdown.sql RESET citus.enable_streaming_sorted_merge; + +-- Cleanup +DROP TABLE sorted_merge_test; +DROP TABLE sorted_merge_events; diff --git a/src/test/regress/sql/setup_multi_orderby_pushdown.sql b/src/test/regress/sql/setup_multi_orderby_pushdown.sql new file mode 100644 index 00000000000..a1c6e6c5976 --- /dev/null +++ b/src/test/regress/sql/setup_multi_orderby_pushdown.sql @@ -0,0 +1,45 @@ +-- +-- SETUP_MULTI_ORDERBY_PUSHDOWN +-- +-- Creates the test tables and data used by multi_orderby_pushdown.sql +-- and its variants (e.g., multi_orderby_pushdown_streaming.sql). +-- This file is meant to be included via \i from test files that need +-- these tables. +-- + +SET citus.next_shard_id TO 960000; + +-- ================================================================= +-- Setup: create test tables +-- ================================================================= + +CREATE TABLE sorted_merge_test ( + id int, + val text, + num numeric, + ts timestamptz DEFAULT now() +); +SELECT create_distributed_table('sorted_merge_test', 'id'); + +-- Insert 100 rows + NULLs + duplicates +INSERT INTO sorted_merge_test (id, val, num) +SELECT i, 'val_' || i, (i * 1.5)::numeric +FROM generate_series(1, 100) i; + +INSERT INTO sorted_merge_test (id, val, num) VALUES (101, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (102, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (200, 'dup_a', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (201, 'dup_b', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (202, 'dup_c', 10.5); + +-- Second table for join tests +CREATE TABLE sorted_merge_events ( + id int, + event_type text, + event_val int +); +SELECT create_distributed_table('sorted_merge_events', 'id'); + +INSERT INTO sorted_merge_events +SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i +FROM generate_series(1, 200) i; From 1ac461709ec7cb3b2ffc52e1c823dbe91ef48f4f Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Mon, 13 Apr 2026 17:53:04 +0000 Subject: [PATCH 5/7] Change schedule to use new test instead --- src/test/regress/multi_schedule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 06b482ff5c7..6d18db9094c 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -93,7 +93,7 @@ test: multi_reference_table multi_select_for_update relation_access_tracking pg1 test: custom_aggregate_support aggregate_support tdigest_aggregate_support test: multi_average_expression multi_working_columns multi_having_pushdown having_subquery test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown -test: multi_orderby_pushdown +test: multi_orderby_pushdown_streaming test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg ch_bench_having chbenchmark_all_queries expression_reference_join anonymous_columns test: ch_bench_subquery_repartition test: subscripting_op From a25898e61106c3319bc986ac419e4b202ca660a0 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Tue, 14 Apr 2026 21:20:14 +0000 Subject: [PATCH 6/7] Fix up test output due to rebase --- .../multi_orderby_pushdown_streaming.out | 6408 +++++++++++------ 1 file changed, 4283 insertions(+), 2125 deletions(-) diff --git a/src/test/regress/expected/multi_orderby_pushdown_streaming.out b/src/test/regress/expected/multi_orderby_pushdown_streaming.out index b000e7a0c5f..263e93f08f5 100644 --- a/src/test/regress/expected/multi_orderby_pushdown_streaming.out +++ b/src/test/regress/expected/multi_orderby_pushdown_streaming.out @@ -66,6 +66,10 @@ FROM generate_series(1, 200) i; -- planner eligibility logic. Verifies that enabling the GUC does not -- introduce regressions for any query pattern. -- +-- MX verification: this test has been verified to pass with zero diffs +-- under check-base-mx (MX mode), confirming sorted merge works correctly +-- when any node in the cluster acts as coordinator. +-- -- ================================================================= -- 1. GUC basics -- ================================================================= @@ -88,341 +92,337 @@ SET citus.enable_sorted_merge TO off; -- ================================================================= SET citus.enable_sorted_merge TO on; -- A1: ORDER BY distribution column -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val -(15 rows) +(16 rows) -- A2: ORDER BY DESC -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id DESC; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 420 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC - Tuple data received from node: 104 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id DESC - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id -(15 rows) +(16 rows) -- A3: ORDER BY DESC NULLS LAST -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 1556 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num DESC NULLS LAST - Tuple data received from node: 392 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, num Sort Key: sorted_merge_test.num DESC NULLS LAST - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, num -(15 rows) +(16 rows) -- A4: ORDER BY non-distribution column -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY val; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY val'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY val - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.val - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val -(15 rows) +(16 rows) -- A5: Multi-column ORDER BY -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id, val; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, val - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id, sorted_merge_test.val - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val -(15 rows) +(16 rows) -- A6: Mixed directions -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 2163 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, num DESC - Tuple data received from node: 543 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val, num Sort Key: sorted_merge_test.id, sorted_merge_test.num DESC - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num -(15 rows) +(16 rows) -- A7: GROUP BY dist_col ORDER BY dist_col -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1260 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id - Tuple data received from node: 312 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, (count(*)) Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> HashAggregate (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) Output: id, count(*) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts -(19 rows) +(20 rows) -- A8: WHERE clause + ORDER BY -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=67 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 671 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) '50'::numeric) ORDER BY id - Tuple data received from node: 130 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=13 loops=1) + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) 'N'::numeric) ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=13 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val - Filter: (sorted_merge_test.num > '50'::numeric) - Rows Removed by Filter: 13 -(17 rows) + Filter: (sorted_merge_test.num > 'N'::numeric) + Rows Removed by Filter: N +(18 rows) -- A9: Expression in ORDER BY (non-aggregate) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, num FROM sorted_merge_test ORDER BY id + 1; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.num, remote_scan.worker_column_3 - Task Count: 4 - Tuple data received from nodes: 1976 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, num, (id OPERATOR(pg_catalog.+) 1) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) 1) - Tuple data received from node: 496 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) - Output: id, num, ((id + 1)) - Sort Key: ((sorted_merge_test.id + 1)) - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, num, (id + 1) -(15 rows) + Query: SELECT id, num, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, num, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num, (id + N) +(16 rows) -- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id -(19 rows) +(20 rows) -- ================================================================= -- Category B: Ineligibility — sort NOT pushed for merge -- ================================================================= SET citus.enable_sorted_merge TO on; -- B1: ORDER BY count(*) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*)'); + explain_filter --------------------------------------------------------------------- - Sort (actual rows=105 loops=1) + Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count Sort Key: remote_scan.count - Sort Method: quicksort Memory: 28kB - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1260 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id - Tuple data received from node: 312 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) Output: id, count(*) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (19 rows) -- B2: ORDER BY avg(col) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num)'); + explain_filter --------------------------------------------------------------------- - Sort (actual rows=105 loops=1) + Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.avg Sort Key: remote_scan.avg - Sort Method: quicksort Memory: 28kB - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.avg - Task Count: 4 - Tuple data received from nodes: 1556 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, avg(num) AS avg FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id - Tuple data received from node: 392 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) Output: id, avg(num) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (19 rows) -- B3: GROUP BY non-dist col, ORDER BY non-dist col -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val'); + explain_filter --------------------------------------------------------------------- - Sort (actual rows=104 loops=1) - Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) Sort Key: remote_scan.val - Sort Method: quicksort Memory: 28kB - -> HashAggregate (actual rows=104 loops=1) - Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) Group Key: remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.val, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1447 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val - Tuple data received from node: 359 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) Output: val, count(*) Group Key: sorted_merge_test.val - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (23 rows) -- B4: GROUP BY non-dist col, ORDER BY aggregate -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); - QUERY PLAN ---------------------------------------------------------------------- - Sort (actual rows=104 loops=1) - Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) - Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) - Sort Method: quicksort Memory: 28kB - -> HashAggregate (actual rows=104 loops=1) - Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) Group Key: remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.val, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1447 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val - Tuple data received from node: 359 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) Output: val, count(*) Group Key: sorted_merge_test.val - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (23 rows) @@ -901,123 +901,121 @@ SELECT id FROM sorted_merge_test WHERE false ORDER BY id; -- ================================================================= -- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: remote_scan.id Sort Key: remote_scan.id - Sort Method: top-N heapsort Memory: 25kB - -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id (23 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id -(19 rows) +(20 rows) -- F2: GROUP BY dist_col + ORDER BY + LIMIT SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count Sort Key: remote_scan.id - Sort Method: top-N heapsort Memory: 25kB - -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 240 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 60 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, (count(*)) - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id, (count(*)) Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> HashAggregate (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) Output: id, count(*) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (27 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 240 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 60 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, (count(*)) - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id, (count(*)) Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> HashAggregate (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) Output: id, count(*) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts -(23 rows) +(24 rows) -- F3: ORDER BY aggregate + LIMIT (not eligible for merge) SET citus.enable_sorted_merge TO off; @@ -1047,51 +1045,52 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, i -- ================================================================= -- G1: Sort elision verification — coordinator Sort node absent SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter --------------------------------------------------------------------- - Sort (actual rows=105 loops=1) + Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val Sort Key: remote_scan.id - Sort Method: quicksort Memory: 28kB - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val (15 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val -(15 rows) +(16 rows) -- G2a: PREPARE with merge ON, EXECUTE after turning OFF --- Plan-time decision is baked in — cached plan must still merge correctly +-- Plan-time decision is baked in — cached plan must still merge correctly. +-- Execute 6+ times to trigger PostgreSQL's generic plan caching, then +-- verify the plan shape is preserved after toggling the GUC. SET citus.enable_sorted_merge TO on; PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; EXECUTE merge_on_stmt; @@ -1109,7 +1108,100 @@ EXECUTE merge_on_stmt; 10 | val_10 (10 rows) +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — no Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + SET citus.enable_sorted_merge TO off; +-- Cached plan retains the sorted merge decision from planning time EXECUTE merge_on_stmt; id | val --------------------------------------------------------------------- @@ -1125,9 +1217,25 @@ EXECUTE merge_on_stmt; 10 | val_10 (10 rows) +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + DEALLOCATE merge_on_stmt; -- G2b: PREPARE with merge OFF, EXECUTE after turning ON --- Cached plan has Sort node — must still return sorted results +-- Cached plan has Sort node — must still return sorted results. SET citus.enable_sorted_merge TO off; PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; EXECUTE merge_off_stmt; @@ -1145,7 +1253,101 @@ EXECUTE merge_off_stmt; 10 | val_10 (10 rows) +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + SET citus.enable_sorted_merge TO on; +-- Cached plan retains the non-merge decision from planning time EXECUTE merge_off_stmt; id | val --------------------------------------------------------------------- @@ -1161,6 +1363,23 @@ EXECUTE merge_off_stmt; 10 | val_10 (10 rows) +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + DEALLOCATE merge_off_stmt; -- G3: Cursor with backward scan SET citus.enable_sorted_merge TO on; @@ -1218,30 +1437,30 @@ CLOSE sorted_scroll_cursor; COMMIT; -- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id -(19 rows) +(20 rows) -- G5: ORDER BY aggregate + LIMIT — crash regression test -- Previously caused SIGSEGV when sorted merge was enabled because @@ -1442,37 +1661,36 @@ SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; -- ================================================================= SET citus.enable_sorted_merge TO on; -- H1 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH ordered_cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH ordered_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id ) -SELECT * FROM ordered_cte ORDER BY id LIMIT 5; - QUERY PLAN +SELECT * FROM ordered_cte ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 191 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint - Tuple data received from node: 47 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val -(19 rows) +(20 rows) -- H2 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH eligible_cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH eligible_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ), ineligible_cte AS ( @@ -1480,830 +1698,694 @@ ineligible_cte AS ( ) SELECT e.id, e.val, i.cnt FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id -ORDER BY e.id; - QUERY PLAN +ORDER BY e.id'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.cnt -> Distributed Subplan XXX_1 - Intermediate Data Size: 397 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=20 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 791 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 197 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, val - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val -> Distributed Subplan XXX_2 - Intermediate Data Size: 330 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=15 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.cnt - -> Sort (actual rows=15 loops=1) + -> Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.cnt Sort Key: remote_scan.cnt DESC, remote_scan.id -> Custom Scan (Citus Adaptive) (never executed) Output: remote_scan.id, remote_scan.cnt - Task Count: 4 - Tuple data received from nodes: 720 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT '15'::bigint - Tuple data received from node: 180 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=15 loops=1) + Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, (count(*)) - -> Sort (actual rows=15 loops=1) + -> Sort (actual rows=N loops=N) Output: id, (count(*)) Sort Key: (count(*)) DESC, sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> HashAggregate (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) Output: id, count(*) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts - Task Count: 1 - Tuple data received from nodes: 87 bytes + Task Count: N + Tuple data received from nodes: N bytes Tasks Shown: All -> Task Query: SELECT e.id, e.val, i.cnt FROM ((SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) e JOIN (SELECT intermediate_result.id, intermediate_result.cnt FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer, cnt bigint)) i ON ((e.id OPERATOR(pg_catalog.=) i.id))) ORDER BY e.id - Tuple data received from node: 87 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Merge Join (actual rows=5 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Merge Join (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result_1.cnt Merge Cond: (intermediate_result.id = intermediate_result_1.id) - -> Sort (actual rows=6 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val Sort Key: intermediate_result.id - Sort Method: quicksort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - -> Sort (actual rows=15 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result_1.cnt, intermediate_result_1.id Sort Key: intermediate_result_1.id - Sort Method: quicksort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=15 loops=1) + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) Output: intermediate_result_1.cnt, intermediate_result_1.id Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) -(77 rows) +(78 rows) -- H3 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH top_ids AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH top_ids AS ( SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 ) SELECT t.id, t.val FROM sorted_merge_test t JOIN top_ids ON t.id = top_ids.id ORDER BY t.id -LIMIT 10; - QUERY PLAN +LIMIT 10'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=10 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=10 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val -> Distributed Subplan XXX_1 - Intermediate Data Size: 200 bytes - Result destination: Send to 2 nodes - -> Limit (actual rows=20 loops=1) + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 320 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 80 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id - Task Count: 4 - Tuple data received from nodes: 97 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT '10'::bigint - Tuple data received from node: 97 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=10 loops=1) + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: t.id, t.val - -> Merge Join (actual rows=10 loops=1) + -> Merge Join (actual rows=N loops=N) Output: t.id, t.val Merge Cond: (intermediate_result.id = t.id) - -> Sort (actual rows=10 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id Sort Key: intermediate_result.id - Sort Method: quicksort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - -> Sort (actual rows=10 loops=1) + -> Sort (actual rows=N loops=N) Output: t.id, t.val Sort Key: t.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=N loops=N) Output: t.id, t.val -(51 rows) +(53 rows) -- H4 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE id IN ( SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 ) ORDER BY id -LIMIT 5; - QUERY PLAN +LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=3 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=3 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val -> Distributed Subplan XXX_1 - Intermediate Data Size: 100 bytes - Result destination: Send to 2 nodes - -> Limit (actual rows=10 loops=1) + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=40 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 160 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '10'::bigint - Tuple data received from node: 40 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=10 loops=1) + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=10 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_events.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) Output: id - Task Count: 4 - Tuple data received from nodes: 27 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint - Tuple data received from node: 27 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=3 loops=1) + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val - -> Sort (actual rows=3 loops=1) + -> Sort (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Hash Semi Join (actual rows=3 loops=1) + Sort Method: quicksort Memory: NkB + -> Hash Semi Join (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val Hash Cond: (sorted_merge_test.id = intermediate_result.id) - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val, sorted_merge_test.num, sorted_merge_test.ts - -> Hash (actual rows=10 loops=1) + -> Hash (actual rows=N loops=N) Output: intermediate_result.id - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=10 loops=1) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) -(50 rows) +(52 rows) -- H5 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH small_cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH small_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ) -SELECT * FROM small_cte ORDER BY id LIMIT 5; - QUERY PLAN +SELECT * FROM small_cte ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val -> Distributed Subplan XXX_1 - Intermediate Data Size: 397 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=20 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 791 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 197 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, val - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val - Task Count: 1 - Tuple data received from nodes: 47 bytes + Task Count: N + Tuple data received from nodes: N bytes Tasks Shown: All -> Task - Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT 5 - Tuple data received from node: 47 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val Sort Key: intermediate_result.id - Sort Method: top-N heapsort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) -(40 rows) +(41 rows) -- H6 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT foo.id, bar.id as bar_id +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT foo.id, bar.id as bar_id FROM (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar ORDER BY foo.id, bar.id -LIMIT 5; - QUERY PLAN +LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.bar_id -> Distributed Subplan XXX_1 - Intermediate Data Size: 30 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=3 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 48 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '3'::bigint - Tuple data received from node: 12 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=3 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=3 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id -> Distributed Subplan XXX_2 - Intermediate Data Size: 30 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=3 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 48 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '3'::bigint - Tuple data received from node: 12 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=3 loops=1) + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=3 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_events.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) Output: id - Task Count: 1 - Tuple data received from nodes: 40 bytes + Task Count: N + Tuple data received from nodes: N bytes Tasks Shown: All -> Task - Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT 5 - Tuple data received from node: 40 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result_1.id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result_1.id Sort Key: intermediate_result.id, intermediate_result_1.id - Sort Method: quicksort Memory: 25kB - -> Nested Loop (actual rows=9 loops=1) + Sort Method: quicksort Memory: NkB + -> Nested Loop (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result_1.id - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=3 loops=1) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=3 loops=3) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) Output: intermediate_result_1.id Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) -(67 rows) +(69 rows) -- H7 EXPLAIN — GUC off vs on SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 ) -SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; - QUERY PLAN +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num -> Distributed Subplan XXX_1 - Intermediate Data Size: 691 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=20 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num Sort Key: remote_scan.id -> Custom Scan (Citus Adaptive) (never executed) Output: remote_scan.id, remote_scan.val, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 1673 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 419 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, val, num - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: id, val, num Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num - Task Count: 1 - Tuple data received from nodes: 103 bytes + Task Count: N + Tuple data received from nodes: N bytes Tasks Shown: All -> Task - Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 - Tuple data received from node: 103 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num Sort Key: intermediate_result.id - Sort Method: top-N heapsort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=14 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - Filter: (intermediate_result.num > '10'::numeric) - Rows Removed by Filter: 6 + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N (45 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 ) -SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; - QUERY PLAN +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num -> Distributed Subplan XXX_1 - Intermediate Data Size: 699 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=20 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 1673 bytes - Tasks Shown: One of 4 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 419 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, val, num - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: id, val, num Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num - Task Count: 1 - Tuple data received from nodes: 101 bytes + Task Count: N + Tuple data received from nodes: N bytes Tasks Shown: All -> Task - Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 - Tuple data received from node: 101 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num Sort Key: intermediate_result.id - Sort Method: top-N heapsort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=18 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - Filter: (intermediate_result.num > '10'::numeric) - Rows Removed by Filter: 2 -(42 rows) + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N +(43 rows) -- ================================================================= --- Cleanup +-- Category I: Distributed Transactions -- ================================================================= -SET citus.enable_sorted_merge TO off; --- Run 2: streaming adapter -SET citus.enable_streaming_sorted_merge TO on; -\i sql/multi_orderby_pushdown.sql --- --- MULTI_SORTED_MERGE --- --- Tests for the citus.enable_sorted_merge GUC and the sorted merge --- planner eligibility logic. Verifies that enabling the GUC does not --- introduce regressions for any query pattern. --- --- ================================================================= --- 1. GUC basics --- ================================================================= -SHOW citus.enable_sorted_merge; - citus.enable_sorted_merge ---------------------------------------------------------------------- - off -(1 row) - +-- Verify sorted merge correctness within multi-statement transactions +-- where data is modified before the sorted-merge SELECT. SET citus.enable_sorted_merge TO on; -SHOW citus.enable_sorted_merge; - citus.enable_sorted_merge +-- I1: INSERT then SELECT within a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (900, 'txn_insert', 900.0); +SELECT id, val FROM sorted_merge_test WHERE id >= 900 ORDER BY id; + id | val --------------------------------------------------------------------- - on + 900 | txn_insert (1 row) -SET citus.enable_sorted_merge TO off; --- ================================================================= --- Category A: Eligibility — sort IS pushed to workers --- ================================================================= -SET citus.enable_sorted_merge TO on; --- A1: ORDER BY distribution column -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN ---------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) - Output: id, val - Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val -(15 rows) - --- A2: ORDER BY DESC -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id DESC; - QUERY PLAN ---------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 420 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC - Tuple data received from node: 104 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) - Output: id - Sort Key: sorted_merge_test.id DESC - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id -(15 rows) - --- A3: ORDER BY DESC NULLS LAST -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; - QUERY PLAN ---------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 1556 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num DESC NULLS LAST - Tuple data received from node: 392 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) - Output: id, num - Sort Key: sorted_merge_test.num DESC NULLS LAST - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, num -(15 rows) - --- A4: ORDER BY non-distribution column -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY val; - QUERY PLAN ---------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY val - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) - Output: id, val - Sort Key: sorted_merge_test.val - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val -(15 rows) - --- A5: Multi-column ORDER BY -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id, val; - QUERY PLAN +ROLLBACK; +-- I2: UPDATE then SELECT within a transaction +BEGIN; +UPDATE sorted_merge_test SET val = 'updated' WHERE id = 1; +SELECT id, val FROM sorted_merge_test WHERE id <= 3 ORDER BY id; + id | val --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, val - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) - Output: id, val - Sort Key: sorted_merge_test.id, sorted_merge_test.val - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val -(15 rows) + 1 | updated + 2 | val_2 + 3 | val_3 +(3 rows) --- A6: Mixed directions -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; - QUERY PLAN +ROLLBACK; +-- I3: DELETE then SELECT within a transaction +BEGIN; +DELETE FROM sorted_merge_test WHERE id <= 5; +SELECT id, val FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id | val --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.val, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 2163 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, num DESC - Tuple data received from node: 543 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) - Output: id, val, num - Sort Key: sorted_merge_test.id, sorted_merge_test.num DESC - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val, num -(15 rows) + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(5 rows) --- A7: GROUP BY dist_col ORDER BY dist_col -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; - QUERY PLAN ---------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1260 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id - Tuple data received from node: 312 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) - Output: id, (count(*)) - Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> HashAggregate (actual rows=26 loops=1) - Output: id, count(*) - Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val, num, ts -(19 rows) +ROLLBACK; +-- I4: INSERT + UPDATE + SELECT with multi-column ORDER BY +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (901, 'txn_a', 1.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (902, 'txn_b', 2.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (903, 'txn_c', 3.0); +UPDATE sorted_merge_test SET num = 999.0 WHERE id = 901; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 900 ORDER BY num, id; + id | val | num +--------------------------------------------------------------------- + 902 | txn_b | 2.0 + 903 | txn_c | 3.0 + 901 | txn_a | 999.0 +(3 rows) --- A8: WHERE clause + ORDER BY -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; - QUERY PLAN ---------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=67 loops=1) - Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 671 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) '50'::numeric) ORDER BY id - Tuple data received from node: 130 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=13 loops=1) - Output: id, val - Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=13 loops=1) - Output: id, val - Filter: (sorted_merge_test.num > '50'::numeric) - Rows Removed by Filter: 13 -(17 rows) +ROLLBACK; +-- I5: Compare results with GUC off vs on in a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (910, 'cmp_a', 10.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (911, 'cmp_b', 20.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (912, 'cmp_c', 30.0); +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num +--------------------------------------------------------------------- + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) --- A9: Expression in ORDER BY (non-aggregate) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, num FROM sorted_merge_test ORDER BY id + 1; - QUERY PLAN +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.num, remote_scan.worker_column_3 - Task Count: 4 - Tuple data received from nodes: 1976 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, num, (id OPERATOR(pg_catalog.+) 1) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) 1) - Tuple data received from node: 496 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) - Output: id, num, ((id + 1)) - Sort Key: ((sorted_merge_test.id + 1)) - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, num, (id + 1) -(15 rows) + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) --- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN +ROLLBACK; +-- I6: DELETE + aggregate in SELECT with ORDER BY +BEGIN; +DELETE FROM sorted_merge_test WHERE id > 100 AND id < 200; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) - Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) - Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) - Output: id - -> Sort (actual rows=5 loops=1) - Output: id - Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id -(19 rows) + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) +ROLLBACK; -- ================================================================= --- Category B: Ineligibility — sort NOT pushed for merge +-- Category J: Coordinator expression evaluation exclusion -- ================================================================= +-- Verify that queries with ORDER BY on expressions that need coordinator-side +-- evaluation are correctly excluded from sorted merge (or handled correctly). SET citus.enable_sorted_merge TO on; --- B1: ORDER BY count(*) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); - QUERY PLAN ---------------------------------------------------------------------- - Sort (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.count - Sort Key: remote_scan.count - Sort Method: quicksort Memory: 28kB - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1260 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id - Tuple data received from node: 312 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) - Output: id, count(*) - Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val, num, ts -(19 rows) +-- J1: ORDER BY expression on aggregate result (ordinal reference) +-- The ORDER BY references position 2 which is an aggregate — sorted merge +-- must NOT be used because aggregates are rewritten between worker/coordinator. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) AS total FROM sorted_merge_test GROUP BY id ORDER BY 2 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Sort Key: remote_scan.total + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, sum(num) AS total FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (sum(num)) + -> Sort (actual rows=N loops=N) + Output: id, (sum(num)) + Sort Key: (sum(sorted_merge_test.num)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, sum(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) --- B2: ORDER BY avg(col) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); - QUERY PLAN ---------------------------------------------------------------------- - Sort (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.avg - Sort Key: remote_scan.avg - Sort Method: quicksort Memory: 28kB - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.avg - Task Count: 4 - Tuple data received from nodes: 1556 bytes - Tasks Shown: One of 4 +-- J2: ORDER BY expression wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) + 1 AS total_plus FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Sort Key: remote_scan.total_plus + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) AS total_plus FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + Sort Key: ((sum(sorted_merge_test.num) + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, (sum(num) + 'N'::numeric) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J3: ORDER BY a non-aggregate expression that can be pushed to workers +-- This should be eligible for sorted merge — the expression is evaluated +-- on the worker side and sort order is preserved. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id + 0'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (id + N) +(16 rows) + +-- J4: ORDER BY with CASE expression (no aggregates) — eligible +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY CASE WHEN id < 50 THEN 0 ELSE 1 END, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CASE WHEN (id < N) THEN N ELSE N END) + Sort Key: (CASE WHEN (sorted_merge_test.id < N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CASE WHEN (id < N) THEN N ELSE N END +(16 rows) + +-- J5: ORDER BY on an expression that mixes aggregate and non-aggregate +-- Should be ineligible because the expression contains an aggregate. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id + count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3 + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, avg(num) AS avg FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id - Tuple data received from node: 392 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) - Output: id, avg(num) + Query: SELECT id, count(*) AS count, (id OPERATOR(pg_catalog.+) count(*)) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), (id + count(*)) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (19 rows) --- B3: GROUP BY non-dist col, ORDER BY non-dist col -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; - QUERY PLAN ---------------------------------------------------------------------- - Sort (actual rows=104 loops=1) - Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) - Sort Key: remote_scan.val - Sort Method: quicksort Memory: 28kB - -> HashAggregate (actual rows=104 loops=1) - Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) - Group Key: remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.val, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1447 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val - Tuple data received from node: 359 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) - Output: val, count(*) - Group Key: sorted_merge_test.val - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val, num, ts -(23 rows) - --- B4: GROUP BY non-dist col, ORDER BY aggregate -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); - QUERY PLAN ---------------------------------------------------------------------- - Sort (actual rows=104 loops=1) - Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) - Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) - Sort Method: quicksort Memory: 28kB - -> HashAggregate (actual rows=104 loops=1) - Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) - Group Key: remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.val, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1447 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val - Tuple data received from node: 359 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) - Output: val, count(*) - Group Key: sorted_merge_test.val - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val, num, ts -(23 rows) - --- ================================================================= --- Category C: Correctness — results match GUC off vs on --- ================================================================= --- C1: Simple ORDER BY +-- J6: Correctness comparison — expression ORDER BY, GUC off vs on SET citus.enable_sorted_merge TO off; -SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; id | val --------------------------------------------------------------------- 1 | val_1 @@ -2311,15 +2393,10 @@ SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; 3 | val_3 4 | val_4 5 | val_5 - 6 | val_6 - 7 | val_7 - 8 | val_8 - 9 | val_9 - 10 | val_10 -(10 rows) +(5 rows) SET citus.enable_sorted_merge TO on; -SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; id | val --------------------------------------------------------------------- 1 | val_1 @@ -2327,50 +2404,91 @@ SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; 3 | val_3 4 | val_4 5 | val_5 - 6 | val_6 - 7 | val_7 - 8 | val_8 - 9 | val_9 - 10 | val_10 -(10 rows) - --- C2: ORDER BY DESC -SET citus.enable_sorted_merge TO off; -SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; - id ---------------------------------------------------------------------- - 202 - 201 - 200 - 102 - 101 (5 rows) +-- ----------------------------------------------------------------- +-- J7–J12: Additional pushable expressions (no aggregates) +-- ----------------------------------------------------------------- SET citus.enable_sorted_merge TO on; -SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; - id +-- J7: ORDER BY function call on column +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val --------------------------------------------------------------------- - 202 - 201 - 200 - 102 - 101 + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 (5 rows) --- C3: Multi-column ORDER BY -SET citus.enable_sorted_merge TO off; -SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; - id | num +-- J8: ORDER BY COALESCE +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0) LIMIT 5; + id | num --------------------------------------------------------------------- - 1 | 1.5 - 2 | 3.0 - 3 | 4.5 - 4 | 6.0 - 5 | 7.5 + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 (5 rows) -SET citus.enable_sorted_merge TO on; -SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; +-- J9: ORDER BY negation +SELECT id, num FROM sorted_merge_test ORDER BY -num LIMIT 5; + id | num +--------------------------------------------------------------------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +-- J10: ORDER BY concatenation +SELECT id, val FROM sorted_merge_test ORDER BY val || '_suffix' LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 100 | val_100 + 10 | val_10 +(5 rows) + +-- J11: ORDER BY mathematical function (abs distance) +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +-- J12: ORDER BY expression not in SELECT list +SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +-- J13: ORDER BY expression referencing multiple columns +SELECT id, val FROM sorted_merge_test ORDER BY id * num LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- J14: ORDER BY with type cast +SELECT id, num FROM sorted_merge_test ORDER BY num::int LIMIT 5; id | num --------------------------------------------------------------------- 1 | 1.5 @@ -2380,9 +2498,34 @@ SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; 5 | 7.5 (5 rows) --- C4: ORDER BY non-distribution column +-- J15: ORDER BY with subexpression in SELECT and different expression in ORDER BY +SELECT id, num + 1 as n1 FROM sorted_merge_test ORDER BY num + 2 LIMIT 5; + id | n1 +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +-- J16: ORDER BY column alias +SELECT id, num * 2 as doubled FROM sorted_merge_test ORDER BY doubled LIMIT 5; + id | doubled +--------------------------------------------------------------------- + 1 | 3.0 + 2 | 6.0 + 3 | 9.0 + 4 | 12.0 + 5 | 15.0 +(5 rows) + +-- ----------------------------------------------------------------- +-- J17–J21: Correctness — GUC off vs on for expression ORDER BY +-- ----------------------------------------------------------------- +-- J17: function call SET citus.enable_sorted_merge TO off; -SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; id | val --------------------------------------------------------------------- 200 | dup_a @@ -2393,7 +2536,7 @@ SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5 (5 rows) SET citus.enable_sorted_merge TO on; -SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; id | val --------------------------------------------------------------------- 200 | dup_a @@ -2403,106 +2546,244 @@ SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5 10 | val_10 (5 rows) --- C5: GROUP BY dist_col ORDER BY dist_col +-- J18: CASE expression SET citus.enable_sorted_merge TO off; -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - id | count ---------------------------------------------------------------------- - 1 | 1 - 2 | 1 - 3 | 1 - 4 | 1 - 5 | 1 -(5 rows) +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) SET citus.enable_sorted_merge TO on; -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - id | count ---------------------------------------------------------------------- - 1 | 1 - 2 | 1 - 3 | 1 - 4 | 1 - 5 | 1 -(5 rows) +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) --- C6: Mixed directions +-- J19: COALESCE SET citus.enable_sorted_merge TO off; -SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; - id | num +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num --------------------------------------------------------------------- - 1 | 1.5 - 2 | 3.0 - 3 | 4.5 - 4 | 6.0 - 5 | 7.5 + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 (5 rows) SET citus.enable_sorted_merge TO on; -SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; - id | num +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num --------------------------------------------------------------------- - 1 | 1.5 - 2 | 3.0 - 3 | 4.5 - 4 | 6.0 - 5 | 7.5 + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 (5 rows) --- C7: WHERE + ORDER BY +-- J20: abs() distance function SET citus.enable_sorted_merge TO off; -SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; - id | val +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num --------------------------------------------------------------------- - 67 | val_67 - 68 | val_68 - 69 | val_69 - 70 | val_70 - 71 | val_71 + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 (5 rows) SET citus.enable_sorted_merge TO on; -SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; - id | val +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num --------------------------------------------------------------------- - 67 | val_67 - 68 | val_68 - 69 | val_69 - 70 | val_70 - 71 | val_71 + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 (5 rows) --- C8: Aggregates in SELECT, ORDER BY on dist_col (GROUP BY dist_col) +-- ----------------------------------------------------------------- +-- J21–J22: More ineligibility — aggregate inside expressions +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J21: ORDER BY CASE wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY CASE WHEN count(*) > 1 THEN 0 ELSE 1 END, id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3, remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count, CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + Sort Key: (CASE WHEN (count(*) > N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), CASE WHEN (count(*) > N) THEN N ELSE N END + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J22: ORDER BY aggregate expression (sum + 1) — correctness SET citus.enable_sorted_merge TO off; -SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - id | count | sum | avg ---------------------------------------------------------------------- - 1 | 1 | 1.5 | 1.50000000000000000000 - 2 | 1 | 3.0 | 3.0000000000000000 - 3 | 1 | 4.5 | 4.5000000000000000 - 4 | 1 | 6.0 | 6.0000000000000000 - 5 | 1 | 7.5 | 7.5000000000000000 +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 (5 rows) SET citus.enable_sorted_merge TO on; -SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - id | count | sum | avg ---------------------------------------------------------------------- - 1 | 1 | 1.5 | 1.50000000000000000000 - 2 | 1 | 3.0 | 3.0000000000000000 - 3 | 1 | 4.5 | 4.5000000000000000 - 4 | 1 | 6.0 | 6.0000000000000000 - 5 | 1 | 7.5 | 7.5000000000000000 +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 (5 rows) +-- ----------------------------------------------------------------- +-- J23–J24: EXPLAIN plans for pushable expression patterns +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J23: Does function-call ORDER BY get pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, upper(val) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (upper(val)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, (upper(val)) + -> Sort (actual rows=N loops=N) + Output: id, val, (upper(val)) + Sort Key: (upper(sorted_merge_test.val)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, upper(val) +(20 rows) + +-- J24: ORDER BY expression not in SELECT list — pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (num OPERATOR(pg_catalog.+) 'N'::numeric) AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (num OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + Sort Key: ((sorted_merge_test.num + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, (num + 'N'::numeric) +(20 rows) + -- ================================================================= --- Category D: Complex queries — regression guards +-- Category K: Index-based sort avoidance -- ================================================================= +-- When an index exists on the ORDER BY column, PostgreSQL's worker-side +-- planner should choose an Index Scan instead of Sort + Seq Scan, making +-- the worker-side sort essentially free. This is the best-case scenario +-- for sorted merge: zero worker sort cost + zero coordinator sort cost. +-- +-- We disable enable_seqscan to force the worker planner to prefer the +-- index, since the test table is small enough that Seq Scan + Sort +-- would otherwise be cheaper. +CREATE INDEX sorted_merge_test_id_idx ON sorted_merge_test(id); +-- Use a transaction with SET LOCAL to propagate enable_seqscan=off to workers, +-- forcing the worker planner to use the index instead of Seq Scan + Sort. +SET citus.propagate_set_commands TO 'local'; +-- K1: EXPLAIN with index — worker uses Index Scan, no Sort node SET citus.enable_sorted_merge TO on; --- D1: Subquery in FROM with ORDER BY -SELECT * FROM ( - SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 -) sub ORDER BY id; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) + +COMMIT; +-- K2: Correctness with index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; id | val --------------------------------------------------------------------- 1 | val_1 @@ -2512,11 +2793,8 @@ SELECT * FROM ( 5 | val_5 (5 rows) --- D2: CTE with ORDER BY -WITH top5 AS ( - SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 -) -SELECT * FROM top5 ORDER BY id; +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; id | val --------------------------------------------------------------------- 1 | val_1 @@ -2526,447 +2804,585 @@ SELECT * FROM top5 ORDER BY id; 5 | val_5 (5 rows) --- D3: Co-located JOIN + ORDER BY -SELECT t.id, t.val, e.event_type -FROM sorted_merge_test t -JOIN sorted_merge_events e ON t.id = e.id -WHERE t.id <= 5 -ORDER BY t.id, e.event_type -LIMIT 10; - id | val | event_type ---------------------------------------------------------------------- - 1 | val_1 | buy - 1 | val_1 | buy - 1 | val_1 | click - 1 | val_1 | view - 2 | val_2 | buy - 2 | val_2 | click - 2 | val_2 | view - 2 | val_2 | view - 3 | val_3 | buy - 3 | val_3 | buy -(10 rows) - --- D4: UNION ALL + ORDER BY -SELECT id, val FROM sorted_merge_test WHERE id <= 3 -UNION ALL -SELECT id, val FROM sorted_merge_test WHERE id BETWEEN 98 AND 100 -ORDER BY id; - id | val +COMMIT; +-- K3: Multi-column index +CREATE INDEX sorted_merge_test_num_id_idx ON sorted_merge_test(num, id); +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num, id'); + explain_filter --------------------------------------------------------------------- - 1 | val_1 - 2 | val_2 - 3 | val_3 - 98 | val_98 - 99 | val_99 - 100 | val_100 -(6 rows) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Only Scan using sorted_merge_test_num_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num + Heap Fetches: N +(13 rows) --- D5: DISTINCT + ORDER BY -SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; - id +COMMIT; +-- K4: Correctness with multi-column index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num --------------------------------------------------------------------- - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 -(10 rows) + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) --- D6: DISTINCT ON + ORDER BY -SELECT DISTINCT ON (id) id, val, num -FROM sorted_merge_test -WHERE id <= 5 -ORDER BY id, num DESC; - id | val | num +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num --------------------------------------------------------------------- - 1 | val_1 | 1.5 - 2 | val_2 | 3.0 - 3 | val_3 | 4.5 - 4 | val_4 | 6.0 - 5 | val_5 | 7.5 + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 (5 rows) --- D7: EXISTS subquery + ORDER BY -SELECT id, val FROM sorted_merge_test t -WHERE EXISTS (SELECT 1 FROM sorted_merge_events e WHERE e.id = t.id) -ORDER BY id LIMIT 5; - id | val +COMMIT; +-- K5: DESC ordering with index +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id DESC'); + explain_filter --------------------------------------------------------------------- - 1 | val_1 - 2 | val_2 - 3 | val_3 - 4 | val_4 - 5 | val_5 -(5 rows) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan Backward using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) --- D8: IN subquery + ORDER BY -SELECT id, val FROM sorted_merge_test -WHERE id IN (SELECT id FROM sorted_merge_events WHERE event_type = 'click') -ORDER BY id LIMIT 5; - id | val ---------------------------------------------------------------------- - 1 | val_1 - 2 | val_2 - 3 | val_3 - 4 | val_4 - 5 | val_5 -(5 rows) +COMMIT; +RESET citus.propagate_set_commands; +DROP INDEX sorted_merge_test_id_idx; +DROP INDEX sorted_merge_test_num_id_idx; +-- ================================================================= +-- Category L: Volatile and stable functions in ORDER BY +-- Tests that ORDER BY with functions works correctly with sorted merge. +-- Volatile functions (random, clock_timestamp, timeofday) are pushed +-- to workers as computed columns — sorted merge uses the materialized +-- worker values, which is semantically equivalent to coordinator Sort. +-- ================================================================= +-- L1: STABLE function — now() in expression with column +-- now() returns the same value on all workers within a transaction, +-- so the merge is globally consistent. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY now() - ts, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (now() OPERATOR(pg_catalog.-) ts) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (now() OPERATOR(pg_catalog.-) ts), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((now() - ts)) + Sort Key: ((now() - sorted_merge_test.ts)), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (now() - ts) +(16 rows) + +-- L2: VOLATILE function — random() in ORDER BY +-- random() is pushed to workers as worker_column_3; each worker sorts +-- by its own random values. The merge interleaves using materialized +-- values — semantically equivalent to coordinator Sort on worker_column_3. +-- Test plan shape only (result is non-deterministic). +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY random(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, random() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (random()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (random()) + Sort Key: (random()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, random() +(16 rows) + +-- L3: VOLATILE function — clock_timestamp() in ORDER BY +-- Same mechanics as random(): pushed to workers, sorted locally, merged. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY clock_timestamp(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, clock_timestamp() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (clock_timestamp()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (clock_timestamp()) + Sort Key: (clock_timestamp()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, clock_timestamp() +(16 rows) + +-- L4: nextval() in ORDER BY with sorted merge ON — expected ERROR +-- nextval() cannot be pushed to workers (CanPushDownExpression blocks it). +-- The sort clause references a target entry missing from the worker target +-- list, causing a plan-time error. This is a pre-existing Citus limitation. +CREATE SEQUENCE sorted_merge_test_seq; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq'); +psql:sql/multi_orderby_pushdown.sql:777: ERROR: ORDER/GROUP BY expression not found in targetlist +-- L4b: nextval() in ORDER BY with sorted merge OFF but LIMIT present +-- Same error — demonstrates this is NOT a sorted merge regression. +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq') LIMIT 5; +psql:sql/multi_orderby_pushdown.sql:782: ERROR: ORDER/GROUP BY expression not found in targetlist +DROP SEQUENCE sorted_merge_test_seq; +-- L5: STABLE function alone (constant-fold case) +-- current_timestamp is constant-folded by the planner; the sort key +-- effectively becomes just 'id'. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY current_timestamp, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CURRENT_TIMESTAMP AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CURRENT_TIMESTAMP, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CURRENT_TIMESTAMP) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CURRENT_TIMESTAMP +(16 rows) --- D9: Multiple aggregates, GROUP BY dist_col, ORDER BY dist_col -SELECT id, count(*), sum(num), avg(num), min(val), max(val) -FROM sorted_merge_test -GROUP BY id -ORDER BY id -LIMIT 5; - id | count | sum | avg | min | max +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Cleanup +-- ================================================================= +SET citus.enable_sorted_merge TO off; +-- Run 2: streaming adapter +SET citus.enable_streaming_sorted_merge TO on; +\i sql/multi_orderby_pushdown.sql +-- +-- MULTI_SORTED_MERGE +-- +-- Tests for the citus.enable_sorted_merge GUC and the sorted merge +-- planner eligibility logic. Verifies that enabling the GUC does not +-- introduce regressions for any query pattern. +-- +-- MX verification: this test has been verified to pass with zero diffs +-- under check-base-mx (MX mode), confirming sorted merge works correctly +-- when any node in the cluster acts as coordinator. +-- +-- ================================================================= +-- 1. GUC basics +-- ================================================================= +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge --------------------------------------------------------------------- - 1 | 1 | 1.5 | 1.50000000000000000000 | val_1 | val_1 - 2 | 1 | 3.0 | 3.0000000000000000 | val_2 | val_2 - 3 | 1 | 4.5 | 4.5000000000000000 | val_3 | val_3 - 4 | 1 | 6.0 | 6.0000000000000000 | val_4 | val_4 - 5 | 1 | 7.5 | 7.5000000000000000 | val_5 | val_5 -(5 rows) + off +(1 row) --- D10: CASE expression in SELECT + ORDER BY -SELECT id, - CASE WHEN num > 75 THEN 'high' WHEN num > 25 THEN 'mid' ELSE 'low' END as bucket -FROM sorted_merge_test -WHERE num IS NOT NULL -ORDER BY id -LIMIT 10; - id | bucket +SET citus.enable_sorted_merge TO on; +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge --------------------------------------------------------------------- - 1 | low - 2 | low - 3 | low - 4 | low - 5 | low - 6 | low - 7 | low - 8 | low - 9 | low - 10 | low -(10 rows) + on +(1 row) --- D11: NULL values ordering -SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; - id | num +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Category A: Eligibility — sort IS pushed to workers +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- A1: ORDER BY distribution column +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter --------------------------------------------------------------------- - 101 | - 102 | - 1 | 1.5 - 2 | 3.0 - 3 | 4.5 -(5 rows) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) -SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; - id | num +-- A2: ORDER BY DESC +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC'); + explain_filter --------------------------------------------------------------------- - 1 | 1.5 - 2 | 3.0 - 3 | 4.5 - 4 | 6.0 - 5 | 7.5 -(5 rows) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id DESC + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(16 rows) -SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; - id | num +-- A3: ORDER BY DESC NULLS LAST +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST'); + explain_filter --------------------------------------------------------------------- - 101 | - 102 | - 100 | 150.0 - 99 | 148.5 - 98 | 147.0 -(5 rows) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num DESC NULLS LAST + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, num + Sort Key: sorted_merge_test.num DESC NULLS LAST + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num +(16 rows) -SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMIT 5; - id | num +-- A4: ORDER BY non-distribution column +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY val'); + explain_filter --------------------------------------------------------------------- - 100 | 150.0 - 99 | 148.5 - 98 | 147.0 - 97 | 145.5 - 96 | 144.0 -(5 rows) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.val + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) --- D12: Large OFFSET -SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; - id +-- A5: Multi-column ORDER BY +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val'); + explain_filter --------------------------------------------------------------------- - 101 - 102 - 200 - 201 - 202 -(5 rows) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id, sorted_merge_test.val + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) --- D13: ORDER BY ordinal position -SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; - id | val +-- A6: Mixed directions +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC'); + explain_filter --------------------------------------------------------------------- - 200 | dup_a - 201 | dup_b - 202 | dup_c - 1 | val_1 - 10 | val_10 -(5 rows) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, num DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id, sorted_merge_test.num DESC + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num +(16 rows) --- ================================================================= --- Category E: Edge cases --- ================================================================= -SET citus.enable_sorted_merge TO on; --- E1: Empty result set -SELECT id FROM sorted_merge_test WHERE id < 0 ORDER BY id; - id +-- A7: GROUP BY dist_col ORDER BY dist_col +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id'); + explain_filter --------------------------------------------------------------------- -(0 rows) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> GroupAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(19 rows) --- E2: Single row (may go through router planner) -SELECT id, val FROM sorted_merge_test WHERE id = 42 ORDER BY id; - id | val +-- A8: WHERE clause + ORDER BY +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id'); + explain_filter --------------------------------------------------------------------- - 42 | val_42 -(1 row) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) 'N'::numeric) ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + Filter: (sorted_merge_test.num > 'N'::numeric) + Rows Removed by Filter: N +(18 rows) --- E3: All rows with same sort value -SELECT id, num FROM sorted_merge_test WHERE num = 10.5 ORDER BY num, id; - id | num +-- A9: Expression in ORDER BY (non-aggregate) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1'); + explain_filter --------------------------------------------------------------------- - 7 | 10.5 - 200 | 10.5 - 201 | 10.5 - 202 | 10.5 -(4 rows) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, num, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num, (id + N) +(16 rows) --- E4: Wide sort key (4 columns) -SELECT id, val, num FROM sorted_merge_test -WHERE id <= 5 -ORDER BY num, val, id -LIMIT 5; - id | val | num +-- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - 1 | val_1 | 1.5 - 2 | val_2 | 3.0 - 3 | val_3 | 4.5 - 4 | val_4 | 6.0 - 5 | val_5 | 7.5 -(5 rows) - --- E5: Zero-task defensive path --- CreatePerTaskDispatchDest handles taskCount=0 gracefully (returns a no-op --- destination). This cannot be triggered via normal SQL because distributed --- tables always have at least one shard. The closest we can test is an --- empty-result query through the sorted merge path to verify no crash. -SELECT id FROM sorted_merge_test WHERE false ORDER BY id; - id ---------------------------------------------------------------------- -(0 rows) + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(20 rows) -- ================================================================= --- Category F: Existing LIMIT pushdown stability +-- Category B: Ineligibility — sort NOT pushed for merge -- ================================================================= --- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on -SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN ---------------------------------------------------------------------- - Limit (actual rows=5 loops=1) - Output: remote_scan.id - -> Sort (actual rows=5 loops=1) - Output: remote_scan.id - Sort Key: remote_scan.id - Sort Method: top-N heapsort Memory: 25kB - -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) - Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) - Output: id - -> Sort (actual rows=5 loops=1) - Output: id - Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id -(23 rows) - SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN ---------------------------------------------------------------------- - Limit (actual rows=5 loops=1) - Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) - Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) - Output: id - -> Sort (actual rows=5 loops=1) - Output: id - Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id -(19 rows) - --- F2: GROUP BY dist_col + ORDER BY + LIMIT -SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - QUERY PLAN +-- B1: ORDER BY count(*) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*)'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - -> Sort (actual rows=5 loops=1) + Sort Key: remote_scan.count + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - Sort Key: remote_scan.id - Sort Method: top-N heapsort Memory: 25kB - -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) - Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 240 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 60 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) - Output: id, (count(*)) - -> Sort (actual rows=5 loops=1) - Output: id, (count(*)) - Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> HashAggregate (actual rows=26 loops=1) - Output: id, count(*) - Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val, num, ts -(27 rows) + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) -SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - QUERY PLAN +-- B2: ORDER BY avg(col) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num)'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) - Output: remote_scan.id, remote_scan.count - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) - Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 240 bytes - Tasks Shown: One of 4 + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.avg + Sort Key: remote_scan.avg + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.avg + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 60 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) - Output: id, (count(*)) - -> Sort (actual rows=5 loops=1) - Output: id, (count(*)) - Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> HashAggregate (actual rows=26 loops=1) - Output: id, count(*) - Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val, num, ts -(23 rows) + Query: SELECT id, avg(num) AS avg FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, avg(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) --- F3: ORDER BY aggregate + LIMIT (not eligible for merge) -SET citus.enable_sorted_merge TO off; -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; - id | count +-- B3: GROUP BY non-dist col, ORDER BY non-dist col +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val'); + explain_filter --------------------------------------------------------------------- - 1 | 1 - 2 | 1 - 3 | 1 - 4 | 1 - 5 | 1 -(5 rows) + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Key: remote_scan.val + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.val, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(23 rows) -SET citus.enable_sorted_merge TO on; -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; - id | count ---------------------------------------------------------------------- - 1 | 1 - 2 | 1 - 3 | 1 - 4 | 1 - 5 | 1 -(5 rows) +-- B4: GROUP BY non-dist col, ORDER BY aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.val, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(23 rows) -- ================================================================= --- Category G: Phase 4 — Sort elision and advanced scenarios +-- Category C: Correctness — results match GUC off vs on -- ================================================================= --- G1: Sort elision verification — coordinator Sort node absent +-- C1: Simple ORDER BY SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN ---------------------------------------------------------------------- - Sort (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.val - Sort Key: remote_scan.id - Sort Method: quicksort Memory: 28kB - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val -(15 rows) - -SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN ---------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) - Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) - Output: id, val - Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val -(15 rows) - --- G2a: PREPARE with merge ON, EXECUTE after turning OFF --- Plan-time decision is baked in — cached plan must still merge correctly -SET citus.enable_sorted_merge TO on; -PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; -EXECUTE merge_on_stmt; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; id | val --------------------------------------------------------------------- 1 | val_1 @@ -2981,8 +3397,8 @@ EXECUTE merge_on_stmt; 10 | val_10 (10 rows) -SET citus.enable_sorted_merge TO off; -EXECUTE merge_on_stmt; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; id | val --------------------------------------------------------------------- 1 | val_1 @@ -2997,133 +3413,236 @@ EXECUTE merge_on_stmt; 10 | val_10 (10 rows) -DEALLOCATE merge_on_stmt; --- G2b: PREPARE with merge OFF, EXECUTE after turning ON --- Cached plan has Sort node — must still return sorted results +-- C2: ORDER BY DESC SET citus.enable_sorted_merge TO off; -PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; -EXECUTE merge_off_stmt; - id | val +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id --------------------------------------------------------------------- - 1 | val_1 - 2 | val_2 - 3 | val_3 - 4 | val_4 - 5 | val_5 - 6 | val_6 - 7 | val_7 - 8 | val_8 - 9 | val_9 - 10 | val_10 -(10 rows) + 202 + 201 + 200 + 102 + 101 +(5 rows) SET citus.enable_sorted_merge TO on; -EXECUTE merge_off_stmt; - id | val +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id --------------------------------------------------------------------- - 1 | val_1 - 2 | val_2 - 3 | val_3 - 4 | val_4 - 5 | val_5 - 6 | val_6 - 7 | val_7 - 8 | val_8 - 9 | val_9 - 10 | val_10 -(10 rows) + 202 + 201 + 200 + 102 + 101 +(5 rows) -DEALLOCATE merge_off_stmt; --- G3: Cursor with backward scan -SET citus.enable_sorted_merge TO on; -BEGIN; -DECLARE sorted_cursor CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; -FETCH 3 FROM sorted_cursor; - id +-- C3: Multi-column ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num --------------------------------------------------------------------- - 1 - 2 - 3 -(3 rows) + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) -FETCH BACKWARD 1 FROM sorted_cursor; -psql:sql/multi_orderby_pushdown.sql:319: ERROR: cursor can only scan forward -HINT: Declare it with SCROLL option to enable backward scan. -FETCH 2 FROM sorted_cursor; -psql:sql/multi_orderby_pushdown.sql:320: ERROR: current transaction is aborted, commands ignored until end of transaction block -CLOSE sorted_cursor; -psql:sql/multi_orderby_pushdown.sql:321: ERROR: current transaction is aborted, commands ignored until end of transaction block -COMMIT; --- G3b: SCROLL cursor with backward scan SET citus.enable_sorted_merge TO on; -BEGIN; -DECLARE sorted_scroll_cursor SCROLL CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; -FETCH 3 FROM sorted_scroll_cursor; - id +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num --------------------------------------------------------------------- - 1 - 2 - 3 -(3 rows) + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) -FETCH BACKWARD 1 FROM sorted_scroll_cursor; - id +-- C4: ORDER BY non-distribution column +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val --------------------------------------------------------------------- - 2 -(1 row) + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) -FETCH 2 FROM sorted_scroll_cursor; - id +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val --------------------------------------------------------------------- - 3 - 4 -(2 rows) + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) -CLOSE sorted_scroll_cursor; -COMMIT; --- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) -SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN +-- C5: GROUP BY dist_col ORDER BY dist_col +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) - Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) - Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) - Output: id - -> Sort (actual rows=5 loops=1) - Output: id - Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id -(19 rows) + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) --- G5: ORDER BY aggregate + LIMIT — crash regression test --- Previously caused SIGSEGV when sorted merge was enabled because --- aggregate ORDER BY was erroneously tagged as merge-eligible. SET citus.enable_sorted_merge TO on; -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 3; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; id | count --------------------------------------------------------------------- 1 | 1 2 | 1 3 | 1 -(3 rows) + 4 | 1 + 5 | 1 +(5 rows) + +-- C6: Mixed directions +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) --- G6: Small work_mem with many tasks (32 shards) SET citus.enable_sorted_merge TO on; -SET work_mem TO '64kB'; -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- C7: WHERE + ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +-- C8: Aggregates in SELECT, ORDER BY on dist_col (GROUP BY dist_col) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +-- ================================================================= +-- Category D: Complex queries — regression guards +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- D1: Subquery in FROM with ORDER BY +SELECT * FROM ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) sub ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D2: CTE with ORDER BY +WITH top5 AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) +SELECT * FROM top5 ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D3: Co-located JOIN + ORDER BY +SELECT t.id, t.val, e.event_type +FROM sorted_merge_test t +JOIN sorted_merge_events e ON t.id = e.id +WHERE t.id <= 5 +ORDER BY t.id, e.event_type +LIMIT 10; + id | val | event_type +--------------------------------------------------------------------- + 1 | val_1 | buy + 1 | val_1 | buy + 1 | val_1 | click + 1 | val_1 | view + 2 | val_2 | buy + 2 | val_2 | click + 2 | val_2 | view + 2 | val_2 | view + 3 | val_3 | buy + 3 | val_3 | buy +(10 rows) + +-- D4: UNION ALL + ORDER BY +SELECT id, val FROM sorted_merge_test WHERE id <= 3 +UNION ALL +SELECT id, val FROM sorted_merge_test WHERE id BETWEEN 98 AND 100 +ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 98 | val_98 + 99 | val_99 + 100 | val_100 +(6 rows) + +-- D5: DISTINCT + ORDER BY +SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; id --------------------------------------------------------------------- 1 @@ -3138,12 +3657,24 @@ SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; 10 (10 rows) -RESET work_mem; --- G7: max_intermediate_result_size with CTE subplan -SET citus.enable_sorted_merge TO on; -SET citus.max_intermediate_result_size TO '4kB'; -WITH cte AS (SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 50) -SELECT * FROM cte ORDER BY id LIMIT 5; +-- D6: DISTINCT ON + ORDER BY +SELECT DISTINCT ON (id) id, val, num +FROM sorted_merge_test +WHERE id <= 5 +ORDER BY id, num DESC; + id | val | num +--------------------------------------------------------------------- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +-- D7: EXISTS subquery + ORDER BY +SELECT id, val FROM sorted_merge_test t +WHERE EXISTS (SELECT 1 FROM sorted_merge_events e WHERE e.id = t.id) +ORDER BY id LIMIT 5; id | val --------------------------------------------------------------------- 1 | val_1 @@ -3153,17 +3684,10 @@ SELECT * FROM cte ORDER BY id LIMIT 5; 5 | val_5 (5 rows) -RESET citus.max_intermediate_result_size; --- ================================================================= --- Category H: Subplan + Sorted Merge interactions --- ================================================================= -SET citus.enable_sorted_merge TO on; --- H1: CTE subplan with simple ORDER BY — eligible for sorted merge --- The CTE becomes a subplan; its DistributedPlan may have useSortedMerge=true -WITH ordered_cte AS ( - SELECT id, val FROM sorted_merge_test ORDER BY id -) -SELECT * FROM ordered_cte ORDER BY id LIMIT 5; +-- D8: IN subquery + ORDER BY +SELECT id, val FROM sorted_merge_test +WHERE id IN (SELECT id FROM sorted_merge_events WHERE event_type = 'click') +ORDER BY id LIMIT 5; id | val --------------------------------------------------------------------- 1 | val_1 @@ -3173,622 +3697,2256 @@ SELECT * FROM ordered_cte ORDER BY id LIMIT 5; 5 | val_5 (5 rows) --- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) -WITH eligible_cte AS ( - SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 -), -ineligible_cte AS ( - SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 -) -SELECT e.id, e.val, i.cnt -FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id -ORDER BY e.id; - id | val | cnt +-- D9: Multiple aggregates, GROUP BY dist_col, ORDER BY dist_col +SELECT id, count(*), sum(num), avg(num), min(val), max(val) +FROM sorted_merge_test +GROUP BY id +ORDER BY id +LIMIT 5; + id | count | sum | avg | min | max --------------------------------------------------------------------- - 1 | val_1 | 1 - 2 | val_2 | 1 - 3 | val_3 | 1 - 4 | val_4 | 1 - 5 | val_5 | 1 - 6 | val_6 | 1 - 7 | val_7 | 1 - 8 | val_8 | 1 - 9 | val_9 | 1 - 10 | val_10 | 1 - 11 | val_11 | 1 - 12 | val_12 | 1 - 13 | val_13 | 1 - 14 | val_14 | 1 - 15 | val_15 | 1 -(15 rows) + 1 | 1 | 1.5 | 1.50000000000000000000 | val_1 | val_1 + 2 | 1 | 3.0 | 3.0000000000000000 | val_2 | val_2 + 3 | 1 | 4.5 | 4.5000000000000000 | val_3 | val_3 + 4 | 1 | 6.0 | 6.0000000000000000 | val_4 | val_4 + 5 | 1 | 7.5 | 7.5000000000000000 | val_5 | val_5 +(5 rows) --- H3: CTE subplan feeding outer ORDER BY — both levels may merge independently -WITH top_ids AS ( - SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 -) -SELECT t.id, t.val -FROM sorted_merge_test t -JOIN top_ids ON t.id = top_ids.id -ORDER BY t.id +-- D10: CASE expression in SELECT + ORDER BY +SELECT id, + CASE WHEN num > 75 THEN 'high' WHEN num > 25 THEN 'mid' ELSE 'low' END as bucket +FROM sorted_merge_test +WHERE num IS NOT NULL +ORDER BY id LIMIT 10; - id | val + id | bucket --------------------------------------------------------------------- - 1 | val_1 - 2 | val_2 - 3 | val_3 - 4 | val_4 - 5 | val_5 - 6 | val_6 - 7 | val_7 - 8 | val_8 - 9 | val_9 - 10 | val_10 + 1 | low + 2 | low + 3 | low + 4 | low + 5 | low + 6 | low + 7 | low + 8 | low + 9 | low + 10 | low (10 rows) --- H4: Subquery in WHERE with ORDER BY + LIMIT — becomes subplan with merge -SELECT id, val FROM sorted_merge_test -WHERE id IN ( - SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 -) -ORDER BY id -LIMIT 5; - id | val +-- D11: NULL values ordering +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; + id | num --------------------------------------------------------------------- - 1 | val_1 - 2 | val_2 - 3 | val_3 -(3 rows) + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) --- H5: CTE subplan with max_intermediate_result_size enforcement --- Tests that EnsureIntermediateSizeLimitNotExceeded works through per-task dispatch -SET citus.max_intermediate_result_size TO '4kB'; -WITH small_cte AS ( - SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 -) -SELECT * FROM small_cte ORDER BY id LIMIT 5; - id | val +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; + id | num --------------------------------------------------------------------- - 1 | val_1 - 2 | val_2 - 3 | val_3 - 4 | val_4 - 5 | val_5 + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 (5 rows) -RESET citus.max_intermediate_result_size; --- H6: Cross-join subplan with non-aggregate ORDER BY (crash regression variant) --- Similar pattern to subquery_complex_target_list but without aggregate ORDER BY -SELECT foo.id, bar.id as bar_id -FROM - (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, - (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar -ORDER BY foo.id, bar.id -LIMIT 5; - id | bar_id +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; + id | num --------------------------------------------------------------------- - 1 | 1 - 1 | 1 - 1 | 1 - 2 | 1 - 2 | 1 + 101 | + 102 | + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 (5 rows) --- H7: CTE correctness comparison — GUC off vs on must produce identical results -SET citus.enable_sorted_merge TO off; -WITH cte AS ( - SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 -) -SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; - id | val | num +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMIT 5; + id | num --------------------------------------------------------------------- - 7 | val_7 | 10.5 - 8 | val_8 | 12.0 - 9 | val_9 | 13.5 - 10 | val_10 | 15.0 - 11 | val_11 | 16.5 + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 (5 rows) -SET citus.enable_sorted_merge TO on; -WITH cte AS ( - SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 -) -SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; - id | val | num +-- D12: Large OFFSET +SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; + id --------------------------------------------------------------------- - 7 | val_7 | 10.5 - 8 | val_8 | 12.0 - 9 | val_9 | 13.5 - 10 | val_10 | 15.0 - 11 | val_11 | 16.5 + 101 + 102 + 200 + 201 + 202 +(5 rows) + +-- D13: ORDER BY ordinal position +SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 (5 rows) -- ================================================================= --- Category H EXPLAIN: Query plans for subplan + sorted merge +-- Category E: Edge cases -- ================================================================= SET citus.enable_sorted_merge TO on; --- H1 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH ordered_cte AS ( - SELECT id, val FROM sorted_merge_test ORDER BY id -) -SELECT * FROM ordered_cte ORDER BY id LIMIT 5; - QUERY PLAN +-- E1: Empty result set +SELECT id FROM sorted_merge_test WHERE id < 0 ORDER BY id; + id --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) - Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) - Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 191 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint - Tuple data received from node: 47 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) - Output: sorted_merge_test.id, sorted_merge_test.val - -> Sort (actual rows=5 loops=1) - Output: sorted_merge_test.id, sorted_merge_test.val - Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: sorted_merge_test.id, sorted_merge_test.val -(19 rows) +(0 rows) --- H2 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH eligible_cte AS ( - SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 -), -ineligible_cte AS ( - SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 -) -SELECT e.id, e.val, i.cnt -FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id -ORDER BY e.id; - QUERY PLAN +-- E2: Single row (may go through router planner) +SELECT id, val FROM sorted_merge_test WHERE id = 42 ORDER BY id; + id | val --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) - Output: remote_scan.id, remote_scan.val, remote_scan.cnt - -> Distributed Subplan XXX_1 - Intermediate Data Size: 397 bytes - Result destination: Write locally - -> Limit (actual rows=20 loops=1) - Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) - Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 791 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 197 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) - Output: id, val - -> Sort (actual rows=20 loops=1) - Output: id, val - Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val - -> Distributed Subplan XXX_2 - Intermediate Data Size: 330 bytes - Result destination: Write locally - -> Limit (actual rows=15 loops=1) - Output: remote_scan.id, remote_scan.cnt - -> Sort (actual rows=15 loops=1) - Output: remote_scan.id, remote_scan.cnt - Sort Key: remote_scan.cnt DESC, remote_scan.id - -> Custom Scan (Citus Adaptive) (never executed) - Output: remote_scan.id, remote_scan.cnt - Task Count: 4 - Tuple data received from nodes: 720 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT '15'::bigint - Tuple data received from node: 180 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=15 loops=1) - Output: id, (count(*)) - -> Sort (actual rows=15 loops=1) - Output: id, (count(*)) - Sort Key: (count(*)) DESC, sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> HashAggregate (actual rows=26 loops=1) - Output: id, count(*) - Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val, num, ts - Task Count: 1 - Tuple data received from nodes: 87 bytes - Tasks Shown: All - -> Task - Query: SELECT e.id, e.val, i.cnt FROM ((SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) e JOIN (SELECT intermediate_result.id, intermediate_result.cnt FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer, cnt bigint)) i ON ((e.id OPERATOR(pg_catalog.=) i.id))) ORDER BY e.id - Tuple data received from node: 87 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Merge Join (actual rows=5 loops=1) - Output: intermediate_result.id, intermediate_result.val, intermediate_result_1.cnt - Merge Cond: (intermediate_result.id = intermediate_result_1.id) - -> Sort (actual rows=6 loops=1) - Output: intermediate_result.id, intermediate_result.val - Sort Key: intermediate_result.id - Sort Method: quicksort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) - Output: intermediate_result.id, intermediate_result.val - Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - -> Sort (actual rows=15 loops=1) - Output: intermediate_result_1.cnt, intermediate_result_1.id - Sort Key: intermediate_result_1.id - Sort Method: quicksort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=15 loops=1) - Output: intermediate_result_1.cnt, intermediate_result_1.id - Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) -(77 rows) + 42 | val_42 +(1 row) --- H3 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH top_ids AS ( - SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 -) -SELECT t.id, t.val -FROM sorted_merge_test t -JOIN top_ids ON t.id = top_ids.id -ORDER BY t.id -LIMIT 10; - QUERY PLAN +-- E3: All rows with same sort value +SELECT id, num FROM sorted_merge_test WHERE num = 10.5 ORDER BY num, id; + id | num --------------------------------------------------------------------- - Limit (actual rows=10 loops=1) - Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=10 loops=1) - Output: remote_scan.id, remote_scan.val - -> Distributed Subplan XXX_1 - Intermediate Data Size: 200 bytes - Result destination: Send to 2 nodes - -> Limit (actual rows=20 loops=1) - Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) - Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 320 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 80 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) - Output: id - -> Sort (actual rows=20 loops=1) - Output: id - Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id - Task Count: 4 - Tuple data received from nodes: 97 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT '10'::bigint - Tuple data received from node: 97 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=10 loops=1) - Output: t.id, t.val - -> Merge Join (actual rows=10 loops=1) - Output: t.id, t.val - Merge Cond: (intermediate_result.id = t.id) - -> Sort (actual rows=10 loops=1) - Output: intermediate_result.id - Sort Key: intermediate_result.id - Sort Method: quicksort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) - Output: intermediate_result.id - Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - -> Sort (actual rows=10 loops=1) - Output: t.id, t.val - Sort Key: t.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=26 loops=1) - Output: t.id, t.val -(51 rows) + 7 | 10.5 + 200 | 10.5 + 201 | 10.5 + 202 | 10.5 +(4 rows) --- H4 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test -WHERE id IN ( - SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 -) -ORDER BY id +-- E4: Wide sort key (4 columns) +SELECT id, val, num FROM sorted_merge_test +WHERE id <= 5 +ORDER BY num, val, id LIMIT 5; - QUERY PLAN + id | val | num --------------------------------------------------------------------- - Limit (actual rows=3 loops=1) - Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=3 loops=1) - Output: remote_scan.id, remote_scan.val - -> Distributed Subplan XXX_1 - Intermediate Data Size: 100 bytes - Result destination: Send to 2 nodes - -> Limit (actual rows=10 loops=1) - Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=40 loops=1) - Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 160 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '10'::bigint - Tuple data received from node: 40 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=10 loops=1) - Output: id - -> Sort (actual rows=10 loops=1) - Output: id - Sort Key: sorted_merge_events.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) - Output: id - Task Count: 4 - Tuple data received from nodes: 27 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint - Tuple data received from node: 27 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=3 loops=1) - Output: sorted_merge_test.id, sorted_merge_test.val - -> Sort (actual rows=3 loops=1) - Output: sorted_merge_test.id, sorted_merge_test.val - Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Hash Semi Join (actual rows=3 loops=1) - Output: sorted_merge_test.id, sorted_merge_test.val - Hash Cond: (sorted_merge_test.id = intermediate_result.id) - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: sorted_merge_test.id, sorted_merge_test.val, sorted_merge_test.num, sorted_merge_test.ts - -> Hash (actual rows=10 loops=1) - Output: intermediate_result.id - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=10 loops=1) - Output: intermediate_result.id - Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) -(50 rows) + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) --- H5 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH small_cte AS ( - SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 -) -SELECT * FROM small_cte ORDER BY id LIMIT 5; - QUERY PLAN +-- E5: Zero-task defensive path +-- CreatePerTaskDispatchDest handles taskCount=0 gracefully (returns a no-op +-- destination). This cannot be triggered via normal SQL because distributed +-- tables always have at least one shard. The closest we can test is an +-- empty-result query through the sorted merge path to verify no crash. +SELECT id FROM sorted_merge_test WHERE false ORDER BY id; + id --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) - Output: remote_scan.id, remote_scan.val - -> Distributed Subplan XXX_1 - Intermediate Data Size: 397 bytes - Result destination: Write locally - -> Limit (actual rows=20 loops=1) - Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) - Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 791 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 197 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) - Output: id, val - -> Sort (actual rows=20 loops=1) - Output: id, val - Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val - Task Count: 1 - Tuple data received from nodes: 47 bytes - Tasks Shown: All - -> Task - Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT 5 - Tuple data received from node: 47 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) - Output: intermediate_result.id, intermediate_result.val - -> Sort (actual rows=5 loops=1) - Output: intermediate_result.id, intermediate_result.val - Sort Key: intermediate_result.id - Sort Method: top-N heapsort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) - Output: intermediate_result.id, intermediate_result.val - Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) -(40 rows) +(0 rows) --- H6 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT foo.id, bar.id as bar_id -FROM - (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, - (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar -ORDER BY foo.id, bar.id -LIMIT 5; - QUERY PLAN +-- ================================================================= +-- Category F: Existing LIMIT pushdown stability +-- ================================================================= +-- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) - Output: remote_scan.id, remote_scan.bar_id - -> Distributed Subplan XXX_1 - Intermediate Data Size: 30 bytes - Result destination: Write locally - -> Limit (actual rows=3 loops=1) + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Sort (actual rows=N loops=N) + Output: remote_scan.id + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) - Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 48 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '3'::bigint - Tuple data received from node: 12 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=3 loops=1) + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(23 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id - -> Sort (actual rows=3 loops=1) +(20 rows) + +-- F2: GROUP BY dist_col + ORDER BY + LIMIT +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> GroupAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id - -> Distributed Subplan XXX_2 - Intermediate Data Size: 30 bytes - Result destination: Write locally - -> Limit (actual rows=3 loops=1) - Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) - Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 48 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '3'::bigint - Tuple data received from node: 12 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=3 loops=1) +(26 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> GroupAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Sort (actual rows=N loops=N) Output: id - -> Sort (actual rows=3 loops=1) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id - Sort Key: sorted_merge_events.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) - Output: id - Task Count: 1 - Tuple data received from nodes: 40 bytes - Tasks Shown: All - -> Task - Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT 5 - Tuple data received from node: 40 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) - Output: intermediate_result.id, intermediate_result_1.id - -> Sort (actual rows=5 loops=1) - Output: intermediate_result.id, intermediate_result_1.id - Sort Key: intermediate_result.id, intermediate_result_1.id - Sort Method: quicksort Memory: 25kB - -> Nested Loop (actual rows=9 loops=1) - Output: intermediate_result.id, intermediate_result_1.id - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=3 loops=1) - Output: intermediate_result.id - Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=3 loops=3) - Output: intermediate_result_1.id - Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) -(67 rows) +(23 rows) --- H7 EXPLAIN — GUC off vs on +-- F3: ORDER BY aggregate + LIMIT (not eligible for merge) SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH cte AS ( - SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 -) -SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; - QUERY PLAN +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) - Output: remote_scan.id, remote_scan.val, remote_scan.num - -> Distributed Subplan XXX_1 - Intermediate Data Size: 691 bytes - Result destination: Write locally - -> Limit (actual rows=20 loops=1) - Output: remote_scan.id, remote_scan.val, remote_scan.num - -> Sort (actual rows=20 loops=1) - Output: remote_scan.id, remote_scan.val, remote_scan.num - Sort Key: remote_scan.id - -> Custom Scan (Citus Adaptive) (never executed) - Output: remote_scan.id, remote_scan.val, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 1673 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 419 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) - Output: id, val, num - -> Sort (actual rows=20 loops=1) - Output: id, val, num - Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val, num - Task Count: 1 - Tuple data received from nodes: 103 bytes - Tasks Shown: All - -> Task - Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 - Tuple data received from node: 103 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) - Output: intermediate_result.id, intermediate_result.val, intermediate_result.num - -> Sort (actual rows=5 loops=1) - Output: intermediate_result.id, intermediate_result.val, intermediate_result.num - Sort Key: intermediate_result.id - Sort Method: top-N heapsort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=14 loops=1) - Output: intermediate_result.id, intermediate_result.val, intermediate_result.num - Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - Filter: (intermediate_result.num > '10'::numeric) - Rows Removed by Filter: 6 -(45 rows) + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH cte AS ( - SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 -) -SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; - QUERY PLAN +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) - Output: remote_scan.id, remote_scan.val, remote_scan.num - -> Distributed Subplan XXX_1 - Intermediate Data Size: 699 bytes - Result destination: Write locally - -> Limit (actual rows=20 loops=1) - Output: remote_scan.id, remote_scan.val, remote_scan.num - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) - Output: remote_scan.id, remote_scan.val, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 1673 bytes - Tasks Shown: One of 4 - -> Task - Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 419 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) - Output: id, val, num - -> Sort (actual rows=20 loops=1) - Output: id, val, num - Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, val, num - Task Count: 1 - Tuple data received from nodes: 101 bytes - Tasks Shown: All + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- ================================================================= +-- Category G: Phase 4 — Sort elision and advanced scenarios +-- ================================================================= +-- G1: Sort elision verification — coordinator Sort node absent +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Sort Key: remote_scan.id + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(15 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 - Tuple data received from node: 101 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) - Output: intermediate_result.id, intermediate_result.val, intermediate_result.num - -> Sort (actual rows=5 loops=1) - Output: intermediate_result.id, intermediate_result.val, intermediate_result.num - Sort Key: intermediate_result.id - Sort Method: top-N heapsort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=18 loops=1) - Output: intermediate_result.id, intermediate_result.val, intermediate_result.num - Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - Filter: (intermediate_result.num > '10'::numeric) - Rows Removed by Filter: 2 -(42 rows) + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) +-- G2a: PREPARE with merge ON, EXECUTE after turning OFF +-- Plan-time decision is baked in — cached plan must still merge correctly. +-- Execute 6+ times to trigger PostgreSQL's generic plan caching, then +-- verify the plan shape is preserved after toggling the GUC. +SET citus.enable_sorted_merge TO on; +PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — no Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + +SET citus.enable_sorted_merge TO off; +-- Cached plan retains the sorted merge decision from planning time +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + +DEALLOCATE merge_on_stmt; +-- G2b: PREPARE with merge OFF, EXECUTE after turning ON +-- Cached plan has Sort node — must still return sorted results. +SET citus.enable_sorted_merge TO off; +PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +SET citus.enable_sorted_merge TO on; +-- Cached plan retains the non-merge decision from planning time +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +DEALLOCATE merge_off_stmt; +-- G3: Cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_cursor CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:323: ERROR: cursor can only scan forward +HINT: Declare it with SCROLL option to enable backward scan. +FETCH 2 FROM sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:324: ERROR: current transaction is aborted, commands ignored until end of transaction block +CLOSE sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:325: ERROR: current transaction is aborted, commands ignored until end of transaction block +COMMIT; +-- G3b: SCROLL cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_scroll_cursor SCROLL CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 2 +(1 row) + +FETCH 2 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 3 + 4 +(2 rows) + +CLOSE sorted_scroll_cursor; +COMMIT; +-- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(20 rows) + +-- G5: ORDER BY aggregate + LIMIT — crash regression test +-- Previously caused SIGSEGV when sorted merge was enabled because +-- aggregate ORDER BY was erroneously tagged as merge-eligible. +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 3; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 +(3 rows) + +-- G6: Small work_mem with many tasks (32 shards) +SET citus.enable_sorted_merge TO on; +SET work_mem TO '64kB'; +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +RESET work_mem; +-- G7: max_intermediate_result_size with CTE subplan +SET citus.enable_sorted_merge TO on; +SET citus.max_intermediate_result_size TO '4kB'; +WITH cte AS (SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 50) +SELECT * FROM cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- ================================================================= +-- Category H: Subplan + Sorted Merge interactions +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1: CTE subplan with simple ORDER BY — eligible for sorted merge +-- The CTE becomes a subplan; its DistributedPlan may have useSortedMerge=true +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + id | val | cnt +--------------------------------------------------------------------- + 1 | val_1 | 1 + 2 | val_2 | 1 + 3 | val_3 | 1 + 4 | val_4 | 1 + 5 | val_5 | 1 + 6 | val_6 | 1 + 7 | val_7 | 1 + 8 | val_8 | 1 + 9 | val_9 | 1 + 10 | val_10 | 1 + 11 | val_11 | 1 + 12 | val_12 | 1 + 13 | val_13 | 1 + 14 | val_14 | 1 + 15 | val_15 | 1 +(15 rows) + +-- H3: CTE subplan feeding outer ORDER BY — both levels may merge independently +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- H4: Subquery in WHERE with ORDER BY + LIMIT — becomes subplan with merge +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 +(3 rows) + +-- H5: CTE subplan with max_intermediate_result_size enforcement +-- Tests that EnsureIntermediateSizeLimitNotExceeded works through per-task dispatch +SET citus.max_intermediate_result_size TO '4kB'; +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- H6: Cross-join subplan with non-aggregate ORDER BY (crash regression variant) +-- Similar pattern to subquery_complex_target_list but without aggregate ORDER BY +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + id | bar_id +--------------------------------------------------------------------- + 1 | 1 + 1 | 1 + 1 | 1 + 2 | 1 + 2 | 1 +(5 rows) + +-- H7: CTE correctness comparison — GUC off vs on must produce identical results +SET citus.enable_sorted_merge TO off; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +-- ================================================================= +-- Category H EXPLAIN: Query plans for subplan + sorted merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val +(20 rows) + +-- H2 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.cnt + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + -> Distributed Subplan XXX_2 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.cnt + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.cnt + Sort Key: remote_scan.cnt DESC, remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.cnt + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)) + Sort Key: (count(*)) DESC, sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT e.id, e.val, i.cnt FROM ((SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) e JOIN (SELECT intermediate_result.id, intermediate_result.cnt FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer, cnt bigint)) i ON ((e.id OPERATOR(pg_catalog.=) i.id))) ORDER BY e.id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Merge Join (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result_1.cnt + Merge Cond: (intermediate_result.id = intermediate_result_1.id) + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=N loops=N) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Sort Key: intermediate_result_1.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(78 rows) + +-- H3 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: t.id, t.val + -> Merge Join (actual rows=N loops=N) + Output: t.id, t.val + Merge Cond: (t.id = intermediate_result.id) + -> Sort (actual rows=N loops=N) + Output: t.id, t.val + Sort Key: t.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=N loops=N) + Output: t.id, t.val + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(53 rows) + +-- H4 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) + Output: id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Hash Semi Join (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Hash Cond: (sorted_merge_test.id = intermediate_result.id) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val, sorted_merge_test.num, sorted_merge_test.ts + -> Hash (actual rows=N loops=N) + Output: intermediate_result.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(52 rows) + +-- H5 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(41 rows) + +-- H6 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.bar_id + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id + -> Distributed Subplan XXX_2 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) + Output: id + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + Sort Key: intermediate_result.id, intermediate_result_1.id + Sort Method: quicksort Memory: NkB + -> Nested Loop (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) + Output: intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(69 rows) + +-- H7 EXPLAIN — GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, num + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N +(45 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, num + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N +(43 rows) + +-- ================================================================= +-- Category I: Distributed Transactions +-- ================================================================= +-- Verify sorted merge correctness within multi-statement transactions +-- where data is modified before the sorted-merge SELECT. +SET citus.enable_sorted_merge TO on; +-- I1: INSERT then SELECT within a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (900, 'txn_insert', 900.0); +SELECT id, val FROM sorted_merge_test WHERE id >= 900 ORDER BY id; + id | val +--------------------------------------------------------------------- + 900 | txn_insert +(1 row) + +ROLLBACK; +-- I2: UPDATE then SELECT within a transaction +BEGIN; +UPDATE sorted_merge_test SET val = 'updated' WHERE id = 1; +SELECT id, val FROM sorted_merge_test WHERE id <= 3 ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | updated + 2 | val_2 + 3 | val_3 +(3 rows) + +ROLLBACK; +-- I3: DELETE then SELECT within a transaction +BEGIN; +DELETE FROM sorted_merge_test WHERE id <= 5; +SELECT id, val FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id | val +--------------------------------------------------------------------- + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(5 rows) + +ROLLBACK; +-- I4: INSERT + UPDATE + SELECT with multi-column ORDER BY +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (901, 'txn_a', 1.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (902, 'txn_b', 2.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (903, 'txn_c', 3.0); +UPDATE sorted_merge_test SET num = 999.0 WHERE id = 901; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 900 ORDER BY num, id; + id | val | num +--------------------------------------------------------------------- + 902 | txn_b | 2.0 + 903 | txn_c | 3.0 + 901 | txn_a | 999.0 +(3 rows) + +ROLLBACK; +-- I5: Compare results with GUC off vs on in a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (910, 'cmp_a', 10.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (911, 'cmp_b', 20.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (912, 'cmp_c', 30.0); +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num +--------------------------------------------------------------------- + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num +--------------------------------------------------------------------- + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) + +ROLLBACK; +-- I6: DELETE + aggregate in SELECT with ORDER BY +BEGIN; +DELETE FROM sorted_merge_test WHERE id > 100 AND id < 200; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +ROLLBACK; +-- ================================================================= +-- Category J: Coordinator expression evaluation exclusion +-- ================================================================= +-- Verify that queries with ORDER BY on expressions that need coordinator-side +-- evaluation are correctly excluded from sorted merge (or handled correctly). +SET citus.enable_sorted_merge TO on; +-- J1: ORDER BY expression on aggregate result (ordinal reference) +-- The ORDER BY references position 2 which is an aggregate — sorted merge +-- must NOT be used because aggregates are rewritten between worker/coordinator. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) AS total FROM sorted_merge_test GROUP BY id ORDER BY 2 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Sort Key: remote_scan.total + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, sum(num) AS total FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (sum(num)) + -> Sort (actual rows=N loops=N) + Output: id, (sum(num)) + Sort Key: (sum(sorted_merge_test.num)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, sum(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J2: ORDER BY expression wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) + 1 AS total_plus FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Sort Key: remote_scan.total_plus + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) AS total_plus FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + Sort Key: ((sum(sorted_merge_test.num) + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, (sum(num) + 'N'::numeric) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J3: ORDER BY a non-aggregate expression that can be pushed to workers +-- This should be eligible for sorted merge — the expression is evaluated +-- on the worker side and sort order is preserved. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id + 0'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (id + N) +(16 rows) + +-- J4: ORDER BY with CASE expression (no aggregates) — eligible +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY CASE WHEN id < 50 THEN 0 ELSE 1 END, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CASE WHEN (id < N) THEN N ELSE N END) + Sort Key: (CASE WHEN (sorted_merge_test.id < N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CASE WHEN (id < N) THEN N ELSE N END +(16 rows) + +-- J5: ORDER BY on an expression that mixes aggregate and non-aggregate +-- Should be ineligible because the expression contains an aggregate. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id + count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3 + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count, (id OPERATOR(pg_catalog.+) count(*)) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), (id + count(*)) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) + +-- J6: Correctness comparison — expression ORDER BY, GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J7–J12: Additional pushable expressions (no aggregates) +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J7: ORDER BY function call on column +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- J8: ORDER BY COALESCE +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0) LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +-- J9: ORDER BY negation +SELECT id, num FROM sorted_merge_test ORDER BY -num LIMIT 5; + id | num +--------------------------------------------------------------------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +-- J10: ORDER BY concatenation +SELECT id, val FROM sorted_merge_test ORDER BY val || '_suffix' LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 100 | val_100 + 10 | val_10 +(5 rows) + +-- J11: ORDER BY mathematical function (abs distance) +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +-- J12: ORDER BY expression not in SELECT list +SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +-- J13: ORDER BY expression referencing multiple columns +SELECT id, val FROM sorted_merge_test ORDER BY id * num LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- J14: ORDER BY with type cast +SELECT id, num FROM sorted_merge_test ORDER BY num::int LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- J15: ORDER BY with subexpression in SELECT and different expression in ORDER BY +SELECT id, num + 1 as n1 FROM sorted_merge_test ORDER BY num + 2 LIMIT 5; + id | n1 +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +-- J16: ORDER BY column alias +SELECT id, num * 2 as doubled FROM sorted_merge_test ORDER BY doubled LIMIT 5; + id | doubled +--------------------------------------------------------------------- + 1 | 3.0 + 2 | 6.0 + 3 | 9.0 + 4 | 12.0 + 5 | 15.0 +(5 rows) + +-- ----------------------------------------------------------------- +-- J17–J21: Correctness — GUC off vs on for expression ORDER BY +-- ----------------------------------------------------------------- +-- J17: function call +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- J18: CASE expression +SET citus.enable_sorted_merge TO off; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) + +-- J19: COALESCE +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +-- J20: abs() distance function +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J21–J22: More ineligibility — aggregate inside expressions +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J21: ORDER BY CASE wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY CASE WHEN count(*) > 1 THEN 0 ELSE 1 END, id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3, remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count, CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + Sort Key: (CASE WHEN (count(*) > N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), CASE WHEN (count(*) > N) THEN N ELSE N END + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J22: ORDER BY aggregate expression (sum + 1) — correctness +SET citus.enable_sorted_merge TO off; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J23–J24: EXPLAIN plans for pushable expression patterns +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J23: Does function-call ORDER BY get pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, upper(val) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (upper(val)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, (upper(val)) + -> Sort (actual rows=N loops=N) + Output: id, val, (upper(val)) + Sort Key: (upper(sorted_merge_test.val)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, upper(val) +(20 rows) + +-- J24: ORDER BY expression not in SELECT list — pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (num OPERATOR(pg_catalog.+) 'N'::numeric) AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (num OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + Sort Key: ((sorted_merge_test.num + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, (num + 'N'::numeric) +(20 rows) + +-- ================================================================= +-- Category K: Index-based sort avoidance +-- ================================================================= +-- When an index exists on the ORDER BY column, PostgreSQL's worker-side +-- planner should choose an Index Scan instead of Sort + Seq Scan, making +-- the worker-side sort essentially free. This is the best-case scenario +-- for sorted merge: zero worker sort cost + zero coordinator sort cost. +-- +-- We disable enable_seqscan to force the worker planner to prefer the +-- index, since the test table is small enough that Seq Scan + Sort +-- would otherwise be cheaper. +CREATE INDEX sorted_merge_test_id_idx ON sorted_merge_test(id); +-- Use a transaction with SET LOCAL to propagate enable_seqscan=off to workers, +-- forcing the worker planner to use the index instead of Seq Scan + Sort. +SET citus.propagate_set_commands TO 'local'; +-- K1: EXPLAIN with index — worker uses Index Scan, no Sort node +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) + +COMMIT; +-- K2: Correctness with index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +COMMIT; +-- K3: Multi-column index +CREATE INDEX sorted_merge_test_num_id_idx ON sorted_merge_test(num, id); +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Only Scan using sorted_merge_test_num_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num + Heap Fetches: N +(13 rows) + +COMMIT; +-- K4: Correctness with multi-column index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +COMMIT; +-- K5: DESC ordering with index +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id DESC'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan Backward using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) + +COMMIT; +RESET citus.propagate_set_commands; +DROP INDEX sorted_merge_test_id_idx; +DROP INDEX sorted_merge_test_num_id_idx; +-- ================================================================= +-- Category L: Volatile and stable functions in ORDER BY +-- Tests that ORDER BY with functions works correctly with sorted merge. +-- Volatile functions (random, clock_timestamp, timeofday) are pushed +-- to workers as computed columns — sorted merge uses the materialized +-- worker values, which is semantically equivalent to coordinator Sort. +-- ================================================================= +-- L1: STABLE function — now() in expression with column +-- now() returns the same value on all workers within a transaction, +-- so the merge is globally consistent. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY now() - ts, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (now() OPERATOR(pg_catalog.-) ts) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (now() OPERATOR(pg_catalog.-) ts), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((now() - ts)) + Sort Key: ((now() - sorted_merge_test.ts)), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (now() - ts) +(16 rows) + +-- L2: VOLATILE function — random() in ORDER BY +-- random() is pushed to workers as worker_column_3; each worker sorts +-- by its own random values. The merge interleaves using materialized +-- values — semantically equivalent to coordinator Sort on worker_column_3. +-- Test plan shape only (result is non-deterministic). +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY random(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, random() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (random()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (random()) + Sort Key: (random()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, random() +(16 rows) + +-- L3: VOLATILE function — clock_timestamp() in ORDER BY +-- Same mechanics as random(): pushed to workers, sorted locally, merged. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY clock_timestamp(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, clock_timestamp() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (clock_timestamp()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (clock_timestamp()) + Sort Key: (clock_timestamp()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, clock_timestamp() +(16 rows) + +-- L4: nextval() in ORDER BY with sorted merge ON — expected ERROR +-- nextval() cannot be pushed to workers (CanPushDownExpression blocks it). +-- The sort clause references a target entry missing from the worker target +-- list, causing a plan-time error. This is a pre-existing Citus limitation. +CREATE SEQUENCE sorted_merge_test_seq; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq'); +psql:sql/multi_orderby_pushdown.sql:777: ERROR: ORDER/GROUP BY expression not found in targetlist +-- L4b: nextval() in ORDER BY with sorted merge OFF but LIMIT present +-- Same error — demonstrates this is NOT a sorted merge regression. +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq') LIMIT 5; +psql:sql/multi_orderby_pushdown.sql:782: ERROR: ORDER/GROUP BY expression not found in targetlist +DROP SEQUENCE sorted_merge_test_seq; +-- L5: STABLE function alone (constant-fold case) +-- current_timestamp is constant-folded by the planner; the sort key +-- effectively becomes just 'id'. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY current_timestamp, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CURRENT_TIMESTAMP AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CURRENT_TIMESTAMP, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CURRENT_TIMESTAMP) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CURRENT_TIMESTAMP +(16 rows) + +SET citus.enable_sorted_merge TO off; -- ================================================================= -- Cleanup -- ================================================================= From 1c11bf9da3889d8c1617d7f696fe46846cd1441f Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Tue, 14 Apr 2026 21:47:42 +0000 Subject: [PATCH 7/7] Make style checks happy --- .../distributed/executor/sorted_merge.c | 2 +- src/backend/distributed/shared_library_init.c | 30 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/backend/distributed/executor/sorted_merge.c b/src/backend/distributed/executor/sorted_merge.c index 75e09f92baa..397b1cc9620 100644 --- a/src/backend/distributed/executor/sorted_merge.c +++ b/src/backend/distributed/executor/sorted_merge.c @@ -382,7 +382,7 @@ CreateSortedMergeAdapter(Tuplestorestate **perTaskStores, * On each call after the first, we advance the previous winner's store * and update the heap before selecting the new winner. This matches the * MergeAppend pattern in nodeMergeAppend.c. - * + * * Possible perf optimizations to explore in the future: * Avoid copying the winning tuple into the scan slot by returning a pointer to the winner's slot instead. * This would require changes to the caller to not modify the returned slot and to understand that it's owned by the adapter until the next call. diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 8b4b366ace3..039e8ee5c51 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1617,21 +1617,6 @@ RegisterCitusConfigVariables(void) GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, NULL, NULL, NULL); - DefineCustomBoolVariable( - "citus.enable_streaming_sorted_merge", - gettext_noop("Use streaming adapter instead of eager merge for sorted merge."), - gettext_noop("When enabled alongside citus.enable_sorted_merge, the coordinator " - "streams merged tuples directly from per-task stores via a binary " - "heap instead of eagerly copying all tuples into a final tuplestore. " - "This reduces memory usage and improves time-to-first-tuple, " - "especially for LIMIT queries. Requires citus.enable_sorted_merge " - "to also be enabled. This is an experimental feature."), - &EnableStreamingSortedMerge, - false, - PGC_USERSET, - GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - DefineCustomBoolVariable( "citus.enable_stat_counters", gettext_noop("Enables the collection of statistic counters for Citus."), @@ -1656,6 +1641,21 @@ RegisterCitusConfigVariables(void) GUC_SUPERUSER_ONLY, NULL, NULL, NULL); + DefineCustomBoolVariable( + "citus.enable_streaming_sorted_merge", + gettext_noop("Use streaming adapter instead of eager merge for sorted merge."), + gettext_noop("When enabled alongside citus.enable_sorted_merge, the coordinator " + "streams merged tuples directly from per-task stores via a binary " + "heap instead of eagerly copying all tuples into a final tuplestore. " + "This reduces memory usage and improves time-to-first-tuple, " + "especially for LIMIT queries. Requires citus.enable_sorted_merge " + "to also be enabled. This is an experimental feature."), + &EnableStreamingSortedMerge, + false, + PGC_USERSET, + GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + DefineCustomBoolVariable( "citus.enable_unique_job_ids", gettext_noop("Enables unique job IDs by prepending the local process ID and "