diff --git a/src/backend/distributed/executor/adaptive_executor.c b/src/backend/distributed/executor/adaptive_executor.c index a4e5461e51e..53a6c4c7b46 100644 --- a/src/backend/distributed/executor/adaptive_executor.c +++ b/src/backend/distributed/executor/adaptive_executor.c @@ -947,23 +947,45 @@ AdaptiveExecutor(CitusScanState *scanState) * When sorted merge is active, k-way merge the per-task stores into * the final tuplestore. This produces globally sorted output that the * existing ReturnTupleFromTuplestore() path can read unchanged. + * + * When streaming sorted merge is enabled, create an adapter instead + * that delivers tuples one at a time without a final tuplestore. */ if (execution->useSortedMerge && execution->perTaskStoreCount > 0) { - scanState->tuplestorestate = - tuplestore_begin_heap(randomAccess, interTransactions, work_mem); - - MergePerTaskStoresIntoFinalStore(scanState->tuplestorestate, - execution->perTaskStores, - execution->perTaskStoreCount, - distributedPlan->sortedMergeKeys, - distributedPlan->sortedMergeKeyCount, - tupleDescriptor); - - /* free per-task stores — they are no longer needed */ - for (int i = 0; i < execution->perTaskStoreCount; i++) + if (EnableStreamingSortedMerge) + { + /* + * Streaming mode: create an adapter that delivers tuples one + * at a time from the per-task stores via a binary heap. The + * adapter takes ownership of the per-task stores. + */ + scanState->mergeAdapter = CreateSortedMergeAdapter( + execution->perTaskStores, + execution->perTaskStoreCount, + distributedPlan->sortedMergeKeys, + distributedPlan->sortedMergeKeyCount, + tupleDescriptor, + true); + } + else { - tuplestore_end(execution->perTaskStores[i]); + /* Eager mode (default): merge all tuples into a final tuplestore */ + scanState->tuplestorestate = + tuplestore_begin_heap(randomAccess, interTransactions, work_mem); + + MergePerTaskStoresIntoFinalStore(scanState->tuplestorestate, + execution->perTaskStores, + execution->perTaskStoreCount, + distributedPlan->sortedMergeKeys, + distributedPlan->sortedMergeKeyCount, + tupleDescriptor); + + /* free per-task stores — they are no longer needed */ + for (int i = 0; i < execution->perTaskStoreCount; i++) + { + tuplestore_end(execution->perTaskStores[i]); + } } } diff --git a/src/backend/distributed/executor/citus_custom_scan.c b/src/backend/distributed/executor/citus_custom_scan.c index db7e4f725ff..4b1c4701d08 100644 --- a/src/backend/distributed/executor/citus_custom_scan.c +++ b/src/backend/distributed/executor/citus_custom_scan.c @@ -46,6 +46,7 @@ #include "distributed/multi_router_planner.h" #include "distributed/multi_server_executor.h" #include "distributed/shard_utils.h" +#include "distributed/sorted_merge.h" #include "distributed/stats/query_stats.h" #include "distributed/stats/stat_counters.h" #include "distributed/subplan_execution.h" @@ -835,6 +836,12 @@ CitusEndScan(CustomScanState *node) CitusQueryStatsExecutorsEntry(queryId, executorType, partitionKeyString); } + if (scanState->mergeAdapter) + { + FreeSortedMergeAdapter(scanState->mergeAdapter); + scanState->mergeAdapter = NULL; + } + if (scanState->tuplestorestate) { tuplestore_end(scanState->tuplestorestate); @@ -857,7 +864,12 @@ CitusReScan(CustomScanState *node) ExecScanReScan(&node->ss); CitusScanState *scanState = (CitusScanState *) node; - if (scanState->tuplestorestate) + + if (scanState->mergeAdapter) + { + SortedMergeAdapterRescan(scanState->mergeAdapter); + } + else if (scanState->tuplestorestate) { tuplestore_rescan(scanState->tuplestorestate); } diff --git a/src/backend/distributed/executor/multi_executor.c b/src/backend/distributed/executor/multi_executor.c index 8661d367345..e9857fda136 100644 --- a/src/backend/distributed/executor/multi_executor.c +++ b/src/backend/distributed/executor/multi_executor.c @@ -50,6 +50,7 @@ #include "distributed/multi_server_executor.h" #include "distributed/relation_access_tracking.h" #include "distributed/resource_lock.h" +#include "distributed/sorted_merge.h" #include "distributed/transaction_management.h" #include "distributed/version_compat.h" #include "distributed/worker_protocol.h" @@ -88,6 +89,9 @@ bool SortReturning = false; /* when true at planning time, enables coordinator sorted merge for ORDER BY */ bool EnableSortedMerge = false; +/* when true, uses streaming adapter instead of eager merge for sorted merge */ +bool EnableStreamingSortedMerge = false; + /* * How many nested executors have we started? This can happen for SQL * UDF calls. The outer query starts an executor, then postgres opens @@ -343,21 +347,60 @@ CitusCustomScanStateWalker(PlanState *planState, List **citusCustomScanStates) /* - * ReturnTupleFromTuplestore reads the next tuple from the tuple store of the - * given Citus scan node and returns it. It returns null if all tuples are read - * from the tuple store. + * FetchNextScanTuple loads the next tuple into the scan slot. + * Returns true if a tuple was loaded, false if exhausted. + * + * When a merge adapter is active, it streams from the adapter. + * Otherwise, it reads from the tuplestore in the given direction. */ -TupleTableSlot * -ReturnTupleFromTuplestore(CitusScanState *scanState) +static inline bool +FetchNextScanTuple(CitusScanState *scanState, bool forward, TupleTableSlot *slot) { - Tuplestorestate *tupleStore = scanState->tuplestorestate; - bool forwardScanDirection = true; + if (scanState->mergeAdapter != NULL) + { + /* + * The streaming merge adapter is forward-only. + * + * Citus replaces the entire plan tree after standard_planner() + * returns, so PostgreSQL's cursor-time materialize_finished_plan() + * check does not see the Citus CustomScan. That means SCROLL + * cursors can reach here with a backward scan request even though + * the adapter cannot satisfy it. Report a user-facing error + * rather than crashing. + */ + if (!forward) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("streaming sorted merge does not support " + "backward scan"), + errhint("Use SET citus.enable_streaming_sorted_merge " + "TO off to allow backward scan."))); + } + return SortedMergeAdapterNext(scanState->mergeAdapter, slot); + } + Tuplestorestate *tupleStore = scanState->tuplestorestate; if (tupleStore == NULL) { - return NULL; + ExecClearTuple(slot); + return false; } + return tuplestore_gettupleslot(tupleStore, forward, false, slot); +} + + +/* + * ReturnTupleFromTuplestore reads the next tuple from the tuple store (or + * streaming merge adapter) of the given Citus scan node and returns it. + * It returns null if all tuples are read. + */ +TupleTableSlot * +ReturnTupleFromTuplestore(CitusScanState *scanState) +{ + bool forwardScanDirection = true; + EState *executorState = ScanStateGetExecutorState(scanState); ScanDirection scanDirection = executorState->es_direction; Assert(ScanDirectionIsValid(scanDirection)); @@ -373,9 +416,9 @@ ReturnTupleFromTuplestore(CitusScanState *scanState) if (!qual && !projInfo) { - /* no quals, nor projections return directly from the tuple store. */ + /* no quals, nor projections return directly from the tuple source. */ TupleTableSlot *slot = scanState->customScanState.ss.ss_ScanTupleSlot; - tuplestore_gettupleslot(tupleStore, forwardScanDirection, false, slot); + FetchNextScanTuple(scanState, forwardScanDirection, slot); return slot; } @@ -394,12 +437,10 @@ ReturnTupleFromTuplestore(CitusScanState *scanState) ResetExprContext(econtext); TupleTableSlot *slot = scanState->customScanState.ss.ss_ScanTupleSlot; - tuplestore_gettupleslot(tupleStore, forwardScanDirection, false, slot); - - if (TupIsNull(slot)) + if (!FetchNextScanTuple(scanState, forwardScanDirection, slot)) { /* - * When the tuple is null we have reached the end of the tuplestore. We will + * When the tuple is null we have reached the end of the source. We will * return a null tuple, however, depending on the existence of a projection we * need to either return the scan tuple or the projected tuple. */ diff --git a/src/backend/distributed/executor/sorted_merge.c b/src/backend/distributed/executor/sorted_merge.c index f3514dfdb38..397b1cc9620 100644 --- a/src/backend/distributed/executor/sorted_merge.c +++ b/src/backend/distributed/executor/sorted_merge.c @@ -66,6 +66,32 @@ typedef struct MergeContext } MergeContext; +/* + * SortedMergeAdapter streams tuples from K pre-sorted per-task stores + * via a binary heap, returning one globally-sorted tuple per call. + * + * Used both as the streaming replacement for MergePerTaskStoresIntoFinalStore() + * and internally by that function itself (to avoid duplicating the merge logic). + * + * Modeled after PostgreSQL's MergeAppend (nodeMergeAppend.c), which uses + * the same binary-heap-over-sorted-inputs pattern. + */ +struct SortedMergeAdapter +{ + Tuplestorestate **perTaskStores; /* K per-task stores (not owned in eager mode) */ + int nstores; + bool ownsStores; /* if true, FreeSortedMergeAdapter frees stores */ + + binaryheap *heap; + + MergeContext mergeCtx; /* embedded — passed to heap as bh_arg */ + + TupleDesc tupleDesc; + bool exhausted; + bool initialized; +}; + + /* forward declarations */ static void PerTaskDispatchPutTuple(TupleDestination *self, Task *task, int placementIndex, int queryNumber, @@ -213,7 +239,9 @@ PerTaskDispatchTupleDescForQuery(TupleDestination *self, int queryNumber) * Each per-task store must contain tuples sorted by the given merge keys. * The output tuplestore will contain all tuples in globally sorted order. * - * Uses PostgreSQL's public binaryheap and SortSupport APIs. + * Implemented by creating a temporary SortedMergeAdapter, draining it into + * the final store, and freeing the adapter. The per-task stores are NOT + * freed by this function — the caller is responsible for that. */ void MergePerTaskStoresIntoFinalStore(Tuplestorestate *finalStore, @@ -228,7 +256,88 @@ MergePerTaskStoresIntoFinalStore(Tuplestorestate *finalStore, return; } - /* allocate one reusable slot per task store */ + SortedMergeAdapter *adapter = CreateSortedMergeAdapter(perTaskStores, + nstores, mergeKeys, + nkeys, tupleDesc, + false); + + TupleTableSlot *slot = MakeSingleTupleTableSlot(tupleDesc, + &TTSOpsMinimalTuple); + + while (SortedMergeAdapterNext(adapter, slot)) + { + tuplestore_puttupleslot(finalStore, slot); + } + + ExecDropSingleTupleTableSlot(slot); + FreeSortedMergeAdapter(adapter); +} + + +/* + * MergeHeapComparator compares tuples from two task stores by the merge keys. + * Returns negative if a < b, positive if a > b, zero if equal. + * The binary heap is a max-heap, so we negate to get min-heap behavior. + * + * This is modeled after heap_compare_slots() in nodeMergeAppend.c. + */ +static int +MergeHeapComparator(Datum a, Datum b, void *arg) +{ + MergeContext *ctx = (MergeContext *) arg; + int slot1 = DatumGetInt32(a); + int slot2 = DatumGetInt32(b); + TupleTableSlot *s1 = ctx->slots[slot1]; + TupleTableSlot *s2 = ctx->slots[slot2]; + + for (int i = 0; i < ctx->nkeys; i++) + { + SortSupport sortKey = &ctx->sortKeys[i]; + AttrNumber attno = sortKey->ssup_attno; + bool isNull1, isNull2; + + Datum datum1 = slot_getattr(s1, attno, &isNull1); + Datum datum2 = slot_getattr(s2, attno, &isNull2); + + int compare = ApplySortComparator(datum1, isNull1, + datum2, isNull2, + sortKey); + if (compare != 0) + { + /* binaryheap is a max-heap, negate for min-heap behavior */ + return -compare; + } + } + + return 0; +} + + +/* + * CreateSortedMergeAdapter builds a streaming merge adapter over K per-task + * stores. When ownsStores is true, FreeSortedMergeAdapter() will call + * tuplestore_end() on each per-task store; when false, the caller retains + * ownership and must free them separately. + * + * All memory is allocated in CurrentMemoryContext. The caller must ensure + * this context outlives the adapter (the AdaptiveExecutor local context + * already satisfies this — see adaptive_executor.c). + */ +SortedMergeAdapter * +CreateSortedMergeAdapter(Tuplestorestate **perTaskStores, + int nstores, + SortedMergeKey *mergeKeys, + int nkeys, + TupleDesc tupleDesc, + bool ownsStores) +{ + SortedMergeAdapter *adapter = palloc0(sizeof(SortedMergeAdapter)); + adapter->perTaskStores = perTaskStores; + adapter->nstores = nstores; + adapter->ownsStores = ownsStores; + adapter->tupleDesc = tupleDesc; + + /* one comparison slot per store — owned via mergeCtx.slots */ TupleTableSlot **slots = palloc(nstores * sizeof(TupleTableSlot *)); for (int i = 0; i < nstores; i++) { @@ -247,87 +356,151 @@ MergePerTaskStoresIntoFinalStore(Tuplestorestate *finalStore, PrepareSortSupportFromOrderingOp(mergeKeys[i].sortop, sk); } - /* set up merge context for heap comparisons */ - MergeContext ctx; - ctx.slots = slots; - ctx.sortKeys = sortKeys; - ctx.nkeys = nkeys; + /* set up embedded merge context for heap comparisons */ + adapter->mergeCtx.slots = slots; + adapter->mergeCtx.sortKeys = sortKeys; + adapter->mergeCtx.nkeys = nkeys; - binaryheap *heap = binaryheap_allocate(nstores, MergeHeapComparator, &ctx); + /* allocate heap with embedded context as comparator arg */ + adapter->heap = binaryheap_allocate(nstores, MergeHeapComparator, + &adapter->mergeCtx); - /* seed the heap with the first tuple from each non-empty store */ - for (int i = 0; i < nstores; i++) + return adapter; +} + + +/* + * SortedMergeAdapterNext returns the next globally-sorted tuple from the + * adapter by copying it into the provided scanSlot. Returns true if a tuple + * was returned, false if all stores are exhausted. + * + * The heap uses per-store comparison slots (mergeCtx.slots). After + * identifying the winner, we ExecCopySlot from the winner's comparison + * slot into the scan slot. This is a MinimalTuple copy, comparable in + * cost to the tuplestore_puttupleslot write in the eager merge path. + * + * On each call after the first, we advance the previous winner's store + * and update the heap before selecting the new winner. This matches the + * MergeAppend pattern in nodeMergeAppend.c. + * + * Possible perf optimizations to explore in the future: + * Avoid copying the winning tuple into the scan slot by returning a pointer to the winner's slot instead. + * This would require changes to the caller to not modify the returned slot and to understand that it's owned by the adapter until the next call. + * It would save a copy per tuple at the cost of a more complex API and potential lifetime management issues. + */ +bool +SortedMergeAdapterNext(SortedMergeAdapter *adapter, TupleTableSlot *scanSlot) +{ + if (adapter->exhausted) + { + ExecClearTuple(scanSlot); + return false; + } + + if (!adapter->initialized) { - tuplestore_rescan(perTaskStores[i]); - if (tuplestore_gettupleslot(perTaskStores[i], true, false, slots[i])) + /* first call: seed the heap with the first tuple from each store */ + for (int i = 0; i < adapter->nstores; i++) { - binaryheap_add_unordered(heap, Int32GetDatum(i)); + tuplestore_rescan(adapter->perTaskStores[i]); + if (tuplestore_gettupleslot(adapter->perTaskStores[i], true, false, + adapter->mergeCtx.slots[i])) + { + binaryheap_add_unordered(adapter->heap, Int32GetDatum(i)); + } } + binaryheap_build(adapter->heap); + adapter->initialized = true; } - binaryheap_build(heap); - - /* merge loop: extract min, write to final store, advance winner */ - while (!binaryheap_empty(heap)) + else { - int winner = DatumGetInt32(binaryheap_first(heap)); - tuplestore_puttupleslot(finalStore, slots[winner]); - - if (tuplestore_gettupleslot(perTaskStores[winner], true, false, - slots[winner])) + /* advance the previous winner and update the heap */ + int prevWinner = DatumGetInt32(binaryheap_first(adapter->heap)); + if (tuplestore_gettupleslot(adapter->perTaskStores[prevWinner], true, + false, adapter->mergeCtx.slots[prevWinner])) { - binaryheap_replace_first(heap, Int32GetDatum(winner)); + binaryheap_replace_first(adapter->heap, Int32GetDatum(prevWinner)); } else { - (void) binaryheap_remove_first(heap); + (void) binaryheap_remove_first(adapter->heap); } } - /* free merge-local resources */ - binaryheap_free(heap); - for (int i = 0; i < nstores; i++) + if (binaryheap_empty(adapter->heap)) { - ExecDropSingleTupleTableSlot(slots[i]); + adapter->exhausted = true; + ExecClearTuple(scanSlot); + return false; } - pfree(slots); - pfree(sortKeys); + + int winner = DatumGetInt32(binaryheap_first(adapter->heap)); + ExecCopySlot(scanSlot, adapter->mergeCtx.slots[winner]); + + return true; } /* - * MergeHeapComparator compares tuples from two task stores by the merge keys. - * Returns negative if a < b, positive if a > b, zero if equal. - * The binary heap is a max-heap, so we negate to get min-heap behavior. + * SortedMergeAdapterRescan resets the adapter to re-read from the beginning. + * Called from CitusReScan() for cursor WITH HOLD patterns. * - * This is modeled after heap_compare_slots() in nodeMergeAppend.c. + * Cost is O(K log K) to rebuild the heap, which is negligible for typical + * shard counts (4-64). Both binaryheap_reset() and tuplestore_rescan() + * are proven APIs used by PostgreSQL's ExecReScanMergeAppend. */ -static int -MergeHeapComparator(Datum a, Datum b, void *arg) +void +SortedMergeAdapterRescan(SortedMergeAdapter *adapter) { - MergeContext *ctx = (MergeContext *) arg; - int slot1 = DatumGetInt32(a); - int slot2 = DatumGetInt32(b); - TupleTableSlot *s1 = ctx->slots[slot1]; - TupleTableSlot *s2 = ctx->slots[slot2]; + binaryheap_reset(adapter->heap); - for (int i = 0; i < ctx->nkeys; i++) + for (int i = 0; i < adapter->nstores; i++) { - SortSupport sortKey = &ctx->sortKeys[i]; - AttrNumber attno = sortKey->ssup_attno; - bool isNull1, isNull2; + tuplestore_rescan(adapter->perTaskStores[i]); + if (tuplestore_gettupleslot(adapter->perTaskStores[i], true, false, + adapter->mergeCtx.slots[i])) + { + binaryheap_add_unordered(adapter->heap, Int32GetDatum(i)); + } + } + binaryheap_build(adapter->heap); - Datum datum1 = slot_getattr(s1, attno, &isNull1); - Datum datum2 = slot_getattr(s2, attno, &isNull2); + adapter->exhausted = false; + adapter->initialized = true; +} - int compare = ApplySortComparator(datum1, isNull1, - datum2, isNull2, - sortKey); - if (compare != 0) + +/* + * FreeSortedMergeAdapter releases all adapter resources including + * per-task stores, comparison slots, sort keys, and the heap. + * Called from CitusEndScan() for deterministic cleanup. + */ +void +FreeSortedMergeAdapter(SortedMergeAdapter *adapter) +{ + if (adapter == NULL) + { + return; + } + + for (int i = 0; i < adapter->nstores; i++) + { + if (adapter->ownsStores) { - /* binaryheap is a max-heap, negate for min-heap behavior */ - return -compare; + tuplestore_end(adapter->perTaskStores[i]); } + ExecDropSingleTupleTableSlot(adapter->mergeCtx.slots[i]); } - return 0; + binaryheap_free(adapter->heap); + pfree(adapter->mergeCtx.slots); + pfree(adapter->mergeCtx.sortKeys); + + if (adapter->ownsStores) + { + pfree(adapter->perTaskStores); + } + + /* mergeCtx is embedded in adapter, freed with the adapter itself */ + pfree(adapter); } diff --git a/src/backend/distributed/planner/distributed_planner.c b/src/backend/distributed/planner/distributed_planner.c index d80216b3682..0db8cf338cf 100644 --- a/src/backend/distributed/planner/distributed_planner.c +++ b/src/backend/distributed/planner/distributed_planner.c @@ -855,6 +855,22 @@ CreateDistributedPlannedStmt(DistributedPlanningContext *planContext) /* create final plan by combining local plan with distributed plan */ resultPlan = FinalizePlan(planContext->plan, distributedPlan); + /* + * When the streaming sorted merge adapter is active, the CustomScan + * does not support backward scan. If the query is a SCROLL cursor, + * insert a Material node above the plan tree so backward fetches work. + * + * Normally standard_planner() handles this (planner.c:447-451), but + * Citus replaces the plan tree after standard_planner returns via + * FinalizePlan(), losing any Material node it inserted. + */ + if ((planContext->cursorOptions & CURSOR_OPT_SCROLL) && + distributedPlan->useSortedMerge && EnableStreamingSortedMerge && + !ExecSupportsBackwardScan(resultPlan->planTree)) + { + resultPlan->planTree = materialize_finished_plan(resultPlan->planTree); + } + /* * As explained above, force planning costs to be unrealistically high if * query planning failed (possibly) due to prepared statement parameters or @@ -1499,7 +1515,18 @@ FinalizePlan(PlannedStmt *localPlan, DistributedPlan *distributedPlan) customScan->custom_private = list_make1(distributedPlanData); /* necessary to avoid extra Result node in PG15 */ - customScan->flags = CUSTOMPATH_SUPPORT_BACKWARD_SCAN | CUSTOMPATH_SUPPORT_PROJECTION; + int customFlags = CUSTOMPATH_SUPPORT_PROJECTION; + if (!(distributedPlan->useSortedMerge && EnableStreamingSortedMerge)) + { + /* + * Advertise backward-scan support unless both sorted merge and + * the streaming adapter are active. When streaming, the adapter + * is forward-only; PostgreSQL's planner will insert a Material + * node above us for scrollable cursors. + */ + customFlags |= CUSTOMPATH_SUPPORT_BACKWARD_SCAN; + } + customScan->flags = customFlags; /* * Fast path queries cannot have any subplans by definition, so skip diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 48842050c3b..039e8ee5c51 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1641,6 +1641,21 @@ RegisterCitusConfigVariables(void) GUC_SUPERUSER_ONLY, NULL, NULL, NULL); + DefineCustomBoolVariable( + "citus.enable_streaming_sorted_merge", + gettext_noop("Use streaming adapter instead of eager merge for sorted merge."), + gettext_noop("When enabled alongside citus.enable_sorted_merge, the coordinator " + "streams merged tuples directly from per-task stores via a binary " + "heap instead of eagerly copying all tuples into a final tuplestore. " + "This reduces memory usage and improves time-to-first-tuple, " + "especially for LIMIT queries. Requires citus.enable_sorted_merge " + "to also be enabled. This is an experimental feature."), + &EnableStreamingSortedMerge, + false, + PGC_USERSET, + GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + DefineCustomBoolVariable( "citus.enable_unique_job_ids", gettext_noop("Enables unique job IDs by prepending the local process ID and " diff --git a/src/include/distributed/citus_custom_scan.h b/src/include/distributed/citus_custom_scan.h index db1f0ce1f2a..dbe71df0856 100644 --- a/src/include/distributed/citus_custom_scan.h +++ b/src/include/distributed/citus_custom_scan.h @@ -28,6 +28,9 @@ typedef struct CitusScanState MultiExecutorType executorType; /* distributed executor type */ bool finishedRemoteScan; /* flag to check if remote scan is finished */ Tuplestorestate *tuplestorestate; /* tuple store to store distributed results */ + + /* streaming sorted merge adapter (NULL when not using sorted merge) */ + struct SortedMergeAdapter *mergeAdapter; } CitusScanState; diff --git a/src/include/distributed/multi_executor.h b/src/include/distributed/multi_executor.h index c18067b5499..da30bd0c838 100644 --- a/src/include/distributed/multi_executor.h +++ b/src/include/distributed/multi_executor.h @@ -71,6 +71,7 @@ extern int MaxAdaptiveExecutorPoolSize; extern int ExecutorSlowStartInterval; extern bool SortReturning; extern bool EnableSortedMerge; +extern bool EnableStreamingSortedMerge; extern int ExecutorLevel; diff --git a/src/include/distributed/sorted_merge.h b/src/include/distributed/sorted_merge.h index eeb3e690d35..d82fd626030 100644 --- a/src/include/distributed/sorted_merge.h +++ b/src/include/distributed/sorted_merge.h @@ -18,6 +18,10 @@ #include "distributed/tuple_destination.h" +/* opaque streaming merge adapter — full definition in sorted_merge.c */ +typedef struct SortedMergeAdapter SortedMergeAdapter; + + extern TupleDestination * CreatePerTaskDispatchDest(List *taskList, TupleDesc tupleDesc, TupleDestinationStats *sharedStats, @@ -31,4 +35,15 @@ extern void MergePerTaskStoresIntoFinalStore(Tuplestorestate *finalStore, int nkeys, TupleDesc tupleDesc); +extern SortedMergeAdapter * CreateSortedMergeAdapter(Tuplestorestate **perTaskStores, + int nstores, + SortedMergeKey *mergeKeys, + int nkeys, + TupleDesc tupleDesc, + bool ownsStores); +extern bool SortedMergeAdapterNext(SortedMergeAdapter *adapter, + TupleTableSlot *scanSlot); +extern void SortedMergeAdapterRescan(SortedMergeAdapter *adapter); +extern void FreeSortedMergeAdapter(SortedMergeAdapter *adapter); + #endif /* SORTED_MERGE_H */ diff --git a/src/test/regress/expected/multi_orderby_pushdown_streaming.out b/src/test/regress/expected/multi_orderby_pushdown_streaming.out new file mode 100644 index 00000000000..263e93f08f5 --- /dev/null +++ b/src/test/regress/expected/multi_orderby_pushdown_streaming.out @@ -0,0 +1,5957 @@ +-- +-- MULTI_ORDERBY_PUSHDOWN_STREAMING +-- +-- Runs the sorted merge test suite (multi_orderby_pushdown.sql) twice: +-- first with the default eager-merge path, then with the streaming +-- adapter enabled via citus.enable_streaming_sorted_merge. Both runs +-- share the same setup tables and must produce identical results +-- (except for the G3 backward-scan test, where the streaming adapter's +-- forward-only cursor correctly errors on FETCH BACKWARD). +-- +\i sql/setup_multi_orderby_pushdown.sql +-- +-- SETUP_MULTI_ORDERBY_PUSHDOWN +-- +-- Creates the test tables and data used by multi_orderby_pushdown.sql +-- and its variants (e.g., multi_orderby_pushdown_streaming.sql). +-- This file is meant to be included via \i from test files that need +-- these tables. +-- +SET citus.next_shard_id TO 960000; +-- ================================================================= +-- Setup: create test tables +-- ================================================================= +CREATE TABLE sorted_merge_test ( + id int, + val text, + num numeric, + ts timestamptz DEFAULT now() +); +SELECT create_distributed_table('sorted_merge_test', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- Insert 100 rows + NULLs + duplicates +INSERT INTO sorted_merge_test (id, val, num) +SELECT i, 'val_' || i, (i * 1.5)::numeric +FROM generate_series(1, 100) i; +INSERT INTO sorted_merge_test (id, val, num) VALUES (101, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (102, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (200, 'dup_a', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (201, 'dup_b', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (202, 'dup_c', 10.5); +-- Second table for join tests +CREATE TABLE sorted_merge_events ( + id int, + event_type text, + event_val int +); +SELECT create_distributed_table('sorted_merge_events', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO sorted_merge_events +SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i +FROM generate_series(1, 200) i; +-- Run 1: eager merge (default) +\i sql/multi_orderby_pushdown.sql +-- +-- MULTI_SORTED_MERGE +-- +-- Tests for the citus.enable_sorted_merge GUC and the sorted merge +-- planner eligibility logic. Verifies that enabling the GUC does not +-- introduce regressions for any query pattern. +-- +-- MX verification: this test has been verified to pass with zero diffs +-- under check-base-mx (MX mode), confirming sorted merge works correctly +-- when any node in the cluster acts as coordinator. +-- +-- ================================================================= +-- 1. GUC basics +-- ================================================================= +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------------------------------------------------- + off +(1 row) + +SET citus.enable_sorted_merge TO on; +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------------------------------------------------- + on +(1 row) + +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Category A: Eligibility — sort IS pushed to workers +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- A1: ORDER BY distribution column +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- A2: ORDER BY DESC +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id DESC + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(16 rows) + +-- A3: ORDER BY DESC NULLS LAST +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num DESC NULLS LAST + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, num + Sort Key: sorted_merge_test.num DESC NULLS LAST + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num +(16 rows) + +-- A4: ORDER BY non-distribution column +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY val'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.val + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- A5: Multi-column ORDER BY +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id, sorted_merge_test.val + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- A6: Mixed directions +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, num DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id, sorted_merge_test.num DESC + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num +(16 rows) + +-- A7: GROUP BY dist_col ORDER BY dist_col +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(20 rows) + +-- A8: WHERE clause + ORDER BY +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) 'N'::numeric) ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + Filter: (sorted_merge_test.num > 'N'::numeric) + Rows Removed by Filter: N +(18 rows) + +-- A9: Expression in ORDER BY (non-aggregate) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, num, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num, (id + N) +(16 rows) + +-- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(20 rows) + +-- ================================================================= +-- Category B: Ineligibility — sort NOT pushed for merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- B1: ORDER BY count(*) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Sort Key: remote_scan.count + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) + +-- B2: ORDER BY avg(col) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.avg + Sort Key: remote_scan.avg + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.avg + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, avg(num) AS avg FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, avg(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) + +-- B3: GROUP BY non-dist col, ORDER BY non-dist col +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Key: remote_scan.val + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.val, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(23 rows) + +-- B4: GROUP BY non-dist col, ORDER BY aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.val, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(23 rows) + +-- ================================================================= +-- Category C: Correctness — results match GUC off vs on +-- ================================================================= +-- C1: Simple ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- C2: ORDER BY DESC +SET citus.enable_sorted_merge TO off; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +-- C3: Multi-column ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- C4: ORDER BY non-distribution column +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- C5: GROUP BY dist_col ORDER BY dist_col +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- C6: Mixed directions +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- C7: WHERE + ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +-- C8: Aggregates in SELECT, ORDER BY on dist_col (GROUP BY dist_col) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +-- ================================================================= +-- Category D: Complex queries — regression guards +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- D1: Subquery in FROM with ORDER BY +SELECT * FROM ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) sub ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D2: CTE with ORDER BY +WITH top5 AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) +SELECT * FROM top5 ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D3: Co-located JOIN + ORDER BY +SELECT t.id, t.val, e.event_type +FROM sorted_merge_test t +JOIN sorted_merge_events e ON t.id = e.id +WHERE t.id <= 5 +ORDER BY t.id, e.event_type +LIMIT 10; + id | val | event_type +--------------------------------------------------------------------- + 1 | val_1 | buy + 1 | val_1 | buy + 1 | val_1 | click + 1 | val_1 | view + 2 | val_2 | buy + 2 | val_2 | click + 2 | val_2 | view + 2 | val_2 | view + 3 | val_3 | buy + 3 | val_3 | buy +(10 rows) + +-- D4: UNION ALL + ORDER BY +SELECT id, val FROM sorted_merge_test WHERE id <= 3 +UNION ALL +SELECT id, val FROM sorted_merge_test WHERE id BETWEEN 98 AND 100 +ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 98 | val_98 + 99 | val_99 + 100 | val_100 +(6 rows) + +-- D5: DISTINCT + ORDER BY +SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +-- D6: DISTINCT ON + ORDER BY +SELECT DISTINCT ON (id) id, val, num +FROM sorted_merge_test +WHERE id <= 5 +ORDER BY id, num DESC; + id | val | num +--------------------------------------------------------------------- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +-- D7: EXISTS subquery + ORDER BY +SELECT id, val FROM sorted_merge_test t +WHERE EXISTS (SELECT 1 FROM sorted_merge_events e WHERE e.id = t.id) +ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D8: IN subquery + ORDER BY +SELECT id, val FROM sorted_merge_test +WHERE id IN (SELECT id FROM sorted_merge_events WHERE event_type = 'click') +ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D9: Multiple aggregates, GROUP BY dist_col, ORDER BY dist_col +SELECT id, count(*), sum(num), avg(num), min(val), max(val) +FROM sorted_merge_test +GROUP BY id +ORDER BY id +LIMIT 5; + id | count | sum | avg | min | max +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 | val_1 | val_1 + 2 | 1 | 3.0 | 3.0000000000000000 | val_2 | val_2 + 3 | 1 | 4.5 | 4.5000000000000000 | val_3 | val_3 + 4 | 1 | 6.0 | 6.0000000000000000 | val_4 | val_4 + 5 | 1 | 7.5 | 7.5000000000000000 | val_5 | val_5 +(5 rows) + +-- D10: CASE expression in SELECT + ORDER BY +SELECT id, + CASE WHEN num > 75 THEN 'high' WHEN num > 25 THEN 'mid' ELSE 'low' END as bucket +FROM sorted_merge_test +WHERE num IS NOT NULL +ORDER BY id +LIMIT 10; + id | bucket +--------------------------------------------------------------------- + 1 | low + 2 | low + 3 | low + 4 | low + 5 | low + 6 | low + 7 | low + 8 | low + 9 | low + 10 | low +(10 rows) + +-- D11: NULL values ordering +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +-- D12: Large OFFSET +SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; + id +--------------------------------------------------------------------- + 101 + 102 + 200 + 201 + 202 +(5 rows) + +-- D13: ORDER BY ordinal position +SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- ================================================================= +-- Category E: Edge cases +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- E1: Empty result set +SELECT id FROM sorted_merge_test WHERE id < 0 ORDER BY id; + id +--------------------------------------------------------------------- +(0 rows) + +-- E2: Single row (may go through router planner) +SELECT id, val FROM sorted_merge_test WHERE id = 42 ORDER BY id; + id | val +--------------------------------------------------------------------- + 42 | val_42 +(1 row) + +-- E3: All rows with same sort value +SELECT id, num FROM sorted_merge_test WHERE num = 10.5 ORDER BY num, id; + id | num +--------------------------------------------------------------------- + 7 | 10.5 + 200 | 10.5 + 201 | 10.5 + 202 | 10.5 +(4 rows) + +-- E4: Wide sort key (4 columns) +SELECT id, val, num FROM sorted_merge_test +WHERE id <= 5 +ORDER BY num, val, id +LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +-- E5: Zero-task defensive path +-- CreatePerTaskDispatchDest handles taskCount=0 gracefully (returns a no-op +-- destination). This cannot be triggered via normal SQL because distributed +-- tables always have at least one shard. The closest we can test is an +-- empty-result query through the sorted merge path to verify no crash. +SELECT id FROM sorted_merge_test WHERE false ORDER BY id; + id +--------------------------------------------------------------------- +(0 rows) + +-- ================================================================= +-- Category F: Existing LIMIT pushdown stability +-- ================================================================= +-- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Sort (actual rows=N loops=N) + Output: remote_scan.id + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(23 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(20 rows) + +-- F2: GROUP BY dist_col + ORDER BY + LIMIT +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(24 rows) + +-- F3: ORDER BY aggregate + LIMIT (not eligible for merge) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- ================================================================= +-- Category G: Phase 4 — Sort elision and advanced scenarios +-- ================================================================= +-- G1: Sort elision verification — coordinator Sort node absent +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Sort Key: remote_scan.id + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(15 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- G2a: PREPARE with merge ON, EXECUTE after turning OFF +-- Plan-time decision is baked in — cached plan must still merge correctly. +-- Execute 6+ times to trigger PostgreSQL's generic plan caching, then +-- verify the plan shape is preserved after toggling the GUC. +SET citus.enable_sorted_merge TO on; +PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — no Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + +SET citus.enable_sorted_merge TO off; +-- Cached plan retains the sorted merge decision from planning time +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + +DEALLOCATE merge_on_stmt; +-- G2b: PREPARE with merge OFF, EXECUTE after turning ON +-- Cached plan has Sort node — must still return sorted results. +SET citus.enable_sorted_merge TO off; +PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +SET citus.enable_sorted_merge TO on; +-- Cached plan retains the non-merge decision from planning time +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +DEALLOCATE merge_off_stmt; +-- G3: Cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_cursor CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 2 +(1 row) + +FETCH 2 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 3 + 4 +(2 rows) + +CLOSE sorted_cursor; +COMMIT; +-- G3b: SCROLL cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_scroll_cursor SCROLL CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 2 +(1 row) + +FETCH 2 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 3 + 4 +(2 rows) + +CLOSE sorted_scroll_cursor; +COMMIT; +-- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(20 rows) + +-- G5: ORDER BY aggregate + LIMIT — crash regression test +-- Previously caused SIGSEGV when sorted merge was enabled because +-- aggregate ORDER BY was erroneously tagged as merge-eligible. +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 3; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 +(3 rows) + +-- G6: Small work_mem with many tasks (32 shards) +SET citus.enable_sorted_merge TO on; +SET work_mem TO '64kB'; +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +RESET work_mem; +-- G7: max_intermediate_result_size with CTE subplan +SET citus.enable_sorted_merge TO on; +SET citus.max_intermediate_result_size TO '4kB'; +WITH cte AS (SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 50) +SELECT * FROM cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- ================================================================= +-- Category H: Subplan + Sorted Merge interactions +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1: CTE subplan with simple ORDER BY — eligible for sorted merge +-- The CTE becomes a subplan; its DistributedPlan may have useSortedMerge=true +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + id | val | cnt +--------------------------------------------------------------------- + 1 | val_1 | 1 + 2 | val_2 | 1 + 3 | val_3 | 1 + 4 | val_4 | 1 + 5 | val_5 | 1 + 6 | val_6 | 1 + 7 | val_7 | 1 + 8 | val_8 | 1 + 9 | val_9 | 1 + 10 | val_10 | 1 + 11 | val_11 | 1 + 12 | val_12 | 1 + 13 | val_13 | 1 + 14 | val_14 | 1 + 15 | val_15 | 1 +(15 rows) + +-- H3: CTE subplan feeding outer ORDER BY — both levels may merge independently +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- H4: Subquery in WHERE with ORDER BY + LIMIT — becomes subplan with merge +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 +(3 rows) + +-- H5: CTE subplan with max_intermediate_result_size enforcement +-- Tests that EnsureIntermediateSizeLimitNotExceeded works through per-task dispatch +SET citus.max_intermediate_result_size TO '4kB'; +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- H6: Cross-join subplan with non-aggregate ORDER BY (crash regression variant) +-- Similar pattern to subquery_complex_target_list but without aggregate ORDER BY +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + id | bar_id +--------------------------------------------------------------------- + 1 | 1 + 1 | 1 + 1 | 1 + 2 | 1 + 2 | 1 +(5 rows) + +-- H7: CTE correctness comparison — GUC off vs on must produce identical results +SET citus.enable_sorted_merge TO off; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +-- ================================================================= +-- Category H EXPLAIN: Query plans for subplan + sorted merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val +(20 rows) + +-- H2 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.cnt + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + -> Distributed Subplan XXX_2 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.cnt + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.cnt + Sort Key: remote_scan.cnt DESC, remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.cnt + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)) + Sort Key: (count(*)) DESC, sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT e.id, e.val, i.cnt FROM ((SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) e JOIN (SELECT intermediate_result.id, intermediate_result.cnt FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer, cnt bigint)) i ON ((e.id OPERATOR(pg_catalog.=) i.id))) ORDER BY e.id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Merge Join (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result_1.cnt + Merge Cond: (intermediate_result.id = intermediate_result_1.id) + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=N loops=N) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Sort Key: intermediate_result_1.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(78 rows) + +-- H3 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: t.id, t.val + -> Merge Join (actual rows=N loops=N) + Output: t.id, t.val + Merge Cond: (intermediate_result.id = t.id) + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=N loops=N) + Output: t.id, t.val + Sort Key: t.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=N loops=N) + Output: t.id, t.val +(53 rows) + +-- H4 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) + Output: id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Hash Semi Join (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Hash Cond: (sorted_merge_test.id = intermediate_result.id) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val, sorted_merge_test.num, sorted_merge_test.ts + -> Hash (actual rows=N loops=N) + Output: intermediate_result.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(52 rows) + +-- H5 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(41 rows) + +-- H6 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.bar_id + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id + -> Distributed Subplan XXX_2 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) + Output: id + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + Sort Key: intermediate_result.id, intermediate_result_1.id + Sort Method: quicksort Memory: NkB + -> Nested Loop (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) + Output: intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(69 rows) + +-- H7 EXPLAIN — GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, num + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N +(45 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, num + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N +(43 rows) + +-- ================================================================= +-- Category I: Distributed Transactions +-- ================================================================= +-- Verify sorted merge correctness within multi-statement transactions +-- where data is modified before the sorted-merge SELECT. +SET citus.enable_sorted_merge TO on; +-- I1: INSERT then SELECT within a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (900, 'txn_insert', 900.0); +SELECT id, val FROM sorted_merge_test WHERE id >= 900 ORDER BY id; + id | val +--------------------------------------------------------------------- + 900 | txn_insert +(1 row) + +ROLLBACK; +-- I2: UPDATE then SELECT within a transaction +BEGIN; +UPDATE sorted_merge_test SET val = 'updated' WHERE id = 1; +SELECT id, val FROM sorted_merge_test WHERE id <= 3 ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | updated + 2 | val_2 + 3 | val_3 +(3 rows) + +ROLLBACK; +-- I3: DELETE then SELECT within a transaction +BEGIN; +DELETE FROM sorted_merge_test WHERE id <= 5; +SELECT id, val FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id | val +--------------------------------------------------------------------- + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(5 rows) + +ROLLBACK; +-- I4: INSERT + UPDATE + SELECT with multi-column ORDER BY +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (901, 'txn_a', 1.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (902, 'txn_b', 2.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (903, 'txn_c', 3.0); +UPDATE sorted_merge_test SET num = 999.0 WHERE id = 901; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 900 ORDER BY num, id; + id | val | num +--------------------------------------------------------------------- + 902 | txn_b | 2.0 + 903 | txn_c | 3.0 + 901 | txn_a | 999.0 +(3 rows) + +ROLLBACK; +-- I5: Compare results with GUC off vs on in a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (910, 'cmp_a', 10.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (911, 'cmp_b', 20.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (912, 'cmp_c', 30.0); +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num +--------------------------------------------------------------------- + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num +--------------------------------------------------------------------- + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) + +ROLLBACK; +-- I6: DELETE + aggregate in SELECT with ORDER BY +BEGIN; +DELETE FROM sorted_merge_test WHERE id > 100 AND id < 200; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +ROLLBACK; +-- ================================================================= +-- Category J: Coordinator expression evaluation exclusion +-- ================================================================= +-- Verify that queries with ORDER BY on expressions that need coordinator-side +-- evaluation are correctly excluded from sorted merge (or handled correctly). +SET citus.enable_sorted_merge TO on; +-- J1: ORDER BY expression on aggregate result (ordinal reference) +-- The ORDER BY references position 2 which is an aggregate — sorted merge +-- must NOT be used because aggregates are rewritten between worker/coordinator. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) AS total FROM sorted_merge_test GROUP BY id ORDER BY 2 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Sort Key: remote_scan.total + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, sum(num) AS total FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (sum(num)) + -> Sort (actual rows=N loops=N) + Output: id, (sum(num)) + Sort Key: (sum(sorted_merge_test.num)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, sum(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J2: ORDER BY expression wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) + 1 AS total_plus FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Sort Key: remote_scan.total_plus + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) AS total_plus FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + Sort Key: ((sum(sorted_merge_test.num) + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, (sum(num) + 'N'::numeric) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J3: ORDER BY a non-aggregate expression that can be pushed to workers +-- This should be eligible for sorted merge — the expression is evaluated +-- on the worker side and sort order is preserved. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id + 0'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (id + N) +(16 rows) + +-- J4: ORDER BY with CASE expression (no aggregates) — eligible +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY CASE WHEN id < 50 THEN 0 ELSE 1 END, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CASE WHEN (id < N) THEN N ELSE N END) + Sort Key: (CASE WHEN (sorted_merge_test.id < N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CASE WHEN (id < N) THEN N ELSE N END +(16 rows) + +-- J5: ORDER BY on an expression that mixes aggregate and non-aggregate +-- Should be ineligible because the expression contains an aggregate. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id + count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3 + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count, (id OPERATOR(pg_catalog.+) count(*)) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), (id + count(*)) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) + +-- J6: Correctness comparison — expression ORDER BY, GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J7–J12: Additional pushable expressions (no aggregates) +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J7: ORDER BY function call on column +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- J8: ORDER BY COALESCE +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0) LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +-- J9: ORDER BY negation +SELECT id, num FROM sorted_merge_test ORDER BY -num LIMIT 5; + id | num +--------------------------------------------------------------------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +-- J10: ORDER BY concatenation +SELECT id, val FROM sorted_merge_test ORDER BY val || '_suffix' LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 100 | val_100 + 10 | val_10 +(5 rows) + +-- J11: ORDER BY mathematical function (abs distance) +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +-- J12: ORDER BY expression not in SELECT list +SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +-- J13: ORDER BY expression referencing multiple columns +SELECT id, val FROM sorted_merge_test ORDER BY id * num LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- J14: ORDER BY with type cast +SELECT id, num FROM sorted_merge_test ORDER BY num::int LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- J15: ORDER BY with subexpression in SELECT and different expression in ORDER BY +SELECT id, num + 1 as n1 FROM sorted_merge_test ORDER BY num + 2 LIMIT 5; + id | n1 +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +-- J16: ORDER BY column alias +SELECT id, num * 2 as doubled FROM sorted_merge_test ORDER BY doubled LIMIT 5; + id | doubled +--------------------------------------------------------------------- + 1 | 3.0 + 2 | 6.0 + 3 | 9.0 + 4 | 12.0 + 5 | 15.0 +(5 rows) + +-- ----------------------------------------------------------------- +-- J17–J21: Correctness — GUC off vs on for expression ORDER BY +-- ----------------------------------------------------------------- +-- J17: function call +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- J18: CASE expression +SET citus.enable_sorted_merge TO off; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) + +-- J19: COALESCE +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +-- J20: abs() distance function +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J21–J22: More ineligibility — aggregate inside expressions +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J21: ORDER BY CASE wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY CASE WHEN count(*) > 1 THEN 0 ELSE 1 END, id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3, remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count, CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + Sort Key: (CASE WHEN (count(*) > N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), CASE WHEN (count(*) > N) THEN N ELSE N END + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J22: ORDER BY aggregate expression (sum + 1) — correctness +SET citus.enable_sorted_merge TO off; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J23–J24: EXPLAIN plans for pushable expression patterns +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J23: Does function-call ORDER BY get pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, upper(val) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (upper(val)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, (upper(val)) + -> Sort (actual rows=N loops=N) + Output: id, val, (upper(val)) + Sort Key: (upper(sorted_merge_test.val)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, upper(val) +(20 rows) + +-- J24: ORDER BY expression not in SELECT list — pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (num OPERATOR(pg_catalog.+) 'N'::numeric) AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (num OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + Sort Key: ((sorted_merge_test.num + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, (num + 'N'::numeric) +(20 rows) + +-- ================================================================= +-- Category K: Index-based sort avoidance +-- ================================================================= +-- When an index exists on the ORDER BY column, PostgreSQL's worker-side +-- planner should choose an Index Scan instead of Sort + Seq Scan, making +-- the worker-side sort essentially free. This is the best-case scenario +-- for sorted merge: zero worker sort cost + zero coordinator sort cost. +-- +-- We disable enable_seqscan to force the worker planner to prefer the +-- index, since the test table is small enough that Seq Scan + Sort +-- would otherwise be cheaper. +CREATE INDEX sorted_merge_test_id_idx ON sorted_merge_test(id); +-- Use a transaction with SET LOCAL to propagate enable_seqscan=off to workers, +-- forcing the worker planner to use the index instead of Seq Scan + Sort. +SET citus.propagate_set_commands TO 'local'; +-- K1: EXPLAIN with index — worker uses Index Scan, no Sort node +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) + +COMMIT; +-- K2: Correctness with index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +COMMIT; +-- K3: Multi-column index +CREATE INDEX sorted_merge_test_num_id_idx ON sorted_merge_test(num, id); +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Only Scan using sorted_merge_test_num_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num + Heap Fetches: N +(13 rows) + +COMMIT; +-- K4: Correctness with multi-column index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +COMMIT; +-- K5: DESC ordering with index +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id DESC'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan Backward using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) + +COMMIT; +RESET citus.propagate_set_commands; +DROP INDEX sorted_merge_test_id_idx; +DROP INDEX sorted_merge_test_num_id_idx; +-- ================================================================= +-- Category L: Volatile and stable functions in ORDER BY +-- Tests that ORDER BY with functions works correctly with sorted merge. +-- Volatile functions (random, clock_timestamp, timeofday) are pushed +-- to workers as computed columns — sorted merge uses the materialized +-- worker values, which is semantically equivalent to coordinator Sort. +-- ================================================================= +-- L1: STABLE function — now() in expression with column +-- now() returns the same value on all workers within a transaction, +-- so the merge is globally consistent. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY now() - ts, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (now() OPERATOR(pg_catalog.-) ts) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (now() OPERATOR(pg_catalog.-) ts), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((now() - ts)) + Sort Key: ((now() - sorted_merge_test.ts)), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (now() - ts) +(16 rows) + +-- L2: VOLATILE function — random() in ORDER BY +-- random() is pushed to workers as worker_column_3; each worker sorts +-- by its own random values. The merge interleaves using materialized +-- values — semantically equivalent to coordinator Sort on worker_column_3. +-- Test plan shape only (result is non-deterministic). +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY random(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, random() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (random()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (random()) + Sort Key: (random()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, random() +(16 rows) + +-- L3: VOLATILE function — clock_timestamp() in ORDER BY +-- Same mechanics as random(): pushed to workers, sorted locally, merged. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY clock_timestamp(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, clock_timestamp() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (clock_timestamp()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (clock_timestamp()) + Sort Key: (clock_timestamp()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, clock_timestamp() +(16 rows) + +-- L4: nextval() in ORDER BY with sorted merge ON — expected ERROR +-- nextval() cannot be pushed to workers (CanPushDownExpression blocks it). +-- The sort clause references a target entry missing from the worker target +-- list, causing a plan-time error. This is a pre-existing Citus limitation. +CREATE SEQUENCE sorted_merge_test_seq; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq'); +psql:sql/multi_orderby_pushdown.sql:777: ERROR: ORDER/GROUP BY expression not found in targetlist +-- L4b: nextval() in ORDER BY with sorted merge OFF but LIMIT present +-- Same error — demonstrates this is NOT a sorted merge regression. +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq') LIMIT 5; +psql:sql/multi_orderby_pushdown.sql:782: ERROR: ORDER/GROUP BY expression not found in targetlist +DROP SEQUENCE sorted_merge_test_seq; +-- L5: STABLE function alone (constant-fold case) +-- current_timestamp is constant-folded by the planner; the sort key +-- effectively becomes just 'id'. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY current_timestamp, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CURRENT_TIMESTAMP AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CURRENT_TIMESTAMP, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CURRENT_TIMESTAMP) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CURRENT_TIMESTAMP +(16 rows) + +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Cleanup +-- ================================================================= +SET citus.enable_sorted_merge TO off; +-- Run 2: streaming adapter +SET citus.enable_streaming_sorted_merge TO on; +\i sql/multi_orderby_pushdown.sql +-- +-- MULTI_SORTED_MERGE +-- +-- Tests for the citus.enable_sorted_merge GUC and the sorted merge +-- planner eligibility logic. Verifies that enabling the GUC does not +-- introduce regressions for any query pattern. +-- +-- MX verification: this test has been verified to pass with zero diffs +-- under check-base-mx (MX mode), confirming sorted merge works correctly +-- when any node in the cluster acts as coordinator. +-- +-- ================================================================= +-- 1. GUC basics +-- ================================================================= +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------------------------------------------------- + off +(1 row) + +SET citus.enable_sorted_merge TO on; +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------------------------------------------------- + on +(1 row) + +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Category A: Eligibility — sort IS pushed to workers +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- A1: ORDER BY distribution column +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- A2: ORDER BY DESC +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id DESC + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(16 rows) + +-- A3: ORDER BY DESC NULLS LAST +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num DESC NULLS LAST + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, num + Sort Key: sorted_merge_test.num DESC NULLS LAST + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num +(16 rows) + +-- A4: ORDER BY non-distribution column +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY val'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.val + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- A5: Multi-column ORDER BY +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id, sorted_merge_test.val + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- A6: Mixed directions +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, num DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id, sorted_merge_test.num DESC + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num +(16 rows) + +-- A7: GROUP BY dist_col ORDER BY dist_col +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> GroupAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(19 rows) + +-- A8: WHERE clause + ORDER BY +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) 'N'::numeric) ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + Filter: (sorted_merge_test.num > 'N'::numeric) + Rows Removed by Filter: N +(18 rows) + +-- A9: Expression in ORDER BY (non-aggregate) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, num, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num, (id + N) +(16 rows) + +-- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(20 rows) + +-- ================================================================= +-- Category B: Ineligibility — sort NOT pushed for merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- B1: ORDER BY count(*) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Sort Key: remote_scan.count + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) + +-- B2: ORDER BY avg(col) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.avg + Sort Key: remote_scan.avg + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.avg + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, avg(num) AS avg FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, avg(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) + +-- B3: GROUP BY non-dist col, ORDER BY non-dist col +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Key: remote_scan.val + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.val, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(23 rows) + +-- B4: GROUP BY non-dist col, ORDER BY aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.val, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(23 rows) + +-- ================================================================= +-- Category C: Correctness — results match GUC off vs on +-- ================================================================= +-- C1: Simple ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- C2: ORDER BY DESC +SET citus.enable_sorted_merge TO off; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +-- C3: Multi-column ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- C4: ORDER BY non-distribution column +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- C5: GROUP BY dist_col ORDER BY dist_col +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- C6: Mixed directions +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- C7: WHERE + ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +-- C8: Aggregates in SELECT, ORDER BY on dist_col (GROUP BY dist_col) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +-- ================================================================= +-- Category D: Complex queries — regression guards +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- D1: Subquery in FROM with ORDER BY +SELECT * FROM ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) sub ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D2: CTE with ORDER BY +WITH top5 AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) +SELECT * FROM top5 ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D3: Co-located JOIN + ORDER BY +SELECT t.id, t.val, e.event_type +FROM sorted_merge_test t +JOIN sorted_merge_events e ON t.id = e.id +WHERE t.id <= 5 +ORDER BY t.id, e.event_type +LIMIT 10; + id | val | event_type +--------------------------------------------------------------------- + 1 | val_1 | buy + 1 | val_1 | buy + 1 | val_1 | click + 1 | val_1 | view + 2 | val_2 | buy + 2 | val_2 | click + 2 | val_2 | view + 2 | val_2 | view + 3 | val_3 | buy + 3 | val_3 | buy +(10 rows) + +-- D4: UNION ALL + ORDER BY +SELECT id, val FROM sorted_merge_test WHERE id <= 3 +UNION ALL +SELECT id, val FROM sorted_merge_test WHERE id BETWEEN 98 AND 100 +ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 98 | val_98 + 99 | val_99 + 100 | val_100 +(6 rows) + +-- D5: DISTINCT + ORDER BY +SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +-- D6: DISTINCT ON + ORDER BY +SELECT DISTINCT ON (id) id, val, num +FROM sorted_merge_test +WHERE id <= 5 +ORDER BY id, num DESC; + id | val | num +--------------------------------------------------------------------- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +-- D7: EXISTS subquery + ORDER BY +SELECT id, val FROM sorted_merge_test t +WHERE EXISTS (SELECT 1 FROM sorted_merge_events e WHERE e.id = t.id) +ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D8: IN subquery + ORDER BY +SELECT id, val FROM sorted_merge_test +WHERE id IN (SELECT id FROM sorted_merge_events WHERE event_type = 'click') +ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D9: Multiple aggregates, GROUP BY dist_col, ORDER BY dist_col +SELECT id, count(*), sum(num), avg(num), min(val), max(val) +FROM sorted_merge_test +GROUP BY id +ORDER BY id +LIMIT 5; + id | count | sum | avg | min | max +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 | val_1 | val_1 + 2 | 1 | 3.0 | 3.0000000000000000 | val_2 | val_2 + 3 | 1 | 4.5 | 4.5000000000000000 | val_3 | val_3 + 4 | 1 | 6.0 | 6.0000000000000000 | val_4 | val_4 + 5 | 1 | 7.5 | 7.5000000000000000 | val_5 | val_5 +(5 rows) + +-- D10: CASE expression in SELECT + ORDER BY +SELECT id, + CASE WHEN num > 75 THEN 'high' WHEN num > 25 THEN 'mid' ELSE 'low' END as bucket +FROM sorted_merge_test +WHERE num IS NOT NULL +ORDER BY id +LIMIT 10; + id | bucket +--------------------------------------------------------------------- + 1 | low + 2 | low + 3 | low + 4 | low + 5 | low + 6 | low + 7 | low + 8 | low + 9 | low + 10 | low +(10 rows) + +-- D11: NULL values ordering +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +-- D12: Large OFFSET +SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; + id +--------------------------------------------------------------------- + 101 + 102 + 200 + 201 + 202 +(5 rows) + +-- D13: ORDER BY ordinal position +SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- ================================================================= +-- Category E: Edge cases +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- E1: Empty result set +SELECT id FROM sorted_merge_test WHERE id < 0 ORDER BY id; + id +--------------------------------------------------------------------- +(0 rows) + +-- E2: Single row (may go through router planner) +SELECT id, val FROM sorted_merge_test WHERE id = 42 ORDER BY id; + id | val +--------------------------------------------------------------------- + 42 | val_42 +(1 row) + +-- E3: All rows with same sort value +SELECT id, num FROM sorted_merge_test WHERE num = 10.5 ORDER BY num, id; + id | num +--------------------------------------------------------------------- + 7 | 10.5 + 200 | 10.5 + 201 | 10.5 + 202 | 10.5 +(4 rows) + +-- E4: Wide sort key (4 columns) +SELECT id, val, num FROM sorted_merge_test +WHERE id <= 5 +ORDER BY num, val, id +LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +-- E5: Zero-task defensive path +-- CreatePerTaskDispatchDest handles taskCount=0 gracefully (returns a no-op +-- destination). This cannot be triggered via normal SQL because distributed +-- tables always have at least one shard. The closest we can test is an +-- empty-result query through the sorted merge path to verify no crash. +SELECT id FROM sorted_merge_test WHERE false ORDER BY id; + id +--------------------------------------------------------------------- +(0 rows) + +-- ================================================================= +-- Category F: Existing LIMIT pushdown stability +-- ================================================================= +-- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Sort (actual rows=N loops=N) + Output: remote_scan.id + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(23 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(20 rows) + +-- F2: GROUP BY dist_col + ORDER BY + LIMIT +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> GroupAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(26 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> GroupAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(23 rows) + +-- F3: ORDER BY aggregate + LIMIT (not eligible for merge) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- ================================================================= +-- Category G: Phase 4 — Sort elision and advanced scenarios +-- ================================================================= +-- G1: Sort elision verification — coordinator Sort node absent +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Sort Key: remote_scan.id + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(15 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- G2a: PREPARE with merge ON, EXECUTE after turning OFF +-- Plan-time decision is baked in — cached plan must still merge correctly. +-- Execute 6+ times to trigger PostgreSQL's generic plan caching, then +-- verify the plan shape is preserved after toggling the GUC. +SET citus.enable_sorted_merge TO on; +PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — no Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + +SET citus.enable_sorted_merge TO off; +-- Cached plan retains the sorted merge decision from planning time +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + +DEALLOCATE merge_on_stmt; +-- G2b: PREPARE with merge OFF, EXECUTE after turning ON +-- Cached plan has Sort node — must still return sorted results. +SET citus.enable_sorted_merge TO off; +PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +SET citus.enable_sorted_merge TO on; +-- Cached plan retains the non-merge decision from planning time +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +DEALLOCATE merge_off_stmt; +-- G3: Cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_cursor CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:323: ERROR: cursor can only scan forward +HINT: Declare it with SCROLL option to enable backward scan. +FETCH 2 FROM sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:324: ERROR: current transaction is aborted, commands ignored until end of transaction block +CLOSE sorted_cursor; +psql:sql/multi_orderby_pushdown.sql:325: ERROR: current transaction is aborted, commands ignored until end of transaction block +COMMIT; +-- G3b: SCROLL cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_scroll_cursor SCROLL CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 2 +(1 row) + +FETCH 2 FROM sorted_scroll_cursor; + id +--------------------------------------------------------------------- + 3 + 4 +(2 rows) + +CLOSE sorted_scroll_cursor; +COMMIT; +-- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(20 rows) + +-- G5: ORDER BY aggregate + LIMIT — crash regression test +-- Previously caused SIGSEGV when sorted merge was enabled because +-- aggregate ORDER BY was erroneously tagged as merge-eligible. +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 3; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 +(3 rows) + +-- G6: Small work_mem with many tasks (32 shards) +SET citus.enable_sorted_merge TO on; +SET work_mem TO '64kB'; +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +RESET work_mem; +-- G7: max_intermediate_result_size with CTE subplan +SET citus.enable_sorted_merge TO on; +SET citus.max_intermediate_result_size TO '4kB'; +WITH cte AS (SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 50) +SELECT * FROM cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- ================================================================= +-- Category H: Subplan + Sorted Merge interactions +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1: CTE subplan with simple ORDER BY — eligible for sorted merge +-- The CTE becomes a subplan; its DistributedPlan may have useSortedMerge=true +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + id | val | cnt +--------------------------------------------------------------------- + 1 | val_1 | 1 + 2 | val_2 | 1 + 3 | val_3 | 1 + 4 | val_4 | 1 + 5 | val_5 | 1 + 6 | val_6 | 1 + 7 | val_7 | 1 + 8 | val_8 | 1 + 9 | val_9 | 1 + 10 | val_10 | 1 + 11 | val_11 | 1 + 12 | val_12 | 1 + 13 | val_13 | 1 + 14 | val_14 | 1 + 15 | val_15 | 1 +(15 rows) + +-- H3: CTE subplan feeding outer ORDER BY — both levels may merge independently +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- H4: Subquery in WHERE with ORDER BY + LIMIT — becomes subplan with merge +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 +(3 rows) + +-- H5: CTE subplan with max_intermediate_result_size enforcement +-- Tests that EnsureIntermediateSizeLimitNotExceeded works through per-task dispatch +SET citus.max_intermediate_result_size TO '4kB'; +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- H6: Cross-join subplan with non-aggregate ORDER BY (crash regression variant) +-- Similar pattern to subquery_complex_target_list but without aggregate ORDER BY +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + id | bar_id +--------------------------------------------------------------------- + 1 | 1 + 1 | 1 + 1 | 1 + 2 | 1 + 2 | 1 +(5 rows) + +-- H7: CTE correctness comparison — GUC off vs on must produce identical results +SET citus.enable_sorted_merge TO off; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +-- ================================================================= +-- Category H EXPLAIN: Query plans for subplan + sorted merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val +(20 rows) + +-- H2 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.cnt + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + -> Distributed Subplan XXX_2 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.cnt + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.cnt + Sort Key: remote_scan.cnt DESC, remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.cnt + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)) + Sort Key: (count(*)) DESC, sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT e.id, e.val, i.cnt FROM ((SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) e JOIN (SELECT intermediate_result.id, intermediate_result.cnt FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer, cnt bigint)) i ON ((e.id OPERATOR(pg_catalog.=) i.id))) ORDER BY e.id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Merge Join (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result_1.cnt + Merge Cond: (intermediate_result.id = intermediate_result_1.id) + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=N loops=N) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Sort Key: intermediate_result_1.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(78 rows) + +-- H3 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: t.id, t.val + -> Merge Join (actual rows=N loops=N) + Output: t.id, t.val + Merge Cond: (t.id = intermediate_result.id) + -> Sort (actual rows=N loops=N) + Output: t.id, t.val + Sort Key: t.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=N loops=N) + Output: t.id, t.val + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(53 rows) + +-- H4 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) + Output: id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Hash Semi Join (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Hash Cond: (sorted_merge_test.id = intermediate_result.id) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val, sorted_merge_test.num, sorted_merge_test.ts + -> Hash (actual rows=N loops=N) + Output: intermediate_result.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(52 rows) + +-- H5 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(41 rows) + +-- H6 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.bar_id + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id + -> Distributed Subplan XXX_2 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) + Output: id + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + Sort Key: intermediate_result.id, intermediate_result_1.id + Sort Method: quicksort Memory: NkB + -> Nested Loop (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) + Output: intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(69 rows) + +-- H7 EXPLAIN — GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, num + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N +(45 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, num + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N +(43 rows) + +-- ================================================================= +-- Category I: Distributed Transactions +-- ================================================================= +-- Verify sorted merge correctness within multi-statement transactions +-- where data is modified before the sorted-merge SELECT. +SET citus.enable_sorted_merge TO on; +-- I1: INSERT then SELECT within a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (900, 'txn_insert', 900.0); +SELECT id, val FROM sorted_merge_test WHERE id >= 900 ORDER BY id; + id | val +--------------------------------------------------------------------- + 900 | txn_insert +(1 row) + +ROLLBACK; +-- I2: UPDATE then SELECT within a transaction +BEGIN; +UPDATE sorted_merge_test SET val = 'updated' WHERE id = 1; +SELECT id, val FROM sorted_merge_test WHERE id <= 3 ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | updated + 2 | val_2 + 3 | val_3 +(3 rows) + +ROLLBACK; +-- I3: DELETE then SELECT within a transaction +BEGIN; +DELETE FROM sorted_merge_test WHERE id <= 5; +SELECT id, val FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id | val +--------------------------------------------------------------------- + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(5 rows) + +ROLLBACK; +-- I4: INSERT + UPDATE + SELECT with multi-column ORDER BY +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (901, 'txn_a', 1.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (902, 'txn_b', 2.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (903, 'txn_c', 3.0); +UPDATE sorted_merge_test SET num = 999.0 WHERE id = 901; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 900 ORDER BY num, id; + id | val | num +--------------------------------------------------------------------- + 902 | txn_b | 2.0 + 903 | txn_c | 3.0 + 901 | txn_a | 999.0 +(3 rows) + +ROLLBACK; +-- I5: Compare results with GUC off vs on in a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (910, 'cmp_a', 10.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (911, 'cmp_b', 20.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (912, 'cmp_c', 30.0); +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num +--------------------------------------------------------------------- + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num +--------------------------------------------------------------------- + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) + +ROLLBACK; +-- I6: DELETE + aggregate in SELECT with ORDER BY +BEGIN; +DELETE FROM sorted_merge_test WHERE id > 100 AND id < 200; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +ROLLBACK; +-- ================================================================= +-- Category J: Coordinator expression evaluation exclusion +-- ================================================================= +-- Verify that queries with ORDER BY on expressions that need coordinator-side +-- evaluation are correctly excluded from sorted merge (or handled correctly). +SET citus.enable_sorted_merge TO on; +-- J1: ORDER BY expression on aggregate result (ordinal reference) +-- The ORDER BY references position 2 which is an aggregate — sorted merge +-- must NOT be used because aggregates are rewritten between worker/coordinator. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) AS total FROM sorted_merge_test GROUP BY id ORDER BY 2 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Sort Key: remote_scan.total + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, sum(num) AS total FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (sum(num)) + -> Sort (actual rows=N loops=N) + Output: id, (sum(num)) + Sort Key: (sum(sorted_merge_test.num)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, sum(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J2: ORDER BY expression wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) + 1 AS total_plus FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Sort Key: remote_scan.total_plus + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) AS total_plus FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + Sort Key: ((sum(sorted_merge_test.num) + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, (sum(num) + 'N'::numeric) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J3: ORDER BY a non-aggregate expression that can be pushed to workers +-- This should be eligible for sorted merge — the expression is evaluated +-- on the worker side and sort order is preserved. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id + 0'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (id + N) +(16 rows) + +-- J4: ORDER BY with CASE expression (no aggregates) — eligible +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY CASE WHEN id < 50 THEN 0 ELSE 1 END, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CASE WHEN (id < N) THEN N ELSE N END) + Sort Key: (CASE WHEN (sorted_merge_test.id < N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CASE WHEN (id < N) THEN N ELSE N END +(16 rows) + +-- J5: ORDER BY on an expression that mixes aggregate and non-aggregate +-- Should be ineligible because the expression contains an aggregate. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id + count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3 + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count, (id OPERATOR(pg_catalog.+) count(*)) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), (id + count(*)) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) + +-- J6: Correctness comparison — expression ORDER BY, GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J7–J12: Additional pushable expressions (no aggregates) +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J7: ORDER BY function call on column +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- J8: ORDER BY COALESCE +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0) LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +-- J9: ORDER BY negation +SELECT id, num FROM sorted_merge_test ORDER BY -num LIMIT 5; + id | num +--------------------------------------------------------------------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +-- J10: ORDER BY concatenation +SELECT id, val FROM sorted_merge_test ORDER BY val || '_suffix' LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 100 | val_100 + 10 | val_10 +(5 rows) + +-- J11: ORDER BY mathematical function (abs distance) +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +-- J12: ORDER BY expression not in SELECT list +SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +-- J13: ORDER BY expression referencing multiple columns +SELECT id, val FROM sorted_merge_test ORDER BY id * num LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- J14: ORDER BY with type cast +SELECT id, num FROM sorted_merge_test ORDER BY num::int LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- J15: ORDER BY with subexpression in SELECT and different expression in ORDER BY +SELECT id, num + 1 as n1 FROM sorted_merge_test ORDER BY num + 2 LIMIT 5; + id | n1 +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +-- J16: ORDER BY column alias +SELECT id, num * 2 as doubled FROM sorted_merge_test ORDER BY doubled LIMIT 5; + id | doubled +--------------------------------------------------------------------- + 1 | 3.0 + 2 | 6.0 + 3 | 9.0 + 4 | 12.0 + 5 | 15.0 +(5 rows) + +-- ----------------------------------------------------------------- +-- J17–J21: Correctness — GUC off vs on for expression ORDER BY +-- ----------------------------------------------------------------- +-- J17: function call +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- J18: CASE expression +SET citus.enable_sorted_merge TO off; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) + +-- J19: COALESCE +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +-- J20: abs() distance function +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J21–J22: More ineligibility — aggregate inside expressions +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J21: ORDER BY CASE wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY CASE WHEN count(*) > 1 THEN 0 ELSE 1 END, id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3, remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count, CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + Sort Key: (CASE WHEN (count(*) > N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), CASE WHEN (count(*) > N) THEN N ELSE N END + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J22: ORDER BY aggregate expression (sum + 1) — correctness +SET citus.enable_sorted_merge TO off; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J23–J24: EXPLAIN plans for pushable expression patterns +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J23: Does function-call ORDER BY get pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, upper(val) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (upper(val)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, (upper(val)) + -> Sort (actual rows=N loops=N) + Output: id, val, (upper(val)) + Sort Key: (upper(sorted_merge_test.val)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, upper(val) +(20 rows) + +-- J24: ORDER BY expression not in SELECT list — pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (num OPERATOR(pg_catalog.+) 'N'::numeric) AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (num OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + Sort Key: ((sorted_merge_test.num + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, (num + 'N'::numeric) +(20 rows) + +-- ================================================================= +-- Category K: Index-based sort avoidance +-- ================================================================= +-- When an index exists on the ORDER BY column, PostgreSQL's worker-side +-- planner should choose an Index Scan instead of Sort + Seq Scan, making +-- the worker-side sort essentially free. This is the best-case scenario +-- for sorted merge: zero worker sort cost + zero coordinator sort cost. +-- +-- We disable enable_seqscan to force the worker planner to prefer the +-- index, since the test table is small enough that Seq Scan + Sort +-- would otherwise be cheaper. +CREATE INDEX sorted_merge_test_id_idx ON sorted_merge_test(id); +-- Use a transaction with SET LOCAL to propagate enable_seqscan=off to workers, +-- forcing the worker planner to use the index instead of Seq Scan + Sort. +SET citus.propagate_set_commands TO 'local'; +-- K1: EXPLAIN with index — worker uses Index Scan, no Sort node +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) + +COMMIT; +-- K2: Correctness with index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +COMMIT; +-- K3: Multi-column index +CREATE INDEX sorted_merge_test_num_id_idx ON sorted_merge_test(num, id); +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Only Scan using sorted_merge_test_num_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num + Heap Fetches: N +(13 rows) + +COMMIT; +-- K4: Correctness with multi-column index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +COMMIT; +-- K5: DESC ordering with index +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id DESC'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan Backward using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) + +COMMIT; +RESET citus.propagate_set_commands; +DROP INDEX sorted_merge_test_id_idx; +DROP INDEX sorted_merge_test_num_id_idx; +-- ================================================================= +-- Category L: Volatile and stable functions in ORDER BY +-- Tests that ORDER BY with functions works correctly with sorted merge. +-- Volatile functions (random, clock_timestamp, timeofday) are pushed +-- to workers as computed columns — sorted merge uses the materialized +-- worker values, which is semantically equivalent to coordinator Sort. +-- ================================================================= +-- L1: STABLE function — now() in expression with column +-- now() returns the same value on all workers within a transaction, +-- so the merge is globally consistent. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY now() - ts, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (now() OPERATOR(pg_catalog.-) ts) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (now() OPERATOR(pg_catalog.-) ts), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((now() - ts)) + Sort Key: ((now() - sorted_merge_test.ts)), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (now() - ts) +(16 rows) + +-- L2: VOLATILE function — random() in ORDER BY +-- random() is pushed to workers as worker_column_3; each worker sorts +-- by its own random values. The merge interleaves using materialized +-- values — semantically equivalent to coordinator Sort on worker_column_3. +-- Test plan shape only (result is non-deterministic). +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY random(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, random() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (random()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (random()) + Sort Key: (random()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, random() +(16 rows) + +-- L3: VOLATILE function — clock_timestamp() in ORDER BY +-- Same mechanics as random(): pushed to workers, sorted locally, merged. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY clock_timestamp(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, clock_timestamp() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (clock_timestamp()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (clock_timestamp()) + Sort Key: (clock_timestamp()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, clock_timestamp() +(16 rows) + +-- L4: nextval() in ORDER BY with sorted merge ON — expected ERROR +-- nextval() cannot be pushed to workers (CanPushDownExpression blocks it). +-- The sort clause references a target entry missing from the worker target +-- list, causing a plan-time error. This is a pre-existing Citus limitation. +CREATE SEQUENCE sorted_merge_test_seq; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq'); +psql:sql/multi_orderby_pushdown.sql:777: ERROR: ORDER/GROUP BY expression not found in targetlist +-- L4b: nextval() in ORDER BY with sorted merge OFF but LIMIT present +-- Same error — demonstrates this is NOT a sorted merge regression. +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq') LIMIT 5; +psql:sql/multi_orderby_pushdown.sql:782: ERROR: ORDER/GROUP BY expression not found in targetlist +DROP SEQUENCE sorted_merge_test_seq; +-- L5: STABLE function alone (constant-fold case) +-- current_timestamp is constant-folded by the planner; the sort key +-- effectively becomes just 'id'. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY current_timestamp, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CURRENT_TIMESTAMP AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CURRENT_TIMESTAMP, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CURRENT_TIMESTAMP) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CURRENT_TIMESTAMP +(16 rows) + +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Cleanup +-- ================================================================= +SET citus.enable_sorted_merge TO off; +RESET citus.enable_streaming_sorted_merge; +-- Cleanup +DROP TABLE sorted_merge_test; +DROP TABLE sorted_merge_events; diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 06b482ff5c7..6d18db9094c 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -93,7 +93,7 @@ test: multi_reference_table multi_select_for_update relation_access_tracking pg1 test: custom_aggregate_support aggregate_support tdigest_aggregate_support test: multi_average_expression multi_working_columns multi_having_pushdown having_subquery test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown -test: multi_orderby_pushdown +test: multi_orderby_pushdown_streaming test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg ch_bench_having chbenchmark_all_queries expression_reference_join anonymous_columns test: ch_bench_subquery_repartition test: subscripting_op diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index cc2bb87377f..4fb4f0cab32 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -10,42 +10,6 @@ -- when any node in the cluster acts as coordinator. -- -SET citus.next_shard_id TO 960000; - --- ================================================================= --- Setup: create test tables --- ================================================================= - -CREATE TABLE sorted_merge_test ( - id int, - val text, - num numeric, - ts timestamptz DEFAULT now() -); -SELECT create_distributed_table('sorted_merge_test', 'id'); - --- Insert 100 rows + NULLs + duplicates -INSERT INTO sorted_merge_test (id, val, num) -SELECT i, 'val_' || i, (i * 1.5)::numeric -FROM generate_series(1, 100) i; - -INSERT INTO sorted_merge_test (id, val, num) VALUES (101, NULL, NULL); -INSERT INTO sorted_merge_test (id, val, num) VALUES (102, NULL, NULL); -INSERT INTO sorted_merge_test (id, val, num) VALUES (200, 'dup_a', 10.5); -INSERT INTO sorted_merge_test (id, val, num) VALUES (201, 'dup_b', 10.5); -INSERT INTO sorted_merge_test (id, val, num) VALUES (202, 'dup_c', 10.5); - --- Second table for join tests -CREATE TABLE sorted_merge_events ( - id int, - event_type text, - event_val int -); -SELECT create_distributed_table('sorted_merge_events', 'id'); - -INSERT INTO sorted_merge_events -SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i -FROM generate_series(1, 200) i; -- ================================================================= -- 1. GUC basics @@ -361,6 +325,16 @@ FETCH 2 FROM sorted_cursor; CLOSE sorted_cursor; COMMIT; +-- G3b: SCROLL cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_scroll_cursor SCROLL CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_scroll_cursor; +FETCH BACKWARD 1 FROM sorted_scroll_cursor; +FETCH 2 FROM sorted_scroll_cursor; +CLOSE sorted_scroll_cursor; +COMMIT; + -- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) SET citus.enable_sorted_merge TO on; SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); @@ -821,5 +795,3 @@ SET citus.enable_sorted_merge TO off; -- ================================================================= SET citus.enable_sorted_merge TO off; -DROP TABLE sorted_merge_test; -DROP TABLE sorted_merge_events; diff --git a/src/test/regress/sql/multi_orderby_pushdown_streaming.sql b/src/test/regress/sql/multi_orderby_pushdown_streaming.sql new file mode 100644 index 00000000000..10c20e26c81 --- /dev/null +++ b/src/test/regress/sql/multi_orderby_pushdown_streaming.sql @@ -0,0 +1,24 @@ +-- +-- MULTI_ORDERBY_PUSHDOWN_STREAMING +-- +-- Runs the sorted merge test suite (multi_orderby_pushdown.sql) twice: +-- first with the default eager-merge path, then with the streaming +-- adapter enabled via citus.enable_streaming_sorted_merge. Both runs +-- share the same setup tables and must produce identical results +-- (except for the G3 backward-scan test, where the streaming adapter's +-- forward-only cursor correctly errors on FETCH BACKWARD). +-- + +\i sql/setup_multi_orderby_pushdown.sql + +-- Run 1: eager merge (default) +\i sql/multi_orderby_pushdown.sql + +-- Run 2: streaming adapter +SET citus.enable_streaming_sorted_merge TO on; +\i sql/multi_orderby_pushdown.sql +RESET citus.enable_streaming_sorted_merge; + +-- Cleanup +DROP TABLE sorted_merge_test; +DROP TABLE sorted_merge_events; diff --git a/src/test/regress/sql/setup_multi_orderby_pushdown.sql b/src/test/regress/sql/setup_multi_orderby_pushdown.sql new file mode 100644 index 00000000000..a1c6e6c5976 --- /dev/null +++ b/src/test/regress/sql/setup_multi_orderby_pushdown.sql @@ -0,0 +1,45 @@ +-- +-- SETUP_MULTI_ORDERBY_PUSHDOWN +-- +-- Creates the test tables and data used by multi_orderby_pushdown.sql +-- and its variants (e.g., multi_orderby_pushdown_streaming.sql). +-- This file is meant to be included via \i from test files that need +-- these tables. +-- + +SET citus.next_shard_id TO 960000; + +-- ================================================================= +-- Setup: create test tables +-- ================================================================= + +CREATE TABLE sorted_merge_test ( + id int, + val text, + num numeric, + ts timestamptz DEFAULT now() +); +SELECT create_distributed_table('sorted_merge_test', 'id'); + +-- Insert 100 rows + NULLs + duplicates +INSERT INTO sorted_merge_test (id, val, num) +SELECT i, 'val_' || i, (i * 1.5)::numeric +FROM generate_series(1, 100) i; + +INSERT INTO sorted_merge_test (id, val, num) VALUES (101, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (102, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (200, 'dup_a', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (201, 'dup_b', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (202, 'dup_c', 10.5); + +-- Second table for join tests +CREATE TABLE sorted_merge_events ( + id int, + event_type text, + event_val int +); +SELECT create_distributed_table('sorted_merge_events', 'id'); + +INSERT INTO sorted_merge_events +SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i +FROM generate_series(1, 200) i;