diff --git a/src/backend/distributed/executor/adaptive_executor.c b/src/backend/distributed/executor/adaptive_executor.c index 83e561c9376..f8713f5725b 100644 --- a/src/backend/distributed/executor/adaptive_executor.c +++ b/src/backend/distributed/executor/adaptive_executor.c @@ -171,6 +171,7 @@ #include "distributed/repartition_join_execution.h" #include "distributed/resource_lock.h" #include "distributed/shared_connection_stats.h" +#include "distributed/sorted_merge.h" #include "distributed/stats/stat_counters.h" #include "distributed/subplan_execution.h" #include "distributed/transaction_identifier.h" @@ -315,6 +316,15 @@ typedef struct DistributedExecution * fail, such as CREATE INDEX CONCURRENTLY. */ bool localExecutionSupported; + + /* + * Sorted merge: when useSortedMerge is true, worker results are routed + * to per-task tuple stores. After execution completes, these stores are + * k-way merged into the final scanState->tuplestorestate. + */ + bool useSortedMerge; + Tuplestorestate **perTaskStores; + int perTaskStoreCount; } DistributedExecution; @@ -641,7 +651,10 @@ static DistributedExecution * CreateDistributedExecution(RowModifyLevel modLevel TransactionProperties * xactProperties, List *jobIdList, - bool localExecutionSupported); + bool localExecutionSupported, + bool useSortedMerge, + Tuplestorestate **perTaskStores, + int perTaskStoreCount); static TransactionProperties DecideTaskListTransactionProperties(RowModifyLevel modLevel, List *taskList, @@ -799,12 +812,45 @@ AdaptiveExecutor(CitusScanState *scanState) /* Reset Task fields that are only valid for a single execution */ ResetExplainAnalyzeData(taskList); - scanState->tuplestorestate = - tuplestore_begin_heap(randomAccess, interTransactions, work_mem); - TupleDesc tupleDescriptor = ScanStateGetTupleDescriptor(scanState); - TupleDestination *defaultTupleDest = - CreateTupleStoreTupleDest(scanState->tuplestorestate, tupleDescriptor); + TupleDestination *defaultTupleDest = NULL; + + /* + * When sorted merge is active, route worker results into per-task tuple + * stores. After execution completes, these stores are k-way merged into + * the final scanState->tuplestorestate. + * + * useSortedMerge is a plan-time decision — if the plan says merge, the + * executor must merge, because the combine query plan has no Sort node + * above us. Skipping the merge would produce silently unsorted output. + * + * This applies even under EXPLAIN ANALYZE: the ExplainAnalyzeDestination + * wrapper forwards data tuples (queryNumber == 0) to the per-task + * dispatch, which routes them to the correct per-task store. Plan-fetch + * tuples (queryNumber == 1) are handled entirely within + * ExplainAnalyzeDestPutTuple and never reach the per-task dispatch. + */ + Tuplestorestate **perTaskStores = NULL; + int perTaskStoreCount = 0; + + if (distributedPlan->useSortedMerge) + { + TupleDestinationStats *sharedStats = palloc0(sizeof(TupleDestinationStats)); + defaultTupleDest = CreatePerTaskDispatchDest(taskList, tupleDescriptor, + sharedStats, + &perTaskStores, + &perTaskStoreCount); + + /* final tuplestore created after merge */ + scanState->tuplestorestate = NULL; + } + else + { + scanState->tuplestorestate = + tuplestore_begin_heap(randomAccess, interTransactions, work_mem); + defaultTupleDest = + CreateTupleStoreTupleDest(scanState->tuplestorestate, tupleDescriptor); + } bool localExecutionSupported = true; @@ -865,7 +911,10 @@ AdaptiveExecutor(CitusScanState *scanState) defaultTupleDest, &xactProperties, jobIdList, - localExecutionSupported); + localExecutionSupported, + distributedPlan->useSortedMerge, + perTaskStores, + perTaskStoreCount); /* * Make sure that we acquire the appropriate locks even if the local tasks @@ -897,6 +946,30 @@ AdaptiveExecutor(CitusScanState *scanState) FinishDistributedExecution(execution); + /* + * When sorted merge is active, k-way merge the per-task stores into + * the final tuplestore. This produces globally sorted output that the + * existing ReturnTupleFromTuplestore() path can read unchanged. + */ + if (execution->useSortedMerge && execution->perTaskStoreCount > 0) + { + scanState->tuplestorestate = + tuplestore_begin_heap(randomAccess, interTransactions, work_mem); + + MergePerTaskStoresIntoFinalStore(scanState->tuplestorestate, + execution->perTaskStores, + execution->perTaskStoreCount, + distributedPlan->sortedMergeKeys, + distributedPlan->sortedMergeKeyCount, + tupleDescriptor); + + /* free per-task stores — they are no longer needed */ + for (int i = 0; i < execution->perTaskStoreCount; i++) + { + tuplestore_end(execution->perTaskStores[i]); + } + } + if (SortReturning && distributedPlan->expectResults && commandType != CMD_SELECT) { SortTupleStore(scanState); @@ -1105,7 +1178,8 @@ ExecuteTaskListExtended(ExecutionParams *executionParams) executionParams->modLevel, executionParams->taskList, executionParams->paramListInfo, executionParams->targetPoolSize, defaultTupleDest, &executionParams->xactProperties, - executionParams->jobIdList, executionParams->localExecutionSupported); + executionParams->jobIdList, executionParams->localExecutionSupported, + false, NULL, 0); /* * If current transaction accessed local placements and task list includes @@ -1170,7 +1244,10 @@ CreateDistributedExecution(RowModifyLevel modLevel, List *taskList, ParamListInfo paramListInfo, int targetPoolSize, TupleDestination *defaultTupleDest, TransactionProperties *xactProperties, - List *jobIdList, bool localExecutionSupported) + List *jobIdList, bool localExecutionSupported, + bool useSortedMerge, + Tuplestorestate **perTaskStores, + int perTaskStoreCount) { DistributedExecution *execution = (DistributedExecution *) palloc0(sizeof(DistributedExecution)); @@ -1200,6 +1277,10 @@ CreateDistributedExecution(RowModifyLevel modLevel, List *taskList, execution->localExecutionSupported = localExecutionSupported; + execution->useSortedMerge = useSortedMerge; + execution->perTaskStores = perTaskStores; + execution->perTaskStoreCount = perTaskStoreCount; + /* * Since task can have multiple queries, we are not sure how many columns we should * allocate for. We start with 16, and reallocate when we need more. diff --git a/src/backend/distributed/executor/multi_executor.c b/src/backend/distributed/executor/multi_executor.c index 1893e262765..8661d367345 100644 --- a/src/backend/distributed/executor/multi_executor.c +++ b/src/backend/distributed/executor/multi_executor.c @@ -85,6 +85,9 @@ ParamListInfo executorBoundParams = NULL; /* sort the returning to get consistent outputs, used only for testing */ bool SortReturning = false; +/* when true at planning time, enables coordinator sorted merge for ORDER BY */ +bool EnableSortedMerge = false; + /* * How many nested executors have we started? This can happen for SQL * UDF calls. The outer query starts an executor, then postgres opens diff --git a/src/backend/distributed/executor/sorted_merge.c b/src/backend/distributed/executor/sorted_merge.c new file mode 100644 index 00000000000..585a90c5188 --- /dev/null +++ b/src/backend/distributed/executor/sorted_merge.c @@ -0,0 +1,338 @@ +/*------------------------------------------------------------------------- + * + * sorted_merge.c + * Implements coordinator-side sorted merge of pre-sorted worker results. + * + * CreatePerTaskDispatchDest() creates per-task tuple stores and returns + * a TupleDestination that routes incoming tuples to the correct store + * based on task->taskId. The only Task field written is + * totalReceivedTupleData (execution-time reporting, reset each execution). + * + * MergePerTaskStoresIntoFinalStore() performs a k-way merge of the + * per-task stores into a single output tuplestore using a binary heap + * and PostgreSQL's SortSupport infrastructure. + * + * Copyright (c) Citus Data, Inc. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" + +#include "executor/tuptable.h" +#include "lib/binaryheap.h" +#include "utils/hsearch.h" +#include "utils/sortsupport.h" + +#include "distributed/listutils.h" +#include "distributed/sorted_merge.h" +#include "distributed/subplan_execution.h" + + +/* + * PerTaskDispatchTupleDest routes tuples to per-task tuple stores + * based on the task's taskId. This is an execution-local object that + * is never attached to a reusable Task node. + */ +typedef struct PerTaskDispatchTupleDest +{ + TupleDestination pub; + Tuplestorestate **perTaskStores; + int taskCount; + TupleDesc tupleDesc; + HTAB *taskIdToIndex; /* maps uint32 taskId -> int array index */ +} PerTaskDispatchTupleDest; + + +/* + * TaskIdIndexEntry is a hash table entry mapping taskId to per-task store index. + */ +typedef struct TaskIdIndexEntry +{ + uint32 taskId; /* hash key */ + int index; /* index into perTaskStores array */ +} TaskIdIndexEntry; + + +/* + * MergeContext holds the state needed by the binary heap comparator. + */ +typedef struct MergeContext +{ + TupleTableSlot **slots; + SortSupportData *sortKeys; + int nkeys; +} MergeContext; + + +/* forward declarations */ +static void PerTaskDispatchPutTuple(TupleDestination *self, Task *task, + int placementIndex, int queryNumber, + HeapTuple heapTuple, uint64 tupleLibpqSize); +static TupleDesc PerTaskDispatchTupleDescForQuery(TupleDestination *self, + int queryNumber); +static int MergeHeapComparator(Datum a, Datum b, void *arg); + + +/* + * CreatePerTaskDispatchDest creates per-task tuple stores and returns a + * TupleDestination that routes incoming tuples to the correct store based + * on task->taskId. + * + * The per-task stores and their count are returned via out parameters so + * the caller can pass them to MergePerTaskStoresIntoFinalStore() later. + * + * All memory is allocated in CurrentMemoryContext (expected to be the + * AdaptiveExecutor local context). + */ +TupleDestination * +CreatePerTaskDispatchDest(List *taskList, TupleDesc tupleDesc, + TupleDestinationStats *sharedStats, + Tuplestorestate ***perTaskStoresOut, + int *perTaskStoreCountOut) +{ + int taskCount = list_length(taskList); + if (taskCount == 0) + { + *perTaskStoresOut = NULL; + *perTaskStoreCountOut = 0; + return CreateTupleDestNone(); + } + + /* + * Allocate per-task tuple stores. Each store gets work_mem / taskCount, + * with a floor of 64 kB. Note: this means the aggregate in-memory budget + * for per-task stores can exceed a single work_mem allocation when + * taskCount is large (e.g., 128 tasks × 64 kB = 8 MB floor). The final + * output tuplestore also gets a full work_mem allocation. This is a + * deliberate trade-off: per-task stores spill to disk automatically, + * and they are freed before the final tuplestore is consumed. The + * temporary memory amplification is bounded and short-lived. + */ + Tuplestorestate **perTaskStores = palloc(taskCount * sizeof(Tuplestorestate *)); + int perTaskWorkMem = Max(work_mem / Max(taskCount, 1), 64); + + for (int i = 0; i < taskCount; i++) + { + perTaskStores[i] = tuplestore_begin_heap(false, false, perTaskWorkMem); + } + + /* build taskId -> array index hash table */ + HASHCTL hashInfo; + memset(&hashInfo, 0, sizeof(hashInfo)); + hashInfo.keysize = sizeof(uint32); + hashInfo.entrysize = sizeof(TaskIdIndexEntry); + hashInfo.hcxt = CurrentMemoryContext; + HTAB *taskIdToIndex = hash_create("PerTaskDispatchHash", taskCount, + &hashInfo, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + int index = 0; + Task *task = NULL; + foreach_declared_ptr(task, taskList) + { + bool found = false; + TaskIdIndexEntry *entry = hash_search(taskIdToIndex, &task->taskId, + HASH_ENTER, &found); + Assert(!found); + entry->index = index; + index++; + } + + /* build the dispatch TupleDestination */ + PerTaskDispatchTupleDest *dispatch = palloc0(sizeof(PerTaskDispatchTupleDest)); + dispatch->pub.putTuple = PerTaskDispatchPutTuple; + dispatch->pub.tupleDescForQuery = PerTaskDispatchTupleDescForQuery; + dispatch->pub.tupleDestinationStats = sharedStats; + dispatch->perTaskStores = perTaskStores; + dispatch->taskCount = taskCount; + dispatch->tupleDesc = tupleDesc; + dispatch->taskIdToIndex = taskIdToIndex; + + *perTaskStoresOut = perTaskStores; + *perTaskStoreCountOut = taskCount; + + return (TupleDestination *) dispatch; +} + + +/* + * PerTaskDispatchPutTuple routes a tuple to the per-task store identified + * by the task's taskId. Matches the behavior of TupleStoreTupleDestPutTuple + * for intermediate-result accounting and totalReceivedTupleData tracking. + */ +static void +PerTaskDispatchPutTuple(TupleDestination *self, Task *task, + int placementIndex, int queryNumber, + HeapTuple heapTuple, uint64 tupleLibpqSize) +{ + PerTaskDispatchTupleDest *dispatch = (PerTaskDispatchTupleDest *) self; + + /* look up the per-task store index */ + bool found = false; + TaskIdIndexEntry *entry = hash_search(dispatch->taskIdToIndex, &task->taskId, + HASH_FIND, &found); + Assert(found); + tuplestore_puttuple(dispatch->perTaskStores[entry->index], heapTuple); + + /* intermediate-result size accounting (matches TupleStoreTupleDestPutTuple) */ + uint64 tupleSize = tupleLibpqSize; + if (tupleSize == 0) + { + tupleSize = heapTuple->t_len; + } + + TupleDestinationStats *stats = self->tupleDestinationStats; + if (SubPlanLevel > 0 && stats != NULL) + { + stats->totalIntermediateResultSize += tupleSize; + EnsureIntermediateSizeLimitNotExceeded(stats); + } + + /* track network transfer size (matches TupleStoreTupleDestPutTuple) */ + task->totalReceivedTupleData += tupleLibpqSize; +} + + +/* + * PerTaskDispatchTupleDescForQuery returns the tuple descriptor. + * + * Only queryNumber == 0 (data tuples) is expected to reach the per-task + * dispatch directly. Under EXPLAIN ANALYZE, the ExplainAnalyzeDestination + * wrapper intercepts plan-fetch tuples (queryNumber == 1) before they + * reach us, but may call tupleDescForQuery with queryNumber == 0 or 1. + * We return the same data tuple descriptor in all cases. + */ +static TupleDesc +PerTaskDispatchTupleDescForQuery(TupleDestination *self, int queryNumber) +{ + PerTaskDispatchTupleDest *dispatch = (PerTaskDispatchTupleDest *) self; + return dispatch->tupleDesc; +} + + +/* + * MergePerTaskStoresIntoFinalStore performs a k-way merge of pre-sorted + * per-task tuple stores into a single output tuplestore using a binary heap. + * + * Each per-task store must contain tuples sorted by the given merge keys. + * The output tuplestore will contain all tuples in globally sorted order. + * + * Uses PostgreSQL's public binaryheap and SortSupport APIs. + */ +void +MergePerTaskStoresIntoFinalStore(Tuplestorestate *finalStore, + Tuplestorestate **perTaskStores, + int nstores, + SortedMergeKey *mergeKeys, + int nkeys, + TupleDesc tupleDesc) +{ + if (nstores == 0 || nkeys == 0) + { + return; + } + + /* allocate one reusable slot per task store */ + TupleTableSlot **slots = palloc(nstores * sizeof(TupleTableSlot *)); + for (int i = 0; i < nstores; i++) + { + slots[i] = MakeSingleTupleTableSlot(tupleDesc, &TTSOpsMinimalTuple); + } + + /* build SortSupport from serialized merge keys */ + SortSupportData *sortKeys = palloc0(nkeys * sizeof(SortSupportData)); + for (int i = 0; i < nkeys; i++) + { + SortSupport sk = &sortKeys[i]; + sk->ssup_cxt = CurrentMemoryContext; + sk->ssup_collation = mergeKeys[i].collation; + sk->ssup_nulls_first = mergeKeys[i].nullsFirst; + sk->ssup_attno = mergeKeys[i].attno; + PrepareSortSupportFromOrderingOp(mergeKeys[i].sortop, sk); + } + + /* set up merge context for heap comparisons */ + MergeContext ctx; + ctx.slots = slots; + ctx.sortKeys = sortKeys; + ctx.nkeys = nkeys; + + binaryheap *heap = binaryheap_allocate(nstores, MergeHeapComparator, &ctx); + + /* seed the heap with the first tuple from each non-empty store */ + for (int i = 0; i < nstores; i++) + { + tuplestore_rescan(perTaskStores[i]); + if (tuplestore_gettupleslot(perTaskStores[i], true, false, slots[i])) + { + binaryheap_add_unordered(heap, Int32GetDatum(i)); + } + } + binaryheap_build(heap); + + /* merge loop: extract min, write to final store, advance winner */ + while (!binaryheap_empty(heap)) + { + int winner = DatumGetInt32(binaryheap_first(heap)); + tuplestore_puttupleslot(finalStore, slots[winner]); + + if (tuplestore_gettupleslot(perTaskStores[winner], true, false, + slots[winner])) + { + binaryheap_replace_first(heap, Int32GetDatum(winner)); + } + else + { + (void) binaryheap_remove_first(heap); + } + } + + /* free merge-local resources */ + binaryheap_free(heap); + for (int i = 0; i < nstores; i++) + { + ExecDropSingleTupleTableSlot(slots[i]); + } + pfree(slots); + pfree(sortKeys); +} + + +/* + * MergeHeapComparator compares tuples from two task stores by the merge keys. + * Returns negative if a < b, positive if a > b, zero if equal. + * The binary heap is a max-heap, so we negate to get min-heap behavior. + * + * This is modeled after heap_compare_slots() in nodeMergeAppend.c. + */ +static int +MergeHeapComparator(Datum a, Datum b, void *arg) +{ + MergeContext *ctx = (MergeContext *) arg; + int slot1 = DatumGetInt32(a); + int slot2 = DatumGetInt32(b); + TupleTableSlot *s1 = ctx->slots[slot1]; + TupleTableSlot *s2 = ctx->slots[slot2]; + + for (int i = 0; i < ctx->nkeys; i++) + { + SortSupport sortKey = &ctx->sortKeys[i]; + AttrNumber attno = sortKey->ssup_attno; + bool isNull1, isNull2; + + Datum datum1 = slot_getattr(s1, attno, &isNull1); + Datum datum2 = slot_getattr(s2, attno, &isNull2); + + int compare = ApplySortComparator(datum1, isNull1, + datum2, isNull2, + sortKey); + if (compare != 0) + { + /* binaryheap is a max-heap, negate for min-heap behavior */ + return -compare; + } + } + + return 0; +} diff --git a/src/backend/distributed/executor/tuple_destination.c b/src/backend/distributed/executor/tuple_destination.c index b3c4b509c2f..78e1f98887d 100644 --- a/src/backend/distributed/executor/tuple_destination.c +++ b/src/backend/distributed/executor/tuple_destination.c @@ -49,8 +49,6 @@ typedef struct TupleDestDestReceiver static void TupleStoreTupleDestPutTuple(TupleDestination *self, Task *task, int placementIndex, int queryNumber, HeapTuple heapTuple, uint64 tupleLibpqSize); -static void EnsureIntermediateSizeLimitNotExceeded(TupleDestinationStats * - tupleDestinationStats); static TupleDesc TupleStoreTupleDestTupleDescForQuery(TupleDestination *self, int queryNumber); static void TupleDestNonePutTuple(TupleDestination *self, Task *task, @@ -135,7 +133,7 @@ TupleStoreTupleDestPutTuple(TupleDestination *self, Task *task, * EnsureIntermediateSizeLimitNotExceeded is a helper function for checking the current * state of the tupleDestinationStats and throws error if necessary. */ -static void +void EnsureIntermediateSizeLimitNotExceeded(TupleDestinationStats *tupleDestinationStats) { if (!tupleDestinationStats) diff --git a/src/backend/distributed/planner/combine_query_planner.c b/src/backend/distributed/planner/combine_query_planner.c index c8ab2a4b326..740a814ea06 100644 --- a/src/backend/distributed/planner/combine_query_planner.c +++ b/src/backend/distributed/planner/combine_query_planner.c @@ -22,6 +22,7 @@ #include "distributed/citus_ruleutils.h" #include "distributed/combine_query_planner.h" +#include "distributed/distributed_planner.h" #include "distributed/insert_select_planner.h" #include "distributed/listutils.h" #include "distributed/metadata_cache.h" @@ -154,6 +155,22 @@ CreateCitusCustomScanPath(PlannerInfo *root, RelOptInfo *relOptInfo, path->custom_path.path.rows = 100000; path->remoteScan = remoteScan; + /* + * When sorted merge is active (decided at planning time and baked into the + * DistributedPlan), declare that this CustomScan produces sorted output by + * setting pathkeys to match the combine query's required sort order. + * + * This causes PostgreSQL's create_ordered_paths() to recognize the + * CustomScan output as already sorted and skip adding a Sort node above + * it. The executor fulfills this contract by merging per-task stores in + * sort order into the final tuplestore. + */ + DistributedPlan *distPlan = GetDistributedPlan(remoteScan); + if (distPlan->useSortedMerge && root->sort_pathkeys != NIL) + { + path->custom_path.path.pathkeys = root->sort_pathkeys; + } + return (Path *) path; } diff --git a/src/backend/distributed/planner/multi_explain.c b/src/backend/distributed/planner/multi_explain.c index 52e56030eb7..8c9fd8a720c 100644 --- a/src/backend/distributed/planner/multi_explain.c +++ b/src/backend/distributed/planner/multi_explain.c @@ -710,6 +710,13 @@ ExplainJob(CitusScanState *scanState, Job *job, ExplainState *es, ExplainOpenGroup("Job", "Job", true, es); ExplainPropertyInteger("Task Count", NULL, taskCount, es); + + DistributedPlan *distributedPlan = scanState->distributedPlan; + if (distributedPlan->useSortedMerge) + { + ExplainPropertyText("Merge Method", "sorted merge", es); + } + if (ShowReceivedTupleData(scanState, es)) { Task *task = NULL; diff --git a/src/backend/distributed/planner/multi_logical_optimizer.c b/src/backend/distributed/planner/multi_logical_optimizer.c index cd117b3545a..cf04a7e7f58 100644 --- a/src/backend/distributed/planner/multi_logical_optimizer.c +++ b/src/backend/distributed/planner/multi_logical_optimizer.c @@ -50,6 +50,7 @@ #include "distributed/function_utils.h" #include "distributed/listutils.h" #include "distributed/metadata_cache.h" +#include "distributed/multi_executor.h" #include "distributed/multi_logical_optimizer.h" #include "distributed/multi_logical_planner.h" #include "distributed/multi_physical_planner.h" @@ -341,6 +342,7 @@ static bool ShouldProcessDistinctOrderAndLimitForWorker(ExtendedOpNodeProperties bool pushingDownOriginalGrouping, Node *havingQual); static bool IsIndexInRange(const List *list, int index); +static bool SortClauseListsMatch(List *workerClauses, List *originalClauses); /* * MultiLogicalPlanOptimize applies multi-relational algebra optimizations on @@ -2549,6 +2551,31 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode, */ workerExtendedOpNode->limitOption = originalOpNode->limitOption; + /* + * Determine sorted-merge eligibility. This is a plan-time-only decision. + * The worker sort clause list is the output of the existing safety analysis + * in WorkerSortClauseList(). If it matches the original sort clause, workers + * will produce identically-sorted output suitable for a coordinator merge. + * + * We must also exclude queries where ORDER BY references aggregates, + * because aggregate expressions are rewritten between worker and coordinator + * (e.g. avg → sum/count). The worker's sort order on partial aggregates + * does not match the coordinator's final aggregate sort order, so the + * merge would produce incorrectly ordered output. This check is needed + * because the existing LIMIT pushdown path may have already pushed the + * sort clause to workers for its own purposes. + */ + if (EnableSortedMerge && + queryOrderByLimit.workerSortClauseList != NIL && + originalSortClauseList != NIL && + !extendedOpNodeProperties->pullUpIntermediateRows && + !HasOrderByAggregate(originalSortClauseList, originalTargetEntryList) && + SortClauseListsMatch(queryOrderByLimit.workerSortClauseList, + originalSortClauseList)) + { + workerExtendedOpNode->sortedMergeEligible = true; + } + return workerExtendedOpNode; } @@ -5158,6 +5185,12 @@ WorkerLimitCount(Node *limitCount, Node *limitOffset, OrderByLimitReference * checks if we need to add any sorting and grouping clauses to the sort list we * push down for the limit. If we do, the function adds these clauses and * returns them. Otherwise, the function returns null. + * + * When citus.enable_sorted_merge is enabled, we also push down the sort + * clause to workers even without a LIMIT, for queries where the sort + * is safe to push (no aggregates in ORDER BY, no non-pushable window + * functions, and either no GROUP BY or GROUP BY on partition column). + * This enables the coordinator to merge pre-sorted worker results. */ static List * WorkerSortClauseList(Node *limitCount, List *groupClauseList, List *sortClauseList, @@ -5165,6 +5198,22 @@ WorkerSortClauseList(Node *limitCount, List *groupClauseList, List *sortClauseLi { List *workerSortClauseList = NIL; + /* + * When sorted merge is enabled, push the sort clause to workers even + * without a LIMIT. The coordinator will merge the sorted streams + * instead of doing a full re-sort. + */ + if (EnableSortedMerge && sortClauseList != NIL && + orderByLimitReference.onlyPushableWindowFunctions && + !orderByLimitReference.hasOrderByAggregate) + { + if (orderByLimitReference.groupClauseIsEmpty || + orderByLimitReference.groupedByDisjointPartitionColumn) + { + return copyObject(sortClauseList); + } + } + /* if no limit node and no hasDistinctOn, no need to push down sort clauses */ if (limitCount == NULL && !orderByLimitReference.hasDistinctOn) { @@ -5473,3 +5522,45 @@ IsGroupBySubsetOfDistinct(List *groupClauses, List *distinctClauses) return true; } + + +/* + * SortClauseListsMatch checks whether two SortGroupClause lists represent + * semantically identical sort orderings. Compares tleSortGroupRef, sortop, + * nulls_first, and eqop for each corresponding entry. + */ +static bool +SortClauseListsMatch(List *workerClauses, List *originalClauses) +{ + if (list_length(workerClauses) != list_length(originalClauses)) + { + return false; + } + + ListCell *wc; + ListCell *oc; + forboth(wc, workerClauses, oc, originalClauses) + { + SortGroupClause *w = lfirst_node(SortGroupClause, wc); + SortGroupClause *o = lfirst_node(SortGroupClause, oc); + + if (w->tleSortGroupRef != o->tleSortGroupRef) + { + return false; + } + if (w->sortop != o->sortop) + { + return false; + } + if (w->nulls_first != o->nulls_first) + { + return false; + } + if (w->eqop != o->eqop) + { + return false; + } + } + + return true; +} diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index f7d49ab5115..59967373abe 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -162,6 +162,10 @@ static MapMergeJob * BuildMapMergeJob(Query *jobQuery, List *dependentJobList, Var *partitionKey, PartitionType partitionType, Oid baseRelationId, BoundaryNodeJobType boundaryNodeJobType); +static SortedMergeKey * BuildSortedMergeKeys(List *sortClauseList, + List *targetList, int *nkeys); +static void SetSortedMergeFields(MultiTreeRoot *multiTree, Job *workerJob, + DistributedPlan *distributedPlan); static uint32 HashPartitionCount(void); /* Local functions forward declarations for task list creation and helper functions */ @@ -270,6 +274,9 @@ CreatePhysicalDistributedPlan(MultiTreeRoot *multiTree, distributedPlan->modLevel = ROW_MODIFY_READONLY; distributedPlan->expectResults = true; + /* check sorted merge eligibility and populate merge-key metadata */ + SetSortedMergeFields(multiTree, workerJob, distributedPlan); + return distributedPlan; } @@ -2035,6 +2042,97 @@ BuildMapMergeJob(Query *jobQuery, List *dependentJobList, Var *partitionKey, } +/* + * SetSortedMergeFields checks whether the logical optimizer tagged the + * worker extended op node as eligible for a coordinator-side sorted merge. + * If so, the function builds merge-key metadata from the worker job query's + * sort clause and target list, and sets useSortedMerge on the plan. + * + * This is a plan-time decision: the executor reads only the plan fields, + * never the GUC. + * + * We directly walk the tree structure rather than using FindNodesOfType, + * which would traverse into subquery subtrees and could find unrelated + * MultiExtendedOp nodes. After MultiLogicalPlanOptimize the tree is: + * MultiTreeRoot -> MasterExtendedOp -> MultiCollect -> WorkerExtendedOp + */ +static void +SetSortedMergeFields(MultiTreeRoot *multiTree, Job *workerJob, + DistributedPlan *distributedPlan) +{ + MultiNode *masterChild = ChildNode((MultiUnaryNode *) multiTree); + if (!CitusIsA(masterChild, MultiExtendedOp)) + { + return; + } + + MultiNode *collectNode = ChildNode((MultiUnaryNode *) masterChild); + if (!CitusIsA(collectNode, MultiCollect)) + { + return; + } + + MultiNode *workerNode = ChildNode((MultiUnaryNode *) collectNode); + if (!CitusIsA(workerNode, MultiExtendedOp)) + { + return; + } + + MultiExtendedOp *workerExtOp = (MultiExtendedOp *) workerNode; + if (!workerExtOp->sortedMergeEligible) + { + return; + } + + Query *jobQuery = workerJob->jobQuery; + int nkeys = 0; + SortedMergeKey *keys = BuildSortedMergeKeys(jobQuery->sortClause, + jobQuery->targetList, + &nkeys); + if (nkeys > 0) + { + distributedPlan->useSortedMerge = true; + distributedPlan->sortedMergeKeyCount = nkeys; + distributedPlan->sortedMergeKeys = keys; + } +} + + +/* + * BuildSortedMergeKeys constructs an array of SortedMergeKey from a sort clause + * list and its corresponding target list. The resulting keys are used by the + * executor to set up SortSupport structures for the k-way merge. + * + * The attribute numbers in the keys correspond to worker output column positions, + * which align with the 1-based non-junk ordering of the worker target list. + */ +static SortedMergeKey * +BuildSortedMergeKeys(List *sortClauseList, List *targetList, int *nkeys) +{ + *nkeys = list_length(sortClauseList); + if (*nkeys == 0) + { + return NULL; + } + + SortedMergeKey *keys = palloc(*nkeys * sizeof(SortedMergeKey)); + + int i = 0; + SortGroupClause *sgc = NULL; + foreach_declared_ptr(sgc, sortClauseList) + { + TargetEntry *tle = get_sortgroupclause_tle(sgc, targetList); + keys[i].attno = tle->resno; + keys[i].sortop = sgc->sortop; + keys[i].collation = exprCollation((Node *) tle->expr); + keys[i].nullsFirst = sgc->nulls_first; + i++; + } + + return keys; +} + + /* * HashPartitionCount returns the number of partition files we create for a hash * partition task. The function follows Hadoop's method for picking the number diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 3dbd81abb32..48842050c3b 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1604,6 +1604,19 @@ RegisterCitusConfigVariables(void) GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, NULL, NULL, NULL); + DefineCustomBoolVariable( + "citus.enable_sorted_merge", + gettext_noop("Enables sorted merge of worker results for ORDER BY queries."), + gettext_noop("When enabled during planning, Citus pushes ORDER BY to workers " + "and merges the pre-sorted results on the coordinator using a " + "binary heap, eliminating the Sort node in the combine query. " + "This is an experimental feature."), + &EnableSortedMerge, + false, + PGC_SUSET, + GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + DefineCustomBoolVariable( "citus.enable_stat_counters", gettext_noop("Enables the collection of statistic counters for Citus."), diff --git a/src/backend/distributed/utils/citus_copyfuncs.c b/src/backend/distributed/utils/citus_copyfuncs.c index 74496151c0f..e0e118d62c3 100644 --- a/src/backend/distributed/utils/citus_copyfuncs.c +++ b/src/backend/distributed/utils/citus_copyfuncs.c @@ -14,6 +14,7 @@ #include "utils/datum.h" #include "distributed/citus_nodefuncs.h" +#include "distributed/citus_safe_lib.h" #include "distributed/listutils.h" #include "distributed/multi_server_executor.h" @@ -140,6 +141,20 @@ CopyNodeDistributedPlan(COPYFUNC_ARGS) COPY_SCALAR_FIELD(sourceResultRepartitionColumnIndex); COPY_SCALAR_FIELD(disableTrackingQueryCounters); + + COPY_SCALAR_FIELD(useSortedMerge); + COPY_SCALAR_FIELD(sortedMergeKeyCount); + if (from->sortedMergeKeyCount > 0 && from->sortedMergeKeys != NULL) + { + Size keySize = from->sortedMergeKeyCount * sizeof(SortedMergeKey); + newnode->sortedMergeKeys = (SortedMergeKey *) palloc(keySize); + memcpy_s(newnode->sortedMergeKeys, keySize, + from->sortedMergeKeys, keySize); + } + else + { + newnode->sortedMergeKeys = NULL; + } } diff --git a/src/backend/distributed/utils/citus_outfuncs.c b/src/backend/distributed/utils/citus_outfuncs.c index 4782cab5de4..9c30d38be6f 100644 --- a/src/backend/distributed/utils/citus_outfuncs.c +++ b/src/backend/distributed/utils/citus_outfuncs.c @@ -205,6 +205,18 @@ OutDistributedPlan(OUTFUNC_ARGS) WRITE_NODE_FIELD(planningError); WRITE_INT_FIELD(sourceResultRepartitionColumnIndex); WRITE_BOOL_FIELD(disableTrackingQueryCounters); + + WRITE_BOOL_FIELD(useSortedMerge); + WRITE_INT_FIELD(sortedMergeKeyCount); + for (int i = 0; i < node->sortedMergeKeyCount; i++) + { + appendStringInfoString(str, " :sortedMergeKey"); + appendStringInfo(str, " :attno %d", node->sortedMergeKeys[i].attno); + appendStringInfo(str, " :sortop %u", node->sortedMergeKeys[i].sortop); + appendStringInfo(str, " :collation %u", node->sortedMergeKeys[i].collation); + appendStringInfo(str, " :nullsFirst %s", + booltostr(node->sortedMergeKeys[i].nullsFirst)); + } } @@ -376,6 +388,7 @@ OutMultiExtendedOp(OUTFUNC_ARGS) WRITE_BOOL_FIELD(hasWindowFuncs); WRITE_BOOL_FIELD(onlyPushableWindowFunctions); WRITE_NODE_FIELD(windowClause); + WRITE_BOOL_FIELD(sortedMergeEligible); OutMultiUnaryNodeFields(str, (const MultiUnaryNode *) node); } diff --git a/src/include/distributed/multi_executor.h b/src/include/distributed/multi_executor.h index b0b0288de87..c18067b5499 100644 --- a/src/include/distributed/multi_executor.h +++ b/src/include/distributed/multi_executor.h @@ -70,6 +70,7 @@ extern bool ForceMaxQueryParallelization; extern int MaxAdaptiveExecutorPoolSize; extern int ExecutorSlowStartInterval; extern bool SortReturning; +extern bool EnableSortedMerge; extern int ExecutorLevel; diff --git a/src/include/distributed/multi_logical_planner.h b/src/include/distributed/multi_logical_planner.h index edeae6a59ba..4843453b120 100644 --- a/src/include/distributed/multi_logical_planner.h +++ b/src/include/distributed/multi_logical_planner.h @@ -186,6 +186,7 @@ typedef struct MultiExtendedOp bool hasDistinctOn; bool hasWindowFuncs; bool onlyPushableWindowFunctions; + bool sortedMergeEligible; } MultiExtendedOp; diff --git a/src/include/distributed/multi_physical_planner.h b/src/include/distributed/multi_physical_planner.h index 3f13b0df97a..d9f5e6b68bc 100644 --- a/src/include/distributed/multi_physical_planner.h +++ b/src/include/distributed/multi_physical_planner.h @@ -393,6 +393,21 @@ typedef enum ModifyWithSelectMethod } ModifyWithSelectMethod; +/* + * SortedMergeKey describes one sort key for the coordinator-side + * k-way merge of pre-sorted worker results. These are serialized + * on DistributedPlan at planning time so the executor can build + * SortSupport structures without consulting the combine query. + */ +typedef struct SortedMergeKey +{ + AttrNumber attno; /* 1-based attribute in the worker output */ + Oid sortop; /* ordering operator OID */ + Oid collation; /* collation OID */ + bool nullsFirst; /* NULLS FIRST? */ +} SortedMergeKey; + + /* * DistributedPlan contains all information necessary to execute a * distribute query. @@ -492,6 +507,16 @@ typedef struct DistributedPlan * Disables tracking query stat counters if true. */ bool disableTrackingQueryCounters; + + /* + * Sorted merge: when true, the coordinator performs a k-way merge + * of pre-sorted worker results instead of relying on an upper Sort node. + * This is a plan-time decision — the executor reads only this flag and + * the associated merge keys, never the GUC. + */ + bool useSortedMerge; + int sortedMergeKeyCount; + SortedMergeKey *sortedMergeKeys; } DistributedPlan; diff --git a/src/include/distributed/sorted_merge.h b/src/include/distributed/sorted_merge.h new file mode 100644 index 00000000000..eeb3e690d35 --- /dev/null +++ b/src/include/distributed/sorted_merge.h @@ -0,0 +1,34 @@ +/*------------------------------------------------------------------------- + * + * sorted_merge.h + * Declarations for coordinator-side sorted merge of pre-sorted + * worker results using a binary heap. + * + * Copyright (c) Citus Data, Inc. + *------------------------------------------------------------------------- + */ + +#ifndef SORTED_MERGE_H +#define SORTED_MERGE_H + +#include "access/tupdesc.h" +#include "utils/tuplestore.h" + +#include "distributed/multi_physical_planner.h" +#include "distributed/tuple_destination.h" + + +extern TupleDestination * CreatePerTaskDispatchDest(List *taskList, + TupleDesc tupleDesc, + TupleDestinationStats *sharedStats, + Tuplestorestate ***perTaskStoresOut, + int *perTaskStoreCountOut); + +extern void MergePerTaskStoresIntoFinalStore(Tuplestorestate *finalStore, + Tuplestorestate **perTaskStores, + int nstores, + SortedMergeKey *mergeKeys, + int nkeys, + TupleDesc tupleDesc); + +#endif /* SORTED_MERGE_H */ diff --git a/src/include/distributed/tuple_destination.h b/src/include/distributed/tuple_destination.h index 5b4f649835f..e4659ed25f9 100644 --- a/src/include/distributed/tuple_destination.h +++ b/src/include/distributed/tuple_destination.h @@ -65,5 +65,7 @@ extern TupleDestination * CreateTupleStoreTupleDest(Tuplestorestate *tupleStore, extern TupleDestination * CreateTupleDestNone(void); extern DestReceiver * CreateTupleDestDestReceiver(TupleDestination *tupleDest, Task *task, int placementIndex); +extern void EnsureIntermediateSizeLimitNotExceeded(TupleDestinationStats * + tupleDestinationStats); #endif diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out new file mode 100644 index 00000000000..6c930ae8d3e --- /dev/null +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -0,0 +1,3135 @@ +-- +-- MULTI_SORTED_MERGE +-- +-- Tests for the citus.enable_sorted_merge GUC and the sorted merge +-- planner eligibility logic. Verifies that enabling the GUC does not +-- introduce regressions for any query pattern. +-- +-- MX verification: this test has been verified to pass with zero diffs +-- under check-base-mx (MX mode), confirming sorted merge works correctly +-- when any node in the cluster acts as coordinator. +-- +SET citus.next_shard_id TO 960000; +-- ================================================================= +-- Setup: create test tables +-- ================================================================= +CREATE TABLE sorted_merge_test ( + id int, + val text, + num numeric, + ts timestamptz DEFAULT now() +); +SELECT create_distributed_table('sorted_merge_test', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- Insert 100 rows + NULLs + duplicates +INSERT INTO sorted_merge_test (id, val, num) +SELECT i, 'val_' || i, (i * 1.5)::numeric +FROM generate_series(1, 100) i; +INSERT INTO sorted_merge_test (id, val, num) VALUES (101, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (102, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (200, 'dup_a', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (201, 'dup_b', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (202, 'dup_c', 10.5); +-- Second table for join tests +CREATE TABLE sorted_merge_events ( + id int, + event_type text, + event_val int +); +SELECT create_distributed_table('sorted_merge_events', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO sorted_merge_events +SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i +FROM generate_series(1, 200) i; +-- ================================================================= +-- 1. GUC basics +-- ================================================================= +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------------------------------------------------- + off +(1 row) + +SET citus.enable_sorted_merge TO on; +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------------------------------------------------- + on +(1 row) + +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Category A: Eligibility — sort IS pushed to workers +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- A1: ORDER BY distribution column +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- A2: ORDER BY DESC +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id DESC + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(16 rows) + +-- A3: ORDER BY DESC NULLS LAST +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num DESC NULLS LAST + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, num + Sort Key: sorted_merge_test.num DESC NULLS LAST + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num +(16 rows) + +-- A4: ORDER BY non-distribution column +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY val'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.val + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- A5: Multi-column ORDER BY +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id, sorted_merge_test.val + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- A6: Mixed directions +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, num DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id, sorted_merge_test.num DESC + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num +(16 rows) + +-- A7: GROUP BY dist_col ORDER BY dist_col +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(20 rows) + +-- A8: WHERE clause + ORDER BY +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) 'N'::numeric) ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + Filter: (sorted_merge_test.num > 'N'::numeric) + Rows Removed by Filter: N +(18 rows) + +-- A9: Expression in ORDER BY (non-aggregate) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, num, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num, (id + N) +(16 rows) + +-- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(20 rows) + +-- ================================================================= +-- Category B: Ineligibility — sort NOT pushed for merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- B1: ORDER BY count(*) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Sort Key: remote_scan.count + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) + +-- B2: ORDER BY avg(col) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.avg + Sort Key: remote_scan.avg + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.avg + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, avg(num) AS avg FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, avg(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) + +-- B3: GROUP BY non-dist col, ORDER BY non-dist col +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Key: remote_scan.val + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.val, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(23 rows) + +-- B4: GROUP BY non-dist col, ORDER BY aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.val, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(23 rows) + +-- ================================================================= +-- Category C: Correctness — results match GUC off vs on +-- ================================================================= +-- C1: Simple ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- C2: ORDER BY DESC +SET citus.enable_sorted_merge TO off; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +-- C3: Multi-column ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- C4: ORDER BY non-distribution column +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- C5: GROUP BY dist_col ORDER BY dist_col +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- C6: Mixed directions +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- C7: WHERE + ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +-- C8: Aggregates in SELECT, ORDER BY on dist_col (GROUP BY dist_col) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +-- ================================================================= +-- Category D: Complex queries — regression guards +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- D1: Subquery in FROM with ORDER BY +SELECT * FROM ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) sub ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D2: CTE with ORDER BY +WITH top5 AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) +SELECT * FROM top5 ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D3: Co-located JOIN + ORDER BY +SELECT t.id, t.val, e.event_type +FROM sorted_merge_test t +JOIN sorted_merge_events e ON t.id = e.id +WHERE t.id <= 5 +ORDER BY t.id, e.event_type +LIMIT 10; + id | val | event_type +--------------------------------------------------------------------- + 1 | val_1 | buy + 1 | val_1 | buy + 1 | val_1 | click + 1 | val_1 | view + 2 | val_2 | buy + 2 | val_2 | click + 2 | val_2 | view + 2 | val_2 | view + 3 | val_3 | buy + 3 | val_3 | buy +(10 rows) + +-- D4: UNION ALL + ORDER BY +SELECT id, val FROM sorted_merge_test WHERE id <= 3 +UNION ALL +SELECT id, val FROM sorted_merge_test WHERE id BETWEEN 98 AND 100 +ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 98 | val_98 + 99 | val_99 + 100 | val_100 +(6 rows) + +-- D5: DISTINCT + ORDER BY +SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +-- D6: DISTINCT ON + ORDER BY +SELECT DISTINCT ON (id) id, val, num +FROM sorted_merge_test +WHERE id <= 5 +ORDER BY id, num DESC; + id | val | num +--------------------------------------------------------------------- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +-- D7: EXISTS subquery + ORDER BY +SELECT id, val FROM sorted_merge_test t +WHERE EXISTS (SELECT 1 FROM sorted_merge_events e WHERE e.id = t.id) +ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D8: IN subquery + ORDER BY +SELECT id, val FROM sorted_merge_test +WHERE id IN (SELECT id FROM sorted_merge_events WHERE event_type = 'click') +ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- D9: Multiple aggregates, GROUP BY dist_col, ORDER BY dist_col +SELECT id, count(*), sum(num), avg(num), min(val), max(val) +FROM sorted_merge_test +GROUP BY id +ORDER BY id +LIMIT 5; + id | count | sum | avg | min | max +--------------------------------------------------------------------- + 1 | 1 | 1.5 | 1.50000000000000000000 | val_1 | val_1 + 2 | 1 | 3.0 | 3.0000000000000000 | val_2 | val_2 + 3 | 1 | 4.5 | 4.5000000000000000 | val_3 | val_3 + 4 | 1 | 6.0 | 6.0000000000000000 | val_4 | val_4 + 5 | 1 | 7.5 | 7.5000000000000000 | val_5 | val_5 +(5 rows) + +-- D10: CASE expression in SELECT + ORDER BY +SELECT id, + CASE WHEN num > 75 THEN 'high' WHEN num > 25 THEN 'mid' ELSE 'low' END as bucket +FROM sorted_merge_test +WHERE num IS NOT NULL +ORDER BY id +LIMIT 10; + id | bucket +--------------------------------------------------------------------- + 1 | low + 2 | low + 3 | low + 4 | low + 5 | low + 6 | low + 7 | low + 8 | low + 9 | low + 10 | low +(10 rows) + +-- D11: NULL values ordering +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMIT 5; + id | num +--------------------------------------------------------------------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +-- D12: Large OFFSET +SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; + id +--------------------------------------------------------------------- + 101 + 102 + 200 + 201 + 202 +(5 rows) + +-- D13: ORDER BY ordinal position +SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- ================================================================= +-- Category E: Edge cases +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- E1: Empty result set +SELECT id FROM sorted_merge_test WHERE id < 0 ORDER BY id; + id +--------------------------------------------------------------------- +(0 rows) + +-- E2: Single row (may go through router planner) +SELECT id, val FROM sorted_merge_test WHERE id = 42 ORDER BY id; + id | val +--------------------------------------------------------------------- + 42 | val_42 +(1 row) + +-- E3: All rows with same sort value +SELECT id, num FROM sorted_merge_test WHERE num = 10.5 ORDER BY num, id; + id | num +--------------------------------------------------------------------- + 7 | 10.5 + 200 | 10.5 + 201 | 10.5 + 202 | 10.5 +(4 rows) + +-- E4: Wide sort key (4 columns) +SELECT id, val, num FROM sorted_merge_test +WHERE id <= 5 +ORDER BY num, val, id +LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +-- E5: Zero-task defensive path +-- CreatePerTaskDispatchDest handles taskCount=0 gracefully (returns a no-op +-- destination). This cannot be triggered via normal SQL because distributed +-- tables always have at least one shard. The closest we can test is an +-- empty-result query through the sorted merge path to verify no crash. +SELECT id FROM sorted_merge_test WHERE false ORDER BY id; + id +--------------------------------------------------------------------- +(0 rows) + +-- ================================================================= +-- Category F: Existing LIMIT pushdown stability +-- ================================================================= +-- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Sort (actual rows=N loops=N) + Output: remote_scan.id + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(23 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(20 rows) + +-- F2: GROUP BY dist_col + ORDER BY + LIMIT +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Sort Key: remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(24 rows) + +-- F3: ORDER BY aggregate + LIMIT (not eligible for merge) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- ================================================================= +-- Category G: Phase 4 — Sort elision and advanced scenarios +-- ================================================================= +-- G1: Sort elision verification — coordinator Sort node absent +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Sort Key: remote_scan.id + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(15 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(16 rows) + +-- G2a: PREPARE with merge ON, EXECUTE after turning OFF +-- Plan-time decision is baked in — cached plan must still merge correctly. +-- Execute 6+ times to trigger PostgreSQL's generic plan caching, then +-- verify the plan shape is preserved after toggling the GUC. +SET citus.enable_sorted_merge TO on; +PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — no Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + +SET citus.enable_sorted_merge TO off; +-- Cached plan retains the sorted merge decision from planning time +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + +DEALLOCATE merge_on_stmt; +-- G2b: PREPARE with merge OFF, EXECUTE after turning ON +-- Cached plan has Sort node — must still return sorted results. +SET citus.enable_sorted_merge TO off; +PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +SET citus.enable_sorted_merge TO on; +-- Cached plan retains the non-merge decision from planning time +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +DEALLOCATE merge_off_stmt; +-- G3: Cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_cursor CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 2 +(1 row) + +FETCH 2 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 3 + 4 +(2 rows) + +CLOSE sorted_cursor; +COMMIT; +-- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(20 rows) + +-- G5: ORDER BY aggregate + LIMIT — crash regression test +-- Previously caused SIGSEGV when sorted merge was enabled because +-- aggregate ORDER BY was erroneously tagged as merge-eligible. +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 3; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 +(3 rows) + +-- G6: Small work_mem with many tasks (32 shards) +SET citus.enable_sorted_merge TO on; +SET work_mem TO '64kB'; +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +RESET work_mem; +-- G7: max_intermediate_result_size with CTE subplan +SET citus.enable_sorted_merge TO on; +SET citus.max_intermediate_result_size TO '4kB'; +WITH cte AS (SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 50) +SELECT * FROM cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- ================================================================= +-- Category H: Subplan + Sorted Merge interactions +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1: CTE subplan with simple ORDER BY — eligible for sorted merge +-- The CTE becomes a subplan; its DistributedPlan may have useSortedMerge=true +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + id | val | cnt +--------------------------------------------------------------------- + 1 | val_1 | 1 + 2 | val_2 | 1 + 3 | val_3 | 1 + 4 | val_4 | 1 + 5 | val_5 | 1 + 6 | val_6 | 1 + 7 | val_7 | 1 + 8 | val_8 | 1 + 9 | val_9 | 1 + 10 | val_10 | 1 + 11 | val_11 | 1 + 12 | val_12 | 1 + 13 | val_13 | 1 + 14 | val_14 | 1 + 15 | val_15 | 1 +(15 rows) + +-- H3: CTE subplan feeding outer ORDER BY — both levels may merge independently +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- H4: Subquery in WHERE with ORDER BY + LIMIT — becomes subplan with merge +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 +(3 rows) + +-- H5: CTE subplan with max_intermediate_result_size enforcement +-- Tests that EnsureIntermediateSizeLimitNotExceeded works through per-task dispatch +SET citus.max_intermediate_result_size TO '4kB'; +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- H6: Cross-join subplan with non-aggregate ORDER BY (crash regression variant) +-- Similar pattern to subquery_complex_target_list but without aggregate ORDER BY +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + id | bar_id +--------------------------------------------------------------------- + 1 | 1 + 1 | 1 + 1 | 1 + 2 | 1 + 2 | 1 +(5 rows) + +-- H7: CTE correctness comparison — GUC off vs on must produce identical results +SET citus.enable_sorted_merge TO off; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +-- ================================================================= +-- Category H EXPLAIN: Query plans for subplan + sorted merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val +(20 rows) + +-- H2 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.cnt + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + -> Distributed Subplan XXX_2 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.cnt + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.cnt + Sort Key: remote_scan.cnt DESC, remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.cnt + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)) + Sort Key: (count(*)) DESC, sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT e.id, e.val, i.cnt FROM ((SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) e JOIN (SELECT intermediate_result.id, intermediate_result.cnt FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer, cnt bigint)) i ON ((e.id OPERATOR(pg_catalog.=) i.id))) ORDER BY e.id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Merge Join (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result_1.cnt + Merge Cond: (intermediate_result.id = intermediate_result_1.id) + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=N loops=N) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Sort Key: intermediate_result_1.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(78 rows) + +-- H3 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: t.id, t.val + -> Merge Join (actual rows=N loops=N) + Output: t.id, t.val + Merge Cond: (intermediate_result.id = t.id) + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id + Sort Key: intermediate_result.id + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=N loops=N) + Output: t.id, t.val + Sort Key: t.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=N loops=N) + Output: t.id, t.val +(53 rows) + +-- H4 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) + Output: id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Hash Semi Join (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val + Hash Cond: (sorted_merge_test.id = intermediate_result.id) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: sorted_merge_test.id, sorted_merge_test.val, sorted_merge_test.num, sorted_merge_test.ts + -> Hash (actual rows=N loops=N) + Output: intermediate_result.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(52 rows) + +-- H5 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val + -> Sort (actual rows=N loops=N) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(41 rows) + +-- H6 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.bar_id + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id + -> Distributed Subplan XXX_2 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) + Output: id + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + Sort Key: intermediate_result.id, intermediate_result_1.id + Sort Method: quicksort Memory: NkB + -> Nested Loop (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result_1.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) + Output: intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(69 rows) + +-- H7 EXPLAIN — GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, num + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N +(45 rows) + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Distributed Subplan XXX_1 + Intermediate Data Size: N bytes + Result destination: Write locally + -> Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, num + -> Sort (actual rows=N loops=N) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: All + -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Sort Key: intermediate_result.id + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N +(43 rows) + +-- ================================================================= +-- Category I: Distributed Transactions +-- ================================================================= +-- Verify sorted merge correctness within multi-statement transactions +-- where data is modified before the sorted-merge SELECT. +SET citus.enable_sorted_merge TO on; +-- I1: INSERT then SELECT within a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (900, 'txn_insert', 900.0); +SELECT id, val FROM sorted_merge_test WHERE id >= 900 ORDER BY id; + id | val +--------------------------------------------------------------------- + 900 | txn_insert +(1 row) + +ROLLBACK; +-- I2: UPDATE then SELECT within a transaction +BEGIN; +UPDATE sorted_merge_test SET val = 'updated' WHERE id = 1; +SELECT id, val FROM sorted_merge_test WHERE id <= 3 ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | updated + 2 | val_2 + 3 | val_3 +(3 rows) + +ROLLBACK; +-- I3: DELETE then SELECT within a transaction +BEGIN; +DELETE FROM sorted_merge_test WHERE id <= 5; +SELECT id, val FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id | val +--------------------------------------------------------------------- + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(5 rows) + +ROLLBACK; +-- I4: INSERT + UPDATE + SELECT with multi-column ORDER BY +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (901, 'txn_a', 1.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (902, 'txn_b', 2.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (903, 'txn_c', 3.0); +UPDATE sorted_merge_test SET num = 999.0 WHERE id = 901; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 900 ORDER BY num, id; + id | val | num +--------------------------------------------------------------------- + 902 | txn_b | 2.0 + 903 | txn_c | 3.0 + 901 | txn_a | 999.0 +(3 rows) + +ROLLBACK; +-- I5: Compare results with GUC off vs on in a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (910, 'cmp_a', 10.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (911, 'cmp_b', 20.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (912, 'cmp_c', 30.0); +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num +--------------------------------------------------------------------- + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num +--------------------------------------------------------------------- + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) + +ROLLBACK; +-- I6: DELETE + aggregate in SELECT with ORDER BY +BEGIN; +DELETE FROM sorted_merge_test WHERE id > 100 AND id < 200; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +ROLLBACK; +-- ================================================================= +-- Category J: Coordinator expression evaluation exclusion +-- ================================================================= +-- Verify that queries with ORDER BY on expressions that need coordinator-side +-- evaluation are correctly excluded from sorted merge (or handled correctly). +SET citus.enable_sorted_merge TO on; +-- J1: ORDER BY expression on aggregate result (ordinal reference) +-- The ORDER BY references position 2 which is an aggregate — sorted merge +-- must NOT be used because aggregates are rewritten between worker/coordinator. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) AS total FROM sorted_merge_test GROUP BY id ORDER BY 2 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Sort Key: remote_scan.total + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, sum(num) AS total FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (sum(num)) + -> Sort (actual rows=N loops=N) + Output: id, (sum(num)) + Sort Key: (sum(sorted_merge_test.num)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, sum(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J2: ORDER BY expression wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) + 1 AS total_plus FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Sort Key: remote_scan.total_plus + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) AS total_plus FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + Sort Key: ((sum(sorted_merge_test.num) + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, (sum(num) + 'N'::numeric) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J3: ORDER BY a non-aggregate expression that can be pushed to workers +-- This should be eligible for sorted merge — the expression is evaluated +-- on the worker side and sort order is preserved. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id + 0'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (id + N) +(16 rows) + +-- J4: ORDER BY with CASE expression (no aggregates) — eligible +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY CASE WHEN id < 50 THEN 0 ELSE 1 END, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CASE WHEN (id < N) THEN N ELSE N END) + Sort Key: (CASE WHEN (sorted_merge_test.id < N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CASE WHEN (id < N) THEN N ELSE N END +(16 rows) + +-- J5: ORDER BY on an expression that mixes aggregate and non-aggregate +-- Should be ineligible because the expression contains an aggregate. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id + count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3 + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count, (id OPERATOR(pg_catalog.+) count(*)) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), (id + count(*)) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) + +-- J6: Correctness comparison — expression ORDER BY, GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J7–J12: Additional pushable expressions (no aggregates) +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J7: ORDER BY function call on column +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- J8: ORDER BY COALESCE +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0) LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +-- J9: ORDER BY negation +SELECT id, num FROM sorted_merge_test ORDER BY -num LIMIT 5; + id | num +--------------------------------------------------------------------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +-- J10: ORDER BY concatenation +SELECT id, val FROM sorted_merge_test ORDER BY val || '_suffix' LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 100 | val_100 + 10 | val_10 +(5 rows) + +-- J11: ORDER BY mathematical function (abs distance) +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +-- J12: ORDER BY expression not in SELECT list +SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +-- J13: ORDER BY expression referencing multiple columns +SELECT id, val FROM sorted_merge_test ORDER BY id * num LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- J14: ORDER BY with type cast +SELECT id, num FROM sorted_merge_test ORDER BY num::int LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- J15: ORDER BY with subexpression in SELECT and different expression in ORDER BY +SELECT id, num + 1 as n1 FROM sorted_merge_test ORDER BY num + 2 LIMIT 5; + id | n1 +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +-- J16: ORDER BY column alias +SELECT id, num * 2 as doubled FROM sorted_merge_test ORDER BY doubled LIMIT 5; + id | doubled +--------------------------------------------------------------------- + 1 | 3.0 + 2 | 6.0 + 3 | 9.0 + 4 | 12.0 + 5 | 15.0 +(5 rows) + +-- ----------------------------------------------------------------- +-- J17–J21: Correctness — GUC off vs on for expression ORDER BY +-- ----------------------------------------------------------------- +-- J17: function call +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- J18: CASE expression +SET citus.enable_sorted_merge TO off; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) + +-- J19: COALESCE +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +-- J20: abs() distance function +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J21–J22: More ineligibility — aggregate inside expressions +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J21: ORDER BY CASE wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY CASE WHEN count(*) > 1 THEN 0 ELSE 1 END, id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3, remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count, CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + Sort Key: (CASE WHEN (count(*) > N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), CASE WHEN (count(*) > N) THEN N ELSE N END + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J22: ORDER BY aggregate expression (sum + 1) — correctness +SET citus.enable_sorted_merge TO off; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J23–J24: EXPLAIN plans for pushable expression patterns +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J23: Does function-call ORDER BY get pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, upper(val) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (upper(val)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, (upper(val)) + -> Sort (actual rows=N loops=N) + Output: id, val, (upper(val)) + Sort Key: (upper(sorted_merge_test.val)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, upper(val) +(20 rows) + +-- J24: ORDER BY expression not in SELECT list — pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (num OPERATOR(pg_catalog.+) 'N'::numeric) AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (num OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + Sort Key: ((sorted_merge_test.num + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, (num + 'N'::numeric) +(20 rows) + +-- ================================================================= +-- Category K: Index-based sort avoidance +-- ================================================================= +-- When an index exists on the ORDER BY column, PostgreSQL's worker-side +-- planner should choose an Index Scan instead of Sort + Seq Scan, making +-- the worker-side sort essentially free. This is the best-case scenario +-- for sorted merge: zero worker sort cost + zero coordinator sort cost. +-- +-- We disable enable_seqscan to force the worker planner to prefer the +-- index, since the test table is small enough that Seq Scan + Sort +-- would otherwise be cheaper. +CREATE INDEX sorted_merge_test_id_idx ON sorted_merge_test(id); +-- Use a transaction with SET LOCAL to propagate enable_seqscan=off to workers, +-- forcing the worker planner to use the index instead of Seq Scan + Sort. +SET citus.propagate_set_commands TO 'local'; +-- K1: EXPLAIN with index — worker uses Index Scan, no Sort node +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) + +COMMIT; +-- K2: Correctness with index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +COMMIT; +-- K3: Multi-column index +CREATE INDEX sorted_merge_test_num_id_idx ON sorted_merge_test(num, id); +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Only Scan using sorted_merge_test_num_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num + Heap Fetches: N +(13 rows) + +COMMIT; +-- K4: Correctness with multi-column index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +COMMIT; +-- K5: DESC ordering with index +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id DESC'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan Backward using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) + +COMMIT; +RESET citus.propagate_set_commands; +DROP INDEX sorted_merge_test_id_idx; +DROP INDEX sorted_merge_test_num_id_idx; +-- ================================================================= +-- Category L: Volatile and stable functions in ORDER BY +-- Tests that ORDER BY with functions works correctly with sorted merge. +-- Volatile functions (random, clock_timestamp, timeofday) are pushed +-- to workers as computed columns — sorted merge uses the materialized +-- worker values, which is semantically equivalent to coordinator Sort. +-- ================================================================= +-- L1: STABLE function — now() in expression with column +-- now() returns the same value on all workers within a transaction, +-- so the merge is globally consistent. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY now() - ts, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (now() OPERATOR(pg_catalog.-) ts) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (now() OPERATOR(pg_catalog.-) ts), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((now() - ts)) + Sort Key: ((now() - sorted_merge_test.ts)), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (now() - ts) +(16 rows) + +-- L2: VOLATILE function — random() in ORDER BY +-- random() is pushed to workers as worker_column_3; each worker sorts +-- by its own random values. The merge interleaves using materialized +-- values — semantically equivalent to coordinator Sort on worker_column_3. +-- Test plan shape only (result is non-deterministic). +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY random(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, random() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (random()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (random()) + Sort Key: (random()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, random() +(16 rows) + +-- L3: VOLATILE function — clock_timestamp() in ORDER BY +-- Same mechanics as random(): pushed to workers, sorted locally, merged. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY clock_timestamp(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, clock_timestamp() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (clock_timestamp()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (clock_timestamp()) + Sort Key: (clock_timestamp()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, clock_timestamp() +(16 rows) + +-- L4: nextval() in ORDER BY with sorted merge ON — expected ERROR +-- nextval() cannot be pushed to workers (CanPushDownExpression blocks it). +-- The sort clause references a target entry missing from the worker target +-- list, causing a plan-time error. This is a pre-existing Citus limitation. +CREATE SEQUENCE sorted_merge_test_seq; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq'); +ERROR: ORDER/GROUP BY expression not found in targetlist +-- L4b: nextval() in ORDER BY with sorted merge OFF but LIMIT present +-- Same error — demonstrates this is NOT a sorted merge regression. +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq') LIMIT 5; +ERROR: ORDER/GROUP BY expression not found in targetlist +DROP SEQUENCE sorted_merge_test_seq; +-- L5: STABLE function alone (constant-fold case) +-- current_timestamp is constant-folded by the planner; the sort key +-- effectively becomes just 'id'. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY current_timestamp, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CURRENT_TIMESTAMP AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CURRENT_TIMESTAMP, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CURRENT_TIMESTAMP) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CURRENT_TIMESTAMP +(16 rows) + +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Category L6: EXPLAIN ANALYZE + sorted merge +-- +-- Verify that sorted merge works correctly when the EXPLAIN ANALYZE +-- code path is active. We test two mechanisms: +-- +-- 1. Plain EXPLAIN ANALYZE: verifies plan structure (no coordinator +-- Sort node, "Merge Method: sorted merge" visible). +-- +-- 2. auto_explain with log_analyze: triggers the same executor code +-- path (es_instrument != 0 → RequestedForExplainAnalyze() = true) +-- but returns actual data rows. This directly validates that the +-- k-way merge produces correctly sorted output under the EXPLAIN +-- ANALYZE path — if the merge were skipped, the rows would be +-- visibly unsorted. +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- Verify EXPLAIN ANALYZE plan structure: no Sort node at coordinator +-- level, "Merge Method: sorted merge" visible, and "actual rows" +-- confirms full execution through the merge path. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(16 rows) + +-- Load auto_explain to trigger the EXPLAIN ANALYZE executor path +-- while returning real data rows. auto_explain sets es_instrument, +-- which makes RequestedForExplainAnalyze() return true — the same +-- condition as a real EXPLAIN ANALYZE. +LOAD 'auto_explain'; +SET auto_explain.log_min_duration = 0; +SET auto_explain.log_analyze TO true; +-- ASC sort under auto_explain: these SELECTs go through the EXPLAIN +-- ANALYZE code path but return actual data. If the merge were +-- skipped, rows would arrive in arbitrary worker order. +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +-- DESC sort under auto_explain +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 10; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 + 100 + 99 + 98 + 97 + 96 +(10 rows) + +-- Multi-column sort under auto_explain +SELECT id, val FROM sorted_merge_test ORDER BY id, val LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Single-column sort on num (non-distribution column, has NULLs) +SELECT num FROM sorted_merge_test ORDER BY num LIMIT 10; + num +--------------------------------------------------------------------- + 1.5 + 3.0 + 4.5 + 6.0 + 7.5 + 9.0 + 10.5 + 10.5 + 10.5 + 10.5 +(10 rows) + +-- Multi-column sort with num as first column +SELECT num, id FROM sorted_merge_test ORDER BY num, id LIMIT 10; + num | id +--------------------------------------------------------------------- + 1.5 | 1 + 3.0 | 2 + 4.5 | 3 + 6.0 | 4 + 7.5 | 5 + 9.0 | 6 + 10.5 | 7 + 10.5 | 200 + 10.5 | 201 + 10.5 | 202 +(10 rows) + +-- Multi-column sort with num DESC as first column, id ASC +SELECT num, id FROM sorted_merge_test ORDER BY num DESC, id LIMIT 10; + num | id +--------------------------------------------------------------------- + | 101 + | 102 + 150.0 | 100 + 148.5 | 99 + 147.0 | 98 + 145.5 | 97 + 144.0 | 96 + 142.5 | 95 + 141.0 | 94 + 139.5 | 93 +(10 rows) + +-- Disable auto_explain +SET auto_explain.log_min_duration = -1; +SET auto_explain.log_analyze TO false; +-- Contrast: sorted merge OFF shows a Sort node at coordinator level. +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id + Sort Key: remote_scan.id + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(15 rows) + +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Cleanup +-- ================================================================= +SET citus.enable_sorted_merge TO off; +DROP TABLE sorted_merge_test; +DROP TABLE sorted_merge_events; diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index a6643e3b768..06b482ff5c7 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -93,6 +93,7 @@ test: multi_reference_table multi_select_for_update relation_access_tracking pg1 test: custom_aggregate_support aggregate_support tdigest_aggregate_support test: multi_average_expression multi_working_columns multi_having_pushdown having_subquery test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown +test: multi_orderby_pushdown test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg ch_bench_having chbenchmark_all_queries expression_reference_join anonymous_columns test: ch_bench_subquery_repartition test: subscripting_op diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql new file mode 100644 index 00000000000..5860fc867f9 --- /dev/null +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -0,0 +1,887 @@ +-- +-- MULTI_SORTED_MERGE +-- +-- Tests for the citus.enable_sorted_merge GUC and the sorted merge +-- planner eligibility logic. Verifies that enabling the GUC does not +-- introduce regressions for any query pattern. +-- +-- MX verification: this test has been verified to pass with zero diffs +-- under check-base-mx (MX mode), confirming sorted merge works correctly +-- when any node in the cluster acts as coordinator. +-- + +SET citus.next_shard_id TO 960000; + +-- ================================================================= +-- Setup: create test tables +-- ================================================================= + +CREATE TABLE sorted_merge_test ( + id int, + val text, + num numeric, + ts timestamptz DEFAULT now() +); +SELECT create_distributed_table('sorted_merge_test', 'id'); + +-- Insert 100 rows + NULLs + duplicates +INSERT INTO sorted_merge_test (id, val, num) +SELECT i, 'val_' || i, (i * 1.5)::numeric +FROM generate_series(1, 100) i; + +INSERT INTO sorted_merge_test (id, val, num) VALUES (101, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (102, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (200, 'dup_a', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (201, 'dup_b', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (202, 'dup_c', 10.5); + +-- Second table for join tests +CREATE TABLE sorted_merge_events ( + id int, + event_type text, + event_val int +); +SELECT create_distributed_table('sorted_merge_events', 'id'); + +INSERT INTO sorted_merge_events +SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i +FROM generate_series(1, 200) i; + +-- ================================================================= +-- 1. GUC basics +-- ================================================================= + +SHOW citus.enable_sorted_merge; + +SET citus.enable_sorted_merge TO on; +SHOW citus.enable_sorted_merge; + +SET citus.enable_sorted_merge TO off; + +-- ================================================================= +-- Category A: Eligibility — sort IS pushed to workers +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- A1: ORDER BY distribution column +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + +-- A2: ORDER BY DESC +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC'); + +-- A3: ORDER BY DESC NULLS LAST +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST'); + +-- A4: ORDER BY non-distribution column +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY val'); + +-- A5: Multi-column ORDER BY +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val'); + +-- A6: Mixed directions +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC'); + +-- A7: GROUP BY dist_col ORDER BY dist_col +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id'); + +-- A8: WHERE clause + ORDER BY +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id'); + +-- A9: Expression in ORDER BY (non-aggregate) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1'); + +-- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + +-- ================================================================= +-- Category B: Ineligibility — sort NOT pushed for merge +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- B1: ORDER BY count(*) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*)'); + +-- B2: ORDER BY avg(col) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num)'); + +-- B3: GROUP BY non-dist col, ORDER BY non-dist col +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val'); + +-- B4: GROUP BY non-dist col, ORDER BY aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*)'); + +-- ================================================================= +-- Category C: Correctness — results match GUC off vs on +-- ================================================================= + +-- C1: Simple ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + +-- C2: ORDER BY DESC +SET citus.enable_sorted_merge TO off; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + +-- C3: Multi-column ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + +-- C4: ORDER BY non-distribution column +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + +-- C5: GROUP BY dist_col ORDER BY dist_col +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + +-- C6: Mixed directions +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + +-- C7: WHERE + ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + +-- C8: Aggregates in SELECT, ORDER BY on dist_col (GROUP BY dist_col) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + +-- ================================================================= +-- Category D: Complex queries — regression guards +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- D1: Subquery in FROM with ORDER BY +SELECT * FROM ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) sub ORDER BY id; + +-- D2: CTE with ORDER BY +WITH top5 AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) +SELECT * FROM top5 ORDER BY id; + +-- D3: Co-located JOIN + ORDER BY +SELECT t.id, t.val, e.event_type +FROM sorted_merge_test t +JOIN sorted_merge_events e ON t.id = e.id +WHERE t.id <= 5 +ORDER BY t.id, e.event_type +LIMIT 10; + +-- D4: UNION ALL + ORDER BY +SELECT id, val FROM sorted_merge_test WHERE id <= 3 +UNION ALL +SELECT id, val FROM sorted_merge_test WHERE id BETWEEN 98 AND 100 +ORDER BY id; + +-- D5: DISTINCT + ORDER BY +SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + +-- D6: DISTINCT ON + ORDER BY +SELECT DISTINCT ON (id) id, val, num +FROM sorted_merge_test +WHERE id <= 5 +ORDER BY id, num DESC; + +-- D7: EXISTS subquery + ORDER BY +SELECT id, val FROM sorted_merge_test t +WHERE EXISTS (SELECT 1 FROM sorted_merge_events e WHERE e.id = t.id) +ORDER BY id LIMIT 5; + +-- D8: IN subquery + ORDER BY +SELECT id, val FROM sorted_merge_test +WHERE id IN (SELECT id FROM sorted_merge_events WHERE event_type = 'click') +ORDER BY id LIMIT 5; + +-- D9: Multiple aggregates, GROUP BY dist_col, ORDER BY dist_col +SELECT id, count(*), sum(num), avg(num), min(val), max(val) +FROM sorted_merge_test +GROUP BY id +ORDER BY id +LIMIT 5; + +-- D10: CASE expression in SELECT + ORDER BY +SELECT id, + CASE WHEN num > 75 THEN 'high' WHEN num > 25 THEN 'mid' ELSE 'low' END as bucket +FROM sorted_merge_test +WHERE num IS NOT NULL +ORDER BY id +LIMIT 10; + +-- D11: NULL values ordering +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMIT 5; + +-- D12: Large OFFSET +SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; + +-- D13: ORDER BY ordinal position +SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; + +-- ================================================================= +-- Category E: Edge cases +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- E1: Empty result set +SELECT id FROM sorted_merge_test WHERE id < 0 ORDER BY id; + +-- E2: Single row (may go through router planner) +SELECT id, val FROM sorted_merge_test WHERE id = 42 ORDER BY id; + +-- E3: All rows with same sort value +SELECT id, num FROM sorted_merge_test WHERE num = 10.5 ORDER BY num, id; + +-- E4: Wide sort key (4 columns) +SELECT id, val, num FROM sorted_merge_test +WHERE id <= 5 +ORDER BY num, val, id +LIMIT 5; + +-- E5: Zero-task defensive path +-- CreatePerTaskDispatchDest handles taskCount=0 gracefully (returns a no-op +-- destination). This cannot be triggered via normal SQL because distributed +-- tables always have at least one shard. The closest we can test is an +-- empty-result query through the sorted merge path to verify no crash. +SELECT id FROM sorted_merge_test WHERE false ORDER BY id; + +-- ================================================================= +-- Category F: Existing LIMIT pushdown stability +-- ================================================================= + +-- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + +-- F2: GROUP BY dist_col + ORDER BY + LIMIT +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + +-- F3: ORDER BY aggregate + LIMIT (not eligible for merge) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; + +-- ================================================================= +-- Category G: Phase 4 — Sort elision and advanced scenarios +-- ================================================================= + +-- G1: Sort elision verification — coordinator Sort node absent +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + +-- G2a: PREPARE with merge ON, EXECUTE after turning OFF +-- Plan-time decision is baked in — cached plan must still merge correctly. +-- Execute 6+ times to trigger PostgreSQL's generic plan caching, then +-- verify the plan shape is preserved after toggling the GUC. +SET citus.enable_sorted_merge TO on; +PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_on_stmt; +EXECUTE merge_on_stmt; +EXECUTE merge_on_stmt; +EXECUTE merge_on_stmt; +EXECUTE merge_on_stmt; +EXECUTE merge_on_stmt; +-- Verify plan shape after caching — no Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; +SET citus.enable_sorted_merge TO off; +-- Cached plan retains the sorted merge decision from planning time +EXECUTE merge_on_stmt; +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; +DEALLOCATE merge_on_stmt; + +-- G2b: PREPARE with merge OFF, EXECUTE after turning ON +-- Cached plan has Sort node — must still return sorted results. +SET citus.enable_sorted_merge TO off; +PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_off_stmt; +EXECUTE merge_off_stmt; +EXECUTE merge_off_stmt; +EXECUTE merge_off_stmt; +EXECUTE merge_off_stmt; +EXECUTE merge_off_stmt; +-- Verify plan shape after caching — Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; +SET citus.enable_sorted_merge TO on; +-- Cached plan retains the non-merge decision from planning time +EXECUTE merge_off_stmt; +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; +DEALLOCATE merge_off_stmt; + +-- G3: Cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_cursor CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_cursor; +FETCH BACKWARD 1 FROM sorted_cursor; +FETCH 2 FROM sorted_cursor; +CLOSE sorted_cursor; +COMMIT; + +-- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + +-- G5: ORDER BY aggregate + LIMIT — crash regression test +-- Previously caused SIGSEGV when sorted merge was enabled because +-- aggregate ORDER BY was erroneously tagged as merge-eligible. +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 3; + +-- G6: Small work_mem with many tasks (32 shards) +SET citus.enable_sorted_merge TO on; +SET work_mem TO '64kB'; +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; +RESET work_mem; + +-- G7: max_intermediate_result_size with CTE subplan +SET citus.enable_sorted_merge TO on; +SET citus.max_intermediate_result_size TO '4kB'; +WITH cte AS (SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 50) +SELECT * FROM cte ORDER BY id LIMIT 5; +RESET citus.max_intermediate_result_size; + +-- ================================================================= +-- Category H: Subplan + Sorted Merge interactions +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- H1: CTE subplan with simple ORDER BY — eligible for sorted merge +-- The CTE becomes a subplan; its DistributedPlan may have useSortedMerge=true +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; + +-- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + +-- H3: CTE subplan feeding outer ORDER BY — both levels may merge independently +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + +-- H4: Subquery in WHERE with ORDER BY + LIMIT — becomes subplan with merge +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + +-- H5: CTE subplan with max_intermediate_result_size enforcement +-- Tests that EnsureIntermediateSizeLimitNotExceeded works through per-task dispatch +SET citus.max_intermediate_result_size TO '4kB'; +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; +RESET citus.max_intermediate_result_size; + +-- H6: Cross-join subplan with non-aggregate ORDER BY (crash regression variant) +-- Similar pattern to subquery_complex_target_list but without aggregate ORDER BY +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + +-- H7: CTE correctness comparison — GUC off vs on must produce identical results +SET citus.enable_sorted_merge TO off; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + +-- ================================================================= +-- Category H EXPLAIN: Query plans for subplan + sorted merge +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- H1 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte ORDER BY id LIMIT 5'); + +-- H2 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id'); + +-- H3 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10'); + +-- H4 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5'); + +-- H5 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5'); + +-- H6 EXPLAIN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5'); + +-- H7 EXPLAIN — GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + +-- ================================================================= +-- Category I: Distributed Transactions +-- ================================================================= +-- Verify sorted merge correctness within multi-statement transactions +-- where data is modified before the sorted-merge SELECT. + +SET citus.enable_sorted_merge TO on; + +-- I1: INSERT then SELECT within a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (900, 'txn_insert', 900.0); +SELECT id, val FROM sorted_merge_test WHERE id >= 900 ORDER BY id; +ROLLBACK; + +-- I2: UPDATE then SELECT within a transaction +BEGIN; +UPDATE sorted_merge_test SET val = 'updated' WHERE id = 1; +SELECT id, val FROM sorted_merge_test WHERE id <= 3 ORDER BY id; +ROLLBACK; + +-- I3: DELETE then SELECT within a transaction +BEGIN; +DELETE FROM sorted_merge_test WHERE id <= 5; +SELECT id, val FROM sorted_merge_test WHERE id <= 10 ORDER BY id; +ROLLBACK; + +-- I4: INSERT + UPDATE + SELECT with multi-column ORDER BY +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (901, 'txn_a', 1.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (902, 'txn_b', 2.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (903, 'txn_c', 3.0); +UPDATE sorted_merge_test SET num = 999.0 WHERE id = 901; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 900 ORDER BY num, id; +ROLLBACK; + +-- I5: Compare results with GUC off vs on in a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (910, 'cmp_a', 10.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (911, 'cmp_b', 20.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (912, 'cmp_c', 30.0); +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; +ROLLBACK; + +-- I6: DELETE + aggregate in SELECT with ORDER BY +BEGIN; +DELETE FROM sorted_merge_test WHERE id > 100 AND id < 200; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; +ROLLBACK; + +-- ================================================================= +-- Category J: Coordinator expression evaluation exclusion +-- ================================================================= +-- Verify that queries with ORDER BY on expressions that need coordinator-side +-- evaluation are correctly excluded from sorted merge (or handled correctly). + +SET citus.enable_sorted_merge TO on; + +-- J1: ORDER BY expression on aggregate result (ordinal reference) +-- The ORDER BY references position 2 which is an aggregate — sorted merge +-- must NOT be used because aggregates are rewritten between worker/coordinator. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) AS total FROM sorted_merge_test GROUP BY id ORDER BY 2 LIMIT 5'); + +-- J2: ORDER BY expression wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) + 1 AS total_plus FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5'); + +-- J3: ORDER BY a non-aggregate expression that can be pushed to workers +-- This should be eligible for sorted merge — the expression is evaluated +-- on the worker side and sort order is preserved. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id + 0'); + +-- J4: ORDER BY with CASE expression (no aggregates) — eligible +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY CASE WHEN id < 50 THEN 0 ELSE 1 END, id'); + +-- J5: ORDER BY on an expression that mixes aggregate and non-aggregate +-- Should be ineligible because the expression contains an aggregate. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id + count(*)'); + +-- J6: Correctness comparison — expression ORDER BY, GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + +-- ----------------------------------------------------------------- +-- J7–J12: Additional pushable expressions (no aggregates) +-- ----------------------------------------------------------------- + +SET citus.enable_sorted_merge TO on; + +-- J7: ORDER BY function call on column +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + +-- J8: ORDER BY COALESCE +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0) LIMIT 5; + +-- J9: ORDER BY negation +SELECT id, num FROM sorted_merge_test ORDER BY -num LIMIT 5; + +-- J10: ORDER BY concatenation +SELECT id, val FROM sorted_merge_test ORDER BY val || '_suffix' LIMIT 5; + +-- J11: ORDER BY mathematical function (abs distance) +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + +-- J12: ORDER BY expression not in SELECT list +SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5; + +-- J13: ORDER BY expression referencing multiple columns +SELECT id, val FROM sorted_merge_test ORDER BY id * num LIMIT 5; + +-- J14: ORDER BY with type cast +SELECT id, num FROM sorted_merge_test ORDER BY num::int LIMIT 5; + +-- J15: ORDER BY with subexpression in SELECT and different expression in ORDER BY +SELECT id, num + 1 as n1 FROM sorted_merge_test ORDER BY num + 2 LIMIT 5; + +-- J16: ORDER BY column alias +SELECT id, num * 2 as doubled FROM sorted_merge_test ORDER BY doubled LIMIT 5; + +-- ----------------------------------------------------------------- +-- J17–J21: Correctness — GUC off vs on for expression ORDER BY +-- ----------------------------------------------------------------- + +-- J17: function call +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + +-- J18: CASE expression +SET citus.enable_sorted_merge TO off; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; +SET citus.enable_sorted_merge TO on; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + +-- J19: COALESCE +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + +-- J20: abs() distance function +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + +-- ----------------------------------------------------------------- +-- J21–J22: More ineligibility — aggregate inside expressions +-- ----------------------------------------------------------------- + +SET citus.enable_sorted_merge TO on; + +-- J21: ORDER BY CASE wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY CASE WHEN count(*) > 1 THEN 0 ELSE 1 END, id LIMIT 5'); + +-- J22: ORDER BY aggregate expression (sum + 1) — correctness +SET citus.enable_sorted_merge TO off; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; +SET citus.enable_sorted_merge TO on; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + +-- ----------------------------------------------------------------- +-- J23–J24: EXPLAIN plans for pushable expression patterns +-- ----------------------------------------------------------------- + +SET citus.enable_sorted_merge TO on; + +-- J23: Does function-call ORDER BY get pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5'); + +-- J24: ORDER BY expression not in SELECT list — pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5'); + +-- ================================================================= +-- Category K: Index-based sort avoidance +-- ================================================================= +-- When an index exists on the ORDER BY column, PostgreSQL's worker-side +-- planner should choose an Index Scan instead of Sort + Seq Scan, making +-- the worker-side sort essentially free. This is the best-case scenario +-- for sorted merge: zero worker sort cost + zero coordinator sort cost. +-- +-- We disable enable_seqscan to force the worker planner to prefer the +-- index, since the test table is small enough that Seq Scan + Sort +-- would otherwise be cheaper. + +CREATE INDEX sorted_merge_test_id_idx ON sorted_merge_test(id); + +-- Use a transaction with SET LOCAL to propagate enable_seqscan=off to workers, +-- forcing the worker planner to use the index instead of Seq Scan + Sort. +SET citus.propagate_set_commands TO 'local'; + +-- K1: EXPLAIN with index — worker uses Index Scan, no Sort node +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); +COMMIT; + +-- K2: Correctness with index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; +COMMIT; + +-- K3: Multi-column index +CREATE INDEX sorted_merge_test_num_id_idx ON sorted_merge_test(num, id); + +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num, id'); +COMMIT; + +-- K4: Correctness with multi-column index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; +COMMIT; + +-- K5: DESC ordering with index +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id DESC'); +COMMIT; + +RESET citus.propagate_set_commands; +DROP INDEX sorted_merge_test_id_idx; +DROP INDEX sorted_merge_test_num_id_idx; + +-- ================================================================= +-- Category L: Volatile and stable functions in ORDER BY +-- Tests that ORDER BY with functions works correctly with sorted merge. +-- Volatile functions (random, clock_timestamp, timeofday) are pushed +-- to workers as computed columns — sorted merge uses the materialized +-- worker values, which is semantically equivalent to coordinator Sort. +-- ================================================================= + +-- L1: STABLE function — now() in expression with column +-- now() returns the same value on all workers within a transaction, +-- so the merge is globally consistent. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY now() - ts, id'); + +-- L2: VOLATILE function — random() in ORDER BY +-- random() is pushed to workers as worker_column_3; each worker sorts +-- by its own random values. The merge interleaves using materialized +-- values — semantically equivalent to coordinator Sort on worker_column_3. +-- Test plan shape only (result is non-deterministic). +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY random(), id'); + +-- L3: VOLATILE function — clock_timestamp() in ORDER BY +-- Same mechanics as random(): pushed to workers, sorted locally, merged. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY clock_timestamp(), id'); + +-- L4: nextval() in ORDER BY with sorted merge ON — expected ERROR +-- nextval() cannot be pushed to workers (CanPushDownExpression blocks it). +-- The sort clause references a target entry missing from the worker target +-- list, causing a plan-time error. This is a pre-existing Citus limitation. +CREATE SEQUENCE sorted_merge_test_seq; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq'); + +-- L4b: nextval() in ORDER BY with sorted merge OFF but LIMIT present +-- Same error — demonstrates this is NOT a sorted merge regression. +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq') LIMIT 5; +DROP SEQUENCE sorted_merge_test_seq; + +-- L5: STABLE function alone (constant-fold case) +-- current_timestamp is constant-folded by the planner; the sort key +-- effectively becomes just 'id'. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY current_timestamp, id'); + +SET citus.enable_sorted_merge TO off; + +-- ================================================================= +-- Category L6: EXPLAIN ANALYZE + sorted merge +-- +-- Verify that sorted merge works correctly when the EXPLAIN ANALYZE +-- code path is active. We test two mechanisms: +-- +-- 1. Plain EXPLAIN ANALYZE: verifies plan structure (no coordinator +-- Sort node, "Merge Method: sorted merge" visible). +-- +-- 2. auto_explain with log_analyze: triggers the same executor code +-- path (es_instrument != 0 → RequestedForExplainAnalyze() = true) +-- but returns actual data rows. This directly validates that the +-- k-way merge produces correctly sorted output under the EXPLAIN +-- ANALYZE path — if the merge were skipped, the rows would be +-- visibly unsorted. +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- Verify EXPLAIN ANALYZE plan structure: no Sort node at coordinator +-- level, "Merge Method: sorted merge" visible, and "actual rows" +-- confirms full execution through the merge path. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id'); + +-- Load auto_explain to trigger the EXPLAIN ANALYZE executor path +-- while returning real data rows. auto_explain sets es_instrument, +-- which makes RequestedForExplainAnalyze() return true — the same +-- condition as a real EXPLAIN ANALYZE. +LOAD 'auto_explain'; +SET auto_explain.log_min_duration = 0; +SET auto_explain.log_analyze TO true; + +-- ASC sort under auto_explain: these SELECTs go through the EXPLAIN +-- ANALYZE code path but return actual data. If the merge were +-- skipped, rows would arrive in arbitrary worker order. +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; + +-- DESC sort under auto_explain +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 10; + +-- Multi-column sort under auto_explain +SELECT id, val FROM sorted_merge_test ORDER BY id, val LIMIT 10; + +-- Single-column sort on num (non-distribution column, has NULLs) +SELECT num FROM sorted_merge_test ORDER BY num LIMIT 10; + +-- Multi-column sort with num as first column +SELECT num, id FROM sorted_merge_test ORDER BY num, id LIMIT 10; + +-- Multi-column sort with num DESC as first column, id ASC +SELECT num, id FROM sorted_merge_test ORDER BY num DESC, id LIMIT 10; + +-- Disable auto_explain +SET auto_explain.log_min_duration = -1; +SET auto_explain.log_analyze TO false; + +-- Contrast: sorted merge OFF shows a Sort node at coordinator level. +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id'); + +SET citus.enable_sorted_merge TO off; + +-- ================================================================= +-- Cleanup +-- ================================================================= + +SET citus.enable_sorted_merge TO off; +DROP TABLE sorted_merge_test; +DROP TABLE sorted_merge_events;