From 7eb21089155584429662ba529d35acd8316ff5a3 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Tue, 17 Mar 2026 18:05:03 +0000 Subject: [PATCH 01/18] Add sorted merge foundation: GUC, struct fields, and serialization Phase 1 of the sorted-merge feature. This commit adds the data structures and GUC needed by later phases, with zero behavioral changes: - SortedMergeKey typedef in multi_physical_planner.h describing one sort key for the coordinator k-way merge - useSortedMerge, sortedMergeKeys[], sortedMergeKeyCount fields on DistributedPlan (plan-time decision, never checked at runtime via GUC) - sortedMergeEligible field on MultiExtendedOp (logical optimizer tag read by the physical planner) - Hidden GUC citus.enable_sorted_merge (PGC_SUSET, default off, GUC_NO_SHOW_ALL) consulted only during planning - Serialization in citus_outfuncs.c and deep-copy in citus_copyfuncs.c for all new fields All new fields default to false/0/NULL. Existing regression tests are unaffected. Co-authored-by: Copilot --- .../distributed/executor/multi_executor.c | 3 +++ src/backend/distributed/shared_library_init.c | 13 ++++++++++ .../distributed/utils/citus_copyfuncs.c | 13 ++++++++++ .../distributed/utils/citus_outfuncs.c | 13 ++++++++++ src/include/distributed/multi_executor.h | 1 + .../distributed/multi_logical_planner.h | 7 ++++++ .../distributed/multi_physical_planner.h | 25 +++++++++++++++++++ 7 files changed, 75 insertions(+) diff --git a/src/backend/distributed/executor/multi_executor.c b/src/backend/distributed/executor/multi_executor.c index 1893e262765..8661d367345 100644 --- a/src/backend/distributed/executor/multi_executor.c +++ b/src/backend/distributed/executor/multi_executor.c @@ -85,6 +85,9 @@ ParamListInfo executorBoundParams = NULL; /* sort the returning to get consistent outputs, used only for testing */ bool SortReturning = false; +/* when true at planning time, enables coordinator sorted merge for ORDER BY */ +bool EnableSortedMerge = false; + /* * How many nested executors have we started? This can happen for SQL * UDF calls. The outer query starts an executor, then postgres opens diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 3dbd81abb32..a6a928fbe75 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -2613,6 +2613,19 @@ RegisterCitusConfigVariables(void) GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, NULL, NULL, NULL); + DefineCustomBoolVariable( + "citus.enable_sorted_merge", + gettext_noop("Enables sorted merge of worker results for ORDER BY queries."), + gettext_noop("When enabled during planning, Citus pushes ORDER BY to workers " + "and merges the pre-sorted results on the coordinator using a " + "binary heap, eliminating the Sort node in the combine query. " + "This is an experimental feature."), + &EnableSortedMerge, + false, + PGC_SUSET, + GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* * It takes about 140 bytes of shared memory to store one row, therefore * this setting should be used responsibly. setting it to 10M will require diff --git a/src/backend/distributed/utils/citus_copyfuncs.c b/src/backend/distributed/utils/citus_copyfuncs.c index 74496151c0f..65c2a88f3a4 100644 --- a/src/backend/distributed/utils/citus_copyfuncs.c +++ b/src/backend/distributed/utils/citus_copyfuncs.c @@ -140,6 +140,19 @@ CopyNodeDistributedPlan(COPYFUNC_ARGS) COPY_SCALAR_FIELD(sourceResultRepartitionColumnIndex); COPY_SCALAR_FIELD(disableTrackingQueryCounters); + + COPY_SCALAR_FIELD(useSortedMerge); + COPY_SCALAR_FIELD(sortedMergeKeyCount); + if (from->sortedMergeKeyCount > 0 && from->sortedMergeKeys != NULL) + { + Size keySize = from->sortedMergeKeyCount * sizeof(SortedMergeKey); + newnode->sortedMergeKeys = (SortedMergeKey *) palloc(keySize); + memcpy(newnode->sortedMergeKeys, from->sortedMergeKeys, keySize); + } + else + { + newnode->sortedMergeKeys = NULL; + } } diff --git a/src/backend/distributed/utils/citus_outfuncs.c b/src/backend/distributed/utils/citus_outfuncs.c index 4782cab5de4..9c30d38be6f 100644 --- a/src/backend/distributed/utils/citus_outfuncs.c +++ b/src/backend/distributed/utils/citus_outfuncs.c @@ -205,6 +205,18 @@ OutDistributedPlan(OUTFUNC_ARGS) WRITE_NODE_FIELD(planningError); WRITE_INT_FIELD(sourceResultRepartitionColumnIndex); WRITE_BOOL_FIELD(disableTrackingQueryCounters); + + WRITE_BOOL_FIELD(useSortedMerge); + WRITE_INT_FIELD(sortedMergeKeyCount); + for (int i = 0; i < node->sortedMergeKeyCount; i++) + { + appendStringInfoString(str, " :sortedMergeKey"); + appendStringInfo(str, " :attno %d", node->sortedMergeKeys[i].attno); + appendStringInfo(str, " :sortop %u", node->sortedMergeKeys[i].sortop); + appendStringInfo(str, " :collation %u", node->sortedMergeKeys[i].collation); + appendStringInfo(str, " :nullsFirst %s", + booltostr(node->sortedMergeKeys[i].nullsFirst)); + } } @@ -376,6 +388,7 @@ OutMultiExtendedOp(OUTFUNC_ARGS) WRITE_BOOL_FIELD(hasWindowFuncs); WRITE_BOOL_FIELD(onlyPushableWindowFunctions); WRITE_NODE_FIELD(windowClause); + WRITE_BOOL_FIELD(sortedMergeEligible); OutMultiUnaryNodeFields(str, (const MultiUnaryNode *) node); } diff --git a/src/include/distributed/multi_executor.h b/src/include/distributed/multi_executor.h index b0b0288de87..c18067b5499 100644 --- a/src/include/distributed/multi_executor.h +++ b/src/include/distributed/multi_executor.h @@ -70,6 +70,7 @@ extern bool ForceMaxQueryParallelization; extern int MaxAdaptiveExecutorPoolSize; extern int ExecutorSlowStartInterval; extern bool SortReturning; +extern bool EnableSortedMerge; extern int ExecutorLevel; diff --git a/src/include/distributed/multi_logical_planner.h b/src/include/distributed/multi_logical_planner.h index edeae6a59ba..9bd68f0dfb0 100644 --- a/src/include/distributed/multi_logical_planner.h +++ b/src/include/distributed/multi_logical_planner.h @@ -186,6 +186,13 @@ typedef struct MultiExtendedOp bool hasDistinctOn; bool hasWindowFuncs; bool onlyPushableWindowFunctions; + + /* + * Set by the logical optimizer when the worker sort clause can support + * a coordinator-side sorted merge. The physical planner reads this to + * populate DistributedPlan.useSortedMerge. + */ + bool sortedMergeEligible; } MultiExtendedOp; diff --git a/src/include/distributed/multi_physical_planner.h b/src/include/distributed/multi_physical_planner.h index 3f13b0df97a..d9f5e6b68bc 100644 --- a/src/include/distributed/multi_physical_planner.h +++ b/src/include/distributed/multi_physical_planner.h @@ -393,6 +393,21 @@ typedef enum ModifyWithSelectMethod } ModifyWithSelectMethod; +/* + * SortedMergeKey describes one sort key for the coordinator-side + * k-way merge of pre-sorted worker results. These are serialized + * on DistributedPlan at planning time so the executor can build + * SortSupport structures without consulting the combine query. + */ +typedef struct SortedMergeKey +{ + AttrNumber attno; /* 1-based attribute in the worker output */ + Oid sortop; /* ordering operator OID */ + Oid collation; /* collation OID */ + bool nullsFirst; /* NULLS FIRST? */ +} SortedMergeKey; + + /* * DistributedPlan contains all information necessary to execute a * distribute query. @@ -492,6 +507,16 @@ typedef struct DistributedPlan * Disables tracking query stat counters if true. */ bool disableTrackingQueryCounters; + + /* + * Sorted merge: when true, the coordinator performs a k-way merge + * of pre-sorted worker results instead of relying on an upper Sort node. + * This is a plan-time decision — the executor reads only this flag and + * the associated merge keys, never the GUC. + */ + bool useSortedMerge; + int sortedMergeKeyCount; + SortedMergeKey *sortedMergeKeys; } DistributedPlan; From 3e3e3b71cb8cd039679fbe16dcad125a7e3df462 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Tue, 17 Mar 2026 23:04:17 +0000 Subject: [PATCH 02/18] Add planner eligibility and worker sort pushdown for sorted merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 of the sorted-merge feature. Workers now sort their results when citus.enable_sorted_merge is enabled at planning time, even for queries without LIMIT. The plan metadata is populated so later phases can execute the merge and set pathkeys. Logical optimizer changes (multi_logical_optimizer.c): - WorkerSortClauseList() gains an early-return path that pushes the sort clause to workers when the GUC is on and the sort is safe (no aggregates in ORDER BY, no non-pushable window functions, and either no GROUP BY or GROUP BY on partition column). - WorkerExtendedOpNode() sets sortedMergeEligible = true when the worker sort clause semantically matches the original sort clause, using the new SortClauseListsMatch() helper. - SortClauseListsMatch() compares tleSortGroupRef, sortop, nulls_first, and eqop for each pair. Physical planner changes (multi_physical_planner.c): - CreatePhysicalDistributedPlan() finds the worker MultiExtendedOp with sortedMergeEligible = true, builds SortedMergeKey metadata from the worker job query, and sets useSortedMerge on the plan. - BuildSortedMergeKeys() constructs the key array from the worker query's SortGroupClause list and target list. The coordinator Sort node is still present above the CustomScan (pathkeys not set yet — that is Phase 4). Results are correct because the redundant Sort re-sorts already-sorted data. Co-authored-by: Copilot --- .../planner/multi_logical_optimizer.c | 82 ++ .../planner/multi_physical_planner.c | 98 ++ .../distributed/multi_logical_planner.h | 6 - .../expected/multi_orderby_pushdown.out | 856 ++++++++++++++++++ src/test/regress/multi_schedule | 2 +- .../regress/sql/multi_orderby_pushdown.sql | 318 +++++++ 6 files changed, 1355 insertions(+), 7 deletions(-) create mode 100644 src/test/regress/expected/multi_orderby_pushdown.out create mode 100644 src/test/regress/sql/multi_orderby_pushdown.sql diff --git a/src/backend/distributed/planner/multi_logical_optimizer.c b/src/backend/distributed/planner/multi_logical_optimizer.c index cd117b3545a..d5b4b00d42d 100644 --- a/src/backend/distributed/planner/multi_logical_optimizer.c +++ b/src/backend/distributed/planner/multi_logical_optimizer.c @@ -53,6 +53,7 @@ #include "distributed/multi_logical_optimizer.h" #include "distributed/multi_logical_planner.h" #include "distributed/multi_physical_planner.h" +#include "distributed/multi_executor.h" #include "distributed/pg_dist_partition.h" #include "distributed/query_pushdown_planning.h" #include "distributed/string_utils.h" @@ -341,6 +342,7 @@ static bool ShouldProcessDistinctOrderAndLimitForWorker(ExtendedOpNodeProperties bool pushingDownOriginalGrouping, Node *havingQual); static bool IsIndexInRange(const List *list, int index); +static bool SortClauseListsMatch(List *workerClauses, List *originalClauses); /* * MultiLogicalPlanOptimize applies multi-relational algebra optimizations on @@ -2549,6 +2551,22 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode, */ workerExtendedOpNode->limitOption = originalOpNode->limitOption; + /* + * Determine sorted-merge eligibility. This is a plan-time-only decision. + * The worker sort clause list is the output of the existing safety analysis + * in WorkerSortClauseList(). If it matches the original sort clause, workers + * will produce identically-sorted output suitable for a coordinator merge. + */ + if (EnableSortedMerge && + queryOrderByLimit.workerSortClauseList != NIL && + originalSortClauseList != NIL && + !extendedOpNodeProperties->pullUpIntermediateRows && + SortClauseListsMatch(queryOrderByLimit.workerSortClauseList, + originalSortClauseList)) + { + workerExtendedOpNode->sortedMergeEligible = true; + } + return workerExtendedOpNode; } @@ -5158,6 +5176,12 @@ WorkerLimitCount(Node *limitCount, Node *limitOffset, OrderByLimitReference * checks if we need to add any sorting and grouping clauses to the sort list we * push down for the limit. If we do, the function adds these clauses and * returns them. Otherwise, the function returns null. + * + * When citus.enable_sorted_merge is enabled, we also push down the sort + * clause to workers even without a LIMIT, for queries where the sort + * is safe to push (no aggregates in ORDER BY, no non-pushable window + * functions, and either no GROUP BY or GROUP BY on partition column). + * This enables the coordinator to merge pre-sorted worker results. */ static List * WorkerSortClauseList(Node *limitCount, List *groupClauseList, List *sortClauseList, @@ -5165,6 +5189,22 @@ WorkerSortClauseList(Node *limitCount, List *groupClauseList, List *sortClauseLi { List *workerSortClauseList = NIL; + /* + * When sorted merge is enabled, push the sort clause to workers even + * without a LIMIT. The coordinator will merge the sorted streams + * instead of doing a full re-sort. + */ + if (EnableSortedMerge && sortClauseList != NIL && + orderByLimitReference.onlyPushableWindowFunctions && + !orderByLimitReference.hasOrderByAggregate) + { + if (orderByLimitReference.groupClauseIsEmpty || + orderByLimitReference.groupedByDisjointPartitionColumn) + { + return copyObject(sortClauseList); + } + } + /* if no limit node and no hasDistinctOn, no need to push down sort clauses */ if (limitCount == NULL && !orderByLimitReference.hasDistinctOn) { @@ -5473,3 +5513,45 @@ IsGroupBySubsetOfDistinct(List *groupClauses, List *distinctClauses) return true; } + + +/* + * SortClauseListsMatch checks whether two SortGroupClause lists represent + * semantically identical sort orderings. Compares tleSortGroupRef, sortop, + * nulls_first, and eqop for each corresponding entry. + */ +static bool +SortClauseListsMatch(List *workerClauses, List *originalClauses) +{ + if (list_length(workerClauses) != list_length(originalClauses)) + { + return false; + } + + ListCell *wc; + ListCell *oc; + forboth(wc, workerClauses, oc, originalClauses) + { + SortGroupClause *w = lfirst_node(SortGroupClause, wc); + SortGroupClause *o = lfirst_node(SortGroupClause, oc); + + if (w->tleSortGroupRef != o->tleSortGroupRef) + { + return false; + } + if (w->sortop != o->sortop) + { + return false; + } + if (w->nulls_first != o->nulls_first) + { + return false; + } + if (w->eqop != o->eqop) + { + return false; + } + } + + return true; +} diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index f7d49ab5115..59967373abe 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -162,6 +162,10 @@ static MapMergeJob * BuildMapMergeJob(Query *jobQuery, List *dependentJobList, Var *partitionKey, PartitionType partitionType, Oid baseRelationId, BoundaryNodeJobType boundaryNodeJobType); +static SortedMergeKey * BuildSortedMergeKeys(List *sortClauseList, + List *targetList, int *nkeys); +static void SetSortedMergeFields(MultiTreeRoot *multiTree, Job *workerJob, + DistributedPlan *distributedPlan); static uint32 HashPartitionCount(void); /* Local functions forward declarations for task list creation and helper functions */ @@ -270,6 +274,9 @@ CreatePhysicalDistributedPlan(MultiTreeRoot *multiTree, distributedPlan->modLevel = ROW_MODIFY_READONLY; distributedPlan->expectResults = true; + /* check sorted merge eligibility and populate merge-key metadata */ + SetSortedMergeFields(multiTree, workerJob, distributedPlan); + return distributedPlan; } @@ -2035,6 +2042,97 @@ BuildMapMergeJob(Query *jobQuery, List *dependentJobList, Var *partitionKey, } +/* + * SetSortedMergeFields checks whether the logical optimizer tagged the + * worker extended op node as eligible for a coordinator-side sorted merge. + * If so, the function builds merge-key metadata from the worker job query's + * sort clause and target list, and sets useSortedMerge on the plan. + * + * This is a plan-time decision: the executor reads only the plan fields, + * never the GUC. + * + * We directly walk the tree structure rather than using FindNodesOfType, + * which would traverse into subquery subtrees and could find unrelated + * MultiExtendedOp nodes. After MultiLogicalPlanOptimize the tree is: + * MultiTreeRoot -> MasterExtendedOp -> MultiCollect -> WorkerExtendedOp + */ +static void +SetSortedMergeFields(MultiTreeRoot *multiTree, Job *workerJob, + DistributedPlan *distributedPlan) +{ + MultiNode *masterChild = ChildNode((MultiUnaryNode *) multiTree); + if (!CitusIsA(masterChild, MultiExtendedOp)) + { + return; + } + + MultiNode *collectNode = ChildNode((MultiUnaryNode *) masterChild); + if (!CitusIsA(collectNode, MultiCollect)) + { + return; + } + + MultiNode *workerNode = ChildNode((MultiUnaryNode *) collectNode); + if (!CitusIsA(workerNode, MultiExtendedOp)) + { + return; + } + + MultiExtendedOp *workerExtOp = (MultiExtendedOp *) workerNode; + if (!workerExtOp->sortedMergeEligible) + { + return; + } + + Query *jobQuery = workerJob->jobQuery; + int nkeys = 0; + SortedMergeKey *keys = BuildSortedMergeKeys(jobQuery->sortClause, + jobQuery->targetList, + &nkeys); + if (nkeys > 0) + { + distributedPlan->useSortedMerge = true; + distributedPlan->sortedMergeKeyCount = nkeys; + distributedPlan->sortedMergeKeys = keys; + } +} + + +/* + * BuildSortedMergeKeys constructs an array of SortedMergeKey from a sort clause + * list and its corresponding target list. The resulting keys are used by the + * executor to set up SortSupport structures for the k-way merge. + * + * The attribute numbers in the keys correspond to worker output column positions, + * which align with the 1-based non-junk ordering of the worker target list. + */ +static SortedMergeKey * +BuildSortedMergeKeys(List *sortClauseList, List *targetList, int *nkeys) +{ + *nkeys = list_length(sortClauseList); + if (*nkeys == 0) + { + return NULL; + } + + SortedMergeKey *keys = palloc(*nkeys * sizeof(SortedMergeKey)); + + int i = 0; + SortGroupClause *sgc = NULL; + foreach_declared_ptr(sgc, sortClauseList) + { + TargetEntry *tle = get_sortgroupclause_tle(sgc, targetList); + keys[i].attno = tle->resno; + keys[i].sortop = sgc->sortop; + keys[i].collation = exprCollation((Node *) tle->expr); + keys[i].nullsFirst = sgc->nulls_first; + i++; + } + + return keys; +} + + /* * HashPartitionCount returns the number of partition files we create for a hash * partition task. The function follows Hadoop's method for picking the number diff --git a/src/include/distributed/multi_logical_planner.h b/src/include/distributed/multi_logical_planner.h index 9bd68f0dfb0..4843453b120 100644 --- a/src/include/distributed/multi_logical_planner.h +++ b/src/include/distributed/multi_logical_planner.h @@ -186,12 +186,6 @@ typedef struct MultiExtendedOp bool hasDistinctOn; bool hasWindowFuncs; bool onlyPushableWindowFunctions; - - /* - * Set by the logical optimizer when the worker sort clause can support - * a coordinator-side sorted merge. The physical planner reads this to - * populate DistributedPlan.useSortedMerge. - */ bool sortedMergeEligible; } MultiExtendedOp; diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out new file mode 100644 index 00000000000..adbd4f1dde0 --- /dev/null +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -0,0 +1,856 @@ +SET citus.next_shard_id TO 960000; +SET +CREATE TABLE sorted_merge_test ( + id int, + val text, + num numeric, + ts timestamptz DEFAULT now() +); +CREATE TABLE +SELECT create_distributed_table('sorted_merge_test', 'id'); + create_distributed_table +-------------------------- + +(1 row) + +INSERT INTO sorted_merge_test (id, val, num) +SELECT i, 'val_' || i, (i * 1.5)::numeric +FROM generate_series(1, 100) i; +INSERT 0 100 +INSERT INTO sorted_merge_test (id, val, num) VALUES (101, NULL, NULL); +INSERT 0 1 +INSERT INTO sorted_merge_test (id, val, num) VALUES (102, NULL, NULL); +INSERT 0 1 +INSERT INTO sorted_merge_test (id, val, num) VALUES (200, 'dup_a', 10.5); +INSERT 0 1 +INSERT INTO sorted_merge_test (id, val, num) VALUES (201, 'dup_b', 10.5); +INSERT 0 1 +INSERT INTO sorted_merge_test (id, val, num) VALUES (202, 'dup_c', 10.5); +INSERT 0 1 +CREATE TABLE sorted_merge_events ( + id int, + event_type text, + event_val int +); +CREATE TABLE +SELECT create_distributed_table('sorted_merge_events', 'id'); + create_distributed_table +-------------------------- + +(1 row) + +INSERT INTO sorted_merge_events +SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i +FROM generate_series(1, 200) i; +INSERT 0 200 +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------- + off +(1 row) + +SET citus.enable_sorted_merge TO on; +SET +SHOW citus.enable_sorted_merge; + citus.enable_sorted_merge +--------------------------- + on +(1 row) + +SET citus.enable_sorted_merge TO off; +SET +SET citus.enable_sorted_merge TO on; +SET +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + QUERY PLAN +-------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) + +EXPLAIN (COSTS OFF) +SELECT id FROM sorted_merge_test ORDER BY id DESC; + QUERY PLAN +-------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.id DESC + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id DESC + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) + +EXPLAIN (COSTS OFF) +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; + QUERY PLAN +-------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.num DESC NULLS LAST + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: num DESC NULLS LAST + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) + +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test ORDER BY val; + QUERY PLAN +-------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.val + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: val + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) + +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id, val; + QUERY PLAN +-------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id, val + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) + +EXPLAIN (COSTS OFF) +SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; + QUERY PLAN +-------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.id, remote_scan.num DESC + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id, num DESC + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) + +EXPLAIN (COSTS OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; + QUERY PLAN +-------------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id + -> HashAggregate + Group Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; + QUERY PLAN +-------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + Filter: (num > '50'::numeric) +(11 rows) + +EXPLAIN (COSTS OFF) +SELECT id, num FROM sorted_merge_test ORDER BY id + 1; + QUERY PLAN +-------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: ((id + 1)) + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) + +EXPLAIN (COSTS OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +-------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +SET citus.enable_sorted_merge TO on; +SET +EXPLAIN (COSTS OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); + QUERY PLAN +-------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.count + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) + +EXPLAIN (COSTS OFF) +SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); + QUERY PLAN +-------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.avg + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) + +EXPLAIN (COSTS OFF) +SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; + QUERY PLAN +-------------------------------------------------------------------------------------- + Sort + Sort Key: remote_scan.val + -> HashAggregate + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: val + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); + QUERY PLAN +-------------------------------------------------------------------------------------- + Sort + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + -> HashAggregate + Group Key: remote_scan.val + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> HashAggregate + Group Key: val + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +SET citus.enable_sorted_merge TO off; +SET +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +----+-------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO on; +SET +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + id | val +----+-------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO off; +SET +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +----- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SET +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + id +----- + 202 + 201 + 200 + 102 + 101 +(5 rows) + +SET citus.enable_sorted_merge TO off; +SET +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +----+----- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SET +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +----+----- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO off; +SET +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +-----+-------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SET +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + id | val +-----+-------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO off; +SET +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +----+------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SET +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +----+------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO off; +SET +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +----+----- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SET +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + id | num +----+----- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO off; +SET +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +----+-------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SET +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + id | val +----+-------- + 67 | val_67 + 68 | val_68 + 69 | val_69 + 70 | val_70 + 71 | val_71 +(5 rows) + +SET citus.enable_sorted_merge TO off; +SET +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +----+-------+-----+------------------------ + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SET +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count | sum | avg +----+-------+-----+------------------------ + 1 | 1 | 1.5 | 1.50000000000000000000 + 2 | 1 | 3.0 | 3.0000000000000000 + 3 | 1 | 4.5 | 4.5000000000000000 + 4 | 1 | 6.0 | 6.0000000000000000 + 5 | 1 | 7.5 | 7.5000000000000000 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SET +SELECT * FROM ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) sub ORDER BY id; + id | val +----+------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +WITH top5 AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) +SELECT * FROM top5 ORDER BY id; + id | val +----+------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SELECT t.id, t.val, e.event_type +FROM sorted_merge_test t +JOIN sorted_merge_events e ON t.id = e.id +WHERE t.id <= 5 +ORDER BY t.id, e.event_type +LIMIT 10; + id | val | event_type +----+-------+------------ + 1 | val_1 | buy + 1 | val_1 | buy + 1 | val_1 | click + 1 | val_1 | view + 2 | val_2 | buy + 2 | val_2 | click + 2 | val_2 | view + 2 | val_2 | view + 3 | val_3 | buy + 3 | val_3 | buy +(10 rows) + +SELECT id, val FROM sorted_merge_test WHERE id <= 3 +UNION ALL +SELECT id, val FROM sorted_merge_test WHERE id BETWEEN 98 AND 100 +ORDER BY id; + id | val +-----+--------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 98 | val_98 + 99 | val_99 + 100 | val_100 +(6 rows) + +SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id +---- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +SELECT DISTINCT ON (id) id, val, num +FROM sorted_merge_test +WHERE id <= 5 +ORDER BY id, num DESC; + id | val | num +----+-------+----- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +SELECT id, val FROM sorted_merge_test t +WHERE EXISTS (SELECT 1 FROM sorted_merge_events e WHERE e.id = t.id) +ORDER BY id LIMIT 5; + id | val +----+------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SELECT id, val FROM sorted_merge_test +WHERE id IN (SELECT id FROM sorted_merge_events WHERE event_type = 'click') +ORDER BY id LIMIT 5; + id | val +----+------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SELECT id, count(*), sum(num), avg(num), min(val), max(val) +FROM sorted_merge_test +GROUP BY id +ORDER BY id +LIMIT 5; + id | count | sum | avg | min | max +----+-------+-----+------------------------+-------+------- + 1 | 1 | 1.5 | 1.50000000000000000000 | val_1 | val_1 + 2 | 1 | 3.0 | 3.0000000000000000 | val_2 | val_2 + 3 | 1 | 4.5 | 4.5000000000000000 | val_3 | val_3 + 4 | 1 | 6.0 | 6.0000000000000000 | val_4 | val_4 + 5 | 1 | 7.5 | 7.5000000000000000 | val_5 | val_5 +(5 rows) + +SELECT id, + CASE WHEN num > 75 THEN 'high' WHEN num > 25 THEN 'mid' ELSE 'low' END as bucket +FROM sorted_merge_test +WHERE num IS NOT NULL +ORDER BY id +LIMIT 10; + id | bucket +----+-------- + 1 | low + 2 | low + 3 | low + 4 | low + 5 | low + 6 | low + 7 | low + 8 | low + 9 | low + 10 | low +(10 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; + id | num +-----+----- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; + id | num +----+----- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; + id | num +-----+------- + 101 | + 102 | + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 +(5 rows) + +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMIT 5; + id | num +-----+------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; + id +----- + 101 + 102 + 200 + 201 + 202 +(5 rows) + +SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; + id | val +-----+-------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SET +SELECT id FROM sorted_merge_test WHERE id < 0 ORDER BY id; + id +---- +(0 rows) + +SELECT id, val FROM sorted_merge_test WHERE id = 42 ORDER BY id; + id | val +----+-------- + 42 | val_42 +(1 row) + +SELECT id, num FROM sorted_merge_test WHERE num = 10.5 ORDER BY num, id; + id | num +-----+------ + 7 | 10.5 + 200 | 10.5 + 201 | 10.5 + 202 | 10.5 +(4 rows) + +SELECT id, val, num FROM sorted_merge_test +WHERE id <= 5 +ORDER BY num, val, id +LIMIT 5; + id | val | num +----+-------+----- + 1 | val_1 | 1.5 + 2 | val_2 | 3.0 + 3 | val_3 | 4.5 + 4 | val_4 | 6.0 + 5 | val_5 | 7.5 +(5 rows) + +SET citus.enable_sorted_merge TO off; +SET +EXPLAIN (COSTS OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +-------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +SET citus.enable_sorted_merge TO on; +SET +EXPLAIN (COSTS OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +-------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + +SET citus.enable_sorted_merge TO off; +SET +EXPLAIN (COSTS OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + QUERY PLAN +-------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> HashAggregate + Group Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(14 rows) + +SET citus.enable_sorted_merge TO on; +SET +EXPLAIN (COSTS OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + QUERY PLAN +-------------------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 32 + Tasks Shown: One of 32 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> HashAggregate + Group Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(14 rows) + +SET citus.enable_sorted_merge TO off; +SET +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 5; + id | count +----+------- + 20 | 1 + 8 | 1 + 82 | 1 + 15 | 1 + 60 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SET +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 5; + id | count +----+------- + 20 | 1 + 8 | 1 + 82 | 1 + 15 | 1 + 60 | 1 +(5 rows) + +SET citus.enable_sorted_merge TO off; +SET +DROP TABLE sorted_merge_test; +DROP TABLE +DROP TABLE sorted_merge_events; +DROP TABLE diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index a6643e3b768..d3fb728d271 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -92,7 +92,7 @@ test: multi_limit_clause_approximate multi_outer_join_reference multi_outer_join test: multi_reference_table multi_select_for_update relation_access_tracking pg13_with_ties test: custom_aggregate_support aggregate_support tdigest_aggregate_support test: multi_average_expression multi_working_columns multi_having_pushdown having_subquery -test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown +test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown multi_orderby_pushdown test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg ch_bench_having chbenchmark_all_queries expression_reference_join anonymous_columns test: ch_bench_subquery_repartition test: subscripting_op diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql new file mode 100644 index 00000000000..86504708cd3 --- /dev/null +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -0,0 +1,318 @@ +-- +-- MULTI_SORTED_MERGE +-- +-- Tests for the citus.enable_sorted_merge GUC and the sorted merge +-- planner eligibility logic. Verifies that enabling the GUC does not +-- introduce regressions for any query pattern. +-- + +SET citus.next_shard_id TO 960000; + +-- ================================================================= +-- Setup: create test tables +-- ================================================================= + +CREATE TABLE sorted_merge_test ( + id int, + val text, + num numeric, + ts timestamptz DEFAULT now() +); +SELECT create_distributed_table('sorted_merge_test', 'id'); + +-- Insert 100 rows + NULLs + duplicates +INSERT INTO sorted_merge_test (id, val, num) +SELECT i, 'val_' || i, (i * 1.5)::numeric +FROM generate_series(1, 100) i; + +INSERT INTO sorted_merge_test (id, val, num) VALUES (101, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (102, NULL, NULL); +INSERT INTO sorted_merge_test (id, val, num) VALUES (200, 'dup_a', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (201, 'dup_b', 10.5); +INSERT INTO sorted_merge_test (id, val, num) VALUES (202, 'dup_c', 10.5); + +-- Second table for join tests +CREATE TABLE sorted_merge_events ( + id int, + event_type text, + event_val int +); +SELECT create_distributed_table('sorted_merge_events', 'id'); + +INSERT INTO sorted_merge_events +SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i +FROM generate_series(1, 200) i; + +-- ================================================================= +-- 1. GUC basics +-- ================================================================= + +SHOW citus.enable_sorted_merge; + +SET citus.enable_sorted_merge TO on; +SHOW citus.enable_sorted_merge; + +SET citus.enable_sorted_merge TO off; + +-- ================================================================= +-- Category A: Eligibility — sort IS pushed to workers +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- A1: ORDER BY distribution column +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + +-- A2: ORDER BY DESC +EXPLAIN (COSTS OFF) +SELECT id FROM sorted_merge_test ORDER BY id DESC; + +-- A3: ORDER BY DESC NULLS LAST +EXPLAIN (COSTS OFF) +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; + +-- A4: ORDER BY non-distribution column +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test ORDER BY val; + +-- A5: Multi-column ORDER BY +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id, val; + +-- A6: Mixed directions +EXPLAIN (COSTS OFF) +SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; + +-- A7: GROUP BY dist_col ORDER BY dist_col +EXPLAIN (COSTS OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; + +-- A8: WHERE clause + ORDER BY +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; + +-- A9: Expression in ORDER BY (non-aggregate) +EXPLAIN (COSTS OFF) +SELECT id, num FROM sorted_merge_test ORDER BY id + 1; + +-- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) +EXPLAIN (COSTS OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + +-- ================================================================= +-- Category B: Ineligibility — sort NOT pushed for merge +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- B1: ORDER BY count(*) +EXPLAIN (COSTS OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); + +-- B2: ORDER BY avg(col) +EXPLAIN (COSTS OFF) +SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); + +-- B3: GROUP BY non-dist col, ORDER BY non-dist col +EXPLAIN (COSTS OFF) +SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; + +-- B4: GROUP BY non-dist col, ORDER BY aggregate +EXPLAIN (COSTS OFF) +SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); + +-- ================================================================= +-- Category C: Correctness — results match GUC off vs on +-- ================================================================= + +-- C1: Simple ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; + +-- C2: ORDER BY DESC +SET citus.enable_sorted_merge TO off; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; + +-- C3: Multi-column ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + +-- C4: ORDER BY non-distribution column +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; + +-- C5: GROUP BY dist_col ORDER BY dist_col +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + +-- C6: Mixed directions +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; + +-- C7: WHERE + ORDER BY +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; + +-- C8: Aggregates in SELECT, ORDER BY on dist_col (GROUP BY dist_col) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + +-- ================================================================= +-- Category D: Complex queries — regression guards +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- D1: Subquery in FROM with ORDER BY +SELECT * FROM ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) sub ORDER BY id; + +-- D2: CTE with ORDER BY +WITH top5 AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 +) +SELECT * FROM top5 ORDER BY id; + +-- D3: Co-located JOIN + ORDER BY +SELECT t.id, t.val, e.event_type +FROM sorted_merge_test t +JOIN sorted_merge_events e ON t.id = e.id +WHERE t.id <= 5 +ORDER BY t.id, e.event_type +LIMIT 10; + +-- D4: UNION ALL + ORDER BY +SELECT id, val FROM sorted_merge_test WHERE id <= 3 +UNION ALL +SELECT id, val FROM sorted_merge_test WHERE id BETWEEN 98 AND 100 +ORDER BY id; + +-- D5: DISTINCT + ORDER BY +SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + +-- D6: DISTINCT ON + ORDER BY +SELECT DISTINCT ON (id) id, val, num +FROM sorted_merge_test +WHERE id <= 5 +ORDER BY id, num DESC; + +-- D7: EXISTS subquery + ORDER BY +SELECT id, val FROM sorted_merge_test t +WHERE EXISTS (SELECT 1 FROM sorted_merge_events e WHERE e.id = t.id) +ORDER BY id LIMIT 5; + +-- D8: IN subquery + ORDER BY +SELECT id, val FROM sorted_merge_test +WHERE id IN (SELECT id FROM sorted_merge_events WHERE event_type = 'click') +ORDER BY id LIMIT 5; + +-- D9: Multiple aggregates, GROUP BY dist_col, ORDER BY dist_col +SELECT id, count(*), sum(num), avg(num), min(val), max(val) +FROM sorted_merge_test +GROUP BY id +ORDER BY id +LIMIT 5; + +-- D10: CASE expression in SELECT + ORDER BY +SELECT id, + CASE WHEN num > 75 THEN 'high' WHEN num > 25 THEN 'mid' ELSE 'low' END as bucket +FROM sorted_merge_test +WHERE num IS NOT NULL +ORDER BY id +LIMIT 10; + +-- D11: NULL values ordering +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; +SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; +SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMIT 5; + +-- D12: Large OFFSET +SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; + +-- D13: ORDER BY ordinal position +SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; + +-- ================================================================= +-- Category E: Edge cases +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- E1: Empty result set +SELECT id FROM sorted_merge_test WHERE id < 0 ORDER BY id; + +-- E2: Single row (may go through router planner) +SELECT id, val FROM sorted_merge_test WHERE id = 42 ORDER BY id; + +-- E3: All rows with same sort value +SELECT id, num FROM sorted_merge_test WHERE num = 10.5 ORDER BY num, id; + +-- E4: Wide sort key (4 columns) +SELECT id, val, num FROM sorted_merge_test +WHERE id <= 5 +ORDER BY num, val, id +LIMIT 5; + +-- ================================================================= +-- Category F: Existing LIMIT pushdown stability +-- ================================================================= + +-- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on +SET citus.enable_sorted_merge TO off; +EXPLAIN (COSTS OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +EXPLAIN (COSTS OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + +-- F2: GROUP BY dist_col + ORDER BY + LIMIT +SET citus.enable_sorted_merge TO off; +EXPLAIN (COSTS OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +EXPLAIN (COSTS OFF) +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + +-- F3: ORDER BY aggregate + LIMIT (not eligible for merge) +SET citus.enable_sorted_merge TO off; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 5; + +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 5; + +-- ================================================================= +-- Cleanup +-- ================================================================= + +SET citus.enable_sorted_merge TO off; +DROP TABLE sorted_merge_test; +DROP TABLE sorted_merge_events; From cd997109fbdf4a116f07fcb8b335f138b3d4dcf4 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Thu, 19 Mar 2026 19:55:00 +0000 Subject: [PATCH 03/18] =?UTF-8?q?Add=20per-task=20tuple=20stores=20and=20k?= =?UTF-8?q?-way=20merge=20for=20sorted=20merge=20Phase=203=20of=20the=20so?= =?UTF-8?q?rted-merge=20feature.=20When=20distributedPlan->useSortedMerge?= =?UTF-8?q?=20is=20true=20(set=20at=20planning=20time=20by=20Phase=202),?= =?UTF-8?q?=20the=20adaptive=20executor=20now:=201.=20Routes=20worker=20re?= =?UTF-8?q?sults=20into=20per-task=20tuple=20stores=20via=20a=20new=20=20?= =?UTF-8?q?=20=20PerTaskDispatchTupleDest=20that=20dispatches=20by=20task-?= =?UTF-8?q?>taskId=20hash=20lookup.=20=20=20=20No=20Task=20fields=20are=20?= =?UTF-8?q?mutated=20=E2=80=94=20all=20state=20lives=20on=20DistributedExe?= =?UTF-8?q?cution.=202.=20After=20all=20tasks=20complete,=20performs=20a?= =?UTF-8?q?=20k-way=20merge=20of=20the=20per-task=20stores=20=20=20=20into?= =?UTF-8?q?=20the=20final=20scanState->tuplestorestate=20using=20PostgreSQ?= =?UTF-8?q?L's=20public=20=20=20=20binaryheap=20and=20SortSupport=20APIs.?= =?UTF-8?q?=203.=20Frees=20per-task=20stores=20after=20the=20merge.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing CitusExecScan/ReturnTupleFromTuplestore/CitusEndScan/ CitusReScan code paths are completely unchanged — they read from the final tuplestore exactly as before. New files: - sorted_merge.h: CreatePerTaskDispatchDest, MergePerTaskStoresIntoFinalStore - sorted_merge.c: PerTaskDispatchTupleDest with taskId->index hash routing, MergePerTaskStoresIntoFinalStore with binaryheap merge, MergeHeapComparator modeled after PG's heap_compare_slots in nodeMergeAppend.c Modified: - adaptive_executor.c: DistributedExecution gains useSortedMerge/perTaskStores/ perTaskStoreCount fields. AdaptiveExecutor() branches on useSortedMerge to create per-task stores, then merges post-execution. EXPLAIN ANALYZE falls back to existing single-tuplestore path. Safety: - Shared TupleDestinationStats preserves citus.max_intermediate_result_size - Per-task stores allocated in AdaptiveExecutor local memory context (auto-cleanup on error via PG memory context teardown) - task->totalReceivedTupleData tracking preserved The coordinator Sort node is still present above the CustomScan (pathkeys not set until Phase 4). Results are correct because the redundant Sort re-sorts already-sorted data. Co-authored-by: Copilot --- .../distributed/executor/adaptive_executor.c | 73 +++- .../distributed/executor/sorted_merge.c | 333 +++++++++++++++++ .../distributed/executor/tuple_destination.c | 4 +- src/include/distributed/sorted_merge.h | 34 ++ src/include/distributed/tuple_destination.h | 2 + .../expected/multi_orderby_pushdown.out | 345 ++++++++++-------- .../regress/sql/multi_orderby_pushdown.sql | 11 +- 7 files changed, 643 insertions(+), 159 deletions(-) create mode 100644 src/backend/distributed/executor/sorted_merge.c create mode 100644 src/include/distributed/sorted_merge.h diff --git a/src/backend/distributed/executor/adaptive_executor.c b/src/backend/distributed/executor/adaptive_executor.c index 83e561c9376..6fb75c9db0e 100644 --- a/src/backend/distributed/executor/adaptive_executor.c +++ b/src/backend/distributed/executor/adaptive_executor.c @@ -176,6 +176,7 @@ #include "distributed/transaction_identifier.h" #include "distributed/transaction_management.h" #include "distributed/tuple_destination.h" +#include "distributed/sorted_merge.h" #include "distributed/version_compat.h" #include "distributed/worker_protocol.h" @@ -315,6 +316,15 @@ typedef struct DistributedExecution * fail, such as CREATE INDEX CONCURRENTLY. */ bool localExecutionSupported; + + /* + * Sorted merge: when useSortedMerge is true, worker results are routed + * to per-task tuple stores. After execution completes, these stores are + * k-way merged into the final scanState->tuplestorestate. + */ + bool useSortedMerge; + Tuplestorestate **perTaskStores; + int perTaskStoreCount; } DistributedExecution; @@ -799,12 +809,36 @@ AdaptiveExecutor(CitusScanState *scanState) /* Reset Task fields that are only valid for a single execution */ ResetExplainAnalyzeData(taskList); - scanState->tuplestorestate = - tuplestore_begin_heap(randomAccess, interTransactions, work_mem); - TupleDesc tupleDescriptor = ScanStateGetTupleDescriptor(scanState); - TupleDestination *defaultTupleDest = - CreateTupleStoreTupleDest(scanState->tuplestorestate, tupleDescriptor); + TupleDestination *defaultTupleDest = NULL; + + /* + * When sorted merge is active and we're not doing EXPLAIN ANALYZE, + * route worker results into per-task tuple stores. The final tuplestore + * will be created later after the k-way merge. + */ + bool useSortedMerge = distributedPlan->useSortedMerge && + !RequestedForExplainAnalyze(scanState); + Tuplestorestate **perTaskStores = NULL; + int perTaskStoreCount = 0; + + if (useSortedMerge) + { + TupleDestinationStats *sharedStats = palloc0(sizeof(TupleDestinationStats)); + defaultTupleDest = CreatePerTaskDispatchDest(taskList, tupleDescriptor, + sharedStats, + &perTaskStores, + &perTaskStoreCount); + /* final tuplestore created after merge */ + scanState->tuplestorestate = NULL; + } + else + { + scanState->tuplestorestate = + tuplestore_begin_heap(randomAccess, interTransactions, work_mem); + defaultTupleDest = + CreateTupleStoreTupleDest(scanState->tuplestorestate, tupleDescriptor); + } bool localExecutionSupported = true; @@ -867,6 +901,11 @@ AdaptiveExecutor(CitusScanState *scanState) jobIdList, localExecutionSupported); + /* save sorted merge state on execution for post-merge */ + execution->useSortedMerge = useSortedMerge; + execution->perTaskStores = perTaskStores; + execution->perTaskStoreCount = perTaskStoreCount; + /* * Make sure that we acquire the appropriate locks even if the local tasks * are going to be executed with local execution. @@ -897,6 +936,30 @@ AdaptiveExecutor(CitusScanState *scanState) FinishDistributedExecution(execution); + /* + * When sorted merge is active, k-way merge the per-task stores into + * the final tuplestore. This produces globally sorted output that the + * existing ReturnTupleFromTuplestore() path can read unchanged. + */ + if (execution->useSortedMerge && execution->perTaskStoreCount > 0) + { + scanState->tuplestorestate = + tuplestore_begin_heap(randomAccess, interTransactions, work_mem); + + MergePerTaskStoresIntoFinalStore(scanState->tuplestorestate, + execution->perTaskStores, + execution->perTaskStoreCount, + distributedPlan->sortedMergeKeys, + distributedPlan->sortedMergeKeyCount, + tupleDescriptor); + + /* free per-task stores — they are no longer needed */ + for (int i = 0; i < execution->perTaskStoreCount; i++) + { + tuplestore_end(execution->perTaskStores[i]); + } + } + if (SortReturning && distributedPlan->expectResults && commandType != CMD_SELECT) { SortTupleStore(scanState); diff --git a/src/backend/distributed/executor/sorted_merge.c b/src/backend/distributed/executor/sorted_merge.c new file mode 100644 index 00000000000..f3514dfdb38 --- /dev/null +++ b/src/backend/distributed/executor/sorted_merge.c @@ -0,0 +1,333 @@ +/*------------------------------------------------------------------------- + * + * sorted_merge.c + * Implements coordinator-side sorted merge of pre-sorted worker results. + * + * CreatePerTaskDispatchDest() creates per-task tuple stores and returns + * a TupleDestination that routes incoming tuples to the correct store + * based on task->taskId. The only Task field written is + * totalReceivedTupleData (execution-time reporting, reset each execution). + * + * MergePerTaskStoresIntoFinalStore() performs a k-way merge of the + * per-task stores into a single output tuplestore using a binary heap + * and PostgreSQL's SortSupport infrastructure. + * + * Copyright (c) Citus Data, Inc. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" + +#include "executor/tuptable.h" +#include "lib/binaryheap.h" +#include "utils/hsearch.h" +#include "utils/sortsupport.h" + +#include "distributed/listutils.h" +#include "distributed/sorted_merge.h" +#include "distributed/subplan_execution.h" + + +/* + * PerTaskDispatchTupleDest routes tuples to per-task tuple stores + * based on the task's taskId. This is an execution-local object that + * is never attached to a reusable Task node. + */ +typedef struct PerTaskDispatchTupleDest +{ + TupleDestination pub; + Tuplestorestate **perTaskStores; + int taskCount; + TupleDesc tupleDesc; + HTAB *taskIdToIndex; /* maps uint32 taskId -> int array index */ +} PerTaskDispatchTupleDest; + + +/* + * TaskIdIndexEntry is a hash table entry mapping taskId to per-task store index. + */ +typedef struct TaskIdIndexEntry +{ + uint32 taskId; /* hash key */ + int index; /* index into perTaskStores array */ +} TaskIdIndexEntry; + + +/* + * MergeContext holds the state needed by the binary heap comparator. + */ +typedef struct MergeContext +{ + TupleTableSlot **slots; + SortSupportData *sortKeys; + int nkeys; +} MergeContext; + + +/* forward declarations */ +static void PerTaskDispatchPutTuple(TupleDestination *self, Task *task, + int placementIndex, int queryNumber, + HeapTuple heapTuple, uint64 tupleLibpqSize); +static TupleDesc PerTaskDispatchTupleDescForQuery(TupleDestination *self, + int queryNumber); +static int MergeHeapComparator(Datum a, Datum b, void *arg); + + +/* + * CreatePerTaskDispatchDest creates per-task tuple stores and returns a + * TupleDestination that routes incoming tuples to the correct store based + * on task->taskId. + * + * The per-task stores and their count are returned via out parameters so + * the caller can pass them to MergePerTaskStoresIntoFinalStore() later. + * + * All memory is allocated in CurrentMemoryContext (expected to be the + * AdaptiveExecutor local context). + */ +TupleDestination * +CreatePerTaskDispatchDest(List *taskList, TupleDesc tupleDesc, + TupleDestinationStats *sharedStats, + Tuplestorestate ***perTaskStoresOut, + int *perTaskStoreCountOut) +{ + int taskCount = list_length(taskList); + if (taskCount == 0) + { + *perTaskStoresOut = NULL; + *perTaskStoreCountOut = 0; + return CreateTupleDestNone(); + } + + /* + * Allocate per-task tuple stores. Each store gets work_mem / taskCount, + * with a floor of 64 kB. Note: this means the aggregate in-memory budget + * for per-task stores can exceed a single work_mem allocation when + * taskCount is large (e.g., 128 tasks × 64 kB = 8 MB floor). The final + * output tuplestore also gets a full work_mem allocation. This is a + * deliberate trade-off: per-task stores spill to disk automatically, + * and they are freed before the final tuplestore is consumed. The + * temporary memory amplification is bounded and short-lived. + */ + Tuplestorestate **perTaskStores = palloc(taskCount * sizeof(Tuplestorestate *)); + int perTaskWorkMem = Max(work_mem / Max(taskCount, 1), 64); + + for (int i = 0; i < taskCount; i++) + { + perTaskStores[i] = tuplestore_begin_heap(false, false, perTaskWorkMem); + } + + /* build taskId -> array index hash table */ + HASHCTL hashInfo; + memset(&hashInfo, 0, sizeof(hashInfo)); + hashInfo.keysize = sizeof(uint32); + hashInfo.entrysize = sizeof(TaskIdIndexEntry); + hashInfo.hcxt = CurrentMemoryContext; + HTAB *taskIdToIndex = hash_create("PerTaskDispatchHash", taskCount, + &hashInfo, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + int index = 0; + Task *task = NULL; + foreach_declared_ptr(task, taskList) + { + bool found = false; + TaskIdIndexEntry *entry = hash_search(taskIdToIndex, &task->taskId, + HASH_ENTER, &found); + Assert(!found); + entry->index = index; + index++; + } + + /* build the dispatch TupleDestination */ + PerTaskDispatchTupleDest *dispatch = palloc0(sizeof(PerTaskDispatchTupleDest)); + dispatch->pub.putTuple = PerTaskDispatchPutTuple; + dispatch->pub.tupleDescForQuery = PerTaskDispatchTupleDescForQuery; + dispatch->pub.tupleDestinationStats = sharedStats; + dispatch->perTaskStores = perTaskStores; + dispatch->taskCount = taskCount; + dispatch->tupleDesc = tupleDesc; + dispatch->taskIdToIndex = taskIdToIndex; + + *perTaskStoresOut = perTaskStores; + *perTaskStoreCountOut = taskCount; + + return (TupleDestination *) dispatch; +} + + +/* + * PerTaskDispatchPutTuple routes a tuple to the per-task store identified + * by the task's taskId. Matches the behavior of TupleStoreTupleDestPutTuple + * for intermediate-result accounting and totalReceivedTupleData tracking. + */ +static void +PerTaskDispatchPutTuple(TupleDestination *self, Task *task, + int placementIndex, int queryNumber, + HeapTuple heapTuple, uint64 tupleLibpqSize) +{ + PerTaskDispatchTupleDest *dispatch = (PerTaskDispatchTupleDest *) self; + + /* look up the per-task store index */ + bool found = false; + TaskIdIndexEntry *entry = hash_search(dispatch->taskIdToIndex, &task->taskId, + HASH_FIND, &found); + Assert(found); + tuplestore_puttuple(dispatch->perTaskStores[entry->index], heapTuple); + + /* intermediate-result size accounting (matches TupleStoreTupleDestPutTuple) */ + uint64 tupleSize = tupleLibpqSize; + if (tupleSize == 0) + { + tupleSize = heapTuple->t_len; + } + + TupleDestinationStats *stats = self->tupleDestinationStats; + if (SubPlanLevel > 0 && stats != NULL) + { + stats->totalIntermediateResultSize += tupleSize; + EnsureIntermediateSizeLimitNotExceeded(stats); + } + + /* track network transfer size (matches TupleStoreTupleDestPutTuple) */ + task->totalReceivedTupleData += tupleLibpqSize; +} + + +/* + * PerTaskDispatchTupleDescForQuery returns the tuple descriptor. + */ +static TupleDesc +PerTaskDispatchTupleDescForQuery(TupleDestination *self, int queryNumber) +{ + Assert(queryNumber == 0); + PerTaskDispatchTupleDest *dispatch = (PerTaskDispatchTupleDest *) self; + return dispatch->tupleDesc; +} + + +/* + * MergePerTaskStoresIntoFinalStore performs a k-way merge of pre-sorted + * per-task tuple stores into a single output tuplestore using a binary heap. + * + * Each per-task store must contain tuples sorted by the given merge keys. + * The output tuplestore will contain all tuples in globally sorted order. + * + * Uses PostgreSQL's public binaryheap and SortSupport APIs. + */ +void +MergePerTaskStoresIntoFinalStore(Tuplestorestate *finalStore, + Tuplestorestate **perTaskStores, + int nstores, + SortedMergeKey *mergeKeys, + int nkeys, + TupleDesc tupleDesc) +{ + if (nstores == 0 || nkeys == 0) + { + return; + } + + /* allocate one reusable slot per task store */ + TupleTableSlot **slots = palloc(nstores * sizeof(TupleTableSlot *)); + for (int i = 0; i < nstores; i++) + { + slots[i] = MakeSingleTupleTableSlot(tupleDesc, &TTSOpsMinimalTuple); + } + + /* build SortSupport from serialized merge keys */ + SortSupportData *sortKeys = palloc0(nkeys * sizeof(SortSupportData)); + for (int i = 0; i < nkeys; i++) + { + SortSupport sk = &sortKeys[i]; + sk->ssup_cxt = CurrentMemoryContext; + sk->ssup_collation = mergeKeys[i].collation; + sk->ssup_nulls_first = mergeKeys[i].nullsFirst; + sk->ssup_attno = mergeKeys[i].attno; + PrepareSortSupportFromOrderingOp(mergeKeys[i].sortop, sk); + } + + /* set up merge context for heap comparisons */ + MergeContext ctx; + ctx.slots = slots; + ctx.sortKeys = sortKeys; + ctx.nkeys = nkeys; + + binaryheap *heap = binaryheap_allocate(nstores, MergeHeapComparator, &ctx); + + /* seed the heap with the first tuple from each non-empty store */ + for (int i = 0; i < nstores; i++) + { + tuplestore_rescan(perTaskStores[i]); + if (tuplestore_gettupleslot(perTaskStores[i], true, false, slots[i])) + { + binaryheap_add_unordered(heap, Int32GetDatum(i)); + } + } + binaryheap_build(heap); + + /* merge loop: extract min, write to final store, advance winner */ + while (!binaryheap_empty(heap)) + { + int winner = DatumGetInt32(binaryheap_first(heap)); + tuplestore_puttupleslot(finalStore, slots[winner]); + + if (tuplestore_gettupleslot(perTaskStores[winner], true, false, + slots[winner])) + { + binaryheap_replace_first(heap, Int32GetDatum(winner)); + } + else + { + (void) binaryheap_remove_first(heap); + } + } + + /* free merge-local resources */ + binaryheap_free(heap); + for (int i = 0; i < nstores; i++) + { + ExecDropSingleTupleTableSlot(slots[i]); + } + pfree(slots); + pfree(sortKeys); +} + + +/* + * MergeHeapComparator compares tuples from two task stores by the merge keys. + * Returns negative if a < b, positive if a > b, zero if equal. + * The binary heap is a max-heap, so we negate to get min-heap behavior. + * + * This is modeled after heap_compare_slots() in nodeMergeAppend.c. + */ +static int +MergeHeapComparator(Datum a, Datum b, void *arg) +{ + MergeContext *ctx = (MergeContext *) arg; + int slot1 = DatumGetInt32(a); + int slot2 = DatumGetInt32(b); + TupleTableSlot *s1 = ctx->slots[slot1]; + TupleTableSlot *s2 = ctx->slots[slot2]; + + for (int i = 0; i < ctx->nkeys; i++) + { + SortSupport sortKey = &ctx->sortKeys[i]; + AttrNumber attno = sortKey->ssup_attno; + bool isNull1, isNull2; + + Datum datum1 = slot_getattr(s1, attno, &isNull1); + Datum datum2 = slot_getattr(s2, attno, &isNull2); + + int compare = ApplySortComparator(datum1, isNull1, + datum2, isNull2, + sortKey); + if (compare != 0) + { + /* binaryheap is a max-heap, negate for min-heap behavior */ + return -compare; + } + } + + return 0; +} diff --git a/src/backend/distributed/executor/tuple_destination.c b/src/backend/distributed/executor/tuple_destination.c index b3c4b509c2f..78e1f98887d 100644 --- a/src/backend/distributed/executor/tuple_destination.c +++ b/src/backend/distributed/executor/tuple_destination.c @@ -49,8 +49,6 @@ typedef struct TupleDestDestReceiver static void TupleStoreTupleDestPutTuple(TupleDestination *self, Task *task, int placementIndex, int queryNumber, HeapTuple heapTuple, uint64 tupleLibpqSize); -static void EnsureIntermediateSizeLimitNotExceeded(TupleDestinationStats * - tupleDestinationStats); static TupleDesc TupleStoreTupleDestTupleDescForQuery(TupleDestination *self, int queryNumber); static void TupleDestNonePutTuple(TupleDestination *self, Task *task, @@ -135,7 +133,7 @@ TupleStoreTupleDestPutTuple(TupleDestination *self, Task *task, * EnsureIntermediateSizeLimitNotExceeded is a helper function for checking the current * state of the tupleDestinationStats and throws error if necessary. */ -static void +void EnsureIntermediateSizeLimitNotExceeded(TupleDestinationStats *tupleDestinationStats) { if (!tupleDestinationStats) diff --git a/src/include/distributed/sorted_merge.h b/src/include/distributed/sorted_merge.h new file mode 100644 index 00000000000..eeb3e690d35 --- /dev/null +++ b/src/include/distributed/sorted_merge.h @@ -0,0 +1,34 @@ +/*------------------------------------------------------------------------- + * + * sorted_merge.h + * Declarations for coordinator-side sorted merge of pre-sorted + * worker results using a binary heap. + * + * Copyright (c) Citus Data, Inc. + *------------------------------------------------------------------------- + */ + +#ifndef SORTED_MERGE_H +#define SORTED_MERGE_H + +#include "access/tupdesc.h" +#include "utils/tuplestore.h" + +#include "distributed/multi_physical_planner.h" +#include "distributed/tuple_destination.h" + + +extern TupleDestination * CreatePerTaskDispatchDest(List *taskList, + TupleDesc tupleDesc, + TupleDestinationStats *sharedStats, + Tuplestorestate ***perTaskStoresOut, + int *perTaskStoreCountOut); + +extern void MergePerTaskStoresIntoFinalStore(Tuplestorestate *finalStore, + Tuplestorestate **perTaskStores, + int nstores, + SortedMergeKey *mergeKeys, + int nkeys, + TupleDesc tupleDesc); + +#endif /* SORTED_MERGE_H */ diff --git a/src/include/distributed/tuple_destination.h b/src/include/distributed/tuple_destination.h index 5b4f649835f..c502fd2aa1a 100644 --- a/src/include/distributed/tuple_destination.h +++ b/src/include/distributed/tuple_destination.h @@ -65,5 +65,7 @@ extern TupleDestination * CreateTupleStoreTupleDest(Tuplestorestate *tupleStore, extern TupleDestination * CreateTupleDestNone(void); extern DestReceiver * CreateTupleDestDestReceiver(TupleDestination *tupleDest, Task *task, int placementIndex); +extern void EnsureIntermediateSizeLimitNotExceeded( + TupleDestinationStats *tupleDestinationStats); #endif diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index adbd4f1dde0..5a87ad12e11 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -1,75 +1,81 @@ +-- +-- MULTI_SORTED_MERGE +-- +-- Tests for the citus.enable_sorted_merge GUC and the sorted merge +-- planner eligibility logic. Verifies that enabling the GUC does not +-- introduce regressions for any query pattern. +-- SET citus.next_shard_id TO 960000; -SET +-- ================================================================= +-- Setup: create test tables +-- ================================================================= CREATE TABLE sorted_merge_test ( id int, val text, num numeric, ts timestamptz DEFAULT now() ); -CREATE TABLE SELECT create_distributed_table('sorted_merge_test', 'id'); create_distributed_table --------------------------- +--------------------------------------------------------------------- (1 row) +-- Insert 100 rows + NULLs + duplicates INSERT INTO sorted_merge_test (id, val, num) SELECT i, 'val_' || i, (i * 1.5)::numeric FROM generate_series(1, 100) i; -INSERT 0 100 INSERT INTO sorted_merge_test (id, val, num) VALUES (101, NULL, NULL); -INSERT 0 1 INSERT INTO sorted_merge_test (id, val, num) VALUES (102, NULL, NULL); -INSERT 0 1 INSERT INTO sorted_merge_test (id, val, num) VALUES (200, 'dup_a', 10.5); -INSERT 0 1 INSERT INTO sorted_merge_test (id, val, num) VALUES (201, 'dup_b', 10.5); -INSERT 0 1 INSERT INTO sorted_merge_test (id, val, num) VALUES (202, 'dup_c', 10.5); -INSERT 0 1 +-- Second table for join tests CREATE TABLE sorted_merge_events ( id int, event_type text, event_val int ); -CREATE TABLE SELECT create_distributed_table('sorted_merge_events', 'id'); create_distributed_table --------------------------- +--------------------------------------------------------------------- (1 row) INSERT INTO sorted_merge_events SELECT i % 50 + 1, CASE WHEN i % 3 = 0 THEN 'click' WHEN i % 3 = 1 THEN 'view' ELSE 'buy' END, i FROM generate_series(1, 200) i; -INSERT 0 200 +-- ================================================================= +-- 1. GUC basics +-- ================================================================= SHOW citus.enable_sorted_merge; citus.enable_sorted_merge ---------------------------- +--------------------------------------------------------------------- off (1 row) SET citus.enable_sorted_merge TO on; -SET SHOW citus.enable_sorted_merge; citus.enable_sorted_merge ---------------------------- +--------------------------------------------------------------------- on (1 row) SET citus.enable_sorted_merge TO off; -SET +-- ================================================================= +-- Category A: Eligibility — sort IS pushed to workers +-- ================================================================= SET citus.enable_sorted_merge TO on; -SET +-- A1: ORDER BY distribution column EXPLAIN (COSTS OFF) SELECT id, val FROM sorted_merge_test ORDER BY id; QUERY PLAN --------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.id -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Sort @@ -77,15 +83,16 @@ SELECT id, val FROM sorted_merge_test ORDER BY id; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (10 rows) +-- A2: ORDER BY DESC EXPLAIN (COSTS OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC; QUERY PLAN --------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.id DESC -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Sort @@ -93,15 +100,16 @@ SELECT id FROM sorted_merge_test ORDER BY id DESC; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (10 rows) +-- A3: ORDER BY DESC NULLS LAST EXPLAIN (COSTS OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; QUERY PLAN --------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.num DESC NULLS LAST -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Sort @@ -109,15 +117,16 @@ SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (10 rows) +-- A4: ORDER BY non-distribution column EXPLAIN (COSTS OFF) SELECT id, val FROM sorted_merge_test ORDER BY val; QUERY PLAN --------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.val -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Sort @@ -125,15 +134,16 @@ SELECT id, val FROM sorted_merge_test ORDER BY val; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (10 rows) +-- A5: Multi-column ORDER BY EXPLAIN (COSTS OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val; QUERY PLAN --------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.id, remote_scan.val -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Sort @@ -141,15 +151,16 @@ SELECT id, val FROM sorted_merge_test ORDER BY id, val; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (10 rows) +-- A6: Mixed directions EXPLAIN (COSTS OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; QUERY PLAN --------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.id, remote_scan.num DESC -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Sort @@ -157,15 +168,16 @@ SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (10 rows) +-- A7: GROUP BY dist_col ORDER BY dist_col EXPLAIN (COSTS OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; QUERY PLAN --------------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.id -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Sort @@ -175,15 +187,16 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (12 rows) +-- A8: WHERE clause + ORDER BY EXPLAIN (COSTS OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; QUERY PLAN --------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.id -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Sort @@ -192,15 +205,16 @@ SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; Filter: (num > '50'::numeric) (11 rows) +-- A9: Expression in ORDER BY (non-aggregate) EXPLAIN (COSTS OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1; QUERY PLAN --------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.worker_column_3 -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Sort @@ -208,16 +222,17 @@ SELECT id, num FROM sorted_merge_test ORDER BY id + 1; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (10 rows) +-- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) EXPLAIN (COSTS OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; QUERY PLAN --------------------------------------------------------------------------------------------- +--------------------------------------------------------------------- Limit -> Sort Sort Key: remote_scan.id -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Limit @@ -226,17 +241,20 @@ SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (12 rows) +-- ================================================================= +-- Category B: Ineligibility — sort NOT pushed for merge +-- ================================================================= SET citus.enable_sorted_merge TO on; -SET +-- B1: ORDER BY count(*) EXPLAIN (COSTS OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); QUERY PLAN --------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.count -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> HashAggregate @@ -244,15 +262,16 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (10 rows) +-- B2: ORDER BY avg(col) EXPLAIN (COSTS OFF) SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); QUERY PLAN --------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.avg -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> HashAggregate @@ -260,17 +279,18 @@ SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (10 rows) +-- B3: GROUP BY non-dist col, ORDER BY non-dist col EXPLAIN (COSTS OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; QUERY PLAN --------------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: remote_scan.val -> HashAggregate Group Key: remote_scan.val -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> HashAggregate @@ -278,17 +298,18 @@ SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (12 rows) +-- B4: GROUP BY non-dist col, ORDER BY aggregate EXPLAIN (COSTS OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); QUERY PLAN --------------------------------------------------------------------------------------- +--------------------------------------------------------------------- Sort Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) -> HashAggregate Group Key: remote_scan.val -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> HashAggregate @@ -296,11 +317,14 @@ SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (12 rows) +-- ================================================================= +-- Category C: Correctness — results match GUC off vs on +-- ================================================================= +-- C1: Simple ORDER BY SET citus.enable_sorted_merge TO off; -SET SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; id | val -----+-------- +--------------------------------------------------------------------- 1 | val_1 2 | val_2 3 | val_3 @@ -314,10 +338,9 @@ SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; (10 rows) SET citus.enable_sorted_merge TO on; -SET SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; id | val -----+-------- +--------------------------------------------------------------------- 1 | val_1 2 | val_2 3 | val_3 @@ -330,11 +353,11 @@ SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; 10 | val_10 (10 rows) +-- C2: ORDER BY DESC SET citus.enable_sorted_merge TO off; -SET SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; id ------ +--------------------------------------------------------------------- 202 201 200 @@ -343,10 +366,9 @@ SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; (5 rows) SET citus.enable_sorted_merge TO on; -SET SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; id ------ +--------------------------------------------------------------------- 202 201 200 @@ -354,11 +376,11 @@ SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 5; 101 (5 rows) +-- C3: Multi-column ORDER BY SET citus.enable_sorted_merge TO off; -SET SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; id | num -----+----- +--------------------------------------------------------------------- 1 | 1.5 2 | 3.0 3 | 4.5 @@ -367,10 +389,9 @@ SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; (5 rows) SET citus.enable_sorted_merge TO on; -SET SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; id | num -----+----- +--------------------------------------------------------------------- 1 | 1.5 2 | 3.0 3 | 4.5 @@ -378,11 +399,11 @@ SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; 5 | 7.5 (5 rows) +-- C4: ORDER BY non-distribution column SET citus.enable_sorted_merge TO off; -SET SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; id | val ------+-------- +--------------------------------------------------------------------- 200 | dup_a 201 | dup_b 202 | dup_c @@ -391,10 +412,9 @@ SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5 (5 rows) SET citus.enable_sorted_merge TO on; -SET SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5; id | val ------+-------- +--------------------------------------------------------------------- 200 | dup_a 201 | dup_b 202 | dup_c @@ -402,11 +422,11 @@ SELECT id, val FROM sorted_merge_test WHERE val IS NOT NULL ORDER BY val LIMIT 5 10 | val_10 (5 rows) +-- C5: GROUP BY dist_col ORDER BY dist_col SET citus.enable_sorted_merge TO off; -SET SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; id | count -----+------- +--------------------------------------------------------------------- 1 | 1 2 | 1 3 | 1 @@ -415,10 +435,9 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; (5 rows) SET citus.enable_sorted_merge TO on; -SET SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; id | count -----+------- +--------------------------------------------------------------------- 1 | 1 2 | 1 3 | 1 @@ -426,11 +445,11 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; 5 | 1 (5 rows) +-- C6: Mixed directions SET citus.enable_sorted_merge TO off; -SET SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; id | num -----+----- +--------------------------------------------------------------------- 1 | 1.5 2 | 3.0 3 | 4.5 @@ -439,10 +458,9 @@ SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num (5 rows) SET citus.enable_sorted_merge TO on; -SET SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num DESC LIMIT 5; id | num -----+----- +--------------------------------------------------------------------- 1 | 1.5 2 | 3.0 3 | 4.5 @@ -450,11 +468,11 @@ SELECT id, num FROM sorted_merge_test WHERE num IS NOT NULL ORDER BY id ASC, num 5 | 7.5 (5 rows) +-- C7: WHERE + ORDER BY SET citus.enable_sorted_merge TO off; -SET SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; id | val -----+-------- +--------------------------------------------------------------------- 67 | val_67 68 | val_68 69 | val_69 @@ -463,10 +481,9 @@ SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; (5 rows) SET citus.enable_sorted_merge TO on; -SET SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; id | val -----+-------- +--------------------------------------------------------------------- 67 | val_67 68 | val_68 69 | val_69 @@ -474,11 +491,11 @@ SELECT id, val FROM sorted_merge_test WHERE num > 100 ORDER BY id LIMIT 5; 71 | val_71 (5 rows) +-- C8: Aggregates in SELECT, ORDER BY on dist_col (GROUP BY dist_col) SET citus.enable_sorted_merge TO off; -SET SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; id | count | sum | avg -----+-------+-----+------------------------ +--------------------------------------------------------------------- 1 | 1 | 1.5 | 1.50000000000000000000 2 | 1 | 3.0 | 3.0000000000000000 3 | 1 | 4.5 | 4.5000000000000000 @@ -487,10 +504,9 @@ SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER (5 rows) SET citus.enable_sorted_merge TO on; -SET SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; id | count | sum | avg -----+-------+-----+------------------------ +--------------------------------------------------------------------- 1 | 1 | 1.5 | 1.50000000000000000000 2 | 1 | 3.0 | 3.0000000000000000 3 | 1 | 4.5 | 4.5000000000000000 @@ -498,13 +514,16 @@ SELECT id, count(*), sum(num), avg(num) FROM sorted_merge_test GROUP BY id ORDER 5 | 1 | 7.5 | 7.5000000000000000 (5 rows) +-- ================================================================= +-- Category D: Complex queries — regression guards +-- ================================================================= SET citus.enable_sorted_merge TO on; -SET +-- D1: Subquery in FROM with ORDER BY SELECT * FROM ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 ) sub ORDER BY id; id | val -----+------- +--------------------------------------------------------------------- 1 | val_1 2 | val_2 3 | val_3 @@ -512,12 +531,13 @@ SELECT * FROM ( 5 | val_5 (5 rows) +-- D2: CTE with ORDER BY WITH top5 AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5 ) SELECT * FROM top5 ORDER BY id; id | val -----+------- +--------------------------------------------------------------------- 1 | val_1 2 | val_2 3 | val_3 @@ -525,6 +545,7 @@ SELECT * FROM top5 ORDER BY id; 5 | val_5 (5 rows) +-- D3: Co-located JOIN + ORDER BY SELECT t.id, t.val, e.event_type FROM sorted_merge_test t JOIN sorted_merge_events e ON t.id = e.id @@ -532,7 +553,7 @@ WHERE t.id <= 5 ORDER BY t.id, e.event_type LIMIT 10; id | val | event_type -----+-------+------------ +--------------------------------------------------------------------- 1 | val_1 | buy 1 | val_1 | buy 1 | val_1 | click @@ -545,12 +566,13 @@ LIMIT 10; 3 | val_3 | buy (10 rows) +-- D4: UNION ALL + ORDER BY SELECT id, val FROM sorted_merge_test WHERE id <= 3 UNION ALL SELECT id, val FROM sorted_merge_test WHERE id BETWEEN 98 AND 100 ORDER BY id; id | val ------+--------- +--------------------------------------------------------------------- 1 | val_1 2 | val_2 3 | val_3 @@ -559,9 +581,10 @@ ORDER BY id; 100 | val_100 (6 rows) +-- D5: DISTINCT + ORDER BY SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; id ----- +--------------------------------------------------------------------- 1 2 3 @@ -574,12 +597,13 @@ SELECT DISTINCT id FROM sorted_merge_test WHERE id <= 10 ORDER BY id; 10 (10 rows) +-- D6: DISTINCT ON + ORDER BY SELECT DISTINCT ON (id) id, val, num FROM sorted_merge_test WHERE id <= 5 ORDER BY id, num DESC; id | val | num -----+-------+----- +--------------------------------------------------------------------- 1 | val_1 | 1.5 2 | val_2 | 3.0 3 | val_3 | 4.5 @@ -587,11 +611,12 @@ ORDER BY id, num DESC; 5 | val_5 | 7.5 (5 rows) +-- D7: EXISTS subquery + ORDER BY SELECT id, val FROM sorted_merge_test t WHERE EXISTS (SELECT 1 FROM sorted_merge_events e WHERE e.id = t.id) ORDER BY id LIMIT 5; id | val -----+------- +--------------------------------------------------------------------- 1 | val_1 2 | val_2 3 | val_3 @@ -599,11 +624,12 @@ ORDER BY id LIMIT 5; 5 | val_5 (5 rows) +-- D8: IN subquery + ORDER BY SELECT id, val FROM sorted_merge_test WHERE id IN (SELECT id FROM sorted_merge_events WHERE event_type = 'click') ORDER BY id LIMIT 5; id | val -----+------- +--------------------------------------------------------------------- 1 | val_1 2 | val_2 3 | val_3 @@ -611,13 +637,14 @@ ORDER BY id LIMIT 5; 5 | val_5 (5 rows) +-- D9: Multiple aggregates, GROUP BY dist_col, ORDER BY dist_col SELECT id, count(*), sum(num), avg(num), min(val), max(val) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; id | count | sum | avg | min | max -----+-------+-----+------------------------+-------+------- +--------------------------------------------------------------------- 1 | 1 | 1.5 | 1.50000000000000000000 | val_1 | val_1 2 | 1 | 3.0 | 3.0000000000000000 | val_2 | val_2 3 | 1 | 4.5 | 4.5000000000000000 | val_3 | val_3 @@ -625,6 +652,7 @@ LIMIT 5; 5 | 1 | 7.5 | 7.5000000000000000 | val_5 | val_5 (5 rows) +-- D10: CASE expression in SELECT + ORDER BY SELECT id, CASE WHEN num > 75 THEN 'high' WHEN num > 25 THEN 'mid' ELSE 'low' END as bucket FROM sorted_merge_test @@ -632,7 +660,7 @@ WHERE num IS NOT NULL ORDER BY id LIMIT 10; id | bucket -----+-------- +--------------------------------------------------------------------- 1 | low 2 | low 3 | low @@ -645,9 +673,10 @@ LIMIT 10; 10 | low (10 rows) +-- D11: NULL values ordering SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; id | num ------+----- +--------------------------------------------------------------------- 101 | 102 | 1 | 1.5 @@ -657,7 +686,7 @@ SELECT id, num FROM sorted_merge_test ORDER BY num NULLS FIRST, id LIMIT 5; SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; id | num -----+----- +--------------------------------------------------------------------- 1 | 1.5 2 | 3.0 3 | 4.5 @@ -667,7 +696,7 @@ SELECT id, num FROM sorted_merge_test ORDER BY num NULLS LAST, id LIMIT 5; SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; id | num ------+------- +--------------------------------------------------------------------- 101 | 102 | 100 | 150.0 @@ -677,7 +706,7 @@ SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS FIRST, id LIMIT 5; SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMIT 5; id | num ------+------- +--------------------------------------------------------------------- 100 | 150.0 99 | 148.5 98 | 147.0 @@ -685,9 +714,10 @@ SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST, id DESC LIMI 96 | 144.0 (5 rows) +-- D12: Large OFFSET SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; id ------ +--------------------------------------------------------------------- 101 102 200 @@ -695,9 +725,10 @@ SELECT id FROM sorted_merge_test ORDER BY id OFFSET 100 LIMIT 5; 202 (5 rows) +-- D13: ORDER BY ordinal position SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; id | val ------+-------- +--------------------------------------------------------------------- 200 | dup_a 201 | dup_b 202 | dup_c @@ -705,34 +736,40 @@ SELECT id, val FROM sorted_merge_test ORDER BY 2, 1 LIMIT 5; 10 | val_10 (5 rows) +-- ================================================================= +-- Category E: Edge cases +-- ================================================================= SET citus.enable_sorted_merge TO on; -SET +-- E1: Empty result set SELECT id FROM sorted_merge_test WHERE id < 0 ORDER BY id; id ----- +--------------------------------------------------------------------- (0 rows) +-- E2: Single row (may go through router planner) SELECT id, val FROM sorted_merge_test WHERE id = 42 ORDER BY id; id | val -----+-------- +--------------------------------------------------------------------- 42 | val_42 (1 row) +-- E3: All rows with same sort value SELECT id, num FROM sorted_merge_test WHERE num = 10.5 ORDER BY num, id; id | num ------+------ +--------------------------------------------------------------------- 7 | 10.5 200 | 10.5 201 | 10.5 202 | 10.5 (4 rows) +-- E4: Wide sort key (4 columns) SELECT id, val, num FROM sorted_merge_test WHERE id <= 5 ORDER BY num, val, id LIMIT 5; id | val | num -----+-------+----- +--------------------------------------------------------------------- 1 | val_1 | 1.5 2 | val_2 | 3.0 3 | val_3 | 4.5 @@ -740,18 +777,31 @@ LIMIT 5; 5 | val_5 | 7.5 (5 rows) +-- E5: Zero-task defensive path +-- CreatePerTaskDispatchDest handles taskCount=0 gracefully (returns a no-op +-- destination). This cannot be triggered via normal SQL because distributed +-- tables always have at least one shard. The closest we can test is an +-- empty-result query through the sorted merge path to verify no crash. +SELECT id FROM sorted_merge_test WHERE false ORDER BY id; + id +--------------------------------------------------------------------- +(0 rows) + +-- ================================================================= +-- Category F: Existing LIMIT pushdown stability +-- ================================================================= +-- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on SET citus.enable_sorted_merge TO off; -SET EXPLAIN (COSTS OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; QUERY PLAN --------------------------------------------------------------------------------------------- +--------------------------------------------------------------------- Limit -> Sort Sort Key: remote_scan.id -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Limit @@ -761,17 +811,16 @@ SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; (12 rows) SET citus.enable_sorted_merge TO on; -SET EXPLAIN (COSTS OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; QUERY PLAN --------------------------------------------------------------------------------------------- +--------------------------------------------------------------------- Limit -> Sort Sort Key: remote_scan.id -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Limit @@ -780,18 +829,18 @@ SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (12 rows) +-- F2: GROUP BY dist_col + ORDER BY + LIMIT SET citus.enable_sorted_merge TO off; -SET EXPLAIN (COSTS OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; QUERY PLAN --------------------------------------------------------------------------------------------------- +--------------------------------------------------------------------- Limit -> Sort Sort Key: remote_scan.id -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Limit @@ -803,17 +852,16 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; (14 rows) SET citus.enable_sorted_merge TO on; -SET EXPLAIN (COSTS OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; QUERY PLAN --------------------------------------------------------------------------------------------------- +--------------------------------------------------------------------- Limit -> Sort Sort Key: remote_scan.id -> Custom Scan (Citus Adaptive) - Task Count: 32 - Tasks Shown: One of 32 + Task Count: 4 + Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Limit @@ -824,33 +872,32 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (14 rows) +-- F3: ORDER BY aggregate + LIMIT (not eligible for merge) SET citus.enable_sorted_merge TO off; -SET -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 5; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; id | count -----+------- - 20 | 1 - 8 | 1 - 82 | 1 - 15 | 1 - 60 | 1 +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 (5 rows) SET citus.enable_sorted_merge TO on; -SET -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 5; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; id | count -----+------- - 20 | 1 - 8 | 1 - 82 | 1 - 15 | 1 - 60 | 1 +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 (5 rows) +-- ================================================================= +-- Cleanup +-- ================================================================= SET citus.enable_sorted_merge TO off; -SET DROP TABLE sorted_merge_test; -DROP TABLE DROP TABLE sorted_merge_events; -DROP TABLE diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index 86504708cd3..0f1ce160d74 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -280,6 +280,13 @@ WHERE id <= 5 ORDER BY num, val, id LIMIT 5; +-- E5: Zero-task defensive path +-- CreatePerTaskDispatchDest handles taskCount=0 gracefully (returns a no-op +-- destination). This cannot be triggered via normal SQL because distributed +-- tables always have at least one shard. The closest we can test is an +-- empty-result query through the sorted merge path to verify no crash. +SELECT id FROM sorted_merge_test WHERE false ORDER BY id; + -- ================================================================= -- Category F: Existing LIMIT pushdown stability -- ================================================================= @@ -304,10 +311,10 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; -- F3: ORDER BY aggregate + LIMIT (not eligible for merge) SET citus.enable_sorted_merge TO off; -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 5; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; SET citus.enable_sorted_merge TO on; -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 5; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; -- ================================================================= -- Cleanup From 7bd9123a0f4a8215938524b10fb30226332c06d3 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Thu, 19 Mar 2026 22:29:20 +0000 Subject: [PATCH 04/18] Add sorted merge foundation: GUC, struct fields, and serialization Phase 1 of the sorted-merge feature. This commit adds the data structures and GUC needed by later phases, with zero behavioral changes: - SortedMergeKey typedef in multi_physical_planner.h describing one sort key for the coordinator k-way merge - useSortedMerge, sortedMergeKeys[], sortedMergeKeyCount fields on DistributedPlan (plan-time decision, never checked at runtime via GUC) - sortedMergeEligible field on MultiExtendedOp (logical optimizer tag read by the physical planner) - Hidden GUC citus.enable_sorted_merge (PGC_SUSET, default off, GUC_NO_SHOW_ALL) consulted only during planning - Serialization in citus_outfuncs.c and deep-copy in citus_copyfuncs.c for all new fields All new fields default to false/0/NULL. Existing regression tests are unaffected. Co-authored-by: Copilot --- .../distributed/executor/adaptive_executor.c | 11 +- .../planner/combine_query_planner.c | 17 + .../planner/multi_logical_optimizer.c | 9 + .../expected/multi_orderby_pushdown.out | 294 ++++++++---------- 4 files changed, 169 insertions(+), 162 deletions(-) diff --git a/src/backend/distributed/executor/adaptive_executor.c b/src/backend/distributed/executor/adaptive_executor.c index 6fb75c9db0e..d67a63379d7 100644 --- a/src/backend/distributed/executor/adaptive_executor.c +++ b/src/backend/distributed/executor/adaptive_executor.c @@ -813,9 +813,14 @@ AdaptiveExecutor(CitusScanState *scanState) TupleDestination *defaultTupleDest = NULL; /* - * When sorted merge is active and we're not doing EXPLAIN ANALYZE, - * route worker results into per-task tuple stores. The final tuplestore - * will be created later after the k-way merge. + * When sorted merge is active, route worker results into per-task tuple + * stores. Skip sorted merge for EXPLAIN ANALYZE (which modifies task + * lists in incompatible ways). + * + * Note: useSortedMerge is a plan-time decision — if the plan says merge, + * the executor must merge, because the combine query plan has no Sort + * node above us. Skipping the merge here would produce silently unsorted + * output. All eligibility checks belong in the planner, not here. */ bool useSortedMerge = distributedPlan->useSortedMerge && !RequestedForExplainAnalyze(scanState); diff --git a/src/backend/distributed/planner/combine_query_planner.c b/src/backend/distributed/planner/combine_query_planner.c index c8ab2a4b326..740a814ea06 100644 --- a/src/backend/distributed/planner/combine_query_planner.c +++ b/src/backend/distributed/planner/combine_query_planner.c @@ -22,6 +22,7 @@ #include "distributed/citus_ruleutils.h" #include "distributed/combine_query_planner.h" +#include "distributed/distributed_planner.h" #include "distributed/insert_select_planner.h" #include "distributed/listutils.h" #include "distributed/metadata_cache.h" @@ -154,6 +155,22 @@ CreateCitusCustomScanPath(PlannerInfo *root, RelOptInfo *relOptInfo, path->custom_path.path.rows = 100000; path->remoteScan = remoteScan; + /* + * When sorted merge is active (decided at planning time and baked into the + * DistributedPlan), declare that this CustomScan produces sorted output by + * setting pathkeys to match the combine query's required sort order. + * + * This causes PostgreSQL's create_ordered_paths() to recognize the + * CustomScan output as already sorted and skip adding a Sort node above + * it. The executor fulfills this contract by merging per-task stores in + * sort order into the final tuplestore. + */ + DistributedPlan *distPlan = GetDistributedPlan(remoteScan); + if (distPlan->useSortedMerge && root->sort_pathkeys != NIL) + { + path->custom_path.path.pathkeys = root->sort_pathkeys; + } + return (Path *) path; } diff --git a/src/backend/distributed/planner/multi_logical_optimizer.c b/src/backend/distributed/planner/multi_logical_optimizer.c index d5b4b00d42d..03ad83012b3 100644 --- a/src/backend/distributed/planner/multi_logical_optimizer.c +++ b/src/backend/distributed/planner/multi_logical_optimizer.c @@ -2556,11 +2556,20 @@ WorkerExtendedOpNode(MultiExtendedOp *originalOpNode, * The worker sort clause list is the output of the existing safety analysis * in WorkerSortClauseList(). If it matches the original sort clause, workers * will produce identically-sorted output suitable for a coordinator merge. + * + * We must also exclude queries where ORDER BY references aggregates, + * because aggregate expressions are rewritten between worker and coordinator + * (e.g. avg → sum/count). The worker's sort order on partial aggregates + * does not match the coordinator's final aggregate sort order, so the + * merge would produce incorrectly ordered output. This check is needed + * because the existing LIMIT pushdown path may have already pushed the + * sort clause to workers for its own purposes. */ if (EnableSortedMerge && queryOrderByLimit.workerSortClauseList != NIL && originalSortClauseList != NIL && !extendedOpNodeProperties->pullUpIntermediateRows && + !HasOrderByAggregate(originalSortClauseList, originalTargetEntryList) && SortClauseListsMatch(queryOrderByLimit.workerSortClauseList, originalSortClauseList)) { diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index 5a87ad12e11..a03a4a91c52 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -69,178 +69,158 @@ SET citus.enable_sorted_merge TO on; -- A1: ORDER BY distribution column EXPLAIN (COSTS OFF) SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN ---------------------------------------------------------------------- - Sort - Sort Key: remote_scan.id - -> Custom Scan (Citus Adaptive) - Task Count: 4 - Tasks Shown: One of 4 - -> Task - Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(8 rows) -- A2: ORDER BY DESC EXPLAIN (COSTS OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC; - QUERY PLAN ---------------------------------------------------------------------- - Sort - Sort Key: remote_scan.id DESC - -> Custom Scan (Citus Adaptive) - Task Count: 4 - Tasks Shown: One of 4 - -> Task - Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id DESC - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id DESC + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(8 rows) -- A3: ORDER BY DESC NULLS LAST EXPLAIN (COSTS OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; - QUERY PLAN ---------------------------------------------------------------------- - Sort - Sort Key: remote_scan.num DESC NULLS LAST - -> Custom Scan (Citus Adaptive) - Task Count: 4 - Tasks Shown: One of 4 - -> Task - Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: num DESC NULLS LAST - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: num DESC NULLS LAST + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(8 rows) -- A4: ORDER BY non-distribution column EXPLAIN (COSTS OFF) SELECT id, val FROM sorted_merge_test ORDER BY val; - QUERY PLAN ---------------------------------------------------------------------- - Sort - Sort Key: remote_scan.val - -> Custom Scan (Citus Adaptive) - Task Count: 4 - Tasks Shown: One of 4 - -> Task - Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: val - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: val + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(8 rows) -- A5: Multi-column ORDER BY EXPLAIN (COSTS OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val; - QUERY PLAN ---------------------------------------------------------------------- - Sort - Sort Key: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) - Task Count: 4 - Tasks Shown: One of 4 - -> Task - Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id, val - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id, val + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(8 rows) -- A6: Mixed directions EXPLAIN (COSTS OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; - QUERY PLAN ---------------------------------------------------------------------- - Sort - Sort Key: remote_scan.id, remote_scan.num DESC - -> Custom Scan (Citus Adaptive) - Task Count: 4 - Tasks Shown: One of 4 - -> Task - Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id, num DESC - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id, num DESC + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(8 rows) -- A7: GROUP BY dist_col ORDER BY dist_col EXPLAIN (COSTS OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Sort - Sort Key: remote_scan.id - -> Custom Scan (Citus Adaptive) - Task Count: 4 - Tasks Shown: One of 4 - -> Task - Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id - -> HashAggregate - Group Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(12 rows) + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id + -> HashAggregate + Group Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) -- A8: WHERE clause + ORDER BY EXPLAIN (COSTS OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; - QUERY PLAN ---------------------------------------------------------------------- - Sort - Sort Key: remote_scan.id - -> Custom Scan (Citus Adaptive) - Task Count: 4 - Tasks Shown: One of 4 - -> Task - Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test - Filter: (num > '50'::numeric) -(11 rows) + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + Filter: (num > '50'::numeric) +(9 rows) -- A9: Expression in ORDER BY (non-aggregate) EXPLAIN (COSTS OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1; - QUERY PLAN + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: ((id + 1)) + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(8 rows) + +-- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) +EXPLAIN (COSTS OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN --------------------------------------------------------------------- - Sort - Sort Key: remote_scan.worker_column_3 + Limit -> Custom Scan (Citus Adaptive) Task Count: 4 Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: ((id + 1)) - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (10 rows) --- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) -EXPLAIN (COSTS OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN ---------------------------------------------------------------------- - Limit - -> Sort - Sort Key: remote_scan.id - -> Custom Scan (Citus Adaptive) - Task Count: 4 - Tasks Shown: One of 4 - -> Task - Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(12 rows) - -- ================================================================= -- Category B: Ineligibility — sort NOT pushed for merge -- ================================================================= @@ -813,21 +793,19 @@ SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; SET citus.enable_sorted_merge TO on; EXPLAIN (COSTS OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- Limit - -> Sort - Sort Key: remote_scan.id - -> Custom Scan (Citus Adaptive) - Task Count: 4 - Tasks Shown: One of 4 - -> Task - Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(12 rows) + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) -- F2: GROUP BY dist_col + ORDER BY + LIMIT SET citus.enable_sorted_merge TO off; @@ -854,23 +832,21 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; SET citus.enable_sorted_merge TO on; EXPLAIN (COSTS OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- Limit - -> Sort - Sort Key: remote_scan.id - -> Custom Scan (Citus Adaptive) - Task Count: 4 - Tasks Shown: One of 4 - -> Task - Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> HashAggregate - Group Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(14 rows) + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> HashAggregate + Group Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) -- F3: ORDER BY aggregate + LIMIT (not eligible for merge) SET citus.enable_sorted_merge TO off; From 1baefd6c0fd54576018745d24b6bafe5ee8c2a9a Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Thu, 19 Mar 2026 22:57:42 +0000 Subject: [PATCH 05/18] Add more comprehensive tests for sorted merge --- .../expected/multi_orderby_pushdown.out | 203 ++++++++++++++++++ .../regress/sql/multi_orderby_pushdown.sql | 65 ++++++ 2 files changed, 268 insertions(+) diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index a03a4a91c52..eef7cdefce9 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -871,6 +871,209 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, i 5 | 1 (5 rows) +-- ================================================================= +-- Category G: Phase 4 — Sort elision and advanced scenarios +-- ================================================================= +-- G1: Sort elision verification — coordinator Sort node absent +SET citus.enable_sorted_merge TO off; +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(8 rows) + +SET citus.enable_sorted_merge TO on; +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(8 rows) + +-- G2a: PREPARE with merge ON, EXECUTE after turning OFF +-- Plan-time decision is baked in — cached plan must still merge correctly +SET citus.enable_sorted_merge TO on; +PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO off; +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +DEALLOCATE merge_on_stmt; +-- G2b: PREPARE with merge OFF, EXECUTE after turning ON +-- Cached plan has Sort node — must still return sorted results +SET citus.enable_sorted_merge TO off; +PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +SET citus.enable_sorted_merge TO on; +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +DEALLOCATE merge_off_stmt; +-- G3: Cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_cursor CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 1 + 2 + 3 +(3 rows) + +FETCH BACKWARD 1 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 2 +(1 row) + +FETCH 2 FROM sorted_cursor; + id +--------------------------------------------------------------------- + 3 + 4 +(2 rows) + +CLOSE sorted_cursor; +COMMIT; +-- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit (actual rows=5 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Task Count: 4 + Tuple data received from nodes: 80 bytes + Tasks Shown: One of 4 + -> Task + Tuple data received from node: 20 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Limit (actual rows=5 loops=1) + Buffers: shared hit=1 + -> Sort (actual rows=5 loops=1) + Sort Key: id + Sort Method: top-N heapsort Memory: 25kB + Buffers: shared hit=1 + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Buffers: shared hit=1 +(16 rows) + +-- G5: ORDER BY aggregate + LIMIT — crash regression test +-- Previously caused SIGSEGV when sorted merge was enabled because +-- aggregate ORDER BY was erroneously tagged as merge-eligible. +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 3; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 +(3 rows) + +-- G6: Small work_mem with many tasks (32 shards) +SET citus.enable_sorted_merge TO on; +SET work_mem TO '64kB'; +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +RESET work_mem; +-- G7: max_intermediate_result_size with CTE subplan +SET citus.enable_sorted_merge TO on; +SET citus.max_intermediate_result_size TO '4kB'; +WITH cte AS (SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 50) +SELECT * FROM cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; -- ================================================================= -- Cleanup -- ================================================================= diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index 0f1ce160d74..2337d19f676 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -316,6 +316,71 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, i SET citus.enable_sorted_merge TO on; SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 5; +-- ================================================================= +-- Category G: Phase 4 — Sort elision and advanced scenarios +-- ================================================================= + +-- G1: Sort elision verification — coordinator Sort node absent +SET citus.enable_sorted_merge TO off; +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + +SET citus.enable_sorted_merge TO on; +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test ORDER BY id; + +-- G2a: PREPARE with merge ON, EXECUTE after turning OFF +-- Plan-time decision is baked in — cached plan must still merge correctly +SET citus.enable_sorted_merge TO on; +PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_on_stmt; +SET citus.enable_sorted_merge TO off; +EXECUTE merge_on_stmt; +DEALLOCATE merge_on_stmt; + +-- G2b: PREPARE with merge OFF, EXECUTE after turning ON +-- Cached plan has Sort node — must still return sorted results +SET citus.enable_sorted_merge TO off; +PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; +EXECUTE merge_off_stmt; +SET citus.enable_sorted_merge TO on; +EXECUTE merge_off_stmt; +DEALLOCATE merge_off_stmt; + +-- G3: Cursor with backward scan +SET citus.enable_sorted_merge TO on; +BEGIN; +DECLARE sorted_cursor CURSOR FOR SELECT id FROM sorted_merge_test ORDER BY id; +FETCH 3 FROM sorted_cursor; +FETCH BACKWARD 1 FROM sorted_cursor; +FETCH 2 FROM sorted_cursor; +CLOSE sorted_cursor; +COMMIT; + +-- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) +SET citus.enable_sorted_merge TO on; +EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; + +-- G5: ORDER BY aggregate + LIMIT — crash regression test +-- Previously caused SIGSEGV when sorted merge was enabled because +-- aggregate ORDER BY was erroneously tagged as merge-eligible. +SET citus.enable_sorted_merge TO on; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 3; + +-- G6: Small work_mem with many tasks (32 shards) +SET citus.enable_sorted_merge TO on; +SET work_mem TO '64kB'; +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; +RESET work_mem; + +-- G7: max_intermediate_result_size with CTE subplan +SET citus.enable_sorted_merge TO on; +SET citus.max_intermediate_result_size TO '4kB'; +WITH cte AS (SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 50) +SELECT * FROM cte ORDER BY id LIMIT 5; +RESET citus.max_intermediate_result_size; + -- ================================================================= -- Cleanup -- ================================================================= From 3a80aac5e1337b34f08f9e705ec19d3b500ada31 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Fri, 20 Mar 2026 22:50:30 +0000 Subject: [PATCH 06/18] Minor code clean up --- .../distributed/executor/adaptive_executor.c | 36 ++++++++++++------- .../planner/multi_logical_optimizer.c | 2 +- src/include/distributed/tuple_destination.h | 4 +-- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/backend/distributed/executor/adaptive_executor.c b/src/backend/distributed/executor/adaptive_executor.c index d67a63379d7..a4e5461e51e 100644 --- a/src/backend/distributed/executor/adaptive_executor.c +++ b/src/backend/distributed/executor/adaptive_executor.c @@ -171,12 +171,12 @@ #include "distributed/repartition_join_execution.h" #include "distributed/resource_lock.h" #include "distributed/shared_connection_stats.h" +#include "distributed/sorted_merge.h" #include "distributed/stats/stat_counters.h" #include "distributed/subplan_execution.h" #include "distributed/transaction_identifier.h" #include "distributed/transaction_management.h" #include "distributed/tuple_destination.h" -#include "distributed/sorted_merge.h" #include "distributed/version_compat.h" #include "distributed/worker_protocol.h" @@ -651,7 +651,10 @@ static DistributedExecution * CreateDistributedExecution(RowModifyLevel modLevel TransactionProperties * xactProperties, List *jobIdList, - bool localExecutionSupported); + bool localExecutionSupported, + bool useSortedMerge, + Tuplestorestate **perTaskStores, + int perTaskStoreCount); static TransactionProperties DecideTaskListTransactionProperties(RowModifyLevel modLevel, List *taskList, @@ -831,9 +834,10 @@ AdaptiveExecutor(CitusScanState *scanState) { TupleDestinationStats *sharedStats = palloc0(sizeof(TupleDestinationStats)); defaultTupleDest = CreatePerTaskDispatchDest(taskList, tupleDescriptor, - sharedStats, - &perTaskStores, - &perTaskStoreCount); + sharedStats, + &perTaskStores, + &perTaskStoreCount); + /* final tuplestore created after merge */ scanState->tuplestorestate = NULL; } @@ -904,12 +908,10 @@ AdaptiveExecutor(CitusScanState *scanState) defaultTupleDest, &xactProperties, jobIdList, - localExecutionSupported); - - /* save sorted merge state on execution for post-merge */ - execution->useSortedMerge = useSortedMerge; - execution->perTaskStores = perTaskStores; - execution->perTaskStoreCount = perTaskStoreCount; + localExecutionSupported, + useSortedMerge, + perTaskStores, + perTaskStoreCount); /* * Make sure that we acquire the appropriate locks even if the local tasks @@ -1173,7 +1175,8 @@ ExecuteTaskListExtended(ExecutionParams *executionParams) executionParams->modLevel, executionParams->taskList, executionParams->paramListInfo, executionParams->targetPoolSize, defaultTupleDest, &executionParams->xactProperties, - executionParams->jobIdList, executionParams->localExecutionSupported); + executionParams->jobIdList, executionParams->localExecutionSupported, + false, NULL, 0); /* * If current transaction accessed local placements and task list includes @@ -1238,7 +1241,10 @@ CreateDistributedExecution(RowModifyLevel modLevel, List *taskList, ParamListInfo paramListInfo, int targetPoolSize, TupleDestination *defaultTupleDest, TransactionProperties *xactProperties, - List *jobIdList, bool localExecutionSupported) + List *jobIdList, bool localExecutionSupported, + bool useSortedMerge, + Tuplestorestate **perTaskStores, + int perTaskStoreCount) { DistributedExecution *execution = (DistributedExecution *) palloc0(sizeof(DistributedExecution)); @@ -1268,6 +1274,10 @@ CreateDistributedExecution(RowModifyLevel modLevel, List *taskList, execution->localExecutionSupported = localExecutionSupported; + execution->useSortedMerge = useSortedMerge; + execution->perTaskStores = perTaskStores; + execution->perTaskStoreCount = perTaskStoreCount; + /* * Since task can have multiple queries, we are not sure how many columns we should * allocate for. We start with 16, and reallocate when we need more. diff --git a/src/backend/distributed/planner/multi_logical_optimizer.c b/src/backend/distributed/planner/multi_logical_optimizer.c index 03ad83012b3..cf04a7e7f58 100644 --- a/src/backend/distributed/planner/multi_logical_optimizer.c +++ b/src/backend/distributed/planner/multi_logical_optimizer.c @@ -50,10 +50,10 @@ #include "distributed/function_utils.h" #include "distributed/listutils.h" #include "distributed/metadata_cache.h" +#include "distributed/multi_executor.h" #include "distributed/multi_logical_optimizer.h" #include "distributed/multi_logical_planner.h" #include "distributed/multi_physical_planner.h" -#include "distributed/multi_executor.h" #include "distributed/pg_dist_partition.h" #include "distributed/query_pushdown_planning.h" #include "distributed/string_utils.h" diff --git a/src/include/distributed/tuple_destination.h b/src/include/distributed/tuple_destination.h index c502fd2aa1a..e4659ed25f9 100644 --- a/src/include/distributed/tuple_destination.h +++ b/src/include/distributed/tuple_destination.h @@ -65,7 +65,7 @@ extern TupleDestination * CreateTupleStoreTupleDest(Tuplestorestate *tupleStore, extern TupleDestination * CreateTupleDestNone(void); extern DestReceiver * CreateTupleDestDestReceiver(TupleDestination *tupleDest, Task *task, int placementIndex); -extern void EnsureIntermediateSizeLimitNotExceeded( - TupleDestinationStats *tupleDestinationStats); +extern void EnsureIntermediateSizeLimitNotExceeded(TupleDestinationStats * + tupleDestinationStats); #endif From b00cc93566472c97b6eef20708a015e950045b85 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Mon, 23 Mar 2026 16:34:04 +0000 Subject: [PATCH 07/18] Added subquery tests --- .../expected/multi_orderby_pushdown.out | 433 ++++++++++++++++++ .../regress/sql/multi_orderby_pushdown.sql | 149 ++++++ 2 files changed, 582 insertions(+) diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index eef7cdefce9..b8c9fa680ae 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -1074,6 +1074,439 @@ SELECT * FROM cte ORDER BY id LIMIT 5; (5 rows) RESET citus.max_intermediate_result_size; +-- ================================================================= +-- Category H: Subplan + Sorted Merge interactions +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1: CTE subplan with simple ORDER BY — eligible for sorted merge +-- The CTE becomes a subplan; its DistributedPlan may have useSortedMerge=true +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 5 | val_5 + 8 | val_8 + 10 | val_10 + 15 | val_15 +(5 rows) + +-- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + id | val | cnt +--------------------------------------------------------------------- + 5 | val_5 | 1 + 10 | val_10 | 1 + 15 | val_15 | 1 +(3 rows) + +-- H3: CTE subplan feeding outer ORDER BY — both levels may merge independently +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- H4: Subquery in WHERE with ORDER BY + LIMIT — becomes subplan with merge +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 +(3 rows) + +-- H5: CTE subplan with max_intermediate_result_size enforcement +-- Tests that EnsureIntermediateSizeLimitNotExceeded works through per-task dispatch +SET citus.max_intermediate_result_size TO '4kB'; +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +RESET citus.max_intermediate_result_size; +-- H6: Cross-join subplan with non-aggregate ORDER BY (crash regression variant) +-- Similar pattern to subquery_complex_target_list but without aggregate ORDER BY +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + id | bar_id +--------------------------------------------------------------------- + 1 | 1 + 1 | 1 + 1 | 1 + 2 | 1 + 2 | 1 +(5 rows) + +-- H7: CTE correctness comparison — GUC off vs on must produce identical results +SET citus.enable_sorted_merge TO off; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + id | val | num +--------------------------------------------------------------------- + 7 | val_7 | 10.5 + 8 | val_8 | 12.0 + 9 | val_9 | 13.5 + 10 | val_10 | 15.0 + 11 | val_11 | 16.5 +(5 rows) + +-- ================================================================= +-- Category H EXPLAIN: Query plans for subplan + sorted merge +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- H1 EXPLAIN +EXPLAIN (COSTS OFF) +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: sorted_merge_test.id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(10 rows) + +-- H2 EXPLAIN +EXPLAIN (COSTS OFF) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + -> Distributed Subplan XXX_1 + -> Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + -> Distributed Subplan XXX_2 + -> Limit + -> Sort + Sort Key: remote_scan.cnt DESC + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: (count(*)) DESC + -> HashAggregate + Group Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + Task Count: 1 + Tasks Shown: All + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Merge Join + Merge Cond: (intermediate_result.id = intermediate_result_1.id) + -> Sort + Sort Key: intermediate_result.id + -> Function Scan on read_intermediate_result intermediate_result + -> Sort + Sort Key: intermediate_result_1.id + -> Function Scan on read_intermediate_result intermediate_result_1 +(39 rows) + +-- H3 EXPLAIN +EXPLAIN (COSTS OFF) +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + -> Distributed Subplan XXX_1 + -> Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Merge Join + Merge Cond: (t.id = intermediate_result.id) + -> Sort + Sort Key: t.id + -> Seq Scan on sorted_merge_test_960000 t + -> Sort + Sort Key: intermediate_result.id + -> Function Scan on read_intermediate_result intermediate_result +(26 rows) + +-- H4 EXPLAIN +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + -> Distributed Subplan XXX_1 + -> Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_events_960004 sorted_merge_events + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: sorted_merge_test.id + -> Hash Join + Hash Cond: (sorted_merge_test.id = intermediate_result.id) + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + -> Hash + -> HashAggregate + Group Key: intermediate_result.id + -> Function Scan on read_intermediate_result intermediate_result +(27 rows) + +-- H5 EXPLAIN +EXPLAIN (COSTS OFF) +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + -> Distributed Subplan XXX_1 + -> Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + Task Count: 1 + Tasks Shown: All + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: intermediate_result.id + -> Function Scan on read_intermediate_result intermediate_result +(20 rows) + +-- H6 EXPLAIN +EXPLAIN (COSTS OFF) +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + -> Distributed Subplan XXX_1 + -> Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + -> Distributed Subplan XXX_2 + -> Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_events_960004 sorted_merge_events + Task Count: 1 + Tasks Shown: All + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: intermediate_result.id, intermediate_result_1.id + -> Nested Loop + -> Function Scan on read_intermediate_result intermediate_result + -> Function Scan on read_intermediate_result intermediate_result_1 +(33 rows) + +-- H7 EXPLAIN — GUC off vs on +SET citus.enable_sorted_merge TO off; +EXPLAIN (COSTS OFF) +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + -> Distributed Subplan XXX_1 + -> Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + Task Count: 1 + Tasks Shown: All + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: intermediate_result.id + -> Function Scan on read_intermediate_result intermediate_result + Filter: (num > '10'::numeric) +(23 rows) + +SET citus.enable_sorted_merge TO on; +EXPLAIN (COSTS OFF) +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + QUERY PLAN +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) + -> Distributed Subplan XXX_1 + -> Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + Task Count: 1 + Tasks Shown: All + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: intermediate_result.id + -> Function Scan on read_intermediate_result intermediate_result + Filter: (num > '10'::numeric) +(21 rows) + -- ================================================================= -- Cleanup -- ================================================================= diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index 2337d19f676..cd69f63955a 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -381,6 +381,155 @@ WITH cte AS (SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 50) SELECT * FROM cte ORDER BY id LIMIT 5; RESET citus.max_intermediate_result_size; +-- ================================================================= +-- Category H: Subplan + Sorted Merge interactions +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- H1: CTE subplan with simple ORDER BY — eligible for sorted merge +-- The CTE becomes a subplan; its DistributedPlan may have useSortedMerge=true +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte LIMIT 5; + +-- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + +-- H3: CTE subplan feeding outer ORDER BY — both levels may merge independently +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + +-- H4: Subquery in WHERE with ORDER BY + LIMIT — becomes subplan with merge +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + +-- H5: CTE subplan with max_intermediate_result_size enforcement +-- Tests that EnsureIntermediateSizeLimitNotExceeded works through per-task dispatch +SET citus.max_intermediate_result_size TO '4kB'; +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; +RESET citus.max_intermediate_result_size; + +-- H6: Cross-join subplan with non-aggregate ORDER BY (crash regression variant) +-- Similar pattern to subquery_complex_target_list but without aggregate ORDER BY +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + +-- H7: CTE correctness comparison — GUC off vs on must produce identical results +SET citus.enable_sorted_merge TO off; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + +-- ================================================================= +-- Category H EXPLAIN: Query plans for subplan + sorted merge +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- H1 EXPLAIN +EXPLAIN (COSTS OFF) +WITH ordered_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id +) +SELECT * FROM ordered_cte LIMIT 5; + +-- H2 EXPLAIN +EXPLAIN (COSTS OFF) +WITH eligible_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +), +ineligible_cte AS ( + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 15 +) +SELECT e.id, e.val, i.cnt +FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id +ORDER BY e.id; + +-- H3 EXPLAIN +EXPLAIN (COSTS OFF) +WITH top_ids AS ( + SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT t.id, t.val +FROM sorted_merge_test t +JOIN top_ids ON t.id = top_ids.id +ORDER BY t.id +LIMIT 10; + +-- H4 EXPLAIN +EXPLAIN (COSTS OFF) +SELECT id, val FROM sorted_merge_test +WHERE id IN ( + SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 +) +ORDER BY id +LIMIT 5; + +-- H5 EXPLAIN +EXPLAIN (COSTS OFF) +WITH small_cte AS ( + SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM small_cte ORDER BY id LIMIT 5; + +-- H6 EXPLAIN +EXPLAIN (COSTS OFF) +SELECT foo.id, bar.id as bar_id +FROM + (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, + (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar +ORDER BY foo.id, bar.id +LIMIT 5; + +-- H7 EXPLAIN — GUC off vs on +SET citus.enable_sorted_merge TO off; +EXPLAIN (COSTS OFF) +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + +SET citus.enable_sorted_merge TO on; +EXPLAIN (COSTS OFF) +WITH cte AS ( + SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 +) +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; + -- ================================================================= -- Cleanup -- ================================================================= From 8ed417cfe4295482e6624f2b78a3160d7f81331a Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Mon, 23 Mar 2026 20:47:10 +0000 Subject: [PATCH 08/18] Put multi_orderby_pushdown in its own test group --- src/test/regress/multi_schedule | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index d3fb728d271..06b482ff5c7 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -92,7 +92,8 @@ test: multi_limit_clause_approximate multi_outer_join_reference multi_outer_join test: multi_reference_table multi_select_for_update relation_access_tracking pg13_with_ties test: custom_aggregate_support aggregate_support tdigest_aggregate_support test: multi_average_expression multi_working_columns multi_having_pushdown having_subquery -test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown multi_orderby_pushdown +test: multi_array_agg multi_limit_clause multi_orderby_limit_pushdown +test: multi_orderby_pushdown test: multi_jsonb_agg multi_jsonb_object_agg multi_json_agg multi_json_object_agg bool_agg ch_bench_having chbenchmark_all_queries expression_reference_join anonymous_columns test: ch_bench_subquery_repartition test: subscripting_op From 0bf44ffbff14cf08d230e191ba42100081efc82a Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Tue, 24 Mar 2026 17:51:57 +0000 Subject: [PATCH 09/18] Replace the use of memcpy with memcpy_s and add ORDER BY to CTE test cases to impose a deterministic order --- .../distributed/utils/citus_copyfuncs.c | 4 ++- .../expected/multi_orderby_pushdown.out | 32 +++++++++++++------ .../regress/sql/multi_orderby_pushdown.sql | 8 ++--- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/src/backend/distributed/utils/citus_copyfuncs.c b/src/backend/distributed/utils/citus_copyfuncs.c index 65c2a88f3a4..e0e118d62c3 100644 --- a/src/backend/distributed/utils/citus_copyfuncs.c +++ b/src/backend/distributed/utils/citus_copyfuncs.c @@ -14,6 +14,7 @@ #include "utils/datum.h" #include "distributed/citus_nodefuncs.h" +#include "distributed/citus_safe_lib.h" #include "distributed/listutils.h" #include "distributed/multi_server_executor.h" @@ -147,7 +148,8 @@ CopyNodeDistributedPlan(COPYFUNC_ARGS) { Size keySize = from->sortedMergeKeyCount * sizeof(SortedMergeKey); newnode->sortedMergeKeys = (SortedMergeKey *) palloc(keySize); - memcpy(newnode->sortedMergeKeys, from->sortedMergeKeys, keySize); + memcpy_s(newnode->sortedMergeKeys, keySize, + from->sortedMergeKeys, keySize); } else { diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index b8c9fa680ae..92a9cd73557 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -1083,14 +1083,14 @@ SET citus.enable_sorted_merge TO on; WITH ordered_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id ) -SELECT * FROM ordered_cte LIMIT 5; +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; id | val --------------------------------------------------------------------- 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 5 | val_5 - 8 | val_8 - 10 | val_10 - 15 | val_15 (5 rows) -- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) @@ -1098,17 +1098,29 @@ WITH eligible_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ), ineligible_cte AS ( - SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 15 + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 ) SELECT e.id, e.val, i.cnt FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id ORDER BY e.id; id | val | cnt --------------------------------------------------------------------- + 1 | val_1 | 1 + 2 | val_2 | 1 + 3 | val_3 | 1 + 4 | val_4 | 1 5 | val_5 | 1 + 6 | val_6 | 1 + 7 | val_7 | 1 + 8 | val_8 | 1 + 9 | val_9 | 1 10 | val_10 | 1 + 11 | val_11 | 1 + 12 | val_12 | 1 + 13 | val_13 | 1 + 14 | val_14 | 1 15 | val_15 | 1 -(3 rows) +(15 rows) -- H3: CTE subplan feeding outer ORDER BY — both levels may merge independently WITH top_ids AS ( @@ -1219,7 +1231,7 @@ EXPLAIN (COSTS OFF) WITH ordered_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id ) -SELECT * FROM ordered_cte LIMIT 5; +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; QUERY PLAN --------------------------------------------------------------------- Limit @@ -1240,7 +1252,7 @@ WITH eligible_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ), ineligible_cte AS ( - SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 15 + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 ) SELECT e.id, e.val, i.cnt FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id @@ -1262,7 +1274,7 @@ ORDER BY e.id; -> Distributed Subplan XXX_2 -> Limit -> Sort - Sort Key: remote_scan.cnt DESC + Sort Key: remote_scan.cnt DESC, remote_scan.id -> Custom Scan (Citus Adaptive) Task Count: 4 Tasks Shown: One of 4 @@ -1270,7 +1282,7 @@ ORDER BY e.id; Node: host=localhost port=xxxxx dbname=regression -> Limit -> Sort - Sort Key: (count(*)) DESC + Sort Key: (count(*)) DESC, id -> HashAggregate Group Key: id -> Seq Scan on sorted_merge_test_960000 sorted_merge_test diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index cd69f63955a..6c6c6c6d7fb 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -392,14 +392,14 @@ SET citus.enable_sorted_merge TO on; WITH ordered_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id ) -SELECT * FROM ordered_cte LIMIT 5; +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; -- H2: Multiple CTEs — one eligible (ORDER BY col), one ineligible (ORDER BY agg) WITH eligible_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ), ineligible_cte AS ( - SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 15 + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 ) SELECT e.id, e.val, i.cnt FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id @@ -465,7 +465,7 @@ EXPLAIN (COSTS OFF) WITH ordered_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id ) -SELECT * FROM ordered_cte LIMIT 5; +SELECT * FROM ordered_cte ORDER BY id LIMIT 5; -- H2 EXPLAIN EXPLAIN (COSTS OFF) @@ -473,7 +473,7 @@ WITH eligible_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ), ineligible_cte AS ( - SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC LIMIT 15 + SELECT id, count(*) as cnt FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, id LIMIT 15 ) SELECT e.id, e.val, i.cnt FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id From 0eb923cd028682a56af33e832f6bfa8d526a2d34 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Fri, 27 Mar 2026 06:39:22 +0000 Subject: [PATCH 10/18] Emit more verbose output for EXPLAIN in sorted merge tests. Alphabetically sort GUCs --- src/backend/distributed/shared_library_init.c | 26 +- .../expected/multi_orderby_pushdown.out | 967 ++++++++++++------ .../regress/sql/multi_orderby_pushdown.sql | 58 +- 3 files changed, 704 insertions(+), 347 deletions(-) diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index a6a928fbe75..48842050c3b 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -1604,6 +1604,19 @@ RegisterCitusConfigVariables(void) GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, NULL, NULL, NULL); + DefineCustomBoolVariable( + "citus.enable_sorted_merge", + gettext_noop("Enables sorted merge of worker results for ORDER BY queries."), + gettext_noop("When enabled during planning, Citus pushes ORDER BY to workers " + "and merges the pre-sorted results on the coordinator using a " + "binary heap, eliminating the Sort node in the combine query. " + "This is an experimental feature."), + &EnableSortedMerge, + false, + PGC_SUSET, + GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + DefineCustomBoolVariable( "citus.enable_stat_counters", gettext_noop("Enables the collection of statistic counters for Citus."), @@ -2613,19 +2626,6 @@ RegisterCitusConfigVariables(void) GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, NULL, NULL, NULL); - DefineCustomBoolVariable( - "citus.enable_sorted_merge", - gettext_noop("Enables sorted merge of worker results for ORDER BY queries."), - gettext_noop("When enabled during planning, Citus pushes ORDER BY to workers " - "and merges the pre-sorted results on the coordinator using a " - "binary heap, eliminating the Sort node in the combine query. " - "This is an experimental feature."), - &EnableSortedMerge, - false, - PGC_SUSET, - GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - /* * It takes about 140 bytes of shared memory to store one row, therefore * this setting should be used responsibly. setting it to 10M will require diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index 92a9cd73557..b5952dbbf9e 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -67,235 +67,343 @@ SET citus.enable_sorted_merge TO off; -- ================================================================= SET citus.enable_sorted_merge TO on; -- A1: ORDER BY distribution column -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val Task Count: 4 + Tuple data received from nodes: 1027 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: 255 bytes Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(8 rows) + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) -- A2: ORDER BY DESC -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id Task Count: 4 + Tuple data received from nodes: 420 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: 104 bytes Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id DESC - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(8 rows) + -> Sort (actual rows=26 loops=1) + Output: id + Sort Key: sorted_merge_test.id DESC + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(15 rows) -- A3: ORDER BY DESC NULLS LAST -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.num Task Count: 4 + Tuple data received from nodes: 1556 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num DESC NULLS LAST + Tuple data received from node: 392 bytes Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: num DESC NULLS LAST - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(8 rows) + -> Sort (actual rows=26 loops=1) + Output: id, num + Sort Key: sorted_merge_test.num DESC NULLS LAST + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, num +(15 rows) -- A4: ORDER BY non-distribution column -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY val; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val Task Count: 4 + Tuple data received from nodes: 1027 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY val + Tuple data received from node: 255 bytes Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: val - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(8 rows) + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.val + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) -- A5: Multi-column ORDER BY -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val Task Count: 4 + Tuple data received from nodes: 1027 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, val + Tuple data received from node: 255 bytes Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id, val - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(8 rows) + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id, sorted_merge_test.val + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) -- A6: Mixed directions -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num Task Count: 4 + Tuple data received from nodes: 2163 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, num DESC + Tuple data received from node: 543 bytes Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id, num DESC - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(8 rows) + -> Sort (actual rows=26 loops=1) + Output: id, val, num + Sort Key: sorted_merge_test.id, sorted_merge_test.num DESC + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num +(15 rows) -- A7: GROUP BY dist_col ORDER BY dist_col -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.count Task Count: 4 + Tuple data received from nodes: 1260 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id + Tuple data received from node: 312 bytes Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id - -> HashAggregate - Group Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + -> Sort (actual rows=26 loops=1) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(19 rows) -- A8: WHERE clause + ORDER BY -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=67 loops=1) + Output: remote_scan.id, remote_scan.val Task Count: 4 + Tuple data received from nodes: 671 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) '50'::numeric) ORDER BY id + Tuple data received from node: 130 bytes Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test - Filter: (num > '50'::numeric) -(9 rows) + -> Sort (actual rows=13 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=13 loops=1) + Output: id, val + Filter: (sorted_merge_test.num > '50'::numeric) + Rows Removed by Filter: 13 +(17 rows) -- A9: Expression in ORDER BY (non-aggregate) -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.num, remote_scan.worker_column_3 Task Count: 4 + Tuple data received from nodes: 1976 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, num, (id OPERATOR(pg_catalog.+) 1) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) 1) + Tuple data received from node: 496 bytes Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: ((id + 1)) - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(8 rows) + -> Sort (actual rows=26 loops=1) + Output: id, num, ((id + 1)) + Sort Key: ((sorted_merge_test.id + 1)) + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, num, (id + 1) +(15 rows) -- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Limit - -> Custom Scan (Citus Adaptive) + Limit (actual rows=5 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id Task Count: 4 + Tuple data received from nodes: 80 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 20 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + -> Limit (actual rows=5 loops=1) + Output: id + -> Sort (actual rows=5 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(19 rows) -- ================================================================= -- Category B: Ineligibility — sort NOT pushed for merge -- ================================================================= SET citus.enable_sorted_merge TO on; -- B1: ORDER BY count(*) -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Sort + Sort (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.count Sort Key: remote_scan.count - -> Custom Scan (Citus Adaptive) + Sort Method: quicksort Memory: 28kB + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.count Task Count: 4 + Tuple data received from nodes: 1260 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: 312 bytes Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate - Group Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(19 rows) -- B2: ORDER BY avg(col) -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Sort + Sort (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.avg Sort Key: remote_scan.avg - -> Custom Scan (Citus Adaptive) + Sort Method: quicksort Memory: 28kB + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.avg Task Count: 4 + Tuple data received from nodes: 1556 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, avg(num) AS avg FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: 392 bytes Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate - Group Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + -> HashAggregate (actual rows=26 loops=1) + Output: id, avg(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(19 rows) -- B3: GROUP BY non-dist col, ORDER BY non-dist col -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Sort + Sort (actual rows=104 loops=1) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) Sort Key: remote_scan.val - -> HashAggregate + Sort Method: quicksort Memory: 28kB + -> HashAggregate (actual rows=104 loops=1) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) Group Key: remote_scan.val - -> Custom Scan (Citus Adaptive) + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.val, remote_scan.count Task Count: 4 + Tuple data received from nodes: 1447 bytes Tasks Shown: One of 4 -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: 359 bytes Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate - Group Key: val - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(12 rows) + -> HashAggregate (actual rows=26 loops=1) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(23 rows) -- B4: GROUP BY non-dist col, ORDER BY aggregate -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Sort + Sort (actual rows=104 loops=1) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) - -> HashAggregate + Sort Method: quicksort Memory: 28kB + -> HashAggregate (actual rows=104 loops=1) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) Group Key: remote_scan.val - -> Custom Scan (Citus Adaptive) + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.val, remote_scan.count Task Count: 4 + Tuple data received from nodes: 1447 bytes Tasks Shown: One of 4 -> Task + Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val + Tuple data received from node: 359 bytes Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate - Group Key: val - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(12 rows) + -> HashAggregate (actual rows=26 loops=1) + Output: val, count(*) + Group Key: sorted_merge_test.val + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(23 rows) -- ================================================================= -- Category C: Correctness — results match GUC off vs on @@ -772,81 +880,123 @@ SELECT id FROM sorted_merge_test WHERE false ORDER BY id; -- ================================================================= -- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on SET citus.enable_sorted_merge TO off; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Limit - -> Sort + Limit (actual rows=5 loops=1) + Output: remote_scan.id + -> Sort (actual rows=5 loops=1) + Output: remote_scan.id Sort Key: remote_scan.id - -> Custom Scan (Citus Adaptive) + Sort Method: top-N heapsort Memory: 25kB + -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) + Output: remote_scan.id Task Count: 4 + Tuple data received from nodes: 80 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 20 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(12 rows) + -> Limit (actual rows=5 loops=1) + Output: id + -> Sort (actual rows=5 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(23 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Limit - -> Custom Scan (Citus Adaptive) + Limit (actual rows=5 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id Task Count: 4 + Tuple data received from nodes: 80 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 20 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + -> Limit (actual rows=5 loops=1) + Output: id + -> Sort (actual rows=5 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(19 rows) -- F2: GROUP BY dist_col + ORDER BY + LIMIT SET citus.enable_sorted_merge TO off; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Limit - -> Sort + Limit (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count + -> Sort (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count Sort Key: remote_scan.id - -> Custom Scan (Citus Adaptive) + Sort Method: top-N heapsort Memory: 25kB + -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.count Task Count: 4 + Tuple data received from nodes: 240 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 60 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> HashAggregate - Group Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(14 rows) + -> Limit (actual rows=5 loops=1) + Output: id, (count(*)) + -> Sort (actual rows=5 loops=1) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(27 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Limit - -> Custom Scan (Citus Adaptive) + Limit (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.count Task Count: 4 + Tuple data received from nodes: 240 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint + Tuple data received from node: 60 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> HashAggregate - Group Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(12 rows) + -> Limit (actual rows=5 loops=1) + Output: id, (count(*)) + -> Sort (actual rows=5 loops=1) + Output: id, (count(*)) + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts +(23 rows) -- F3: ORDER BY aggregate + LIMIT (not eligible for merge) SET citus.enable_sorted_merge TO off; @@ -876,34 +1026,48 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, i -- ================================================================= -- G1: Sort elision verification — coordinator Sort node absent SET citus.enable_sorted_merge TO off; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Sort + Sort (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val Sort Key: remote_scan.id - -> Custom Scan (Citus Adaptive) + Sort Method: quicksort Memory: 28kB + -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val Task Count: 4 + Tuple data received from nodes: 1027 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true + Tuple data received from node: 255 bytes Node: host=localhost port=xxxxx dbname=regression - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(8 rows) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Output: remote_scan.id, remote_scan.val Task Count: 4 + Tuple data received from nodes: 1027 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: 255 bytes Node: host=localhost port=xxxxx dbname=regression - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(8 rows) + -> Sort (actual rows=26 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val +(15 rows) -- G2a: PREPARE with merge ON, EXECUTE after turning OFF -- Plan-time decision is baked in — cached plan must still merge correctly @@ -1006,27 +1170,30 @@ CLOSE sorted_cursor; COMMIT; -- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- Limit (actual rows=5 loops=1) + Output: remote_scan.id -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id Task Count: 4 Tuple data received from nodes: 80 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint Tuple data received from node: 20 bytes Node: host=localhost port=xxxxx dbname=regression -> Limit (actual rows=5 loops=1) - Buffers: shared hit=1 + Output: id -> Sort (actual rows=5 loops=1) - Sort Key: id + Output: id + Sort Key: sorted_merge_test.id Sort Method: top-N heapsort Memory: 25kB - Buffers: shared hit=1 - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Buffers: shared hit=1 -(16 rows) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id +(19 rows) -- G5: ORDER BY aggregate + LIMIT — crash regression test -- Previously caused SIGSEGV when sorted merge was enabled because @@ -1227,27 +1394,36 @@ SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; -- ================================================================= SET citus.enable_sorted_merge TO on; -- H1 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH ordered_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id ) SELECT * FROM ordered_cte ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Limit - -> Custom Scan (Citus Adaptive) + Limit (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val Task Count: 4 + Tuple data received from nodes: 191 bytes Tasks Shown: One of 4 -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint + Tuple data received from node: 47 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort + -> Limit (actual rows=5 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=5 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val Sort Key: sorted_merge_test.id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test -(10 rows) + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val +(19 rows) -- H2 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH eligible_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ), @@ -1257,51 +1433,88 @@ ineligible_cte AS ( SELECT e.id, e.val, i.cnt FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id ORDER BY e.id; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.cnt -> Distributed Subplan XXX_1 - -> Limit - -> Custom Scan (Citus Adaptive) + Intermediate Data Size: 397 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id, remote_scan.val Task Count: 4 + Tuple data received from nodes: 791 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 197 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + -> Limit (actual rows=20 loops=1) + Output: id, val + -> Sort (actual rows=20 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val -> Distributed Subplan XXX_2 - -> Limit - -> Sort + Intermediate Data Size: 330 bytes + Result destination: Write locally + -> Limit (actual rows=15 loops=1) + Output: remote_scan.id, remote_scan.cnt + -> Sort (actual rows=15 loops=1) + Output: remote_scan.id, remote_scan.cnt Sort Key: remote_scan.cnt DESC, remote_scan.id - -> Custom Scan (Citus Adaptive) + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.cnt Task Count: 4 + Tuple data received from nodes: 720 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT '15'::bigint + Tuple data received from node: 180 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: (count(*)) DESC, id - -> HashAggregate - Group Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + -> Limit (actual rows=15 loops=1) + Output: id, (count(*)) + -> Sort (actual rows=15 loops=1) + Output: id, (count(*)) + Sort Key: (count(*)) DESC, sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> HashAggregate (actual rows=26 loops=1) + Output: id, count(*) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num, ts Task Count: 1 + Tuple data received from nodes: 87 bytes Tasks Shown: All -> Task + Query: SELECT e.id, e.val, i.cnt FROM ((SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) e JOIN (SELECT intermediate_result.id, intermediate_result.cnt FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer, cnt bigint)) i ON ((e.id OPERATOR(pg_catalog.=) i.id))) ORDER BY e.id + Tuple data received from node: 87 bytes Node: host=localhost port=xxxxx dbname=regression - -> Merge Join + -> Merge Join (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result_1.cnt Merge Cond: (intermediate_result.id = intermediate_result_1.id) - -> Sort + -> Sort (actual rows=6 loops=1) + Output: intermediate_result.id, intermediate_result.val Sort Key: intermediate_result.id - -> Function Scan on read_intermediate_result intermediate_result - -> Sort + Sort Method: quicksort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=15 loops=1) + Output: intermediate_result_1.cnt, intermediate_result_1.id Sort Key: intermediate_result_1.id - -> Function Scan on read_intermediate_result intermediate_result_1 -(39 rows) + Sort Method: quicksort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=15 loops=1) + Output: intermediate_result_1.cnt, intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(77 rows) -- H3 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH top_ids AS ( SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 ) @@ -1310,214 +1523,358 @@ FROM sorted_merge_test t JOIN top_ids ON t.id = top_ids.id ORDER BY t.id LIMIT 10; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Limit - -> Custom Scan (Citus Adaptive) + Limit (actual rows=10 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=10 loops=1) + Output: remote_scan.id, remote_scan.val -> Distributed Subplan XXX_1 - -> Limit - -> Custom Scan (Citus Adaptive) + Intermediate Data Size: 200 bytes + Result destination: Send to 2 nodes + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id Task Count: 4 + Tuple data received from nodes: 320 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 80 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + -> Limit (actual rows=20 loops=1) + Output: id + -> Sort (actual rows=20 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id Task Count: 4 + Tuple data received from nodes: 97 bytes Tasks Shown: One of 4 -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT '10'::bigint + Tuple data received from node: 97 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Merge Join - Merge Cond: (t.id = intermediate_result.id) - -> Sort - Sort Key: t.id - -> Seq Scan on sorted_merge_test_960000 t - -> Sort + -> Limit (actual rows=10 loops=1) + Output: t.id, t.val + -> Merge Join (actual rows=10 loops=1) + Output: t.id, t.val + Merge Cond: (intermediate_result.id = t.id) + -> Sort (actual rows=10 loops=1) + Output: intermediate_result.id Sort Key: intermediate_result.id - -> Function Scan on read_intermediate_result intermediate_result -(26 rows) + Sort Method: quicksort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Sort (actual rows=10 loops=1) + Output: t.id, t.val + Sort Key: t.id + Sort Method: quicksort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=26 loops=1) + Output: t.id, t.val +(51 rows) -- H4 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE id IN ( SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 ) ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Limit - -> Custom Scan (Citus Adaptive) + Limit (actual rows=3 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=3 loops=1) + Output: remote_scan.id, remote_scan.val -> Distributed Subplan XXX_1 - -> Limit - -> Custom Scan (Citus Adaptive) + Intermediate Data Size: 100 bytes + Result destination: Send to 2 nodes + -> Limit (actual rows=10 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=40 loops=1) + Output: remote_scan.id Task Count: 4 + Tuple data received from nodes: 160 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '10'::bigint + Tuple data received from node: 40 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_events_960004 sorted_merge_events + -> Limit (actual rows=10 loops=1) + Output: id + -> Sort (actual rows=10 loops=1) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) + Output: id Task Count: 4 + Tuple data received from nodes: 27 bytes Tasks Shown: One of 4 -> Task + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint + Tuple data received from node: 27 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort + -> Limit (actual rows=3 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val + -> Sort (actual rows=3 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val Sort Key: sorted_merge_test.id - -> Hash Join + Sort Method: quicksort Memory: 25kB + -> Hash Semi Join (actual rows=3 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val Hash Cond: (sorted_merge_test.id = intermediate_result.id) - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test - -> Hash - -> HashAggregate - Group Key: intermediate_result.id - -> Function Scan on read_intermediate_result intermediate_result -(27 rows) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: sorted_merge_test.id, sorted_merge_test.val, sorted_merge_test.num, sorted_merge_test.ts + -> Hash (actual rows=10 loops=1) + Output: intermediate_result.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=10 loops=1) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(50 rows) -- H5 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH small_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ) SELECT * FROM small_cte ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val -> Distributed Subplan XXX_1 - -> Limit - -> Custom Scan (Citus Adaptive) + Intermediate Data Size: 397 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id, remote_scan.val Task Count: 4 + Tuple data received from nodes: 791 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 197 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + -> Limit (actual rows=20 loops=1) + Output: id, val + -> Sort (actual rows=20 loops=1) + Output: id, val + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val Task Count: 1 + Tuple data received from nodes: 47 bytes Tasks Shown: All -> Task + Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT 5 + Tuple data received from node: 47 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val Sort Key: intermediate_result.id - -> Function Scan on read_intermediate_result intermediate_result -(20 rows) + Sort Method: top-N heapsort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Output: intermediate_result.id, intermediate_result.val + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) +(40 rows) -- H6 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT foo.id, bar.id as bar_id FROM (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar ORDER BY foo.id, bar.id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.bar_id -> Distributed Subplan XXX_1 - -> Limit - -> Custom Scan (Citus Adaptive) + Intermediate Data Size: 30 bytes + Result destination: Write locally + -> Limit (actual rows=3 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) + Output: remote_scan.id Task Count: 4 + Tuple data received from nodes: 48 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '3'::bigint + Tuple data received from node: 12 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + -> Limit (actual rows=3 loops=1) + Output: id + -> Sort (actual rows=3 loops=1) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id -> Distributed Subplan XXX_2 - -> Limit - -> Custom Scan (Citus Adaptive) + Intermediate Data Size: 30 bytes + Result destination: Write locally + -> Limit (actual rows=3 loops=1) + Output: remote_scan.id + -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) + Output: remote_scan.id Task Count: 4 + Tuple data received from nodes: 48 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '3'::bigint + Tuple data received from node: 12 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_events_960004 sorted_merge_events + -> Limit (actual rows=3 loops=1) + Output: id + -> Sort (actual rows=3 loops=1) + Output: id + Sort Key: sorted_merge_events.id + Sort Method: top-N heapsort Memory: 25kB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) + Output: id Task Count: 1 + Tuple data received from nodes: 40 bytes Tasks Shown: All -> Task + Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT 5 + Tuple data received from node: 40 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result_1.id + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result_1.id Sort Key: intermediate_result.id, intermediate_result_1.id - -> Nested Loop - -> Function Scan on read_intermediate_result intermediate_result - -> Function Scan on read_intermediate_result intermediate_result_1 -(33 rows) + Sort Method: quicksort Memory: 25kB + -> Nested Loop (actual rows=9 loops=1) + Output: intermediate_result.id, intermediate_result_1.id + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=3 loops=1) + Output: intermediate_result.id + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=3 loops=3) + Output: intermediate_result_1.id + Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) +(67 rows) -- H7 EXPLAIN — GUC off vs on SET citus.enable_sorted_merge TO off; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 ) SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num -> Distributed Subplan XXX_1 - -> Limit - -> Sort + Intermediate Data Size: 691 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Sort (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num Sort Key: remote_scan.id - -> Custom Scan (Citus Adaptive) + -> Custom Scan (Citus Adaptive) (never executed) + Output: remote_scan.id, remote_scan.val, remote_scan.num Task Count: 4 + Tuple data received from nodes: 1673 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 419 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + -> Limit (actual rows=20 loops=1) + Output: id, val, num + -> Sort (actual rows=20 loops=1) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num Task Count: 1 + Tuple data received from nodes: 103 bytes Tasks Shown: All -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 + Tuple data received from node: 103 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num Sort Key: intermediate_result.id - -> Function Scan on read_intermediate_result intermediate_result - Filter: (num > '10'::numeric) -(23 rows) + Sort Method: top-N heapsort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=14 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > '10'::numeric) + Rows Removed by Filter: 6 +(45 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 ) SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) + Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num -> Distributed Subplan XXX_1 - -> Limit - -> Custom Scan (Citus Adaptive) + Intermediate Data Size: 699 bytes + Result destination: Write locally + -> Limit (actual rows=20 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num + -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + Output: remote_scan.id, remote_scan.val, remote_scan.num Task Count: 4 + Tuple data received from nodes: 1673 bytes Tasks Shown: One of 4 -> Task + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint + Tuple data received from node: 419 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort - Sort Key: id - -> Seq Scan on sorted_merge_test_960000 sorted_merge_test + -> Limit (actual rows=20 loops=1) + Output: id, val, num + -> Sort (actual rows=20 loops=1) + Output: id, val, num + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: 26kB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Output: id, val, num Task Count: 1 + Tuple data received from nodes: 101 bytes Tasks Shown: All -> Task + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 + Tuple data received from node: 101 bytes Node: host=localhost port=xxxxx dbname=regression - -> Limit - -> Sort + -> Limit (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + -> Sort (actual rows=5 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num Sort Key: intermediate_result.id - -> Function Scan on read_intermediate_result intermediate_result - Filter: (num > '10'::numeric) -(21 rows) + Sort Method: top-N heapsort Memory: 25kB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=18 loops=1) + Output: intermediate_result.id, intermediate_result.val, intermediate_result.num + Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) + Filter: (intermediate_result.num > '10'::numeric) + Rows Removed by Filter: 2 +(42 rows) -- ================================================================= -- Cleanup diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index 6c6c6c6d7fb..1d4110055c9 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -61,43 +61,43 @@ SET citus.enable_sorted_merge TO off; SET citus.enable_sorted_merge TO on; -- A1: ORDER BY distribution column -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id; -- A2: ORDER BY DESC -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC; -- A3: ORDER BY DESC NULLS LAST -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; -- A4: ORDER BY non-distribution column -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY val; -- A5: Multi-column ORDER BY -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val; -- A6: Mixed directions -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; -- A7: GROUP BY dist_col ORDER BY dist_col -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; -- A8: WHERE clause + ORDER BY -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; -- A9: Expression in ORDER BY (non-aggregate) -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1; -- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; -- ================================================================= @@ -107,19 +107,19 @@ SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; SET citus.enable_sorted_merge TO on; -- B1: ORDER BY count(*) -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); -- B2: ORDER BY avg(col) -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); -- B3: GROUP BY non-dist col, ORDER BY non-dist col -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; -- B4: GROUP BY non-dist col, ORDER BY aggregate -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); -- ================================================================= @@ -293,20 +293,20 @@ SELECT id FROM sorted_merge_test WHERE false ORDER BY id; -- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on SET citus.enable_sorted_merge TO off; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; SET citus.enable_sorted_merge TO on; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; -- F2: GROUP BY dist_col + ORDER BY + LIMIT SET citus.enable_sorted_merge TO off; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; SET citus.enable_sorted_merge TO on; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; -- F3: ORDER BY aggregate + LIMIT (not eligible for merge) @@ -322,11 +322,11 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, i -- G1: Sort elision verification — coordinator Sort node absent SET citus.enable_sorted_merge TO off; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id; SET citus.enable_sorted_merge TO on; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id; -- G2a: PREPARE with merge ON, EXECUTE after turning OFF @@ -359,7 +359,7 @@ COMMIT; -- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; -- G5: ORDER BY aggregate + LIMIT — crash regression test @@ -461,14 +461,14 @@ SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; SET citus.enable_sorted_merge TO on; -- H1 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH ordered_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id ) SELECT * FROM ordered_cte ORDER BY id LIMIT 5; -- H2 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH eligible_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ), @@ -480,7 +480,7 @@ FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id ORDER BY e.id; -- H3 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH top_ids AS ( SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 ) @@ -491,7 +491,7 @@ ORDER BY t.id LIMIT 10; -- H4 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE id IN ( SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 @@ -500,14 +500,14 @@ ORDER BY id LIMIT 5; -- H5 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH small_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ) SELECT * FROM small_cte ORDER BY id LIMIT 5; -- H6 EXPLAIN -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT foo.id, bar.id as bar_id FROM (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, @@ -517,14 +517,14 @@ LIMIT 5; -- H7 EXPLAIN — GUC off vs on SET citus.enable_sorted_merge TO off; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 ) SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; SET citus.enable_sorted_merge TO on; -EXPLAIN (COSTS OFF) +EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 ) From 943701832a64c95624124e16153013e1dadadd61 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Sun, 29 Mar 2026 04:46:47 +0000 Subject: [PATCH 11/18] Use explain_filter to normalize regress test output --- .../expected/multi_orderby_pushdown.out | 1069 ++++++++--------- .../regress/sql/multi_orderby_pushdown.sql | 103 +- 2 files changed, 557 insertions(+), 615 deletions(-) diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index b5952dbbf9e..9f528bf21bb 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -67,231 +67,221 @@ SET citus.enable_sorted_merge TO off; -- ================================================================= SET citus.enable_sorted_merge TO on; -- A1: ORDER BY distribution column -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val (15 rows) -- A2: ORDER BY DESC -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id DESC; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 420 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC - Tuple data received from node: 104 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id DESC - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id (15 rows) -- A3: ORDER BY DESC NULLS LAST -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 1556 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num DESC NULLS LAST - Tuple data received from node: 392 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, num Sort Key: sorted_merge_test.num DESC NULLS LAST - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, num (15 rows) -- A4: ORDER BY non-distribution column -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY val; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY val'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY val - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.val - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val (15 rows) -- A5: Multi-column ORDER BY -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id, val; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, val - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id, sorted_merge_test.val - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val (15 rows) -- A6: Mixed directions -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 2163 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id, num DESC - Tuple data received from node: 543 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val, num Sort Key: sorted_merge_test.id, sorted_merge_test.num DESC - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num (15 rows) -- A7: GROUP BY dist_col ORDER BY dist_col -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1260 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id - Tuple data received from node: 312 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, (count(*)) Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> HashAggregate (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) Output: id, count(*) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (19 rows) -- A8: WHERE clause + ORDER BY -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=67 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 671 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) '50'::numeric) ORDER BY id - Tuple data received from node: 130 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=13 loops=1) + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (num OPERATOR(pg_catalog.>) 'N'::numeric) ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=13 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val - Filter: (sorted_merge_test.num > '50'::numeric) - Rows Removed by Filter: 13 + Filter: (sorted_merge_test.num > 'N'::numeric) + Rows Removed by Filter: N (17 rows) -- A9: Expression in ORDER BY (non-aggregate) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, num FROM sorted_merge_test ORDER BY id + 1; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.num, remote_scan.worker_column_3 - Task Count: 4 - Tuple data received from nodes: 1976 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, num, (id OPERATOR(pg_catalog.+) 1) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) 1) - Tuple data received from node: 496 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) - Output: id, num, ((id + 1)) - Sort Key: ((sorted_merge_test.id + 1)) - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) - Output: id, num, (id + 1) + Query: SELECT id, num, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, num, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num, (id + N) (15 rows) -- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id (19 rows) @@ -300,108 +290,104 @@ SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; -- ================================================================= SET citus.enable_sorted_merge TO on; -- B1: ORDER BY count(*) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*)'); + explain_filter --------------------------------------------------------------------- - Sort (actual rows=105 loops=1) + Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count Sort Key: remote_scan.count - Sort Method: quicksort Memory: 28kB - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1260 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id - Tuple data received from node: 312 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) Output: id, count(*) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (19 rows) -- B2: ORDER BY avg(col) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num)'); + explain_filter --------------------------------------------------------------------- - Sort (actual rows=105 loops=1) + Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.avg Sort Key: remote_scan.avg - Sort Method: quicksort Memory: 28kB - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.avg - Task Count: 4 - Tuple data received from nodes: 1556 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, avg(num) AS avg FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id - Tuple data received from node: 392 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) Output: id, avg(num) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (19 rows) -- B3: GROUP BY non-dist col, ORDER BY non-dist col -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val'); + explain_filter --------------------------------------------------------------------- - Sort (actual rows=104 loops=1) - Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) Sort Key: remote_scan.val - Sort Method: quicksort Memory: 28kB - -> HashAggregate (actual rows=104 loops=1) - Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) Group Key: remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.val, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1447 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val - Tuple data received from node: 359 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) Output: val, count(*) Group Key: sorted_merge_test.val - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (23 rows) -- B4: GROUP BY non-dist col, ORDER BY aggregate -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); - QUERY PLAN ---------------------------------------------------------------------- - Sort (actual rows=104 loops=1) - Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) - Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint)) - Sort Method: quicksort Memory: 28kB - -> HashAggregate (actual rows=104 loops=1) - Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.val, (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Key: (COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint)) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: remote_scan.val, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, 'N'::bigint) Group Key: remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.val, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 1447 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT val, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY val - Tuple data received from node: 359 bytes - Node: host=localhost port=xxxxx dbname=regression - -> HashAggregate (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) Output: val, count(*) Group Key: sorted_merge_test.val - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (23 rows) @@ -880,121 +866,117 @@ SELECT id FROM sorted_merge_test WHERE false ORDER BY id; -- ================================================================= -- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: remote_scan.id Sort Key: remote_scan.id - Sort Method: top-N heapsort Memory: 25kB - -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id (23 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id (19 rows) -- F2: GROUP BY dist_col + ORDER BY + LIMIT SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count Sort Key: remote_scan.id - Sort Method: top-N heapsort Memory: 25kB - -> Custom Scan (Citus Adaptive) (actual rows=20 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 240 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 60 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, (count(*)) - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id, (count(*)) Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> HashAggregate (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) Output: id, count(*) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (27 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count - Task Count: 4 - Tuple data received from nodes: 240 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 60 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id, count(*) AS count FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, (count(*)) - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id, (count(*)) Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> HashAggregate (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) Output: id, count(*) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts (23 rows) @@ -1026,46 +1008,44 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, i -- ================================================================= -- G1: Sort elision verification — coordinator Sort node absent SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter --------------------------------------------------------------------- - Sort (actual rows=105 loops=1) + Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val Sort Key: remote_scan.id - Sort Method: quicksort Memory: 28kB - -> Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val (15 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=105 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 1027 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id - Tuple data received from node: 255 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Sort (actual rows=26 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val (15 rows) @@ -1170,28 +1150,27 @@ CLOSE sorted_cursor; COMMIT; -- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; - QUERY PLAN +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 80 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '5'::bigint - Tuple data received from node: 20 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id (19 rows) @@ -1394,37 +1373,35 @@ SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; -- ================================================================= SET citus.enable_sorted_merge TO on; -- H1 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH ordered_cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH ordered_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id ) -SELECT * FROM ordered_cte ORDER BY id LIMIT 5; - QUERY PLAN +SELECT * FROM ordered_cte ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=5 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 191 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint - Tuple data received from node: 47 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT ordered_cte.id AS worker_column_1, ordered_cte.val AS worker_column_2 FROM (SELECT sorted_merge_test.id, sorted_merge_test.val FROM public.sorted_merge_test_960000 sorted_merge_test ORDER BY sorted_merge_test.id) ordered_cte) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val (19 rows) -- H2 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH eligible_cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH eligible_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ), ineligible_cte AS ( @@ -1432,448 +1409,442 @@ ineligible_cte AS ( ) SELECT e.id, e.val, i.cnt FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id -ORDER BY e.id; - QUERY PLAN +ORDER BY e.id'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.cnt -> Distributed Subplan XXX_1 - Intermediate Data Size: 397 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=20 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 791 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 197 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, val - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val -> Distributed Subplan XXX_2 - Intermediate Data Size: 330 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=15 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.cnt - -> Sort (actual rows=15 loops=1) + -> Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.cnt Sort Key: remote_scan.cnt DESC, remote_scan.id -> Custom Scan (Citus Adaptive) (never executed) Output: remote_scan.id, remote_scan.cnt - Task Count: 4 - Tuple data received from nodes: 720 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT '15'::bigint - Tuple data received from node: 180 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=15 loops=1) + Query: SELECT id, count(*) AS cnt FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (count(*)) DESC, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, (count(*)) - -> Sort (actual rows=15 loops=1) + -> Sort (actual rows=N loops=N) Output: id, (count(*)) Sort Key: (count(*)) DESC, sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> HashAggregate (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> HashAggregate (actual rows=N loops=N) Output: id, count(*) Group Key: sorted_merge_test.id - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts - Task Count: 1 - Tuple data received from nodes: 87 bytes + Task Count: N + Tuple data received from nodes: N bytes Tasks Shown: All -> Task Query: SELECT e.id, e.val, i.cnt FROM ((SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) e JOIN (SELECT intermediate_result.id, intermediate_result.cnt FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer, cnt bigint)) i ON ((e.id OPERATOR(pg_catalog.=) i.id))) ORDER BY e.id - Tuple data received from node: 87 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Merge Join (actual rows=5 loops=1) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Merge Join (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result_1.cnt Merge Cond: (intermediate_result.id = intermediate_result_1.id) - -> Sort (actual rows=6 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val Sort Key: intermediate_result.id - Sort Method: quicksort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - -> Sort (actual rows=15 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result_1.cnt, intermediate_result_1.id Sort Key: intermediate_result_1.id - Sort Method: quicksort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=15 loops=1) + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) Output: intermediate_result_1.cnt, intermediate_result_1.id Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) (77 rows) -- H3 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH top_ids AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH top_ids AS ( SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 ) SELECT t.id, t.val FROM sorted_merge_test t JOIN top_ids ON t.id = top_ids.id ORDER BY t.id -LIMIT 10; - QUERY PLAN +LIMIT 10'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=10 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=10 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val -> Distributed Subplan XXX_1 - Intermediate Data Size: 200 bytes - Result destination: Send to 2 nodes - -> Limit (actual rows=20 loops=1) + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 320 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 80 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id - Task Count: 4 - Tuple data received from nodes: 97 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT '10'::bigint - Tuple data received from node: 97 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=10 loops=1) + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT t.id AS worker_column_1, t.val AS worker_column_2 FROM (public.sorted_merge_test_960000 t JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) top_ids ON ((t.id OPERATOR(pg_catalog.=) top_ids.id)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: t.id, t.val - -> Merge Join (actual rows=10 loops=1) + -> Merge Join (actual rows=N loops=N) Output: t.id, t.val Merge Cond: (intermediate_result.id = t.id) - -> Sort (actual rows=10 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id Sort Key: intermediate_result.id - Sort Method: quicksort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Sort Method: quicksort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - -> Sort (actual rows=10 loops=1) + -> Sort (actual rows=N loops=N) Output: t.id, t.val Sort Key: t.id - Sort Method: quicksort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=N loops=N) Output: t.id, t.val (51 rows) -- H4 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE id IN ( SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 ) ORDER BY id -LIMIT 5; - QUERY PLAN +LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Limit (actual rows=3 loops=1) + Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=3 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val -> Distributed Subplan XXX_1 - Intermediate Data Size: 100 bytes - Result destination: Send to 2 nodes - -> Limit (actual rows=10 loops=1) + Intermediate Data Size: N bytes + Result destination: Send to N nodes + -> Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=40 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 160 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '10'::bigint - Tuple data received from node: 40 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=10 loops=1) + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=10 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_events.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) Output: id - Task Count: 4 - Tuple data received from nodes: 27 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT '5'::bigint - Tuple data received from node: 27 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=3 loops=1) + Query: SELECT worker_column_1 AS id, worker_column_2 AS val FROM (SELECT sorted_merge_test.id AS worker_column_1, sorted_merge_test.val AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE (sorted_merge_test.id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)))) worker_subquery ORDER BY worker_column_1 LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val - -> Sort (actual rows=3 loops=1) + -> Sort (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 25kB - -> Hash Semi Join (actual rows=3 loops=1) + Sort Method: quicksort Memory: NkB + -> Hash Semi Join (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val Hash Cond: (sorted_merge_test.id = intermediate_result.id) - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val, sorted_merge_test.num, sorted_merge_test.ts - -> Hash (actual rows=10 loops=1) + -> Hash (actual rows=N loops=N) Output: intermediate_result.id - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=10 loops=1) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) (50 rows) -- H5 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH small_cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH small_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ) -SELECT * FROM small_cte ORDER BY id LIMIT 5; - QUERY PLAN +SELECT * FROM small_cte ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val -> Distributed Subplan XXX_1 - Intermediate Data Size: 397 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=20 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val - Task Count: 4 - Tuple data received from nodes: 791 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 197 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, val - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: id, val Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val - Task Count: 1 - Tuple data received from nodes: 47 bytes + Task Count: N + Tuple data received from nodes: N bytes Tasks Shown: All -> Task - Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT 5 - Tuple data received from node: 47 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id, val FROM (SELECT intermediate_result.id, intermediate_result.val FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text)) small_cte ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val Sort Key: intermediate_result.id - Sort Method: top-N heapsort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=20 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) (40 rows) -- H6 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT foo.id, bar.id as bar_id +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT foo.id, bar.id as bar_id FROM (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar ORDER BY foo.id, bar.id -LIMIT 5; - QUERY PLAN +LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.bar_id -> Distributed Subplan XXX_1 - Intermediate Data Size: 30 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=3 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 48 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '3'::bigint - Tuple data received from node: 12 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=3 loops=1) + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=3 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_test.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id -> Distributed Subplan XXX_2 - Intermediate Data Size: 30 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=3 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id - -> Custom Scan (Citus Adaptive) (actual rows=12 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id - Task Count: 4 - Tuple data received from nodes: 48 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT '3'::bigint - Tuple data received from node: 12 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=3 loops=1) + Query: SELECT id FROM public.sorted_merge_events_960004 sorted_merge_events WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id - -> Sort (actual rows=3 loops=1) + -> Sort (actual rows=N loops=N) Output: id Sort Key: sorted_merge_events.id - Sort Method: top-N heapsort Memory: 25kB - -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=56 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) Output: id - Task Count: 1 - Tuple data received from nodes: 40 bytes + Task Count: N + Tuple data received from nodes: N bytes Tasks Shown: All -> Task - Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT 5 - Tuple data received from node: 40 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT foo.id, bar.id AS bar_id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) foo, (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) bar ORDER BY foo.id, bar.id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result_1.id - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result_1.id Sort Key: intermediate_result.id, intermediate_result_1.id - Sort Method: quicksort Memory: 25kB - -> Nested Loop (actual rows=9 loops=1) + Sort Method: quicksort Memory: NkB + -> Nested Loop (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result_1.id - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=3 loops=1) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=3 loops=3) + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) Output: intermediate_result_1.id Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) (67 rows) -- H7 EXPLAIN — GUC off vs on SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 ) -SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; - QUERY PLAN +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num -> Distributed Subplan XXX_1 - Intermediate Data Size: 691 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=20 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num Sort Key: remote_scan.id -> Custom Scan (Citus Adaptive) (never executed) Output: remote_scan.id, remote_scan.val, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 1673 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 419 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, val, num - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: id, val, num Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num - Task Count: 1 - Tuple data received from nodes: 103 bytes + Task Count: N + Tuple data received from nodes: N bytes Tasks Shown: All -> Task - Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 - Tuple data received from node: 103 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num Sort Key: intermediate_result.id - Sort Method: top-N heapsort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=14 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - Filter: (intermediate_result.num > '10'::numeric) - Rows Removed by Filter: 6 + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N (45 rows) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 ) -SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; - QUERY PLAN +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); + explain_filter --------------------------------------------------------------------- - Custom Scan (Citus Adaptive) (actual rows=5 loops=1) + Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num -> Distributed Subplan XXX_1 - Intermediate Data Size: 699 bytes + Intermediate Data Size: N bytes Result destination: Write locally - -> Limit (actual rows=20 loops=1) + -> Limit (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num - -> Custom Scan (Citus Adaptive) (actual rows=80 loops=1) + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num - Task Count: 4 - Tuple data received from nodes: 1673 bytes - Tasks Shown: One of 4 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N -> Task - Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT '20'::bigint - Tuple data received from node: 419 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=20 loops=1) + Query: SELECT id, val, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: id, val, num - -> Sort (actual rows=20 loops=1) + -> Sort (actual rows=N loops=N) Output: id, val, num Sort Key: sorted_merge_test.id - Sort Method: quicksort Memory: 26kB - -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=26 loops=1) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num - Task Count: 1 - Tuple data received from nodes: 101 bytes + Task Count: N + Tuple data received from nodes: N bytes Tasks Shown: All -> Task - Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (10)::numeric) ORDER BY id LIMIT 5 - Tuple data received from node: 101 bytes - Node: host=localhost port=xxxxx dbname=regression - -> Limit (actual rows=5 loops=1) + Query: SELECT id, val, num FROM (SELECT intermediate_result.id, intermediate_result.val, intermediate_result.num FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, val text, num numeric)) cte WHERE (num OPERATOR(pg_catalog.>) (N)::numeric) ORDER BY id LIMIT N + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num - -> Sort (actual rows=5 loops=1) + -> Sort (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num Sort Key: intermediate_result.id - Sort Method: top-N heapsort Memory: 25kB - -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=18 loops=1) + Sort Method: top-N heapsort Memory: NkB + -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val, intermediate_result.num Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) - Filter: (intermediate_result.num > '10'::numeric) - Rows Removed by Filter: 2 + Filter: (intermediate_result.num > 'N'::numeric) + Rows Removed by Filter: N (42 rows) -- ================================================================= diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index 1d4110055c9..87eb2bcad30 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -61,44 +61,34 @@ SET citus.enable_sorted_merge TO off; SET citus.enable_sorted_merge TO on; -- A1: ORDER BY distribution column -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); -- A2: ORDER BY DESC -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id DESC; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC'); -- A3: ORDER BY DESC NULLS LAST -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST'); -- A4: ORDER BY non-distribution column -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY val; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY val'); -- A5: Multi-column ORDER BY -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id, val; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val'); -- A6: Mixed directions -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC'); -- A7: GROUP BY dist_col ORDER BY dist_col -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id'); -- A8: WHERE clause + ORDER BY -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id'); -- A9: Expression in ORDER BY (non-aggregate) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, num FROM sorted_merge_test ORDER BY id + 1; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1'); -- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); -- ================================================================= -- Category B: Ineligibility — sort NOT pushed for merge @@ -107,20 +97,16 @@ SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; SET citus.enable_sorted_merge TO on; -- B1: ORDER BY count(*) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*); +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*)'); -- B2: ORDER BY avg(col) -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num); +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, avg(num) FROM sorted_merge_test GROUP BY id ORDER BY avg(num)'); -- B3: GROUP BY non-dist col, ORDER BY non-dist col -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY val'); -- B4: GROUP BY non-dist col, ORDER BY aggregate -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*); +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT val, count(*) FROM sorted_merge_test GROUP BY val ORDER BY count(*)'); -- ================================================================= -- Category C: Correctness — results match GUC off vs on @@ -293,21 +279,17 @@ SELECT id FROM sorted_merge_test WHERE false ORDER BY id; -- F1: Simple LIMIT + ORDER BY: plan unchanged between GUC off and on SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); -- F2: GROUP BY dist_col + ORDER BY + LIMIT SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5'); -- F3: ORDER BY aggregate + LIMIT (not eligible for merge) SET citus.enable_sorted_merge TO off; @@ -322,12 +304,10 @@ SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY count(*) DESC, i -- G1: Sort elision verification — coordinator Sort node absent SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test ORDER BY id; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); -- G2a: PREPARE with merge ON, EXECUTE after turning OFF -- Plan-time decision is baked in — cached plan must still merge correctly @@ -359,8 +339,7 @@ COMMIT; -- G4: EXPLAIN ANALYZE (sorted merge skipped for EXPLAIN ANALYZE) SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); -- G5: ORDER BY aggregate + LIMIT — crash regression test -- Previously caused SIGSEGV when sorted merge was enabled because @@ -461,15 +440,13 @@ SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; SET citus.enable_sorted_merge TO on; -- H1 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH ordered_cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH ordered_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id ) -SELECT * FROM ordered_cte ORDER BY id LIMIT 5; +SELECT * FROM ordered_cte ORDER BY id LIMIT 5'); -- H2 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH eligible_cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH eligible_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ), ineligible_cte AS ( @@ -477,58 +454,52 @@ ineligible_cte AS ( ) SELECT e.id, e.val, i.cnt FROM eligible_cte e JOIN ineligible_cte i ON e.id = i.id -ORDER BY e.id; +ORDER BY e.id'); -- H3 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH top_ids AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH top_ids AS ( SELECT id FROM sorted_merge_test ORDER BY id LIMIT 20 ) SELECT t.id, t.val FROM sorted_merge_test t JOIN top_ids ON t.id = top_ids.id ORDER BY t.id -LIMIT 10; +LIMIT 10'); -- H4 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT id, val FROM sorted_merge_test +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE id IN ( SELECT id FROM sorted_merge_events ORDER BY id LIMIT 10 ) ORDER BY id -LIMIT 5; +LIMIT 5'); -- H5 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH small_cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH small_cte AS ( SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 20 ) -SELECT * FROM small_cte ORDER BY id LIMIT 5; +SELECT * FROM small_cte ORDER BY id LIMIT 5'); -- H6 EXPLAIN -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -SELECT foo.id, bar.id as bar_id +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT foo.id, bar.id as bar_id FROM (SELECT id FROM sorted_merge_test ORDER BY id LIMIT 3) as foo, (SELECT id FROM sorted_merge_events ORDER BY id LIMIT 3) as bar ORDER BY foo.id, bar.id -LIMIT 5; +LIMIT 5'); -- H7 EXPLAIN — GUC off vs on SET citus.enable_sorted_merge TO off; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 ) -SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); SET citus.enable_sorted_merge TO on; -EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) -WITH cte AS ( +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH cte AS ( SELECT id, val, num FROM sorted_merge_test ORDER BY id LIMIT 20 ) -SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5; +SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); -- ================================================================= -- Cleanup From 49e9b930e688e7bdb36e239cf259395179142c03 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Fri, 10 Apr 2026 22:57:04 +0000 Subject: [PATCH 12/18] Added more tests per feedback --- .../expected/multi_orderby_pushdown.out | 597 ++++++++++++++++++ .../regress/sql/multi_orderby_pushdown.sql | 183 ++++++ 2 files changed, 780 insertions(+) diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index 9f528bf21bb..6803236ba56 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -5,6 +5,10 @@ -- planner eligibility logic. Verifies that enabling the GUC does not -- introduce regressions for any query pattern. -- +-- MX verification: this test has been verified to pass with zero diffs +-- under check-base-mx (MX mode), confirming sorted merge works correctly +-- when any node in the cluster acts as coordinator. +-- SET citus.next_shard_id TO 960000; -- ================================================================= -- Setup: create test tables @@ -1847,6 +1851,599 @@ SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); Rows Removed by Filter: N (42 rows) +-- ================================================================= +-- Category I: Distributed Transactions +-- ================================================================= +-- Verify sorted merge correctness within multi-statement transactions +-- where data is modified before the sorted-merge SELECT. +SET citus.enable_sorted_merge TO on; +-- I1: INSERT then SELECT within a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (900, 'txn_insert', 900.0); +SELECT id, val FROM sorted_merge_test WHERE id >= 900 ORDER BY id; + id | val +--------------------------------------------------------------------- + 900 | txn_insert +(1 row) + +ROLLBACK; +-- I2: UPDATE then SELECT within a transaction +BEGIN; +UPDATE sorted_merge_test SET val = 'updated' WHERE id = 1; +SELECT id, val FROM sorted_merge_test WHERE id <= 3 ORDER BY id; + id | val +--------------------------------------------------------------------- + 1 | updated + 2 | val_2 + 3 | val_3 +(3 rows) + +ROLLBACK; +-- I3: DELETE then SELECT within a transaction +BEGIN; +DELETE FROM sorted_merge_test WHERE id <= 5; +SELECT id, val FROM sorted_merge_test WHERE id <= 10 ORDER BY id; + id | val +--------------------------------------------------------------------- + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(5 rows) + +ROLLBACK; +-- I4: INSERT + UPDATE + SELECT with multi-column ORDER BY +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (901, 'txn_a', 1.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (902, 'txn_b', 2.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (903, 'txn_c', 3.0); +UPDATE sorted_merge_test SET num = 999.0 WHERE id = 901; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 900 ORDER BY num, id; + id | val | num +--------------------------------------------------------------------- + 902 | txn_b | 2.0 + 903 | txn_c | 3.0 + 901 | txn_a | 999.0 +(3 rows) + +ROLLBACK; +-- I5: Compare results with GUC off vs on in a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (910, 'cmp_a', 10.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (911, 'cmp_b', 20.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (912, 'cmp_c', 30.0); +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num +--------------------------------------------------------------------- + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; + id | val | num +--------------------------------------------------------------------- + 910 | cmp_a | 10.0 + 911 | cmp_b | 20.0 + 912 | cmp_c | 30.0 +(3 rows) + +ROLLBACK; +-- I6: DELETE + aggregate in SELECT with ORDER BY +BEGIN; +DELETE FROM sorted_merge_test WHERE id > 100 AND id < 200; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; + id | count +--------------------------------------------------------------------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +ROLLBACK; +-- ================================================================= +-- Category J: Coordinator expression evaluation exclusion +-- ================================================================= +-- Verify that queries with ORDER BY on expressions that need coordinator-side +-- evaluation are correctly excluded from sorted merge (or handled correctly). +SET citus.enable_sorted_merge TO on; +-- J1: ORDER BY expression on aggregate result (ordinal reference) +-- The ORDER BY references position 2 which is an aggregate — sorted merge +-- must NOT be used because aggregates are rewritten between worker/coordinator. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) AS total FROM sorted_merge_test GROUP BY id ORDER BY 2 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Sort Key: remote_scan.total + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, sum(num) AS total FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (sum(num)) + -> Sort (actual rows=N loops=N) + Output: id, (sum(num)) + Sort Key: (sum(sorted_merge_test.num)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, sum(num) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J2: ORDER BY expression wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) + 1 AS total_plus FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Sort Key: remote_scan.total_plus + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.total_plus + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) AS total_plus FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY (sum(num) OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((sum(num) + 'N'::numeric)) + Sort Key: ((sum(sorted_merge_test.num) + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, (sum(num) + 'N'::numeric) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J3: ORDER BY a non-aggregate expression that can be pushed to workers +-- This should be eligible for sorted merge — the expression is evaluated +-- on the worker side and sort order is preserved. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id + 0'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (id OPERATOR(pg_catalog.+) N) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (id OPERATOR(pg_catalog.+) N) + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((id + N)) + Sort Key: ((sorted_merge_test.id + N)) + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (id + N) +(15 rows) + +-- J4: ORDER BY with CASE expression (no aggregates) — eligible +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY CASE WHEN id < 50 THEN 0 ELSE 1 END, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CASE WHEN (id OPERATOR(pg_catalog.<) N) THEN N ELSE N END, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CASE WHEN (id < N) THEN N ELSE N END) + Sort Key: (CASE WHEN (sorted_merge_test.id < N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CASE WHEN (id < N) THEN N ELSE N END +(15 rows) + +-- J5: ORDER BY on an expression that mixes aggregate and non-aggregate +-- Should be ineligible because the expression contains an aggregate. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id + count(*)'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3 + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count, (id OPERATOR(pg_catalog.+) count(*)) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), (id + count(*)) + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(19 rows) + +-- J6: Correctness comparison — expression ORDER BY, GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J7–J12: Additional pushable expressions (no aggregates) +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J7: ORDER BY function call on column +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- J8: ORDER BY COALESCE +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0) LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +-- J9: ORDER BY negation +SELECT id, num FROM sorted_merge_test ORDER BY -num LIMIT 5; + id | num +--------------------------------------------------------------------- + 100 | 150.0 + 99 | 148.5 + 98 | 147.0 + 97 | 145.5 + 96 | 144.0 +(5 rows) + +-- J10: ORDER BY concatenation +SELECT id, val FROM sorted_merge_test ORDER BY val || '_suffix' LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 100 | val_100 + 10 | val_10 +(5 rows) + +-- J11: ORDER BY mathematical function (abs distance) +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +-- J12: ORDER BY expression not in SELECT list +SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +-- J13: ORDER BY expression referencing multiple columns +SELECT id, val FROM sorted_merge_test ORDER BY id * num LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +-- J14: ORDER BY with type cast +SELECT id, num FROM sorted_merge_test ORDER BY num::int LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +-- J15: ORDER BY with subexpression in SELECT and different expression in ORDER BY +SELECT id, num + 1 as n1 FROM sorted_merge_test ORDER BY num + 2 LIMIT 5; + id | n1 +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +-- J16: ORDER BY column alias +SELECT id, num * 2 as doubled FROM sorted_merge_test ORDER BY doubled LIMIT 5; + id | doubled +--------------------------------------------------------------------- + 1 | 3.0 + 2 | 6.0 + 3 | 9.0 + 4 | 12.0 + 5 | 15.0 +(5 rows) + +-- ----------------------------------------------------------------- +-- J17–J21: Correctness — GUC off vs on for expression ORDER BY +-- ----------------------------------------------------------------- +-- J17: function call +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + id | val +--------------------------------------------------------------------- + 200 | dup_a + 201 | dup_b + 202 | dup_c + 1 | val_1 + 10 | val_10 +(5 rows) + +-- J18: CASE expression +SET citus.enable_sorted_merge TO off; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + id | cat +--------------------------------------------------------------------- + 34 | high + 35 | high + 36 | high + 37 | high + 38 | high + 39 | high + 40 | high + 41 | high + 42 | high + 43 | high +(10 rows) + +-- J19: COALESCE +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 101 | + 102 | + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 +(5 rows) + +-- J20: abs() distance function +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + id | num +--------------------------------------------------------------------- + 17 | 25.5 + 16 | 24.0 + 18 | 27.0 + 15 | 22.5 + 19 | 28.5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J21–J22: More ineligibility — aggregate inside expressions +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J21: ORDER BY CASE wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY CASE WHEN count(*) > 1 THEN 0 ELSE 1 END, id LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + -> Sort (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Sort Key: remote_scan.worker_column_3, remote_scan.id + Sort Method: top-N heapsort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.count, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, count(*) AS count, CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true GROUP BY id ORDER BY CASE WHEN (count(*) OPERATOR(pg_catalog.>) N) THEN N ELSE N END, id LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + -> Sort (actual rows=N loops=N) + Output: id, (count(*)), (CASE WHEN (count(*) > N) THEN N ELSE N END) + Sort Key: (CASE WHEN (count(*) > N) THEN N ELSE N END), sorted_merge_test.id + Sort Method: top-N heapsort Memory: NkB + -> HashAggregate (actual rows=N loops=N) + Output: id, count(*), CASE WHEN (count(*) > N) THEN N ELSE N END + Group Key: sorted_merge_test.id + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, num, ts +(27 rows) + +-- J22: ORDER BY aggregate expression (sum + 1) — correctness +SET citus.enable_sorted_merge TO off; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +SET citus.enable_sorted_merge TO on; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + id | s +--------------------------------------------------------------------- + 1 | 2.5 + 2 | 4.0 + 3 | 5.5 + 4 | 7.0 + 5 | 8.5 +(5 rows) + +-- ----------------------------------------------------------------- +-- J23–J24: EXPLAIN plans for pushable expression patterns +-- ----------------------------------------------------------------- +SET citus.enable_sorted_merge TO on; +-- J23: Does function-call ORDER BY get pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, upper(val) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (upper(val)) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, val, (upper(val)) + -> Sort (actual rows=N loops=N) + Output: id, val, (upper(val)) + Sort Key: (upper(sorted_merge_test.val)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, upper(val) +(19 rows) + +-- J24: ORDER BY expression not in SELECT list — pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5'); + explain_filter +--------------------------------------------------------------------- + Limit (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.worker_column_2 + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, (num OPERATOR(pg_catalog.+) 'N'::numeric) AS worker_column_2 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (num OPERATOR(pg_catalog.+) 'N'::numeric) LIMIT 'N'::bigint + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Limit (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + -> Sort (actual rows=N loops=N) + Output: id, ((num + 'N'::numeric)) + Sort Key: ((sorted_merge_test.num + 'N'::numeric)) + Sort Method: top-N heapsort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, (num + 'N'::numeric) +(19 rows) + -- ================================================================= -- Cleanup -- ================================================================= diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index 87eb2bcad30..a5b48ee9f00 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -5,6 +5,10 @@ -- planner eligibility logic. Verifies that enabling the GUC does not -- introduce regressions for any query pattern. -- +-- MX verification: this test has been verified to pass with zero diffs +-- under check-base-mx (MX mode), confirming sorted merge works correctly +-- when any node in the cluster acts as coordinator. +-- SET citus.next_shard_id TO 960000; @@ -501,6 +505,185 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING ) SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); +-- ================================================================= +-- Category I: Distributed Transactions +-- ================================================================= +-- Verify sorted merge correctness within multi-statement transactions +-- where data is modified before the sorted-merge SELECT. + +SET citus.enable_sorted_merge TO on; + +-- I1: INSERT then SELECT within a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (900, 'txn_insert', 900.0); +SELECT id, val FROM sorted_merge_test WHERE id >= 900 ORDER BY id; +ROLLBACK; + +-- I2: UPDATE then SELECT within a transaction +BEGIN; +UPDATE sorted_merge_test SET val = 'updated' WHERE id = 1; +SELECT id, val FROM sorted_merge_test WHERE id <= 3 ORDER BY id; +ROLLBACK; + +-- I3: DELETE then SELECT within a transaction +BEGIN; +DELETE FROM sorted_merge_test WHERE id <= 5; +SELECT id, val FROM sorted_merge_test WHERE id <= 10 ORDER BY id; +ROLLBACK; + +-- I4: INSERT + UPDATE + SELECT with multi-column ORDER BY +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (901, 'txn_a', 1.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (902, 'txn_b', 2.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (903, 'txn_c', 3.0); +UPDATE sorted_merge_test SET num = 999.0 WHERE id = 901; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 900 ORDER BY num, id; +ROLLBACK; + +-- I5: Compare results with GUC off vs on in a transaction +BEGIN; +INSERT INTO sorted_merge_test (id, val, num) VALUES (910, 'cmp_a', 10.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (911, 'cmp_b', 20.0); +INSERT INTO sorted_merge_test (id, val, num) VALUES (912, 'cmp_c', 30.0); +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val, num FROM sorted_merge_test WHERE id >= 910 ORDER BY id; +ROLLBACK; + +-- I6: DELETE + aggregate in SELECT with ORDER BY +BEGIN; +DELETE FROM sorted_merge_test WHERE id > 100 AND id < 200; +SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id LIMIT 5; +ROLLBACK; + +-- ================================================================= +-- Category J: Coordinator expression evaluation exclusion +-- ================================================================= +-- Verify that queries with ORDER BY on expressions that need coordinator-side +-- evaluation are correctly excluded from sorted merge (or handled correctly). + +SET citus.enable_sorted_merge TO on; + +-- J1: ORDER BY expression on aggregate result (ordinal reference) +-- The ORDER BY references position 2 which is an aggregate — sorted merge +-- must NOT be used because aggregates are rewritten between worker/coordinator. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) AS total FROM sorted_merge_test GROUP BY id ORDER BY 2 LIMIT 5'); + +-- J2: ORDER BY expression wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, sum(num) + 1 AS total_plus FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5'); + +-- J3: ORDER BY a non-aggregate expression that can be pushed to workers +-- This should be eligible for sorted merge — the expression is evaluated +-- on the worker side and sort order is preserved. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id + 0'); + +-- J4: ORDER BY with CASE expression (no aggregates) — eligible +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY CASE WHEN id < 50 THEN 0 ELSE 1 END, id'); + +-- J5: ORDER BY on an expression that mixes aggregate and non-aggregate +-- Should be ineligible because the expression contains an aggregate. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id + count(*)'); + +-- J6: Correctness comparison — expression ORDER BY, GUC off vs on +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id + 0 LIMIT 5; + +-- ----------------------------------------------------------------- +-- J7–J12: Additional pushable expressions (no aggregates) +-- ----------------------------------------------------------------- + +SET citus.enable_sorted_merge TO on; + +-- J7: ORDER BY function call on column +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + +-- J8: ORDER BY COALESCE +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0) LIMIT 5; + +-- J9: ORDER BY negation +SELECT id, num FROM sorted_merge_test ORDER BY -num LIMIT 5; + +-- J10: ORDER BY concatenation +SELECT id, val FROM sorted_merge_test ORDER BY val || '_suffix' LIMIT 5; + +-- J11: ORDER BY mathematical function (abs distance) +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + +-- J12: ORDER BY expression not in SELECT list +SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5; + +-- J13: ORDER BY expression referencing multiple columns +SELECT id, val FROM sorted_merge_test ORDER BY id * num LIMIT 5; + +-- J14: ORDER BY with type cast +SELECT id, num FROM sorted_merge_test ORDER BY num::int LIMIT 5; + +-- J15: ORDER BY with subexpression in SELECT and different expression in ORDER BY +SELECT id, num + 1 as n1 FROM sorted_merge_test ORDER BY num + 2 LIMIT 5; + +-- J16: ORDER BY column alias +SELECT id, num * 2 as doubled FROM sorted_merge_test ORDER BY doubled LIMIT 5; + +-- ----------------------------------------------------------------- +-- J17–J21: Correctness — GUC off vs on for expression ORDER BY +-- ----------------------------------------------------------------- + +-- J17: function call +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5; + +-- J18: CASE expression +SET citus.enable_sorted_merge TO off; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; +SET citus.enable_sorted_merge TO on; +SELECT id, CASE WHEN num > 50 THEN 'high' ELSE 'low' END as cat +FROM sorted_merge_test ORDER BY CASE WHEN num > 50 THEN 'high' ELSE 'low' END, id LIMIT 10; + +-- J19: COALESCE +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY COALESCE(num, 0), id LIMIT 5; + +-- J20: abs() distance function +SET citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; +SET citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY abs(num - 25), id LIMIT 5; + +-- ----------------------------------------------------------------- +-- J21–J22: More ineligibility — aggregate inside expressions +-- ----------------------------------------------------------------- + +SET citus.enable_sorted_merge TO on; + +-- J21: ORDER BY CASE wrapping an aggregate +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY CASE WHEN count(*) > 1 THEN 0 ELSE 1 END, id LIMIT 5'); + +-- J22: ORDER BY aggregate expression (sum + 1) — correctness +SET citus.enable_sorted_merge TO off; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; +SET citus.enable_sorted_merge TO on; +SELECT id, sum(num) + 1 as s FROM sorted_merge_test GROUP BY id ORDER BY sum(num) + 1 LIMIT 5; + +-- ----------------------------------------------------------------- +-- J23–J24: EXPLAIN plans for pushable expression patterns +-- ----------------------------------------------------------------- + +SET citus.enable_sorted_merge TO on; + +-- J23: Does function-call ORDER BY get pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY upper(val) LIMIT 5'); + +-- J24: ORDER BY expression not in SELECT list — pushed to workers? +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5'); + -- ================================================================= -- Cleanup -- ================================================================= From d92b5519090bd87fecbad0461ae2e540e72d603a Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Mon, 13 Apr 2026 16:02:21 +0000 Subject: [PATCH 13/18] Incorporate code review feedback and add more test cases as well as an indication of sorted merge in the EXPLAIN output --- .../distributed/planner/multi_explain.c | 7 + .../expected/multi_orderby_pushdown.out | 442 ++++++++++++++++-- .../regress/sql/multi_orderby_pushdown.sql | 87 +++- 3 files changed, 507 insertions(+), 29 deletions(-) diff --git a/src/backend/distributed/planner/multi_explain.c b/src/backend/distributed/planner/multi_explain.c index 52e56030eb7..747930127b2 100644 --- a/src/backend/distributed/planner/multi_explain.c +++ b/src/backend/distributed/planner/multi_explain.c @@ -710,6 +710,13 @@ ExplainJob(CitusScanState *scanState, Job *job, ExplainState *es, ExplainOpenGroup("Job", "Job", true, es); ExplainPropertyInteger("Task Count", NULL, taskCount, es); + + DistributedPlan *distributedPlan = scanState->distributedPlan; + if (distributedPlan->useSortedMerge) + { + ExplainPropertyText("Merge Method", "sorted merge", es); + } + if (ShowReceivedTupleData(scanState, es)) { Task *task = NULL; diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index 6803236ba56..b3878ed5b57 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -77,6 +77,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -89,7 +90,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: quicksort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val -(15 rows) +(16 rows) -- A2: ORDER BY DESC SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id DESC'); @@ -98,6 +99,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -110,7 +112,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: quicksort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id -(15 rows) +(16 rows) -- A3: ORDER BY DESC NULLS LAST SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num DESC NULLS LAST'); @@ -119,6 +121,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.num Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -131,7 +134,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: quicksort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, num -(15 rows) +(16 rows) -- A4: ORDER BY non-distribution column SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY val'); @@ -140,6 +143,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -152,7 +156,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: quicksort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val -(15 rows) +(16 rows) -- A5: Multi-column ORDER BY SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id, val'); @@ -161,6 +165,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -173,7 +178,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: quicksort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val -(15 rows) +(16 rows) -- A6: Mixed directions SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val, num FROM sorted_merge_test ORDER BY id ASC, num DESC'); @@ -182,6 +187,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -194,7 +200,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: quicksort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num -(15 rows) +(16 rows) -- A7: GROUP BY dist_col ORDER BY dist_col SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, count(*) FROM sorted_merge_test GROUP BY id ORDER BY id'); @@ -203,6 +209,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -218,7 +225,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Group Key: sorted_merge_test.id -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts -(19 rows) +(20 rows) -- A8: WHERE clause + ORDER BY SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test WHERE num > 50 ORDER BY id'); @@ -227,6 +234,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -241,7 +249,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Output: id, val Filter: (sorted_merge_test.num > 'N'::numeric) Rows Removed by Filter: N -(17 rows) +(18 rows) -- A9: Expression in ORDER BY (non-aggregate) SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY id + 1'); @@ -250,6 +258,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.num, remote_scan.worker_column_3 Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -262,7 +271,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: quicksort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, num, (id + N) -(15 rows) +(16 rows) -- A10: ORDER BY with LIMIT (existing pushdown, verify no regression) SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id LIMIT 5'); @@ -273,6 +282,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -287,7 +297,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: top-N heapsort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id -(19 rows) +(20 rows) -- ================================================================= -- Category B: Ineligibility — sort NOT pushed for merge @@ -907,6 +917,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -921,7 +932,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: top-N heapsort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id -(19 rows) +(20 rows) -- F2: GROUP BY dist_col + ORDER BY + LIMIT SET citus.enable_sorted_merge TO off; @@ -965,6 +976,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.count Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -982,7 +994,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Group Key: sorted_merge_test.id -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, num, ts -(23 rows) +(24 rows) -- F3: ORDER BY aggregate + LIMIT (not eligible for merge) SET citus.enable_sorted_merge TO off; @@ -1039,6 +1051,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1051,10 +1064,12 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: quicksort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val -(15 rows) +(16 rows) -- G2a: PREPARE with merge ON, EXECUTE after turning OFF --- Plan-time decision is baked in — cached plan must still merge correctly +-- Plan-time decision is baked in — cached plan must still merge correctly. +-- Execute 6+ times to trigger PostgreSQL's generic plan caching, then +-- verify the plan shape is preserved after toggling the GUC. SET citus.enable_sorted_merge TO on; PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; EXECUTE merge_on_stmt; @@ -1072,7 +1087,100 @@ EXECUTE merge_on_stmt; 10 | val_10 (10 rows) +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_on_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — no Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + SET citus.enable_sorted_merge TO off; +-- Cached plan retains the sorted merge decision from planning time EXECUTE merge_on_stmt; id | val --------------------------------------------------------------------- @@ -1088,9 +1196,25 @@ EXECUTE merge_on_stmt; 10 | val_10 (10 rows) +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Merge Method: sorted merge + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(11 rows) + DEALLOCATE merge_on_stmt; -- G2b: PREPARE with merge OFF, EXECUTE after turning ON --- Cached plan has Sort node — must still return sorted results +-- Cached plan has Sort node — must still return sorted results. SET citus.enable_sorted_merge TO off; PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; EXECUTE merge_off_stmt; @@ -1108,7 +1232,101 @@ EXECUTE merge_off_stmt; 10 | val_10 (10 rows) +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +EXECUTE merge_off_stmt; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Verify plan shape after caching — Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + SET citus.enable_sorted_merge TO on; +-- Cached plan retains the non-merge decision from planning time EXECUTE merge_off_stmt; id | val --------------------------------------------------------------------- @@ -1124,6 +1342,23 @@ EXECUTE merge_off_stmt; 10 | val_10 (10 rows) +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; + QUERY PLAN +--------------------------------------------------------------------- + Limit + -> Sort + Sort Key: remote_scan.id + -> Custom Scan (Citus Adaptive) + Task Count: 4 + Tasks Shown: One of 4 + -> Task + Node: host=localhost port=xxxxx dbname=regression + -> Limit + -> Sort + Sort Key: id + -> Seq Scan on sorted_merge_test_960000 sorted_merge_test +(12 rows) + DEALLOCATE merge_off_stmt; -- G3: Cursor with backward scan SET citus.enable_sorted_merge TO on; @@ -1162,6 +1397,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1176,7 +1412,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: top-N heapsort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id -(19 rows) +(20 rows) -- G5: ORDER BY aggregate + LIMIT — crash regression test -- Previously caused SIGSEGV when sorted merge was enabled because @@ -1388,6 +1624,7 @@ SELECT * FROM ordered_cte ORDER BY id LIMIT 5'); -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1402,7 +1639,7 @@ SELECT * FROM ordered_cte ORDER BY id LIMIT 5'); Sort Method: top-N heapsort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: sorted_merge_test.id, sorted_merge_test.val -(19 rows) +(20 rows) -- H2 EXPLAIN SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH eligible_cte AS ( @@ -1426,6 +1663,7 @@ ORDER BY e.id'); -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1492,7 +1730,7 @@ ORDER BY e.id'); -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) Output: intermediate_result_1.cnt, intermediate_result_1.id Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) -(77 rows) +(78 rows) -- H3 EXPLAIN SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH top_ids AS ( @@ -1517,6 +1755,7 @@ LIMIT 10'); -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1532,6 +1771,7 @@ LIMIT 10'); -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1556,7 +1796,7 @@ LIMIT 10'); Sort Method: quicksort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 t (actual rows=N loops=N) Output: t.id, t.val -(51 rows) +(53 rows) -- H4 EXPLAIN SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test @@ -1579,6 +1819,7 @@ LIMIT 5'); -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1594,6 +1835,7 @@ LIMIT 5'); -> Seq Scan on public.sorted_merge_events_960004 sorted_merge_events (actual rows=N loops=N) Output: id Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1616,7 +1858,7 @@ LIMIT 5'); -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) -(50 rows) +(52 rows) -- H5 EXPLAIN SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) WITH small_cte AS ( @@ -1635,6 +1877,7 @@ SELECT * FROM small_cte ORDER BY id LIMIT 5'); -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1665,7 +1908,7 @@ SELECT * FROM small_cte ORDER BY id LIMIT 5'); -> Function Scan on pg_catalog.read_intermediate_result intermediate_result (actual rows=N loops=N) Output: intermediate_result.id, intermediate_result.val Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) -(40 rows) +(41 rows) -- H6 EXPLAIN SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT foo.id, bar.id as bar_id @@ -1686,6 +1929,7 @@ LIMIT 5'); -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1708,6 +1952,7 @@ LIMIT 5'); -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1743,7 +1988,7 @@ LIMIT 5'); -> Function Scan on pg_catalog.read_intermediate_result intermediate_result_1 (actual rows=N loops=N) Output: intermediate_result_1.id Function Call: read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) -(67 rows) +(69 rows) -- H7 EXPLAIN — GUC off vs on SET citus.enable_sorted_merge TO off; @@ -1817,6 +2062,7 @@ SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.num Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -1849,7 +2095,7 @@ SELECT * FROM cte WHERE num > 10 ORDER BY id LIMIT 5'); Function Call: read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) Filter: (intermediate_result.num > 'N'::numeric) Rows Removed by Filter: N -(42 rows) +(43 rows) -- ================================================================= -- Category I: Distributed Transactions @@ -2027,6 +2273,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -2039,7 +2286,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: quicksort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, (id + N) -(15 rows) +(16 rows) -- J4: ORDER BY with CASE expression (no aggregates) — eligible SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY CASE WHEN id < 50 THEN 0 ELSE 1 END, id'); @@ -2048,6 +2295,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -2060,7 +2308,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: quicksort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, CASE WHEN (id < N) THEN N ELSE N END -(15 rows) +(16 rows) -- J5: ORDER BY on an expression that mixes aggregate and non-aggregate -- Should be ineligible because the expression contains an aggregate. @@ -2403,6 +2651,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -2417,7 +2666,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: top-N heapsort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, val, upper(val) -(19 rows) +(20 rows) -- J24: ORDER BY expression not in SELECT list — pushed to workers? SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5'); @@ -2428,6 +2677,7 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) Output: remote_scan.id, remote_scan.worker_column_2 Task Count: N + Merge Method: sorted merge Tuple data received from nodes: N bytes Tasks Shown: One of N -> Task @@ -2442,8 +2692,146 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Sort Method: top-N heapsort Memory: NkB -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) Output: id, (num + 'N'::numeric) -(19 rows) +(20 rows) + +-- ================================================================= +-- Category K: Index-based sort avoidance +-- ================================================================= +-- When an index exists on the ORDER BY column, PostgreSQL's worker-side +-- planner should choose an Index Scan instead of Sort + Seq Scan, making +-- the worker-side sort essentially free. This is the best-case scenario +-- for sorted merge: zero worker sort cost + zero coordinator sort cost. +-- +-- We disable enable_seqscan to force the worker planner to prefer the +-- index, since the test table is small enough that Seq Scan + Sort +-- would otherwise be cheaper. +CREATE INDEX sorted_merge_test_id_idx ON sorted_merge_test(id); +-- Use a transaction with SET LOCAL to propagate enable_seqscan=off to workers, +-- forcing the worker planner to use the index instead of Seq Scan + Sort. +SET citus.propagate_set_commands TO 'local'; +-- K1: EXPLAIN with index — worker uses Index Scan, no Sort node +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) +COMMIT; +-- K2: Correctness with index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 +(5 rows) + +COMMIT; +-- K3: Multi-column index +CREATE INDEX sorted_merge_test_num_id_idx ON sorted_merge_test(num, id); +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.num + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, num FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY num, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Only Scan using sorted_merge_test_num_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, num + Heap Fetches: N +(13 rows) + +COMMIT; +-- K4: Correctness with multi-column index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; + id | num +--------------------------------------------------------------------- + 1 | 1.5 + 2 | 3.0 + 3 | 4.5 + 4 | 6.0 + 5 | 7.5 +(5 rows) + +COMMIT; +-- K5: DESC ordering with index +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id DESC'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id DESC + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Index Scan Backward using sorted_merge_test_id_idx_960000 on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val +(12 rows) + +COMMIT; +RESET citus.propagate_set_commands; +DROP INDEX sorted_merge_test_id_idx; +DROP INDEX sorted_merge_test_num_id_idx; -- ================================================================= -- Cleanup -- ================================================================= diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index a5b48ee9f00..0917ac890d5 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -314,21 +314,41 @@ SET citus.enable_sorted_merge TO on; SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); -- G2a: PREPARE with merge ON, EXECUTE after turning OFF --- Plan-time decision is baked in — cached plan must still merge correctly +-- Plan-time decision is baked in — cached plan must still merge correctly. +-- Execute 6+ times to trigger PostgreSQL's generic plan caching, then +-- verify the plan shape is preserved after toggling the GUC. SET citus.enable_sorted_merge TO on; PREPARE merge_on_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; EXECUTE merge_on_stmt; +EXECUTE merge_on_stmt; +EXECUTE merge_on_stmt; +EXECUTE merge_on_stmt; +EXECUTE merge_on_stmt; +EXECUTE merge_on_stmt; +-- Verify plan shape after caching — no Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; SET citus.enable_sorted_merge TO off; +-- Cached plan retains the sorted merge decision from planning time EXECUTE merge_on_stmt; +EXPLAIN (COSTS OFF) EXECUTE merge_on_stmt; DEALLOCATE merge_on_stmt; -- G2b: PREPARE with merge OFF, EXECUTE after turning ON --- Cached plan has Sort node — must still return sorted results +-- Cached plan has Sort node — must still return sorted results. SET citus.enable_sorted_merge TO off; PREPARE merge_off_stmt AS SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 10; EXECUTE merge_off_stmt; +EXECUTE merge_off_stmt; +EXECUTE merge_off_stmt; +EXECUTE merge_off_stmt; +EXECUTE merge_off_stmt; +EXECUTE merge_off_stmt; +-- Verify plan shape after caching — Sort above CustomScan +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; SET citus.enable_sorted_merge TO on; +-- Cached plan retains the non-merge decision from planning time EXECUTE merge_off_stmt; +EXPLAIN (COSTS OFF) EXECUTE merge_off_stmt; DEALLOCATE merge_off_stmt; -- G3: Cursor with backward scan @@ -684,6 +704,69 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING -- J24: ORDER BY expression not in SELECT list — pushed to workers? SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY num + 1 LIMIT 5'); +-- ================================================================= +-- Category K: Index-based sort avoidance +-- ================================================================= +-- When an index exists on the ORDER BY column, PostgreSQL's worker-side +-- planner should choose an Index Scan instead of Sort + Seq Scan, making +-- the worker-side sort essentially free. This is the best-case scenario +-- for sorted merge: zero worker sort cost + zero coordinator sort cost. +-- +-- We disable enable_seqscan to force the worker planner to prefer the +-- index, since the test table is small enough that Seq Scan + Sort +-- would otherwise be cheaper. + +CREATE INDEX sorted_merge_test_id_idx ON sorted_merge_test(id); + +-- Use a transaction with SET LOCAL to propagate enable_seqscan=off to workers, +-- forcing the worker planner to use the index instead of Seq Scan + Sort. +SET citus.propagate_set_commands TO 'local'; + +-- K1: EXPLAIN with index — worker uses Index Scan, no Sort node +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id'); +COMMIT; + +-- K2: Correctness with index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY id LIMIT 5; +COMMIT; + +-- K3: Multi-column index +CREATE INDEX sorted_merge_test_num_id_idx ON sorted_merge_test(num, id); + +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, num FROM sorted_merge_test ORDER BY num, id'); +COMMIT; + +-- K4: Correctness with multi-column index — GUC off vs on +BEGIN; +SET LOCAL enable_seqscan TO off; +SET LOCAL citus.enable_sorted_merge TO off; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; +SET LOCAL citus.enable_sorted_merge TO on; +SELECT id, num FROM sorted_merge_test ORDER BY num, id LIMIT 5; +COMMIT; + +-- K5: DESC ordering with index +SET citus.enable_sorted_merge TO on; +BEGIN; +SET LOCAL enable_seqscan TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY id DESC'); +COMMIT; + +RESET citus.propagate_set_commands; +DROP INDEX sorted_merge_test_id_idx; +DROP INDEX sorted_merge_test_num_id_idx; + -- ================================================================= -- Cleanup -- ================================================================= From 8b4ba3641a0ec8debcc40220015baad17cee5209 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Tue, 14 Apr 2026 18:44:55 +0000 Subject: [PATCH 14/18] Added test cases with volatile/stable functions --- .../expected/multi_orderby_pushdown.out | 123 ++++++++++++++++++ .../regress/sql/multi_orderby_pushdown.sql | 49 +++++++ 2 files changed, 172 insertions(+) diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index b3878ed5b57..b5522eafb03 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -2833,6 +2833,129 @@ RESET citus.propagate_set_commands; DROP INDEX sorted_merge_test_id_idx; DROP INDEX sorted_merge_test_num_id_idx; -- ================================================================= +-- Category L: Volatile and stable functions in ORDER BY +-- Tests that ORDER BY with functions works correctly with sorted merge. +-- Volatile functions (random, clock_timestamp, timeofday) are pushed +-- to workers as computed columns — sorted merge uses the materialized +-- worker values, which is semantically equivalent to coordinator Sort. +-- ================================================================= +-- L1: STABLE function — now() in expression with column +-- now() returns the same value on all workers within a transaction, +-- so the merge is globally consistent. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY now() - ts, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, (now() OPERATOR(pg_catalog.-) ts) AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (now() OPERATOR(pg_catalog.-) ts), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, ((now() - ts)) + Sort Key: ((now() - sorted_merge_test.ts)), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, (now() - ts) +(16 rows) + +-- L2: VOLATILE function — random() in ORDER BY +-- random() is pushed to workers as worker_column_3; each worker sorts +-- by its own random values. The merge interleaves using materialized +-- values — semantically equivalent to coordinator Sort on worker_column_3. +-- Test plan shape only (result is non-deterministic). +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY random(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, random() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (random()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (random()) + Sort Key: (random()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, random() +(16 rows) + +-- L3: VOLATILE function — clock_timestamp() in ORDER BY +-- Same mechanics as random(): pushed to workers, sorted locally, merged. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY clock_timestamp(), id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, clock_timestamp() AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY (clock_timestamp()), id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (clock_timestamp()) + Sort Key: (clock_timestamp()), sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, clock_timestamp() +(16 rows) + +-- L4: nextval() in ORDER BY with sorted merge ON — expected ERROR +-- nextval() cannot be pushed to workers (CanPushDownExpression blocks it). +-- The sort clause references a target entry missing from the worker target +-- list, causing a plan-time error. This is a pre-existing Citus limitation. +CREATE SEQUENCE sorted_merge_test_seq; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq'); +ERROR: ORDER/GROUP BY expression not found in targetlist +-- L4b: nextval() in ORDER BY with sorted merge OFF but LIMIT present +-- Same error — demonstrates this is NOT a sorted merge regression. +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq') LIMIT 5; +ERROR: ORDER/GROUP BY expression not found in targetlist +DROP SEQUENCE sorted_merge_test_seq; +-- L5: STABLE function alone (constant-fold case) +-- current_timestamp is constant-folded by the planner; the sort key +-- effectively becomes just 'id'. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY current_timestamp, id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id, remote_scan.val, remote_scan.worker_column_3 + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id, val, CURRENT_TIMESTAMP AS worker_column_3 FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY CURRENT_TIMESTAMP, id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id, val, (CURRENT_TIMESTAMP) + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id, val, CURRENT_TIMESTAMP +(16 rows) + +SET citus.enable_sorted_merge TO off; +-- ================================================================= -- Cleanup -- ================================================================= SET citus.enable_sorted_merge TO off; diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index 0917ac890d5..cc2bb87377f 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -767,6 +767,55 @@ RESET citus.propagate_set_commands; DROP INDEX sorted_merge_test_id_idx; DROP INDEX sorted_merge_test_num_id_idx; +-- ================================================================= +-- Category L: Volatile and stable functions in ORDER BY +-- Tests that ORDER BY with functions works correctly with sorted merge. +-- Volatile functions (random, clock_timestamp, timeofday) are pushed +-- to workers as computed columns — sorted merge uses the materialized +-- worker values, which is semantically equivalent to coordinator Sort. +-- ================================================================= + +-- L1: STABLE function — now() in expression with column +-- now() returns the same value on all workers within a transaction, +-- so the merge is globally consistent. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY now() - ts, id'); + +-- L2: VOLATILE function — random() in ORDER BY +-- random() is pushed to workers as worker_column_3; each worker sorts +-- by its own random values. The merge interleaves using materialized +-- values — semantically equivalent to coordinator Sort on worker_column_3. +-- Test plan shape only (result is non-deterministic). +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY random(), id'); + +-- L3: VOLATILE function — clock_timestamp() in ORDER BY +-- Same mechanics as random(): pushed to workers, sorted locally, merged. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY clock_timestamp(), id'); + +-- L4: nextval() in ORDER BY with sorted merge ON — expected ERROR +-- nextval() cannot be pushed to workers (CanPushDownExpression blocks it). +-- The sort clause references a target entry missing from the worker target +-- list, causing a plan-time error. This is a pre-existing Citus limitation. +CREATE SEQUENCE sorted_merge_test_seq; +SET citus.enable_sorted_merge TO on; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq'); + +-- L4b: nextval() in ORDER BY with sorted merge OFF but LIMIT present +-- Same error — demonstrates this is NOT a sorted merge regression. +SET citus.enable_sorted_merge TO off; +SELECT id, val FROM sorted_merge_test ORDER BY nextval('sorted_merge_test_seq') LIMIT 5; +DROP SEQUENCE sorted_merge_test_seq; + +-- L5: STABLE function alone (constant-fold case) +-- current_timestamp is constant-folded by the planner; the sort key +-- effectively becomes just 'id'. Sorted merge should be used. +SET citus.enable_sorted_merge TO on; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id, val FROM sorted_merge_test ORDER BY current_timestamp, id'); + +SET citus.enable_sorted_merge TO off; + -- ================================================================= -- Cleanup -- ================================================================= From 86f82a1fc077ec562e3f27e7e0ff94eb9a765485 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Tue, 14 Apr 2026 19:55:51 +0000 Subject: [PATCH 15/18] Make style checks happy --- src/backend/distributed/planner/multi_explain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/distributed/planner/multi_explain.c b/src/backend/distributed/planner/multi_explain.c index 747930127b2..8c9fd8a720c 100644 --- a/src/backend/distributed/planner/multi_explain.c +++ b/src/backend/distributed/planner/multi_explain.c @@ -716,7 +716,7 @@ ExplainJob(CitusScanState *scanState, Job *job, ExplainState *es, { ExplainPropertyText("Merge Method", "sorted merge", es); } - + if (ShowReceivedTupleData(scanState, es)) { Task *task = NULL; From 4bb3811ff0f700b25cd72d69b8c8ccca83a40ec9 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Wed, 15 Apr 2026 21:42:26 +0000 Subject: [PATCH 16/18] Fix the issue where EXPLAIN ANALYZE uses sorted merge for the distributed plan but not during execution --- .../distributed/executor/adaptive_executor.c | 23 +++++++++++-------- .../distributed/executor/sorted_merge.c | 7 +++++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/backend/distributed/executor/adaptive_executor.c b/src/backend/distributed/executor/adaptive_executor.c index a4e5461e51e..f8713f5725b 100644 --- a/src/backend/distributed/executor/adaptive_executor.c +++ b/src/backend/distributed/executor/adaptive_executor.c @@ -817,20 +817,23 @@ AdaptiveExecutor(CitusScanState *scanState) /* * When sorted merge is active, route worker results into per-task tuple - * stores. Skip sorted merge for EXPLAIN ANALYZE (which modifies task - * lists in incompatible ways). + * stores. After execution completes, these stores are k-way merged into + * the final scanState->tuplestorestate. * - * Note: useSortedMerge is a plan-time decision — if the plan says merge, - * the executor must merge, because the combine query plan has no Sort - * node above us. Skipping the merge here would produce silently unsorted - * output. All eligibility checks belong in the planner, not here. + * useSortedMerge is a plan-time decision — if the plan says merge, the + * executor must merge, because the combine query plan has no Sort node + * above us. Skipping the merge would produce silently unsorted output. + * + * This applies even under EXPLAIN ANALYZE: the ExplainAnalyzeDestination + * wrapper forwards data tuples (queryNumber == 0) to the per-task + * dispatch, which routes them to the correct per-task store. Plan-fetch + * tuples (queryNumber == 1) are handled entirely within + * ExplainAnalyzeDestPutTuple and never reach the per-task dispatch. */ - bool useSortedMerge = distributedPlan->useSortedMerge && - !RequestedForExplainAnalyze(scanState); Tuplestorestate **perTaskStores = NULL; int perTaskStoreCount = 0; - if (useSortedMerge) + if (distributedPlan->useSortedMerge) { TupleDestinationStats *sharedStats = palloc0(sizeof(TupleDestinationStats)); defaultTupleDest = CreatePerTaskDispatchDest(taskList, tupleDescriptor, @@ -909,7 +912,7 @@ AdaptiveExecutor(CitusScanState *scanState) &xactProperties, jobIdList, localExecutionSupported, - useSortedMerge, + distributedPlan->useSortedMerge, perTaskStores, perTaskStoreCount); diff --git a/src/backend/distributed/executor/sorted_merge.c b/src/backend/distributed/executor/sorted_merge.c index f3514dfdb38..585a90c5188 100644 --- a/src/backend/distributed/executor/sorted_merge.c +++ b/src/backend/distributed/executor/sorted_merge.c @@ -196,11 +196,16 @@ PerTaskDispatchPutTuple(TupleDestination *self, Task *task, /* * PerTaskDispatchTupleDescForQuery returns the tuple descriptor. + * + * Only queryNumber == 0 (data tuples) is expected to reach the per-task + * dispatch directly. Under EXPLAIN ANALYZE, the ExplainAnalyzeDestination + * wrapper intercepts plan-fetch tuples (queryNumber == 1) before they + * reach us, but may call tupleDescForQuery with queryNumber == 0 or 1. + * We return the same data tuple descriptor in all cases. */ static TupleDesc PerTaskDispatchTupleDescForQuery(TupleDestination *self, int queryNumber) { - Assert(queryNumber == 0); PerTaskDispatchTupleDest *dispatch = (PerTaskDispatchTupleDest *) self; return dispatch->tupleDesc; } From 0ed36fdfbac0d720114a41ef74becb029ada91eb Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Thu, 16 Apr 2026 17:22:10 +0000 Subject: [PATCH 17/18] Added auto explain test category --- .../expected/multi_orderby_pushdown.out | 124 ++++++++++++++++++ .../regress/sql/multi_orderby_pushdown.sql | 53 ++++++++ 2 files changed, 177 insertions(+) diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index b5522eafb03..a22e4cf544b 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -2954,6 +2954,130 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING Output: id, val, CURRENT_TIMESTAMP (16 rows) +SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Category L6: EXPLAIN ANALYZE + sorted merge +-- +-- Verify that sorted merge works correctly when the EXPLAIN ANALYZE +-- code path is active. We test two mechanisms: +-- +-- 1. Plain EXPLAIN ANALYZE: verifies plan structure (no coordinator +-- Sort node, "Merge Method: sorted merge" visible). +-- +-- 2. auto_explain with log_analyze: triggers the same executor code +-- path (es_instrument != 0 → RequestedForExplainAnalyze() = true) +-- but returns actual data rows. This directly validates that the +-- k-way merge produces correctly sorted output under the EXPLAIN +-- ANALYZE path — if the merge were skipped, the rows would be +-- visibly unsorted. +-- ================================================================= +SET citus.enable_sorted_merge TO on; +-- Verify EXPLAIN ANALYZE plan structure: no Sort node at coordinator +-- level, "Merge Method: sorted merge" visible, and "actual rows" +-- confirms full execution through the merge path. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Merge Method: sorted merge + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true ORDER BY id + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Sort (actual rows=N loops=N) + Output: id + Sort Key: sorted_merge_test.id + Sort Method: quicksort Memory: NkB + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(16 rows) + +-- Load auto_explain to trigger the EXPLAIN ANALYZE executor path +-- while returning real data rows. auto_explain sets es_instrument, +-- which makes RequestedForExplainAnalyze() return true — the same +-- condition as a real EXPLAIN ANALYZE. +LOAD 'auto_explain'; +SET auto_explain.log_min_duration = 0; +SET auto_explain.log_analyze TO true; +-- ASC sort under auto_explain: these SELECTs go through the EXPLAIN +-- ANALYZE code path but return actual data. If the merge were +-- skipped, rows would arrive in arbitrary worker order. +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; + id +--------------------------------------------------------------------- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +-- DESC sort under auto_explain +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 10; + id +--------------------------------------------------------------------- + 202 + 201 + 200 + 102 + 101 + 100 + 99 + 98 + 97 + 96 +(10 rows) + +-- Multi-column sort under auto_explain +SELECT id, val FROM sorted_merge_test ORDER BY id, val LIMIT 10; + id | val +--------------------------------------------------------------------- + 1 | val_1 + 2 | val_2 + 3 | val_3 + 4 | val_4 + 5 | val_5 + 6 | val_6 + 7 | val_7 + 8 | val_8 + 9 | val_9 + 10 | val_10 +(10 rows) + +-- Disable auto_explain +SET auto_explain.log_min_duration = -1; +SET auto_explain.log_analyze TO false; +-- Contrast: sorted merge OFF shows a Sort node at coordinator level. +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id'); + explain_filter +--------------------------------------------------------------------- + Sort (actual rows=N loops=N) + Output: remote_scan.id + Sort Key: remote_scan.id + Sort Method: quicksort Memory: NkB + -> Custom Scan (Citus Adaptive) (actual rows=N loops=N) + Output: remote_scan.id + Task Count: N + Tuple data received from nodes: N bytes + Tasks Shown: One of N + -> Task + Query: SELECT id FROM public.sorted_merge_test_960000 sorted_merge_test WHERE true + Tuple data received from node: N bytes + Node: host=localhost port=N dbname=regression + -> Seq Scan on public.sorted_merge_test_960000 sorted_merge_test (actual rows=N loops=N) + Output: id +(15 rows) + SET citus.enable_sorted_merge TO off; -- ================================================================= -- Cleanup diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index cc2bb87377f..000bf560d6c 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -816,6 +816,59 @@ SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING SET citus.enable_sorted_merge TO off; +-- ================================================================= +-- Category L6: EXPLAIN ANALYZE + sorted merge +-- +-- Verify that sorted merge works correctly when the EXPLAIN ANALYZE +-- code path is active. We test two mechanisms: +-- +-- 1. Plain EXPLAIN ANALYZE: verifies plan structure (no coordinator +-- Sort node, "Merge Method: sorted merge" visible). +-- +-- 2. auto_explain with log_analyze: triggers the same executor code +-- path (es_instrument != 0 → RequestedForExplainAnalyze() = true) +-- but returns actual data rows. This directly validates that the +-- k-way merge produces correctly sorted output under the EXPLAIN +-- ANALYZE path — if the merge were skipped, the rows would be +-- visibly unsorted. +-- ================================================================= + +SET citus.enable_sorted_merge TO on; + +-- Verify EXPLAIN ANALYZE plan structure: no Sort node at coordinator +-- level, "Merge Method: sorted merge" visible, and "actual rows" +-- confirms full execution through the merge path. +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id'); + +-- Load auto_explain to trigger the EXPLAIN ANALYZE executor path +-- while returning real data rows. auto_explain sets es_instrument, +-- which makes RequestedForExplainAnalyze() return true — the same +-- condition as a real EXPLAIN ANALYZE. +LOAD 'auto_explain'; +SET auto_explain.log_min_duration = 0; +SET auto_explain.log_analyze TO true; + +-- ASC sort under auto_explain: these SELECTs go through the EXPLAIN +-- ANALYZE code path but return actual data. If the merge were +-- skipped, rows would arrive in arbitrary worker order. +SELECT id FROM sorted_merge_test ORDER BY id LIMIT 10; + +-- DESC sort under auto_explain +SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 10; + +-- Multi-column sort under auto_explain +SELECT id, val FROM sorted_merge_test ORDER BY id, val LIMIT 10; + +-- Disable auto_explain +SET auto_explain.log_min_duration = -1; +SET auto_explain.log_analyze TO false; + +-- Contrast: sorted merge OFF shows a Sort node at coordinator level. +SET citus.enable_sorted_merge TO off; +SELECT public.explain_filter('EXPLAIN (ANALYZE ON, VERBOSE ON, COSTS OFF, TIMING OFF, BUFFERS OFF, SUMMARY OFF) SELECT id FROM sorted_merge_test ORDER BY id'); + +SET citus.enable_sorted_merge TO off; + -- ================================================================= -- Cleanup -- ================================================================= From 499e5e3636957c10a187c7dda51823e6eeda26f5 Mon Sep 17 00:00:00 2001 From: Neil Deshpande Date: Thu, 16 Apr 2026 17:34:39 +0000 Subject: [PATCH 18/18] add more test cases for auto explain with duplicates --- .../expected/multi_orderby_pushdown.out | 48 +++++++++++++++++++ .../regress/sql/multi_orderby_pushdown.sql | 9 ++++ 2 files changed, 57 insertions(+) diff --git a/src/test/regress/expected/multi_orderby_pushdown.out b/src/test/regress/expected/multi_orderby_pushdown.out index a22e4cf544b..6c930ae8d3e 100644 --- a/src/test/regress/expected/multi_orderby_pushdown.out +++ b/src/test/regress/expected/multi_orderby_pushdown.out @@ -3053,6 +3053,54 @@ SELECT id, val FROM sorted_merge_test ORDER BY id, val LIMIT 10; 10 | val_10 (10 rows) +-- Single-column sort on num (non-distribution column, has NULLs) +SELECT num FROM sorted_merge_test ORDER BY num LIMIT 10; + num +--------------------------------------------------------------------- + 1.5 + 3.0 + 4.5 + 6.0 + 7.5 + 9.0 + 10.5 + 10.5 + 10.5 + 10.5 +(10 rows) + +-- Multi-column sort with num as first column +SELECT num, id FROM sorted_merge_test ORDER BY num, id LIMIT 10; + num | id +--------------------------------------------------------------------- + 1.5 | 1 + 3.0 | 2 + 4.5 | 3 + 6.0 | 4 + 7.5 | 5 + 9.0 | 6 + 10.5 | 7 + 10.5 | 200 + 10.5 | 201 + 10.5 | 202 +(10 rows) + +-- Multi-column sort with num DESC as first column, id ASC +SELECT num, id FROM sorted_merge_test ORDER BY num DESC, id LIMIT 10; + num | id +--------------------------------------------------------------------- + | 101 + | 102 + 150.0 | 100 + 148.5 | 99 + 147.0 | 98 + 145.5 | 97 + 144.0 | 96 + 142.5 | 95 + 141.0 | 94 + 139.5 | 93 +(10 rows) + -- Disable auto_explain SET auto_explain.log_min_duration = -1; SET auto_explain.log_analyze TO false; diff --git a/src/test/regress/sql/multi_orderby_pushdown.sql b/src/test/regress/sql/multi_orderby_pushdown.sql index 000bf560d6c..5860fc867f9 100644 --- a/src/test/regress/sql/multi_orderby_pushdown.sql +++ b/src/test/regress/sql/multi_orderby_pushdown.sql @@ -859,6 +859,15 @@ SELECT id FROM sorted_merge_test ORDER BY id DESC LIMIT 10; -- Multi-column sort under auto_explain SELECT id, val FROM sorted_merge_test ORDER BY id, val LIMIT 10; +-- Single-column sort on num (non-distribution column, has NULLs) +SELECT num FROM sorted_merge_test ORDER BY num LIMIT 10; + +-- Multi-column sort with num as first column +SELECT num, id FROM sorted_merge_test ORDER BY num, id LIMIT 10; + +-- Multi-column sort with num DESC as first column, id ASC +SELECT num, id FROM sorted_merge_test ORDER BY num DESC, id LIMIT 10; + -- Disable auto_explain SET auto_explain.log_min_duration = -1; SET auto_explain.log_analyze TO false;