Skip to content

A SQL query that runs quickly in PostgreSQL but very slowly in DuckDB #5

@l1t1

Description

@l1t1

This is Mr. Cheng Ning's SQL code for solving Sudoku problems.
For a data table with only one row, executing a recursive CTE query that iterates 654 times takes over 100 milliseconds in PostgreSQL, while in DuckDB, it takes 11 seconds even with multithreading, and up to 7 seconds when set to single-threaded mode.

I examined the execution plans of both databases. The statement SELECT pos, r, c, bx, MIN(d) FROM cd GROUP BY pos, r, c, bx HAVING COUNT(1) = 1 runs significantly slower in DuckDB compared to PostgreSQL.

Could you tell me how to improve it in DuckDB?

the orignal PostgreSQL version

WITH RECURSIVE
-- 生成1-9的有效数字(数独允许的数字范围)
d(d) AS MATERIALIZED(SELECT d from generate_series(1, 9)t(d)),
-- 生成81个单元格的位置信息:pos(1-81)全局位置,r(1-9)行,c(1-9)列,bx(1-9)宫
pi(pos, r, c, bx) AS MATERIALIZED(
    SELECT 
        pos,
        ((pos - 1) / 9) + 1 AS r,
        ((pos - 1) % 9) + 1 AS c,
        ((pos - 1) / 9) / 3 * 3 + ((pos - 1) % 9) / 3 + 1 AS bx
    FROM generate_series(1, 81) AS t(pos)
),
-- 预处理数独数据:清洗空白字符、替换?为0,转为整数数组
cp(id, pz, bs) AS (
    SELECT 
        id,
        puzzle,
        regexp_split_to_array(
            regexp_replace(regexp_replace(puzzle, '[\r\n\s]', '', 'g'), '\?', '0', 'g'), 
            ''
        )::integer[] AS bs
    FROM (SELECT 3 AS id, E'800000000003600000070090200050007000000045700000100030001000068008500010090000400' AS puzzle)sudoku9_9
),
-- 递归求解核心:初始填充→确定填充/回溯/猜测填充循环
s1(id,flag, bs,bse,i ) AS (
    -- 递归初始态:加载预处理后的初始盘面
    SELECT id,'初始填充', bs,ARRAY[]::integer[][], 0 AS i  FROM cp
    UNION ALL
    -- 递归体:计算下一步填充逻辑
    SELECT s1.id,n.flag, n.bs,n.bse, s1.i + 1 AS i
    FROM s1
    CROSS JOIN LATERAL (
        WITH 
        -- 关联位置信息,展开当前盘面的单元格详情
        eb(pos, r, c, bx, v) AS (SELECT pi.pos, pi.r, pi.c, pi.bx, s1.bs[pi.pos] FROM pi),
        -- 计算空白单元格的合法候选数(行/列/宫无重复)
        cd(pos, r, c, bx, d) AS (

            SELECT e.pos, e.r, e.c, e.bx, d.d
            FROM eb e 
            CROSS JOIN d
            LEFT JOIN eb e2 ON e2.r = e.r AND e2.v = d.d
            LEFT JOIN eb e3 ON e3.c = e.c AND e3.v = d.d
            LEFT JOIN eb e4 ON e4.bx = e.bx AND e4.v = d.d
            WHERE e.v = 0
              AND e2.r IS NULL
              AND e3.r IS NULL 
              AND e4.r IS NULL 
        ),
        -- 筛选唯一可填的单元格(位置/行/列/宫维度)
        af(pos,r, c, bx, d) AS (
            SELECT pos,r, c, bx, MIN(d) FROM cd GROUP BY pos,r, c, bx HAVING COUNT(1) = 1
            UNION
            SELECT MIN(pos),r,min(c),min(bx), d FROM cd GROUP BY r, d HAVING COUNT(1) = 1
            UNION
            SELECT MIN(pos),min(r),c,min(bx), d FROM cd GROUP BY c, d HAVING COUNT(1) = 1
            UNION
            SELECT MIN(pos),min(r),min(c),bx, d FROM cd GROUP BY bx, d HAVING COUNT(1) = 1
        ),
        -- 检查填充是否冲突(无效解)
        error_check(is_invalid) AS (
            SELECT EXISTS (
                SELECT 1 FROM af GROUP BY r, d HAVING COUNT(*) > 1
                UNION ALL SELECT 1 FROM af GROUP BY c, d HAVING COUNT(*) > 1
                UNION ALL SELECT 1 FROM af GROUP BY bx, d HAVING COUNT(*) > 1
                UNION ALL SELECT 1 FROM af GROUP BY pos,d HAVING COUNT(*) > 1
            ) or array_length(bs,1) > 81 or not exists (select 1 from cd) AS is_invalid
        ),
        -- 选择候选数最少的位置作为猜测位置
        bg (pos) as (SELECT pos FROM (SELECT pos FROM cd GROUP BY pos ORDER BY COUNT(1), pos asc LIMIT 1) AS p), 
        -- 获取猜测位置的所有候选数
        gv (d) as (SELECT cd.d FROM cd,bg WHERE cd.pos = bg.pos ),
        -- 生成第一次猜测的盘面(取最小候选数)
        gas1 (bs) as (
        	SELECT s1.bs[1 : (bg.pos - 1)] || ARRAY[gv.d] || s1.bs[(bg.pos + 1) : 81] AS bs 
        	from bg,gv order by d limit 1 ),
        -- 剩余猜测盘面存入回溯栈
        gas2(bse) as (select ARRAY_AGG(bs) from (	
        	SELECT s1.bs[1 : (bg.pos - 1)] || ARRAY[gv.d] || s1.bs[(bg.pos + 1) : 81] AS bs
        	from bg,gv order by d offset 1) tmpbs),
        -- 从回溯栈取出上一个备选盘面
        cbse(bs) as (SELECT array_agg(x) from (select unnest(s1.bse[1:1]) as x )),
        -- 应用确定填充生成新盘面
        adf(bs, hf) AS (
            SELECT 
                (SELECT array_agg(coalesce(af.d, s1.bs[pi.pos]) order by pi.pos) FROM pi 
                LEFT JOIN af ON pi.pos = af.pos) AS bs,
                EXISTS (SELECT 1 FROM af) AS hf
        )
        -- 分支1:执行确定填充(有可填项且无冲突)
        SELECT '确定填充' flag,bs, s1.bse bse
        FROM adf,error_check WHERE hf AND NOT is_invalid
        UNION ALL
        -- 分支2:执行猜测填充(无可确定项且无冲突)
        SELECT '猜测填充',gas1.bs, gas2.bse||s1.bse
        FROM adf a,error_check,gas1,gas2
		WHERE NOT a.hf AND NOT is_invalid
		UNION ALL
        -- 分支3:执行回溯(填充冲突时)
        select '回溯',cbse.bs,s1.bse[2:array_length(s1.bse, 1)]
        from error_check,cbse where is_invalid 
    ) n
    WHERE s1.i < 10000 AND s1.bs @> ARRAY[0]
)
-- 输出最终解:将数组转为9x9格式的数独字符串
SELECT s.id, cp.pz, array_to_string(
    array(SELECT CASE WHEN pos % 9 = 1 AND pos > 1 THEN E'\n' ELSE '' END || value::text 
          FROM unnest(s.bs) WITH ORDINALITY AS elem(value, pos)),
    ''
) AS result
FROM s1 s 
JOIN cp ON s.id = cp.id
WHERE NOT s.bs @> ARRAY[0]
;

translated duckdb version

WITH RECURSIVE
-- 生成1-9的有效数字(数独允许的数字范围)
d(d) AS MATERIALIZED(SELECT d from generate_series(1, 9)t(d)),
-- 生成81个单元格的位置信息:pos(1-81)全局位置,r(1-9)行,c(1-9)列,bx(1-9)宫
pi(pos, r, c1, bx) AS MATERIALIZED(
    SELECT 
        pos,
        ((pos - 1) // 9) + 1 AS r,
        ((pos - 1) % 9) + 1 AS c,
        ((pos - 1) // 9) // 3 * 3 + ((pos - 1) % 9) // 3 + 1 AS bx
    FROM generate_series(1, 81) AS t(pos)
),
-- 预处理数独数据:清洗空白字符、替换?为0,转为整数数组
cp(id, pz, bs) AS (
    SELECT 
        id,
        puzzle,
        regexp_split_to_array(
            regexp_replace(regexp_replace(puzzle, '[\r\n\s]', '', 'g'), '\?', '0', 'g'), 
            ''
        )::integer[] AS bs
FROM (SELECT 3 AS id, E'800000000003600000070090200050007000000045700000100030001000068008500010090000400' AS puzzle)sudoku9_9
),
-- 递归求解核心:初始填充→确定填充/回溯/猜测填充循环
s1(id,flag, bs,bse,i ) AS MATERIALIZED(
    -- 递归初始态:加载预处理后的初始盘面
    SELECT id,'初始填充', bs,ARRAY[]::integer[][], 0 AS i  FROM cp
    UNION ALL
    -- 递归体:计算下一步填充逻辑
    SELECT s1.id,n.flag, n.bs,n.bse, s1.i + 1 AS i
    FROM s1
    CROSS JOIN LATERAL (
        WITH 
        -- 关联位置信息,展开当前盘面的单元格详情
        eb(pos, r, c, bx, v) AS (SELECT pi.pos, pi.r, pi.c1, pi.bx, s1.bs[pi.pos] FROM pi),
        -- 计算空白单元格的合法候选数(行/列/宫无重复)
        cd(pos, r, c, bx, d) AS (

            SELECT e.pos, e.r, e.c, e.bx, d.d
            FROM eb e 
            CROSS JOIN d
            LEFT JOIN eb e2 ON e2.r = e.r AND e2.v = d.d
            LEFT JOIN eb e3 ON e3.c = e.c AND e3.v = d.d
            LEFT JOIN eb e4 ON e4.bx = e.bx AND e4.v = d.d
            WHERE e.v = 0
              AND e2.r IS NULL
              AND e3.r IS NULL 
              AND e4.r IS NULL 

        ),

        -- 筛选唯一可填的单元格(位置/行/列/宫维度)
        af(pos,r, c, bx, d) AS (
            SELECT pos,r, c, bx, MIN(d) FROM cd GROUP BY pos,r, c, bx HAVING COUNT(1) = 1
            UNION
            SELECT MIN(pos),r,min(c),min(bx), d FROM cd GROUP BY r, d HAVING COUNT(1) = 1
            UNION
            SELECT MIN(pos),min(r),c,min(bx), d FROM cd GROUP BY c, d HAVING COUNT(1) = 1
            UNION
            SELECT MIN(pos),min(r),min(c),bx, d FROM cd GROUP BY bx, d HAVING COUNT(1) = 1
        ),

       -- 检查填充是否冲突(无效解)
        error_check(is_invalid) AS (
            SELECT EXISTS (
                SELECT 1 FROM af GROUP BY r, d HAVING COUNT(*) > 1
                UNION ALL SELECT 1 FROM af GROUP BY c, d HAVING COUNT(*) > 1
                UNION ALL SELECT 1 FROM af GROUP BY bx, d HAVING COUNT(*) > 1
                UNION ALL SELECT 1 FROM af GROUP BY pos,d HAVING COUNT(*) > 1
            ) or array_length(bs,1) > 81 or not exists (select 1 from cd) AS is_invalid
        ),

        -- 选择候选数最少的位置作为猜测位置 -- pos
        bg (pos) as (SELECT pos FROM (SELECT pos FROM cd GROUP BY pos ORDER BY COUNT(1),pos asc LIMIT 1) AS p), 
        -- 获取猜测位置的所有候选数
        gv (d) as (SELECT cd.d FROM cd,bg WHERE cd.pos = bg.pos ),
        -- 生成第一次猜测的盘面(取最小候选数)
        gas1 (bs) as (
        	SELECT s1.bs[1 : (bg.pos - 1)] || ARRAY[gv.d] || s1.bs[(bg.pos + 1) : 81] AS bs 
        	from bg,gv order by d limit 1 ),
        -- 剩余猜测盘面存入回溯栈

        gas2(bse) as (select ARRAY_AGG(bs) from (	
        	SELECT s1.bs[1 : (bg.pos - 1)] || ARRAY[gv.d] || s1.bs[(bg.pos + 1) : 81] AS bs
        	from bg,gv order by d offset 1) tmpbs),
 
        -- 从回溯栈取出上一个备选盘面
/*        cbse(bs) as --(SELECT array_agg(x) from (select unnest(s1.bse[1:1]) as x )),
        (SELECT s1.bse[1] as x ),
*/ 
       -- 应用确定填充生成新盘面
        adf(bs, hf) AS (
            SELECT 
                (SELECT array_agg(coalesce(af.d, s1.bs[pi.pos]) order by pi.pos) FROM pi 
                LEFT JOIN af ON pi.pos = af.pos) AS bs,
                EXISTS (SELECT 1 FROM af) AS hf
        )
        -- 分支1:执行确定填充(有可填项且无冲突)
        SELECT '确定填充' flag,bs, s1.bse bse
        FROM adf ,error_check WHERE hf AND NOT is_invalid
        UNION ALL
        -- 分支2:执行猜测填充(无可确定项且无冲突)
        SELECT '猜测填充',gas1.bs, gas2.bse||s1.bse
        FROM adf a,error_check,gas1,gas2
		WHERE NOT a.hf AND NOT is_invalid
 	 	UNION ALL
      -- 分支3:执行回溯(填充冲突时)
        select '回溯',s1.bse[1],s1.bse[2:]
        from error_check where is_invalid 

    ) n
    WHERE s1.i < 10000 AND s1.bs @> ARRAY[0]
)
-- 输出最终解:将数组转为9x9格式的数独字符串
SELECT s.id, cp.pz, array_to_string(
    array(SELECT CASE WHEN pos % 9 = 1 AND pos > 1 THEN E'\n' ELSE '' END || value::text 
          FROM unnest(s.bs) WITH ORDINALITY AS elem(value, pos)),
    ''
) AS result
FROM s1 s 
JOIN cp ON s.id = cp.id
WHERE NOT s.bs @> ARRAY[0]
;

the output

postgres=# \i 1230/0112en.txt
 id |                                        pz                                         |  result
----+-----------------------------------------------------------------------------------+-----------
  3 | 800000000003600000070090200050007000000045700000100030001000068008500010090000400 | 812753649+
    |                                                                                   | 943682175+
    |                                                                                   | 675491283+
    |                                                                                   | 154237896+
    |                                                                                   | 369845721+
    |                                                                                   | 287169534+
    |                                                                                   | 521974368+
    |                                                                                   | 438526917+
    |                                                                                   | 796318452
(1 row)

Time: 135.542 ms
D .read 1230/0112ducken.txt
┌───────┬───────────────────────────────────────────────────────┬──────────────────────────────────────────────────────┐
│  id   │                          pz                           │                        result                        │
│ int32 │                        varchar                        │                       varchar                        │
├───────┼───────────────────────────────────────────────────────┼──────────────────────────────────────────────────────┤
│     3 │ 80000000000360000007009020005000700000004570000010003 │ 812753649\n943682175\n675491283\n154237896\n36984572 │
│       │ 0001000068008500010090000400                          │ 1\n287169534\n521974368\n438526917\n796318452        │
└───────┴───────────────────────────────────────────────────────┴──────────────────────────────────────────────────────┘
Run Time (s): real 11.169 user 24.808297 sys 10.038441
D set threads=1;
Run Time (s): real 0.007 user 0.011898 sys 0.000619
D .read 1230/0112ducken.txt
┌───────┬───────────────────────────────────────────────────────┬──────────────────────────────────────────────────────┐
│  id   │                          pz                           │                        result                        │
│ int32 │                        varchar                        │                       varchar                        │
├───────┼───────────────────────────────────────────────────────┼──────────────────────────────────────────────────────┤
│     3 │ 80000000000360000007009020005000700000004570000010003 │ 812753649\n943682175\n675491283\n154237896\n36984572 │
│       │ 0001000068008500010090000400                          │ 1\n287169534\n521974368\n438526917\n796318452        │
└───────┴───────────────────────────────────────────────────────┴──────────────────────────────────────────────────────┘
Run Time (s): real 7.429 user 6.501462 sys 0.397789

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions