From 99d5596e77c0e850a66f991ac9cf520afa9fe7e5 Mon Sep 17 00:00:00 2001
From: Mario Zimmermann <mail@zisoft.de>
Date: Sun, 19 Apr 2026 19:18:45 +0200
Subject: [PATCH 1/4] prepare halide and exposure module

---
 .ci/Brewfile                                  |   1 +
 DefineOptions.cmake                           |   1 +
 data/halide/CMakeLists.txt                    |  49 +++++++
 data/halide/generators/exposure_generator.cpp | 109 ++++++++++++++
 src/CMakeLists.txt                            |   9 ++
 src/common/darktable.c                        |   6 +
 src/develop/pixelpipe_hb.c                    |  27 ++++
 src/develop/pixelpipe_hb.h                    |  10 ++
 src/iop/CMakeLists.txt                        |   9 ++
 src/iop/exposure.c                            | 136 +++++++++++++++++-
 10 files changed, 356 insertions(+), 1 deletion(-)
 create mode 100644 data/halide/CMakeLists.txt
 create mode 100644 data/halide/generators/exposure_generator.cpp

diff --git a/.ci/Brewfile b/.ci/Brewfile
index 55d6941cac48..07accba836bc 100644
--- a/.ci/Brewfile
+++ b/.ci/Brewfile
@@ -29,6 +29,7 @@ brew 'gtk+3'
 brew 'gtk-mac-integration'
 brew 'icu4c'
 brew 'graphicsmagick'
+brew 'halide'
 brew 'imagemagick'
 brew 'intltool'
 brew 'iso-codes'
diff --git a/DefineOptions.cmake b/DefineOptions.cmake
index 7d2ea4a354a2..92b6c14f2cfb 100644
--- a/DefineOptions.cmake
+++ b/DefineOptions.cmake
@@ -8,6 +8,7 @@ option(USE_LIBSECRET "Build libsecret password storage back-end" ON)
 option(USE_UNITY "Use libunity to report progress in the launcher" OFF)
 option(USE_OPENMP "Use OpenMP threading support." ON)
 option(USE_OPENCL "Use OpenCL support." ON)
+option(USE_HALIDE "Use Halide for optimized image processing." OFF)
 option(USE_GRAPHICSMAGICK "Use GraphicsMagick library for image import." ON)
 option(USE_IMAGEMAGICK "Use ImageMagick library for image import." OFF)
 option(USE_DARKTABLE_PROFILING OFF)
diff --git a/data/halide/CMakeLists.txt b/data/halide/CMakeLists.txt
new file mode 100644
index 000000000000..f8e667afc6db
--- /dev/null
+++ b/data/halide/CMakeLists.txt
@@ -0,0 +1,49 @@
+# Halide-based image processing generators for darktable
+#
+# This builds Halide generator executables at compile time, then runs them
+# to produce optimized static libraries (.h + .a) for each target platform.
+# The generated code is linked into individual IOP plugin modules.
+
+find_package(Halide REQUIRED)
+
+# Build the generator executable (runs on the host at build time)
+add_halide_generator(dt_halide_generators
+    SOURCES
+        generators/exposure_generator.cpp
+)
+
+# Generate CPU-optimized code for the host platform
+# "host" auto-detects arch + OS + SIMD features (NEON on ARM64, AVX on x86, etc.)
+add_halide_library(halide_exposure
+    FROM dt_halide_generators
+    GENERATOR exposure
+    TARGETS host
+    FUNCTION_NAME dt_halide_exposure
+)
+
+# Generate Metal GPU code for Apple Silicon (macOS only)
+# Uses the "flat" 2D generator to avoid stride issues with Metal's buffer handling.
+# The flat generator treats each row as width*4 contiguous floats.
+if(APPLE)
+    add_halide_library(halide_exposure_metal
+        FROM dt_halide_generators
+        GENERATOR exposure_flat
+        TARGETS host-metal
+        FUNCTION_NAME dt_halide_exposure_metal
+    )
+endif()
+
+# Generate Vulkan GPU code (cross-platform: Linux, Windows, macOS via MoltenVK)
+# Uses the same flat 2D generator as Metal. Vulkan loader is dlopen'd at runtime,
+# so no link-time dependency — if no Vulkan driver is present, the call simply fails
+# and we fall through to CPU Halide.
+find_package(Vulkan QUIET)
+if(Vulkan_FOUND)
+    add_halide_library(halide_exposure_vulkan
+        FROM dt_halide_generators
+        GENERATOR exposure_flat
+        TARGETS host-vulkan
+        FUNCTION_NAME dt_halide_exposure_vulkan
+    )
+    set(HALIDE_HAS_VULKAN TRUE PARENT_SCOPE)
+endif()
diff --git a/data/halide/generators/exposure_generator.cpp b/data/halide/generators/exposure_generator.cpp
new file mode 100644
index 000000000000..9939749aac84
--- /dev/null
+++ b/data/halide/generators/exposure_generator.cpp
@@ -0,0 +1,109 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * Halide generator for darktable's exposure module.
+ *
+ * Algorithm: out[c] = (in[c] - black) * scale  for c = R,G,B
+ *            out[alpha] = in[alpha]  (passthrough)
+ *
+ * darktable pixel layout: interleaved float RGBA, 4 channels per pixel.
+ * Memory layout: [R0,G0,B0,A0, R1,G1,B1,A1, ...]
+ *                stride: c=1, x=4, y=4*width
+ *
+ * Two generators:
+ *   "exposure"       — 3D (c,x,y) for CPU with SIMD vectorization
+ *   "exposure_flat"  — 2D (xi,y) for Metal GPU, where xi = x*4+c
+ *                      This avoids stride issues with Metal's buffer handling.
+ */
+
+#include "Halide.h"
+
+using namespace Halide;
+
+// CPU generator: 3D (c, x, y) with interleaved layout
+class ExposureGenerator : public Generator<ExposureGenerator> {
+public:
+    Input<Buffer<float, 3>> input{"input"};
+    Input<float> black{"black"};
+    Input<float> scale{"scale"};
+    Output<Buffer<float, 3>> output{"output"};
+
+    void generate() {
+        Var c("c"), x("x"), y("y");
+
+        Expr val = input(c, x, y);
+        Expr adjusted = (val - black) * scale;
+        output(c, x, y) = select(c < 3, adjusted, val);
+    }
+
+    void schedule() {
+        Var c = output.args()[0];
+        Var x = output.args()[1];
+        Var y = output.args()[2];
+
+        output.bound(c, 0, 4);
+        input.dim(0).set_bounds(0, 4);
+
+        // CPU schedule: parallel over rows, vectorize over x
+        output.reorder(c, x, y);
+        output.vectorize(c, 4);
+
+        Var xo("xo"), xi("xi");
+        output.split(x, xo, xi, 8);
+        output.vectorize(xi);
+        output.parallel(y);
+    }
+};
+
+// GPU generator: 2D flat buffer (xi, y) where xi covers width*4 floats per row.
+// Every 4th element (xi % 4 == 3) is alpha and passed through unchanged.
+class ExposureFlatGenerator : public Generator<ExposureFlatGenerator> {
+public:
+    Input<Buffer<float, 2>> input{"input"};
+    Input<float> black{"black"};
+    Input<float> scale{"scale"};
+    Output<Buffer<float, 2>> output{"output"};
+
+    void generate() {
+        Var xi("xi"), y("y");
+
+        Expr val = input(xi, y);
+        Expr adjusted = (val - black) * scale;
+        // Alpha passthrough: every 4th element (index 3, 7, 11, ...) is alpha
+        output(xi, y) = select((xi % 4) != 3, adjusted, val);
+    }
+
+    void schedule() {
+        Var xi = output.args()[0];
+        Var y  = output.args()[1];
+
+        if(get_target().has_gpu_feature()) {
+            Var xio("xio"), xii("xii"), yo("yo"), yi("yi");
+            // Tile: 64 floats wide (16 pixels * 4 channels) x 16 rows
+            output.gpu_tile(xi, y, xio, yo, 64, 16);
+        } else {
+            // Fallback CPU schedule (shouldn't normally be used)
+            output.vectorize(xi, 16);
+            output.parallel(y);
+        }
+    }
+};
+
+HALIDE_REGISTER_GENERATOR(ExposureGenerator, exposure)
+HALIDE_REGISTER_GENERATOR(ExposureFlatGenerator, exposure_flat)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e2d15b5b16cc..37b35a0fc652 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1082,6 +1082,15 @@ add_subdirectory(views ${DARKTABLE_LIBDIR}/views)
 # Compile lighttable modules
 add_subdirectory(libs ${DARKTABLE_LIBDIR}/plugins/lighttable)
 
+# Compile Halide generators (must be before iop so targets are available)
+if(USE_HALIDE)
+  add_subdirectory(${CMAKE_SOURCE_DIR}/data/halide ${CMAKE_BINARY_DIR}/halide)
+  add_definitions("-DHAVE_HALIDE")
+  if(HALIDE_HAS_VULKAN)
+    add_definitions("-DHAVE_HALIDE_VULKAN")
+  endif()
+endif()
+
 # Compile darkroom modules
 add_subdirectory(iop ${DARKTABLE_LIBDIR}/plugins)
 
diff --git a/src/common/darktable.c b/src/common/darktable.c
index 12fba89b069b..1133ef5057ac 100644
--- a/src/common/darktable.c
+++ b/src/common/darktable.c
@@ -835,6 +835,12 @@ char *version = g_strdup_printf(
                "  OpenMP                 -> DISABLED\n"
 #endif
 
+#ifdef HAVE_HALIDE
+               "  Halide                 -> ENABLED\n"
+#else
+               "  Halide                 -> DISABLED\n"
+#endif
+
 #ifdef HAVE_OPENCL
                "  OpenCL                 -> ENABLED\n"
 #else
diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c
index a8ebe3df1b02..b4d9e149b829 100644
--- a/src/develop/pixelpipe_hb.c
+++ b/src/develop/pixelpipe_hb.c
@@ -457,6 +457,13 @@ void dt_dev_pixelpipe_create_nodes(dt_dev_pixelpipe_t *pipe,
                                                 g_direct_equal, NULL, dt_free_align_ptr);
     memset(&piece->processed_roi_in, 0, sizeof(piece->processed_roi_in));
     memset(&piece->processed_roi_out, 0, sizeof(piece->processed_roi_out));
+#ifdef HAVE_HALIDE
+    // Cache the Halide process function pointer (if the module exports one)
+    piece->process_halide = NULL;
+    if(module->so->module)
+      g_module_symbol(module->so->module, "process_halide",
+                      (gpointer *)&piece->process_halide);
+#endif
     dt_iop_init_pipe(piece->module, pipe, piece);
     pipe->nodes = g_list_append(pipe->nodes, piece);
   }
@@ -1444,6 +1451,19 @@ static gboolean _pixelpipe_process_on_CPU(dt_dev_pixelpipe_t *pipe,
     }
     else
     {
+#ifdef HAVE_HALIDE
+      gboolean halide_ok = FALSE;
+      if(piece->process_halide)
+      {
+        halide_ok = (piece->process_halide(module, piece, input, *output, roi_in, roi_out) == 0);
+        if(halide_ok)
+        {
+          dt_print_pipe(DT_DEBUG_PIPE, "processed via Halide",
+                        pipe, module, DT_DEVICE_CPU, roi_in, roi_out, "");
+        }
+      }
+      if(!halide_ok)
+#endif
       module->process(module, piece, input, *output, roi_in, roi_out);
       if(relevant)
       {
@@ -2083,6 +2103,13 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
       }
     }
 
+#ifdef HAVE_HALIDE
+    // If this module has a Halide CPU implementation, prefer it over OpenCL.
+    // This will be replaced by Halide Metal/GPU dispatch in the future.
+    if(possible_cl && piece->process_halide)
+      possible_cl = FALSE;
+#endif
+
     if(possible_cl)
     {
       const int cst_from = input_cst_cl;
diff --git a/src/develop/pixelpipe_hb.h b/src/develop/pixelpipe_hb.h
index fcb8157c62e2..7eabe590a40e 100644
--- a/src/develop/pixelpipe_hb.h
+++ b/src/develop/pixelpipe_hb.h
@@ -63,6 +63,16 @@ typedef struct dt_dev_pixelpipe_iop_t
   uint8_t xtrans[6][6];
   uint32_t filters;
   GHashTable *raster_masks;
+
+#ifdef HAVE_HALIDE
+  // Cached Halide process function pointer (looked up once via g_module_symbol)
+  int (*process_halide)(struct dt_iop_module_t *self,
+                        struct dt_dev_pixelpipe_iop_t *piece,
+                        const void *const i,
+                        void *const o,
+                        const struct dt_iop_roi_t *const roi_in,
+                        const struct dt_iop_roi_t *const roi_out);
+#endif
 } dt_dev_pixelpipe_iop_t;
 
 typedef enum dt_dev_pixelpipe_change_t
diff --git a/src/iop/CMakeLists.txt b/src/iop/CMakeLists.txt
index 92a69c1037f9..6777052a8a26 100644
--- a/src/iop/CMakeLists.txt
+++ b/src/iop/CMakeLists.txt
@@ -78,6 +78,15 @@ add_iop(temperature "temperature.c" DEFAULT_VISIBLE)
 add_iop(colorcorrection "colorcorrection.c")
 add_iop(basicadj "basicadj.c")
 add_iop(exposure "exposure.c" DEFAULT_VISIBLE)
+if(USE_HALIDE)
+  target_link_libraries(exposure halide_exposure)
+  if(APPLE)
+    target_link_libraries(exposure halide_exposure_metal)
+  endif()
+  if(HALIDE_HAS_VULKAN)
+    target_link_libraries(exposure halide_exposure_vulkan)
+  endif()
+endif()
 add_iop(equalizer "equalizer.c")
 add_iop(rgbcurve "rgbcurve.c")
 add_iop(colorbalance "colorbalance.c" DEFAULT_VISIBLE)
diff --git a/src/iop/exposure.c b/src/iop/exposure.c
index 5276cb91228f..00f78ac385b7 100644
--- a/src/iop/exposure.c
+++ b/src/iop/exposure.c
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    Copyright (C) 2009-2024 darktable developers.
+    Copyright (C) 2009-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -508,6 +508,140 @@ static void _process_common_setup(dt_iop_module_t *self,
   d->scale = 1.0 / (white - d->black);
 }
 
+#ifdef HAVE_HALIDE
+#include "HalideRuntime.h"
+#include "halide_exposure.h"
+#ifdef HAVE_HALIDE_VULKAN
+#include "halide_exposure_vulkan.h"
+#endif
+#ifdef __APPLE__
+#include "halide_exposure_metal.h"
+#endif
+
+// Try a Halide GPU backend using flat 2D buffers.
+// Returns 0 on success, non-zero on failure.
+static int _halide_try_gpu_flat(const void *const i, void *const o,
+                                int width, int height, float black, float scale,
+                                int (*gpu_func)(struct halide_buffer_t *,
+                                                float, float,
+                                                struct halide_buffer_t *))
+{
+  const int row_floats = width * 4;
+  halide_dimension_t flat_in_dims[2] = {
+    {0, row_floats, 1},
+    {0, height,     row_floats},
+  };
+  halide_dimension_t flat_out_dims[2] = {
+    {0, row_floats, 1},
+    {0, height,     row_floats},
+  };
+
+  halide_buffer_t flat_in = {0};
+  flat_in.host = (uint8_t *)i;
+  flat_in.type = (struct halide_type_t){halide_type_float, 32, 1};
+  flat_in.dimensions = 2;
+  flat_in.dim = flat_in_dims;
+  flat_in.flags = halide_buffer_flag_host_dirty;
+
+  halide_buffer_t flat_out = {0};
+  flat_out.host = (uint8_t *)o;
+  flat_out.type = (struct halide_type_t){halide_type_float, 32, 1};
+  flat_out.dimensions = 2;
+  flat_out.dim = flat_out_dims;
+
+  int err = gpu_func(&flat_in, black, scale, &flat_out);
+  if(!err)
+    err = halide_copy_to_host(NULL, &flat_out);
+
+  halide_device_free(NULL, &flat_in);
+  halide_device_free(NULL, &flat_out);
+  return err;
+}
+
+__attribute__((visibility("default")))
+int process_halide(dt_iop_module_t *self,
+                   dt_dev_pixelpipe_iop_t *piece,
+                   const void *const i,
+                   void *const o,
+                   const dt_iop_roi_t *const roi_in,
+                   const dt_iop_roi_t *const roi_out)
+{
+  const dt_iop_exposure_data_t *const d = piece->data;
+
+  _process_common_setup(self, piece);
+
+  const int width = roi_out->width;
+  const int height = roi_out->height;
+  const float black = d->black;
+  const float scale = d->scale;
+
+  int err = -1;
+
+  // GPU backends: try Metal (macOS) then Vulkan (cross-platform)
+#ifdef __APPLE__
+  err = _halide_try_gpu_flat(i, o, width, height, black, scale, dt_halide_exposure_metal);
+  if(!err)
+    dt_print(DT_DEBUG_PIPE, "[exposure] Halide backend: Metal GPU");
+  else
+    dt_print(DT_DEBUG_PIPE, "[exposure] Halide Metal failed (%d), trying Vulkan", err);
+#endif
+
+#ifdef HAVE_HALIDE_VULKAN
+  if(err)
+  {
+    err = _halide_try_gpu_flat(i, o, width, height, black, scale, dt_halide_exposure_vulkan);
+    if(!err)
+      dt_print(DT_DEBUG_PIPE, "[exposure] Halide backend: Vulkan GPU");
+    else
+      dt_print(DT_DEBUG_PIPE, "[exposure] Halide Vulkan failed (%d), trying CPU", err);
+  }
+#endif
+
+  // CPU fallback: uses 3D interleaved buffer layout
+  if(err)
+  {
+    halide_dimension_t in_dims[3] = {
+      {0, 4,         1},
+      {0, width,     4},
+      {0, height,    4 * width},
+    };
+    halide_dimension_t out_dims[3] = {
+      {0, 4,         1},
+      {0, width,     4},
+      {0, height,    4 * width},
+    };
+
+    halide_buffer_t in_buf = {0};
+    in_buf.host = (uint8_t *)i;
+    in_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
+    in_buf.dimensions = 3;
+    in_buf.dim = in_dims;
+
+    halide_buffer_t out_buf = {0};
+    out_buf.host = (uint8_t *)o;
+    out_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
+    out_buf.dimensions = 3;
+    out_buf.dim = out_dims;
+
+    err = dt_halide_exposure(&in_buf, black, scale, &out_buf);
+    if(!err)
+      dt_print(DT_DEBUG_PIPE, "[exposure] Halide backend: CPU");
+  }
+
+  if(err)
+  {
+    dt_print(DT_DEBUG_ALWAYS,
+             "[exposure] Halide processing failed with error %d, falling back", err);
+    return err;
+  }
+
+  for(int k = 0; k < 3; k++)
+    piece->pipe->dsc.processed_maximum[k] *= d->scale;
+
+  return 0;
+}
+#endif /* HAVE_HALIDE */
+
 #ifdef HAVE_OPENCL
 int process_cl(dt_iop_module_t *self,
                dt_dev_pixelpipe_iop_t *piece,

From f87e68850db1bf7c227218d74c2b8887ecdf4a44 Mon Sep 17 00:00:00 2001
From: Mario Zimmermann <mail@zisoft.de>
Date: Mon, 20 Apr 2026 19:32:08 +0200
Subject: [PATCH 2/4] convert diffuse/sharpen module to halide

---
 data/halide/CMakeLists.txt                    |  61 +-
 data/halide/generators/bspline_generator.cpp  | 131 +++++
 .../generators/diffuse_pde_generator.cpp      | 297 ++++++++++
 src/iop/CMakeLists.txt                        |   9 +
 src/iop/diffuse.c                             | 540 ++++++++++++++++++
 5 files changed, 1037 insertions(+), 1 deletion(-)
 create mode 100644 data/halide/generators/bspline_generator.cpp
 create mode 100644 data/halide/generators/diffuse_pde_generator.cpp

diff --git a/data/halide/CMakeLists.txt b/data/halide/CMakeLists.txt
index f8e667afc6db..c4470ad1ac85 100644
--- a/data/halide/CMakeLists.txt
+++ b/data/halide/CMakeLists.txt
@@ -6,12 +6,22 @@
 
 find_package(Halide REQUIRED)
 
-# Build the generator executable (runs on the host at build time)
+# Build the generator executables (run on the host at build time)
 add_halide_generator(dt_halide_generators
     SOURCES
         generators/exposure_generator.cpp
 )
 
+add_halide_generator(dt_halide_bspline_generators
+    SOURCES
+        generators/bspline_generator.cpp
+)
+
+add_halide_generator(dt_halide_diffuse_generators
+    SOURCES
+        generators/diffuse_pde_generator.cpp
+)
+
 # Generate CPU-optimized code for the host platform
 # "host" auto-detects arch + OS + SIMD features (NEON on ARM64, AVX on x86, etc.)
 add_halide_library(halide_exposure
@@ -37,6 +47,43 @@ endif()
 # Uses the same flat 2D generator as Metal. Vulkan loader is dlopen'd at runtime,
 # so no link-time dependency — if no Vulkan driver is present, the call simply fails
 # and we fall through to CPU Halide.
+# B-spline wavelet decomposition for diffuse/sharpen module
+# CPU target: separable 5-tap blur + detail extraction in one pass
+add_halide_library(halide_bspline_decompose
+    FROM dt_halide_bspline_generators
+    GENERATOR bspline_decompose
+    TARGETS host
+    FUNCTION_NAME dt_halide_bspline_decompose
+)
+
+# Metal GPU target for B-spline decomposition (macOS only)
+if(APPLE)
+    add_halide_library(halide_bspline_decompose_metal
+        FROM dt_halide_bspline_generators
+        GENERATOR bspline_decompose
+        TARGETS host-metal
+        FUNCTION_NAME dt_halide_bspline_decompose_metal
+    )
+endif()
+
+# Heat PDE diffusion kernel for diffuse/sharpen module
+add_halide_library(halide_diffuse_pde
+    FROM dt_halide_diffuse_generators
+    GENERATOR diffuse_pde
+    TARGETS host
+    FUNCTION_NAME dt_halide_diffuse_pde
+)
+
+# Metal GPU target for PDE diffusion (macOS only)
+if(APPLE)
+    add_halide_library(halide_diffuse_pde_metal
+        FROM dt_halide_diffuse_generators
+        GENERATOR diffuse_pde
+        TARGETS host-metal
+        FUNCTION_NAME dt_halide_diffuse_pde_metal
+    )
+endif()
+
 find_package(Vulkan QUIET)
 if(Vulkan_FOUND)
     add_halide_library(halide_exposure_vulkan
@@ -45,5 +92,17 @@ if(Vulkan_FOUND)
         TARGETS host-vulkan
         FUNCTION_NAME dt_halide_exposure_vulkan
     )
+    add_halide_library(halide_bspline_decompose_vulkan
+        FROM dt_halide_bspline_generators
+        GENERATOR bspline_decompose
+        TARGETS host-vulkan
+        FUNCTION_NAME dt_halide_bspline_decompose_vulkan
+    )
+    add_halide_library(halide_diffuse_pde_vulkan
+        FROM dt_halide_diffuse_generators
+        GENERATOR diffuse_pde
+        TARGETS host-vulkan
+        FUNCTION_NAME dt_halide_diffuse_pde_vulkan
+    )
     set(HALIDE_HAS_VULKAN TRUE PARENT_SCOPE)
 endif()
diff --git a/data/halide/generators/bspline_generator.cpp b/data/halide/generators/bspline_generator.cpp
new file mode 100644
index 000000000000..d629dd9a3977
--- /dev/null
+++ b/data/halide/generators/bspline_generator.cpp
@@ -0,0 +1,131 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * Halide generator for B-spline wavelet decomposition used by
+ * darktable's diffuse/sharpen module.
+ *
+ * Algorithm: separable 5-tap [1,4,6,4,1]/16 blur with strided access
+ * (à-trous wavelet transform). Produces both the low-frequency (blurred)
+ * and high-frequency (detail = input - blurred) components.
+ *
+ * darktable pixel layout: interleaved float RGBA, 4 channels per pixel.
+ * We use the "flat" 2D approach (xi = x*4+c, y) that works on both
+ * CPU and GPU, matching the exposure_flat pattern.
+ *
+ * The stride parameter "mult" controls the spacing between filter taps:
+ *   mult = 1 << scale  (scale 0 -> mult=1, scale 1 -> mult=2, etc.)
+ *
+ * Boundary handling: clamp to edge (matching darktable's MIN/MAX clamping).
+ *
+ * Negative values in the blur output are clipped to 0 (matching darktable's
+ * clip_negatives=TRUE in decompose_2D_Bspline).
+ */
+
+#include "Halide.h"
+
+using namespace Halide;
+
+// Flat 2D layout (xi, y), xi = width * 4.
+// Separable filter: vertical pass then horizontal pass.
+// mult is the tap spacing in pixels (= 1 << scale).
+// width_pixels is the image width in pixels (not floats).
+class BsplineDecomposeGenerator : public Generator<BsplineDecomposeGenerator> {
+public:
+    Input<Buffer<float, 2>> input{"input"};       // (xi, y) where xi = width*4
+    Input<int32_t> width_pixels{"width_pixels"};  // image width in pixels
+    Input<int32_t> height{"height"};              // image height in pixels
+    Input<int32_t> mult{"mult"};                  // tap spacing = 1 << scale
+    Output<Buffer<float, 2>> lf{"lf"};            // low-frequency (blurred)
+    Output<Buffer<float, 2>> hf{"hf"};            // high-frequency (detail)
+
+    // Intermediate stage accessible to schedule()
+    Func vert{"vert"};
+
+    void generate() {
+        Var xi("xi"), y("y");
+
+        // B-spline filter weights
+        const float w0 = 1.0f / 16.0f;
+        const float w1 = 4.0f / 16.0f;
+        const float w2 = 6.0f / 16.0f;
+
+        // Extract pixel x and channel from flat index
+        Expr px = xi / 4;
+        Expr ch = xi % 4;
+
+        // Vertical pass: blur along y with spacing mult
+        // Clamp row indices to [0, height-1]
+        Expr r0 = clamp(y - 2 * mult, 0, height - 1);
+        Expr r1 = clamp(y - mult, 0, height - 1);
+        Expr r2 = y;
+        Expr r3 = clamp(y + mult, 0, height - 1);
+        Expr r4 = clamp(y + 2 * mult, 0, height - 1);
+
+        vert(xi, y) = w0 * input(xi, r0)
+                    + w1 * input(xi, r1)
+                    + w2 * input(xi, r2)
+                    + w1 * input(xi, r3)
+                    + w0 * input(xi, r4);
+
+        // Horizontal pass: blur along x with spacing mult on vertical result
+        // Clamp at pixel boundaries, preserving channel offset
+        Expr p0 = clamp(px - 2 * mult, 0, width_pixels - 1) * 4 + ch;
+        Expr p1 = clamp(px - mult, 0, width_pixels - 1) * 4 + ch;
+        Expr p2 = xi;
+        Expr p3 = clamp(px + mult, 0, width_pixels - 1) * 4 + ch;
+        Expr p4 = clamp(px + 2 * mult, 0, width_pixels - 1) * 4 + ch;
+
+        Expr blur = w0 * vert(p0, y)
+                  + w1 * vert(p1, y)
+                  + w2 * vert(p2, y)
+                  + w1 * vert(p3, y)
+                  + w0 * vert(p4, y);
+
+        // Clip negatives (matching darktable behavior)
+        lf(xi, y) = max(blur, 0.0f);
+        hf(xi, y) = input(xi, y) - lf(xi, y);
+    }
+
+    void schedule() {
+        Var xi = lf.args()[0];
+        Var y  = lf.args()[1];
+
+        if(get_target().has_gpu_feature()) {
+            // GPU schedule: compute vert inline (fused into the horizontal pass)
+            // then tile the outputs
+            Var xio("xio"), yo("yo");
+            vert.compute_at(lf, yo);
+            lf.gpu_tile(xi, y, xio, yo, 64, 16);
+            hf.gpu_tile(xi, y, xio, yo, 64, 16);
+        } else {
+            // CPU schedule: compute vert per row strip for cache locality,
+            // vectorize, parallelize over rows
+            Var xio("xio"), xii("xii");
+
+            // Compute vertical pass per output row (store full row)
+            vert.compute_at(lf, y).vectorize(xi, 32);
+
+            lf.split(xi, xio, xii, 32).vectorize(xii).parallel(y);
+            // hf reads lf, so compute hf after lf at the same row
+            hf.split(xi, xio, xii, 32).vectorize(xii).parallel(y);
+        }
+    }
+};
+
+HALIDE_REGISTER_GENERATOR(BsplineDecomposeGenerator, bspline_decompose)
diff --git a/data/halide/generators/diffuse_pde_generator.cpp b/data/halide/generators/diffuse_pde_generator.cpp
new file mode 100644
index 000000000000..b55488d85c72
--- /dev/null
+++ b/data/halide/generators/diffuse_pde_generator.cpp
@@ -0,0 +1,297 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * Halide generator for the heat PDE diffusion kernel used by
+ * darktable's diffuse/sharpen module.
+ *
+ * This implements the anisotropic PDE solver from:
+ *   https://www.researchgate.net/publication/220663968
+ * modified for multi-scale wavelet framework.
+ *
+ * Per pixel, the algorithm:
+ *   1. Fetches 3x3 stencil from HF and LF buffers (strided by mult)
+ *   2. Computes gradients on LF and HF (centered finite differences)
+ *   3. For 4 derivative orders, computes anisotropic diffusion kernels:
+ *      - c2 = exp(-magnitude * anisotropy)
+ *      - rotation matrix based on isotropy mode (isotrope/isophote/gradient)
+ *      - builds 3x3 convolution kernel from rotation matrix
+ *   4. Convolves kernels with LF (orders 0,1) and HF (orders 2,3)
+ *   5. Regularizes variance, accumulates, produces output
+ *
+ * Uses flat 2D buffer layout (xi = width*4, y) for both CPU and GPU.
+ * Channel index c = xi % 4, pixel x = xi / 4.
+ *
+ * The isotropy_type for each of the 4 orders is passed as int parameters:
+ *   0 = ISOTROPE, 1 = ISOPHOTE, 2 = GRADIENT
+ */
+
+#include "Halide.h"
+
+using namespace Halide;
+
+class DiffusePDEGenerator : public Generator<DiffusePDEGenerator> {
+public:
+    // Input buffers: flat 2D (xi, y) where xi = width * 4
+    Input<Buffer<float, 2>> HF{"HF"};        // high-frequency (wavelet detail)
+    Input<Buffer<float, 2>> LF{"LF"};        // low-frequency (wavelet blur)
+    Input<Buffer<uint8_t, 2>> mask_buf{"mask_buf"};  // per-pixel mask (width, height)
+
+    // Scalar parameters
+    Input<int32_t> width_pixels{"width_pixels"};
+    Input<int32_t> height{"height"};
+    Input<int32_t> mult{"mult"};              // stencil stride = 1 << scale
+    Input<int32_t> has_mask{"has_mask"};       // boolean: use mask?
+
+    // Anisotropy factors (4 orders): already squared user params
+    Input<float> anisotropy_0{"anisotropy_0"};
+    Input<float> anisotropy_1{"anisotropy_1"};
+    Input<float> anisotropy_2{"anisotropy_2"};
+    Input<float> anisotropy_3{"anisotropy_3"};
+
+    // Isotropy types (4 orders): 0=ISOTROPE, 1=ISOPHOTE, 2=GRADIENT
+    Input<int32_t> isotropy_0{"isotropy_0"};
+    Input<int32_t> isotropy_1{"isotropy_1"};
+    Input<int32_t> isotropy_2{"isotropy_2"};
+    Input<int32_t> isotropy_3{"isotropy_3"};
+
+    // ABCD coefficients (diffusion weights per order)
+    Input<float> A{"A"};
+    Input<float> B{"B"};
+    Input<float> C{"C"};
+    Input<float> D{"D"};
+
+    // Other parameters
+    Input<float> strength{"strength"};
+    Input<float> regularization_factor{"regularization_factor"};
+    Input<float> variance_threshold{"variance_threshold"};
+
+    Output<Buffer<float, 2>> output{"output"};
+
+    // Intermediate Funcs for schedule access
+    Func neighbour_HF{"neighbour_HF"}, neighbour_LF{"neighbour_LF"};
+
+    void generate() {
+        Var xi("xi"), y("y");
+
+        // Channel and pixel coordinates
+        Expr ch = xi % 4;
+        Expr px = xi / 4;
+
+        // For 2D buffer access we need (xi_coord, y_coord)
+        // Helper: clamped flat xi for a given pixel offset
+        auto clamped_xi = [&](Expr pixel_x) -> Expr {
+            return clamp(pixel_x, 0, width_pixels - 1) * 4 + ch;
+        };
+        auto clamped_y = [&](Expr pixel_y) -> Expr {
+            return clamp(pixel_y, 0, height - 1);
+        };
+
+        // Fetch 3x3 stencil neighbors
+        // Stencil offsets: -mult, 0, +mult in pixel space
+
+        // Actually, let's just inline the 9 neighbor accesses directly.
+        // For each of the 9 positions (ii,jj) where offset = (ii-1)*mult, (jj-1)*mult:
+
+        // Fetch all 9 neighbors for HF and LF
+        // Layout: [0]=(-1,-1), [1]=(0,-1), [2]=(+1,-1),
+        //         [3]=(-1, 0), [4]=(0, 0), [5]=(+1, 0),
+        //         [6]=(-1,+1), [7]=(0,+1), [8]=(+1,+1)
+        // But in darktable convention: first index is row (x=vertical), second is col (y=horizontal)
+        // i_neighbours maps: 0 -> row-mult, 1 -> row, 2 -> row+mult
+        // j_neighbours maps: 0 -> col-mult, 1 -> col, 2 -> col+mult
+        // pixel[3*ii+jj] where ii is row index, jj is col index
+        // So pixel[0] = (row-mult, col-mult), pixel[1] = (row-mult, col), etc.
+        // NOTE: in darktable, "x is vertical, y is horizontal"
+        // row = y in our coords, col = px in our coords
+        // i_neighbours = row offsets, j_neighbours = col offsets
+
+        // darktable: i = row (our y), j = col (our px)
+        // i_neighbours[0] = row - mult, [1] = row, [2] = row + mult
+        // j_neighbours[0] = col - mult, [1] = col, [2] = col + mult
+        // neighbor[3*ii+jj] accesses i_neighbours[ii] + j_neighbours[jj]
+        // So ii indexes row, jj indexes col
+
+        // fetch_rc(row_offset, col_offset) in pixel-space offsets
+        auto fetch_rc = [&](const Input<Buffer<float, 2>> &buf,
+                           int row_off, int col_off) -> Expr {
+            Expr nx = clamped_xi(px + col_off * mult);
+            Expr ny = clamped_y(y + row_off * mult);
+            return buf(nx, ny);
+        };
+
+        // n[k] for k=0..8, k = 3*ii+jj, ii=row_idx(0,1,2), jj=col_idx(0,1,2)
+        // row_off = ii-1, col_off = jj-1
+        Expr hf[9], lf[9];
+        for(int ii = 0; ii < 3; ii++) {
+            for(int jj = 0; jj < 3; jj++) {
+                hf[3*ii+jj] = fetch_rc(HF, ii-1, jj-1);
+                lf[3*ii+jj] = fetch_rc(LF, ii-1, jj-1);
+            }
+        }
+
+        // Gradients (centered finite differences)
+        // find_gradients: xy[0] = (pixels[7] - pixels[1]) / 2  (x = vertical = row direction)
+        //                 xy[1] = (pixels[5] - pixels[3]) / 2  (y = horizontal = col direction)
+        Expr grad_lf_x = (lf[7] - lf[1]) / 2.0f;
+        Expr grad_lf_y = (lf[5] - lf[3]) / 2.0f;
+        Expr grad_hf_x = (hf[7] - hf[1]) / 2.0f;
+        Expr grad_hf_y = (hf[5] - hf[3]) / 2.0f;
+
+        // Magnitude and normalized direction for LF gradient
+        Expr mag_grad = sqrt(grad_lf_x * grad_lf_x + grad_lf_y * grad_lf_y);
+        Expr cos_grad = select(mag_grad != 0.0f, grad_lf_x / mag_grad, 1.0f);
+        Expr sin_grad = select(mag_grad != 0.0f, grad_lf_y / mag_grad, 0.0f);
+        Expr cos2_grad = cos_grad * cos_grad;
+        Expr sin2_grad = sin_grad * sin_grad;
+        Expr cossin_grad = cos_grad * sin_grad;
+
+        // Magnitude and normalized direction for HF gradient (called "laplacian" in dt)
+        Expr mag_lapl = sqrt(grad_hf_x * grad_hf_x + grad_hf_y * grad_hf_y);
+        Expr cos_lapl = select(mag_lapl != 0.0f, grad_hf_x / mag_lapl, 1.0f);
+        Expr sin_lapl = select(mag_lapl != 0.0f, grad_hf_y / mag_lapl, 0.0f);
+        Expr cos2_lapl = cos_lapl * cos_lapl;
+        Expr sin2_lapl = sin_lapl * sin_lapl;
+        Expr cossin_lapl = cos_lapl * sin_lapl;
+
+        // c2 = exp(-magnitude * anisotropy) for each of 4 orders
+        // darktable uses a fast approximate exp via integer bit tricks.
+        // We use Halide's fast_exp which is similarly approximate.
+        // Orders 0,2 use grad magnitude; orders 1,3 use lapl magnitude.
+        Expr c2_0 = fast_exp(-mag_grad * anisotropy_0);
+        Expr c2_1 = fast_exp(-mag_lapl * anisotropy_1);
+        Expr c2_2 = fast_exp(-mag_grad * anisotropy_2);
+        Expr c2_3 = fast_exp(-mag_lapl * anisotropy_3);
+
+        // Build 3x3 convolution kernels for each order based on isotropy type.
+        // kernel[k] for k=0..8 is the 3x3 stencil weight.
+        //
+        // For ISOTROPE: kernel = [0.25, 0.5, 0.25, 0.5, -3, 0.5, 0.25, 0.5, 0.25]
+        //
+        // For ISOPHOTE with c2, cos2, sin2, cossin:
+        //   a[0][0] = cos2 + c2*sin2,  a[1][1] = c2*cos2 + sin2
+        //   a[0][1] = a[1][0] = (c2 - 1) * cossin
+        //   Then build_matrix:
+        //     b11 = a01/2, b13 = -b11, b22 = -2*(a00+a11)
+        //     kernel = [b11, a11, b13, a00, b22, a00, b13, a11, b11]
+        //
+        // For GRADIENT: same as isophote but swapped:
+        //   a[0][0] = c2*cos2 + sin2,  a[1][1] = cos2 + c2*sin2
+        //   a[0][1] = a[1][0] = (1 - c2) * cossin
+
+        // Helper: build kernel from isotropy_type, c2, cos2, sin2, cossin
+        // Returns 9 Exprs for the kernel weights
+        auto build_kern = [](Expr iso_type, Expr c2_val,
+                            Expr cos2, Expr sin2, Expr cossin,
+                            Expr kern[9]) {
+            // Isophote rotation matrix elements
+            Expr iso_a00 = cos2 + c2_val * sin2;
+            Expr iso_a11 = c2_val * cos2 + sin2;
+            Expr iso_a01 = (c2_val - 1.0f) * cossin;
+
+            // Gradient rotation matrix elements (swapped)
+            Expr grd_a00 = c2_val * cos2 + sin2;
+            Expr grd_a11 = cos2 + c2_val * sin2;
+            Expr grd_a01 = (1.0f - c2_val) * cossin;
+
+            // Select based on isotropy type
+            Expr a00 = select(iso_type == 0, 0.0f,  // isotrope: doesn't use matrix
+                             iso_type == 1, iso_a00,
+                             grd_a00);
+            Expr a11 = select(iso_type == 0, 0.0f,
+                             iso_type == 1, iso_a11,
+                             grd_a11);
+            Expr a01 = select(iso_type == 0, 0.0f,
+                             iso_type == 1, iso_a01,
+                             grd_a01);
+
+            // build_matrix
+            Expr b11 = a01 / 2.0f;
+            Expr b13 = -b11;
+            Expr b22 = -2.0f * (a00 + a11);
+
+            // Anisotropic kernel
+            Expr aniso_kern[9] = {b11, a11, b13, a00, b22, a00, b13, a11, b11};
+
+            // Isotrope kernel (constant)
+            float iso_kern[9] = {0.25f, 0.5f, 0.25f, 0.5f, -3.0f, 0.5f, 0.25f, 0.5f, 0.25f};
+
+            for(int k = 0; k < 9; k++) {
+                kern[k] = select(iso_type == 0, iso_kern[k], aniso_kern[k]);
+            }
+        };
+
+        // Build kernels for each order
+        // Orders 0, 2 use grad angles; orders 1, 3 use lapl angles
+        Expr kern0[9], kern1[9], kern2[9], kern3[9];
+        build_kern(isotropy_0, c2_0, cos2_grad, sin2_grad, cossin_grad, kern0);
+        build_kern(isotropy_1, c2_1, cos2_lapl, sin2_lapl, cossin_lapl, kern1);
+        build_kern(isotropy_2, c2_2, cos2_grad, sin2_grad, cossin_grad, kern2);
+        build_kern(isotropy_3, c2_3, cos2_lapl, sin2_lapl, cossin_lapl, kern3);
+
+        // Convolve: derivatives[k] = sum_j(kern_k[j] * buffer[j])
+        // Orders 0,1 convolve with LF; orders 2,3 convolve with HF
+        Expr deriv0 = cast<float>(0), deriv1 = cast<float>(0);
+        Expr deriv2 = cast<float>(0), deriv3 = cast<float>(0);
+        Expr variance = cast<float>(0);
+
+        for(int k = 0; k < 9; k++) {
+            deriv0 += kern0[k] * lf[k];
+            deriv1 += kern1[k] * lf[k];
+            deriv2 += kern2[k] * hf[k];
+            deriv3 += kern3[k] * hf[k];
+            variance += hf[k] * hf[k];
+        }
+
+        // Regularize variance
+        Expr var_reg = variance_threshold + variance * regularization_factor;
+
+        // Accumulate
+        Expr acc = deriv0 * A + deriv1 * B + deriv2 * C + deriv3 * D;
+
+        // Final output: max(HF*strength + acc/variance + LF, 0)
+        Expr center_hf = hf[4];  // center pixel HF
+        Expr center_lf = lf[4];  // center pixel LF
+        Expr result = center_hf * strength + acc / var_reg + center_lf;
+
+        // Mask handling: if mask is 0, just output HF + LF (passthrough)
+        // mask is per-pixel (width, height), not per-channel
+        Expr mask_val = select(has_mask != 0,
+                              cast<int32_t>(mask_buf(px, y)),
+                              1);
+        output(xi, y) = select(mask_val != 0,
+                              max(result, 0.0f),
+                              center_hf + center_lf);
+    }
+
+    void schedule() {
+        Var xi = output.args()[0];
+        Var y  = output.args()[1];
+
+        if(get_target().has_gpu_feature()) {
+            Var xio("xio"), yo("yo");
+            output.gpu_tile(xi, y, xio, yo, 64, 16);
+        } else {
+            // CPU: vectorize across channels/pixels, parallelize rows
+            Var xio("xio"), xii("xii");
+            output.split(xi, xio, xii, 32).vectorize(xii).parallel(y);
+        }
+    }
+};
+
+HALIDE_REGISTER_GENERATOR(DiffusePDEGenerator, diffuse_pde)
diff --git a/src/iop/CMakeLists.txt b/src/iop/CMakeLists.txt
index 6777052a8a26..4bc8d8b001f0 100644
--- a/src/iop/CMakeLists.txt
+++ b/src/iop/CMakeLists.txt
@@ -160,6 +160,15 @@ add_iop(censorize "censorize.c")
 add_iop(colorbalancergb "colorbalancergb.c")
 add_iop(cacorrectrgb "cacorrectrgb.c")
 add_iop(diffuse "diffuse.c")
+if(USE_HALIDE)
+  target_link_libraries(diffuse halide_bspline_decompose halide_diffuse_pde)
+  if(APPLE)
+    target_link_libraries(diffuse halide_bspline_decompose_metal halide_diffuse_pde_metal)
+  endif()
+  if(HALIDE_HAS_VULKAN)
+    target_link_libraries(diffuse halide_bspline_decompose_vulkan halide_diffuse_pde_vulkan)
+  endif()
+endif()
 add_iop(blurs "blurs.c")
 add_iop(sigmoid "sigmoid.c")
 add_iop(agx "agx.c")
diff --git a/src/iop/diffuse.c b/src/iop/diffuse.c
index 27885e7b966a..69f0cd3d97c7 100644
--- a/src/iop/diffuse.c
+++ b/src/iop/diffuse.c
@@ -1450,6 +1450,546 @@ void process(dt_iop_module_t *self,
     if(HF[s]) dt_free_align(HF[s]);
 }
 
+#ifdef HAVE_HALIDE
+#include "HalideRuntime.h"
+#include "halide_bspline_decompose.h"
+#include "halide_diffuse_pde.h"
+#ifdef __APPLE__
+#include "halide_bspline_decompose_metal.h"
+#include "halide_diffuse_pde_metal.h"
+#endif
+#ifdef HAVE_HALIDE_VULKAN
+#include "halide_bspline_decompose_vulkan.h"
+#include "halide_diffuse_pde_vulkan.h"
+#endif
+
+// Halide-accelerated B-spline decomposition: replaces decompose_2D_Bspline()
+// Uses flat 2D buffer layout (xi = width*4, y).
+// Returns 0 on success, non-zero on failure.
+static int _halide_decompose_2D_Bspline(const float *const restrict in,
+                                        float *const restrict HF,
+                                        float *const restrict LF,
+                                        const size_t width,
+                                        const size_t height,
+                                        const int mult,
+                                        const char **backend_out)
+{
+  const int row_floats = (int)width * 4;
+  const int w = (int)width;
+  const int h = (int)height;
+
+  halide_dimension_t in_dims[2] = {
+    {0, row_floats, 1},
+    {0, h,          row_floats},
+  };
+  halide_dimension_t lf_dims[2] = {
+    {0, row_floats, 1},
+    {0, h,          row_floats},
+  };
+  halide_dimension_t hf_dims[2] = {
+    {0, row_floats, 1},
+    {0, h,          row_floats},
+  };
+
+  halide_buffer_t in_buf = {0};
+  in_buf.host = (uint8_t *)in;
+  in_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
+  in_buf.dimensions = 2;
+  in_buf.dim = in_dims;
+
+  halide_buffer_t lf_buf = {0};
+  lf_buf.host = (uint8_t *)LF;
+  lf_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
+  lf_buf.dimensions = 2;
+  lf_buf.dim = lf_dims;
+
+  halide_buffer_t hf_buf = {0};
+  hf_buf.host = (uint8_t *)HF;
+  hf_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
+  hf_buf.dimensions = 2;
+  hf_buf.dim = hf_dims;
+
+  int err = -1;
+  const char *backend = NULL;
+
+  // Try Metal GPU
+#ifdef __APPLE__
+  {
+    halide_buffer_t gpu_in = in_buf;
+    halide_buffer_t gpu_lf = lf_buf;
+    halide_buffer_t gpu_hf = hf_buf;
+    gpu_in.flags = halide_buffer_flag_host_dirty;
+
+    err = dt_halide_bspline_decompose_metal(&gpu_in, w, h, mult, &gpu_lf, &gpu_hf);
+    if(!err)
+    {
+      halide_copy_to_host(NULL, &gpu_lf);
+      halide_copy_to_host(NULL, &gpu_hf);
+      backend = "Metal GPU";
+    }
+    else
+    {
+      dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Metal bspline failed (%d), trying next backend", err);
+    }
+    halide_device_free(NULL, &gpu_in);
+    halide_device_free(NULL, &gpu_lf);
+    halide_device_free(NULL, &gpu_hf);
+  }
+#endif
+
+  // Try Vulkan GPU
+#ifdef HAVE_HALIDE_VULKAN
+  if(err)
+  {
+    halide_buffer_t gpu_in = in_buf;
+    halide_buffer_t gpu_lf = lf_buf;
+    halide_buffer_t gpu_hf = hf_buf;
+    gpu_in.flags = halide_buffer_flag_host_dirty;
+
+    err = dt_halide_bspline_decompose_vulkan(&gpu_in, w, h, mult, &gpu_lf, &gpu_hf);
+    if(!err)
+    {
+      halide_copy_to_host(NULL, &gpu_lf);
+      halide_copy_to_host(NULL, &gpu_hf);
+      backend = "Vulkan GPU";
+    }
+    else
+    {
+      dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Vulkan bspline failed (%d), trying CPU", err);
+    }
+    halide_device_free(NULL, &gpu_in);
+    halide_device_free(NULL, &gpu_lf);
+    halide_device_free(NULL, &gpu_hf);
+  }
+#endif
+
+  // CPU Halide fallback
+  if(err)
+  {
+    err = dt_halide_bspline_decompose(&in_buf, w, h, mult, &lf_buf, &hf_buf);
+    if(!err) backend = "CPU";
+  }
+
+  if(backend_out) *backend_out = backend;
+  return err;
+}
+
+// Halide-accelerated heat PDE diffusion: replaces heat_PDE_diffusion()
+// Returns 0 on success, non-zero on failure.
+static int _halide_heat_PDE_diffusion(const float *const restrict high_freq,
+                                      const float *const restrict low_freq,
+                                      const uint8_t *const restrict mask,
+                                      const gboolean has_mask,
+                                      float *const restrict output,
+                                      const size_t width,
+                                      const size_t height,
+                                      const dt_aligned_pixel_t anisotropy,
+                                      const dt_isotropy_t isotropy_type[4],
+                                      const float regularization,
+                                      const float variance_threshold,
+                                      const float current_radius_square,
+                                      const int mult,
+                                      const dt_aligned_pixel_t ABCD,
+                                      const float strength,
+                                      const char **backend_out)
+{
+  const int row_floats = (int)width * 4;
+  const int w = (int)width;
+  const int h = (int)height;
+  const float regularization_factor = regularization * current_radius_square / 9.f;
+
+  // Setup flat 2D buffers for HF, LF, output
+  halide_dimension_t flat_dims[2] = {
+    {0, row_floats, 1},
+    {0, h,          row_floats},
+  };
+
+  halide_buffer_t hf_buf = {0};
+  hf_buf.host = (uint8_t *)high_freq;
+  hf_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
+  hf_buf.dimensions = 2;
+  hf_buf.dim = flat_dims;
+
+  halide_dimension_t lf_dims[2] = {
+    {0, row_floats, 1},
+    {0, h,          row_floats},
+  };
+  halide_buffer_t lf_buf = {0};
+  lf_buf.host = (uint8_t *)low_freq;
+  lf_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
+  lf_buf.dimensions = 2;
+  lf_buf.dim = lf_dims;
+
+  // Mask buffer: per-pixel uint8 (width, height)
+  halide_dimension_t mask_dims[2] = {
+    {0, w, 1},
+    {0, h, w},
+  };
+  halide_buffer_t mask_b = {0};
+  mask_b.host = (uint8_t *)mask;
+  mask_b.type = (struct halide_type_t){halide_type_uint, 8, 1};
+  mask_b.dimensions = 2;
+  mask_b.dim = mask_dims;
+
+  halide_dimension_t out_dims[2] = {
+    {0, row_floats, 1},
+    {0, h,          row_floats},
+  };
+  halide_buffer_t out_buf = {0};
+  out_buf.host = (uint8_t *)output;
+  out_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
+  out_buf.dimensions = 2;
+  out_buf.dim = out_dims;
+
+  int err = -1;
+  const char *backend = NULL;
+
+  // Common arguments (after buffer args):
+  // width_pixels, height, mult, has_mask,
+  // anisotropy_0..3, isotropy_0..3, A, B, C, D,
+  // strength, regularization_factor, variance_threshold
+
+#ifdef __APPLE__
+  {
+    halide_buffer_t gpu_hf = hf_buf;
+    halide_buffer_t gpu_lf = lf_buf;
+    halide_buffer_t gpu_mask = mask_b;
+    halide_buffer_t gpu_out = out_buf;
+    gpu_hf.flags = halide_buffer_flag_host_dirty;
+    gpu_lf.flags = halide_buffer_flag_host_dirty;
+    if(has_mask) gpu_mask.flags = halide_buffer_flag_host_dirty;
+
+    err = dt_halide_diffuse_pde_metal(
+        &gpu_hf, &gpu_lf, &gpu_mask,
+        w, h, mult, has_mask ? 1 : 0,
+        anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
+        (int)isotropy_type[0], (int)isotropy_type[1],
+        (int)isotropy_type[2], (int)isotropy_type[3],
+        ABCD[0], ABCD[1], ABCD[2], ABCD[3],
+        strength, regularization_factor, variance_threshold,
+        &gpu_out);
+    if(!err)
+    {
+      halide_copy_to_host(NULL, &gpu_out);
+      backend = "Metal GPU";
+    }
+    else
+    {
+      dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Metal PDE failed (%d), trying next backend", err);
+    }
+    halide_device_free(NULL, &gpu_hf);
+    halide_device_free(NULL, &gpu_lf);
+    halide_device_free(NULL, &gpu_mask);
+    halide_device_free(NULL, &gpu_out);
+  }
+#endif
+
+#ifdef HAVE_HALIDE_VULKAN
+  if(err)
+  {
+    halide_buffer_t gpu_hf = hf_buf;
+    halide_buffer_t gpu_lf = lf_buf;
+    halide_buffer_t gpu_mask = mask_b;
+    halide_buffer_t gpu_out = out_buf;
+    gpu_hf.flags = halide_buffer_flag_host_dirty;
+    gpu_lf.flags = halide_buffer_flag_host_dirty;
+    if(has_mask) gpu_mask.flags = halide_buffer_flag_host_dirty;
+
+    err = dt_halide_diffuse_pde_vulkan(
+        &gpu_hf, &gpu_lf, &gpu_mask,
+        w, h, mult, has_mask ? 1 : 0,
+        anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
+        (int)isotropy_type[0], (int)isotropy_type[1],
+        (int)isotropy_type[2], (int)isotropy_type[3],
+        ABCD[0], ABCD[1], ABCD[2], ABCD[3],
+        strength, regularization_factor, variance_threshold,
+        &gpu_out);
+    if(!err)
+    {
+      halide_copy_to_host(NULL, &gpu_out);
+      backend = "Vulkan GPU";
+    }
+    else
+    {
+      dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Vulkan PDE failed (%d), trying CPU", err);
+    }
+    halide_device_free(NULL, &gpu_hf);
+    halide_device_free(NULL, &gpu_lf);
+    halide_device_free(NULL, &gpu_mask);
+    halide_device_free(NULL, &gpu_out);
+  }
+#endif
+
+  if(err)
+  {
+    err = dt_halide_diffuse_pde(
+        &hf_buf, &lf_buf, &mask_b,
+        w, h, mult, has_mask ? 1 : 0,
+        anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
+        (int)isotropy_type[0], (int)isotropy_type[1],
+        (int)isotropy_type[2], (int)isotropy_type[3],
+        ABCD[0], ABCD[1], ABCD[2], ABCD[3],
+        strength, regularization_factor, variance_threshold,
+        &out_buf);
+    if(!err) backend = "CPU";
+  }
+
+  if(backend_out) *backend_out = backend;
+  return err;
+}
+
+// Halide version of wavelets_process: uses Halide for B-spline decomposition,
+// keeps the PDE diffusion kernel as C code (Phase 2 will port that too).
+static inline gboolean _wavelets_process_halide(const float *const restrict in,
+                                    float *const restrict reconstructed,
+                                    const uint8_t *const restrict mask,
+                                    const size_t width,
+                                    const size_t height,
+                                    const dt_iop_diffuse_data_t *const data,
+                                    const float final_radius,
+                                    const float zoom,
+                                    const int scales,
+                                    const gboolean has_mask,
+                                    float *const restrict HF[MAX_NUM_SCALES],
+                                    float *const restrict LF_odd,
+                                    float *const restrict LF_even)
+{
+  gboolean success = TRUE;
+
+  const dt_aligned_pixel_t anisotropy
+      = { compute_anisotropy_factor(data->anisotropy_first),
+          compute_anisotropy_factor(data->anisotropy_second),
+          compute_anisotropy_factor(data->anisotropy_third),
+          compute_anisotropy_factor(data->anisotropy_fourth) };
+
+  const dt_isotropy_t DT_ALIGNED_PIXEL isotropy_type[4]
+      = { check_isotropy_mode(data->anisotropy_first),
+          check_isotropy_mode(data->anisotropy_second),
+          check_isotropy_mode(data->anisotropy_third),
+          check_isotropy_mode(data->anisotropy_fourth) };
+
+  const float regularization = powf(10.f, data->regularization) - 1.f;
+  const float variance_threshold = powf(10.f, data->variance_threshold);
+
+  // À trous decimated wavelet decompose using Halide-accelerated B-spline blur
+  float *restrict residual;
+  const char *backend = NULL;
+  for(int s = 0; s < scales; ++s)
+  {
+    const int mult = 1 << s;
+
+    const float *restrict buffer_in;
+    float *restrict buffer_out;
+
+    if(s == 0)
+    {
+      buffer_in = in;
+      buffer_out = LF_odd;
+    }
+    else if(s % 2 != 0)
+    {
+      buffer_in = LF_odd;
+      buffer_out = LF_even;
+    }
+    else
+    {
+      buffer_in = LF_even;
+      buffer_out = LF_odd;
+    }
+
+    const char *scale_backend = NULL;
+    int err = _halide_decompose_2D_Bspline(buffer_in, HF[s], buffer_out, width, height, mult, &scale_backend);
+    if(err)
+    {
+      dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide bspline decompose failed at scale %d (err=%d)", s, err);
+      success = FALSE;
+      return success;
+    }
+    if(s == 0) backend = scale_backend;
+
+    residual = buffer_out;
+  }
+
+  if(backend)
+    dt_print(DT_DEBUG_PIPE, "[diffuse] Halide bspline decompose backend: %s", backend);
+
+  // PDE diffusion reconstruction (unchanged from C version)
+  float *restrict temp = (residual == LF_even) ? LF_odd : LF_even;
+
+  int count = 0;
+  for(int s = scales - 1; s > -1; --s)
+  {
+    const int mult = 1 << s;
+    const float current_radius = equivalent_sigma_at_step(B_SPLINE_SIGMA, s);
+    const float real_radius = current_radius * zoom;
+
+    const float norm =
+      expf(-sqf(real_radius - (float)data->radius_center) / sqf(data->radius));
+    const dt_aligned_pixel_t ABCD = { data->first * KAPPA * norm,
+                                      data->second * KAPPA * norm,
+                                      data->third * KAPPA * norm,
+                                      data->fourth * KAPPA * norm };
+    const float strength = data->sharpness * norm + 1.f;
+
+    const float *restrict buffer_in;
+    float *restrict buffer_out;
+
+    if(count == 0)
+    {
+      buffer_in = residual;
+      buffer_out = temp;
+    }
+    else if(count % 2 != 0)
+    {
+      buffer_in = temp;
+      buffer_out = residual;
+    }
+    else
+    {
+      buffer_in = residual;
+      buffer_out = temp;
+    }
+
+    if(s == 0) buffer_out = reconstructed;
+
+    const char *pde_backend = NULL;
+    int pde_err = _halide_heat_PDE_diffusion(HF[s], buffer_in, mask, has_mask, buffer_out, width, height,
+                       anisotropy, isotropy_type, regularization,
+                       variance_threshold, sqf(current_radius), mult, ABCD, strength,
+                       &pde_backend);
+    if(pde_err)
+    {
+      dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide PDE diffusion failed at scale %d (err=%d), falling back to C", s, pde_err);
+      // Fall back to C implementation for this scale
+      heat_PDE_diffusion(HF[s], buffer_in, mask, has_mask, buffer_out, width, height,
+                         anisotropy, isotropy_type, regularization,
+                         variance_threshold, sqf(current_radius), mult, ABCD, strength);
+    }
+    else if(s == scales - 1 && pde_backend)
+    {
+      dt_print(DT_DEBUG_PIPE, "[diffuse] Halide PDE diffusion backend: %s", pde_backend);
+    }
+    count++;
+  }
+
+  return success;
+}
+
+__attribute__((visibility("default")))
+int process_halide(dt_iop_module_t *self,
+                   dt_dev_pixelpipe_iop_t *piece,
+                   const void *const restrict ivoid,
+                   void *const restrict ovoid,
+                   const dt_iop_roi_t *const roi_in,
+                   const dt_iop_roi_t *const roi_out)
+{
+  const gboolean fastmode = piece->pipe->type & DT_DEV_PIXELPIPE_FAST;
+  const dt_iop_diffuse_data_t *const data = piece->data;
+
+  const size_t width = roi_out->width;
+  const size_t height = roi_out->height;
+
+  if(fastmode)
+  {
+    const size_t ch = piece->colors;
+    dt_iop_copy_image_roi(ovoid, ivoid, ch, roi_in, roi_out);
+    return 0;
+  }
+
+  uint8_t *const restrict mask = dt_alloc_align_uint8(width * height);
+
+  float *restrict in = DT_IS_ALIGNED((float *const restrict)ivoid);
+  float *const restrict out = DT_IS_ALIGNED((float *const restrict)ovoid);
+
+  float *restrict temp1, *restrict temp2;
+  float *restrict LF_odd, *restrict LF_even;
+  float *restrict temp_in = NULL;
+  float *restrict temp_out = NULL;
+
+  gboolean out_of_memory = !mask
+    || !dt_iop_alloc_image_buffers(self, roi_in, roi_out,
+                                 4 | DT_IMGSZ_OUTPUT, &temp1,
+                                 4 | DT_IMGSZ_OUTPUT, &temp2,
+                                 4 | DT_IMGSZ_OUTPUT, &LF_odd,
+                                 4 | DT_IMGSZ_OUTPUT, &LF_even,
+                                 0, NULL);
+
+  const float scale = fmaxf(piece->iscale / roi_in->scale, 1.f);
+  const float final_radius = (data->radius + data->radius_center) * 2.f / scale;
+
+  const int iterations = MAX(ceilf((float)data->iterations), 1);
+  const int diffusion_scales = num_steps_to_reach_equivalent_sigma(B_SPLINE_SIGMA, final_radius);
+  const int scales = CLAMP(diffusion_scales, 1, MAX_NUM_SCALES);
+
+  float *restrict HF[MAX_NUM_SCALES];
+  for(int s = 0; s < scales; s++)
+  {
+    HF[s] = out_of_memory ? NULL : dt_alloc_align_float(width * height * 4);
+    if(!HF[s]) out_of_memory = TRUE;
+  }
+
+  int ret = 0;
+
+  if(out_of_memory)
+  {
+    dt_iop_copy_image_roi(ovoid, ivoid, piece->colors, roi_in, roi_out);
+    dt_control_log(_("diffuse/sharpen failed to allocate memory, check your RAM settings"));
+    ret = -1;
+    goto finish;
+  }
+
+  const gboolean has_mask = (data->threshold > 0.f);
+  if(has_mask)
+  {
+    build_mask(in, mask, data->threshold, roi_out->width, roi_out->height);
+    inpaint_mask(temp1, in, mask, roi_out->width, roi_out->height);
+    in = temp1;
+  }
+
+  for(int it = 0; it < iterations; it++)
+  {
+    if(it == 0)
+    {
+      temp_in = in;
+      temp_out = temp2;
+    }
+    else if(it % 2 == 0)
+    {
+      temp_in = temp1;
+      temp_out = temp2;
+    }
+    else
+    {
+      temp_in = temp2;
+      temp_out = temp1;
+    }
+
+    if(it == iterations - 1)
+      temp_out = out;
+
+    if(!_wavelets_process_halide(temp_in, temp_out, mask,
+                     roi_out->width, roi_out->height,
+                     data, final_radius, scale, scales, has_mask, HF, LF_odd, LF_even))
+    {
+      // Halide failed, signal fallback to regular process()
+      ret = -1;
+      goto finish;
+    }
+  }
+
+finish:
+  dt_free_align(mask);
+  dt_free_align(temp1);
+  dt_free_align(temp2);
+  dt_free_align(LF_even);
+  dt_free_align(LF_odd);
+  for(int s = 0; s < scales; s++)
+    if(HF[s]) dt_free_align(HF[s]);
+
+  return ret;
+}
+#endif /* HAVE_HALIDE */
+
 #if HAVE_OPENCL
 static inline cl_int wavelets_process_cl(const int devid,
                                          cl_mem in,

From a342c3b40e0f9c9ee10dc3361fd1b6296092f86a Mon Sep 17 00:00:00 2001
From: Mario Zimmermann <mail@zisoft.de>
Date: Mon, 20 Apr 2026 19:45:28 +0200
Subject: [PATCH 3/4] use GPU persistent buffers

---
 src/iop/diffuse.c | 603 ++++++++++++++++++++--------------------------
 1 file changed, 264 insertions(+), 339 deletions(-)

diff --git a/src/iop/diffuse.c b/src/iop/diffuse.c
index 69f0cd3d97c7..28565a399d59 100644
--- a/src/iop/diffuse.c
+++ b/src/iop/diffuse.c
@@ -1463,283 +1463,166 @@ void process(dt_iop_module_t *self,
 #include "halide_diffuse_pde_vulkan.h"
 #endif
 
-// Halide-accelerated B-spline decomposition: replaces decompose_2D_Bspline()
-// Uses flat 2D buffer layout (xi = width*4, y).
-// Returns 0 on success, non-zero on failure.
-static int _halide_decompose_2D_Bspline(const float *const restrict in,
-                                        float *const restrict HF,
-                                        float *const restrict LF,
-                                        const size_t width,
-                                        const size_t height,
-                                        const int mult,
-                                        const char **backend_out)
+// Backend selection for GPU-persistent buffer management
+typedef enum _halide_backend_t
 {
-  const int row_floats = (int)width * 4;
-  const int w = (int)width;
-  const int h = (int)height;
+  HALIDE_BACKEND_NONE = 0,
+  HALIDE_BACKEND_METAL,
+  HALIDE_BACKEND_VULKAN,
+  HALIDE_BACKEND_CPU
+} _halide_backend_t;
+
+// Helper: initialize a flat 2D halide_buffer_t for float RGBA data
+static inline void _halide_init_flat_f32(halide_buffer_t *buf, halide_dimension_t dims[2],
+                                         float *host, const int row_floats, const int h)
+{
+  dims[0] = (halide_dimension_t){0, row_floats, 1};
+  dims[1] = (halide_dimension_t){0, h, row_floats};
+  memset(buf, 0, sizeof(*buf));
+  buf->host = (uint8_t *)host;
+  buf->type = (struct halide_type_t){halide_type_float, 32, 1};
+  buf->dimensions = 2;
+  buf->dim = dims;
+}
 
-  halide_dimension_t in_dims[2] = {
-    {0, row_floats, 1},
-    {0, h,          row_floats},
-  };
-  halide_dimension_t lf_dims[2] = {
-    {0, row_floats, 1},
-    {0, h,          row_floats},
-  };
-  halide_dimension_t hf_dims[2] = {
-    {0, row_floats, 1},
-    {0, h,          row_floats},
-  };
-
-  halide_buffer_t in_buf = {0};
-  in_buf.host = (uint8_t *)in;
-  in_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
-  in_buf.dimensions = 2;
-  in_buf.dim = in_dims;
-
-  halide_buffer_t lf_buf = {0};
-  lf_buf.host = (uint8_t *)LF;
-  lf_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
-  lf_buf.dimensions = 2;
-  lf_buf.dim = lf_dims;
-
-  halide_buffer_t hf_buf = {0};
-  hf_buf.host = (uint8_t *)HF;
-  hf_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
-  hf_buf.dimensions = 2;
-  hf_buf.dim = hf_dims;
-
-  int err = -1;
-  const char *backend = NULL;
-
-  // Try Metal GPU
-#ifdef __APPLE__
-  {
-    halide_buffer_t gpu_in = in_buf;
-    halide_buffer_t gpu_lf = lf_buf;
-    halide_buffer_t gpu_hf = hf_buf;
-    gpu_in.flags = halide_buffer_flag_host_dirty;
+// Helper: initialize a 2D halide_buffer_t for uint8 mask data
+static inline void _halide_init_mask_u8(halide_buffer_t *buf, halide_dimension_t dims[2],
+                                        const uint8_t *host, const int w, const int h)
+{
+  dims[0] = (halide_dimension_t){0, w, 1};
+  dims[1] = (halide_dimension_t){0, h, w};
+  memset(buf, 0, sizeof(*buf));
+  buf->host = (uint8_t *)host;
+  buf->type = (struct halide_type_t){halide_type_uint, 8, 1};
+  buf->dimensions = 2;
+  buf->dim = dims;
+}
 
-    err = dt_halide_bspline_decompose_metal(&gpu_in, w, h, mult, &gpu_lf, &gpu_hf);
-    if(!err)
-    {
-      halide_copy_to_host(NULL, &gpu_lf);
-      halide_copy_to_host(NULL, &gpu_hf);
-      backend = "Metal GPU";
-    }
-    else
-    {
-      dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Metal bspline failed (%d), trying next backend", err);
-    }
-    halide_device_free(NULL, &gpu_in);
-    halide_device_free(NULL, &gpu_lf);
-    halide_device_free(NULL, &gpu_hf);
-  }
+// Try to run bspline decompose on the given backend.
+// Returns 0 on success. Does NOT copy to host or free device memory.
+static inline int _halide_run_bspline(_halide_backend_t backend,
+                                      halide_buffer_t *in_buf,
+                                      halide_buffer_t *lf_buf,
+                                      halide_buffer_t *hf_buf,
+                                      const int w, const int h, const int mult)
+{
+  switch(backend)
+  {
+#ifdef __APPLE__
+    case HALIDE_BACKEND_METAL:
+      return dt_halide_bspline_decompose_metal(in_buf, w, h, mult, lf_buf, hf_buf);
 #endif
-
-  // Try Vulkan GPU
 #ifdef HAVE_HALIDE_VULKAN
-  if(err)
-  {
-    halide_buffer_t gpu_in = in_buf;
-    halide_buffer_t gpu_lf = lf_buf;
-    halide_buffer_t gpu_hf = hf_buf;
-    gpu_in.flags = halide_buffer_flag_host_dirty;
-
-    err = dt_halide_bspline_decompose_vulkan(&gpu_in, w, h, mult, &gpu_lf, &gpu_hf);
-    if(!err)
-    {
-      halide_copy_to_host(NULL, &gpu_lf);
-      halide_copy_to_host(NULL, &gpu_hf);
-      backend = "Vulkan GPU";
-    }
-    else
-    {
-      dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Vulkan bspline failed (%d), trying CPU", err);
-    }
-    halide_device_free(NULL, &gpu_in);
-    halide_device_free(NULL, &gpu_lf);
-    halide_device_free(NULL, &gpu_hf);
-  }
+    case HALIDE_BACKEND_VULKAN:
+      return dt_halide_bspline_decompose_vulkan(in_buf, w, h, mult, lf_buf, hf_buf);
 #endif
+    case HALIDE_BACKEND_CPU:
+      return dt_halide_bspline_decompose(in_buf, w, h, mult, lf_buf, hf_buf);
+    default:
+      return -1;
+  }
+}
 
-  // CPU Halide fallback
-  if(err)
+// Try to run PDE diffusion on the given backend.
+// Returns 0 on success. Does NOT copy to host or free device memory.
+static inline int _halide_run_pde(_halide_backend_t backend,
+                                  halide_buffer_t *hf_buf,
+                                  halide_buffer_t *lf_buf,
+                                  halide_buffer_t *mask_buf,
+                                  halide_buffer_t *out_buf,
+                                  const int w, const int h, const int mult,
+                                  const int has_mask_i,
+                                  const dt_aligned_pixel_t anisotropy,
+                                  const dt_isotropy_t isotropy_type[4],
+                                  const dt_aligned_pixel_t ABCD,
+                                  const float strength,
+                                  const float regularization_factor,
+                                  const float variance_threshold)
+{
+  switch(backend)
   {
-    err = dt_halide_bspline_decompose(&in_buf, w, h, mult, &lf_buf, &hf_buf);
-    if(!err) backend = "CPU";
+#ifdef __APPLE__
+    case HALIDE_BACKEND_METAL:
+      return dt_halide_diffuse_pde_metal(
+          hf_buf, lf_buf, mask_buf,
+          w, h, mult, has_mask_i,
+          anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
+          (int)isotropy_type[0], (int)isotropy_type[1],
+          (int)isotropy_type[2], (int)isotropy_type[3],
+          ABCD[0], ABCD[1], ABCD[2], ABCD[3],
+          strength, regularization_factor, variance_threshold,
+          out_buf);
+#endif
+#ifdef HAVE_HALIDE_VULKAN
+    case HALIDE_BACKEND_VULKAN:
+      return dt_halide_diffuse_pde_vulkan(
+          hf_buf, lf_buf, mask_buf,
+          w, h, mult, has_mask_i,
+          anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
+          (int)isotropy_type[0], (int)isotropy_type[1],
+          (int)isotropy_type[2], (int)isotropy_type[3],
+          ABCD[0], ABCD[1], ABCD[2], ABCD[3],
+          strength, regularization_factor, variance_threshold,
+          out_buf);
+#endif
+    case HALIDE_BACKEND_CPU:
+      return dt_halide_diffuse_pde(
+          hf_buf, lf_buf, mask_buf,
+          w, h, mult, has_mask_i,
+          anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
+          (int)isotropy_type[0], (int)isotropy_type[1],
+          (int)isotropy_type[2], (int)isotropy_type[3],
+          ABCD[0], ABCD[1], ABCD[2], ABCD[3],
+          strength, regularization_factor, variance_threshold,
+          out_buf);
+    default:
+      return -1;
   }
-
-  if(backend_out) *backend_out = backend;
-  return err;
 }
 
-// Halide-accelerated heat PDE diffusion: replaces heat_PDE_diffusion()
-// Returns 0 on success, non-zero on failure.
-static int _halide_heat_PDE_diffusion(const float *const restrict high_freq,
-                                      const float *const restrict low_freq,
-                                      const uint8_t *const restrict mask,
-                                      const gboolean has_mask,
-                                      float *const restrict output,
-                                      const size_t width,
-                                      const size_t height,
-                                      const dt_aligned_pixel_t anisotropy,
-                                      const dt_isotropy_t isotropy_type[4],
-                                      const float regularization,
-                                      const float variance_threshold,
-                                      const float current_radius_square,
-                                      const int mult,
-                                      const dt_aligned_pixel_t ABCD,
-                                      const float strength,
-                                      const char **backend_out)
+// Detect best available Halide GPU backend with a probe bspline call.
+// On success, the buffers will have device allocations ready for reuse.
+static _halide_backend_t _halide_probe_backend(halide_buffer_t *in_buf,
+                                               halide_buffer_t *lf_buf,
+                                               halide_buffer_t *hf_buf,
+                                               const int w, const int h, const int mult)
 {
-  const int row_floats = (int)width * 4;
-  const int w = (int)width;
-  const int h = (int)height;
-  const float regularization_factor = regularization * current_radius_square / 9.f;
-
-  // Setup flat 2D buffers for HF, LF, output
-  halide_dimension_t flat_dims[2] = {
-    {0, row_floats, 1},
-    {0, h,          row_floats},
-  };
-
-  halide_buffer_t hf_buf = {0};
-  hf_buf.host = (uint8_t *)high_freq;
-  hf_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
-  hf_buf.dimensions = 2;
-  hf_buf.dim = flat_dims;
-
-  halide_dimension_t lf_dims[2] = {
-    {0, row_floats, 1},
-    {0, h,          row_floats},
-  };
-  halide_buffer_t lf_buf = {0};
-  lf_buf.host = (uint8_t *)low_freq;
-  lf_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
-  lf_buf.dimensions = 2;
-  lf_buf.dim = lf_dims;
-
-  // Mask buffer: per-pixel uint8 (width, height)
-  halide_dimension_t mask_dims[2] = {
-    {0, w, 1},
-    {0, h, w},
-  };
-  halide_buffer_t mask_b = {0};
-  mask_b.host = (uint8_t *)mask;
-  mask_b.type = (struct halide_type_t){halide_type_uint, 8, 1};
-  mask_b.dimensions = 2;
-  mask_b.dim = mask_dims;
-
-  halide_dimension_t out_dims[2] = {
-    {0, row_floats, 1},
-    {0, h,          row_floats},
-  };
-  halide_buffer_t out_buf = {0};
-  out_buf.host = (uint8_t *)output;
-  out_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
-  out_buf.dimensions = 2;
-  out_buf.dim = out_dims;
-
-  int err = -1;
-  const char *backend = NULL;
-
-  // Common arguments (after buffer args):
-  // width_pixels, height, mult, has_mask,
-  // anisotropy_0..3, isotropy_0..3, A, B, C, D,
-  // strength, regularization_factor, variance_threshold
-
 #ifdef __APPLE__
-  {
-    halide_buffer_t gpu_hf = hf_buf;
-    halide_buffer_t gpu_lf = lf_buf;
-    halide_buffer_t gpu_mask = mask_b;
-    halide_buffer_t gpu_out = out_buf;
-    gpu_hf.flags = halide_buffer_flag_host_dirty;
-    gpu_lf.flags = halide_buffer_flag_host_dirty;
-    if(has_mask) gpu_mask.flags = halide_buffer_flag_host_dirty;
-
-    err = dt_halide_diffuse_pde_metal(
-        &gpu_hf, &gpu_lf, &gpu_mask,
-        w, h, mult, has_mask ? 1 : 0,
-        anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
-        (int)isotropy_type[0], (int)isotropy_type[1],
-        (int)isotropy_type[2], (int)isotropy_type[3],
-        ABCD[0], ABCD[1], ABCD[2], ABCD[3],
-        strength, regularization_factor, variance_threshold,
-        &gpu_out);
-    if(!err)
-    {
-      halide_copy_to_host(NULL, &gpu_out);
-      backend = "Metal GPU";
-    }
-    else
-    {
-      dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Metal PDE failed (%d), trying next backend", err);
-    }
-    halide_device_free(NULL, &gpu_hf);
-    halide_device_free(NULL, &gpu_lf);
-    halide_device_free(NULL, &gpu_mask);
-    halide_device_free(NULL, &gpu_out);
-  }
+  if(_halide_run_bspline(HALIDE_BACKEND_METAL, in_buf, lf_buf, hf_buf, w, h, mult) == 0)
+    return HALIDE_BACKEND_METAL;
+  dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Metal probe failed, trying next backend");
+  // Clean up failed device state before trying next backend
+  halide_device_free(NULL, in_buf);
+  halide_device_free(NULL, lf_buf);
+  halide_device_free(NULL, hf_buf);
 #endif
-
 #ifdef HAVE_HALIDE_VULKAN
-  if(err)
-  {
-    halide_buffer_t gpu_hf = hf_buf;
-    halide_buffer_t gpu_lf = lf_buf;
-    halide_buffer_t gpu_mask = mask_b;
-    halide_buffer_t gpu_out = out_buf;
-    gpu_hf.flags = halide_buffer_flag_host_dirty;
-    gpu_lf.flags = halide_buffer_flag_host_dirty;
-    if(has_mask) gpu_mask.flags = halide_buffer_flag_host_dirty;
-
-    err = dt_halide_diffuse_pde_vulkan(
-        &gpu_hf, &gpu_lf, &gpu_mask,
-        w, h, mult, has_mask ? 1 : 0,
-        anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
-        (int)isotropy_type[0], (int)isotropy_type[1],
-        (int)isotropy_type[2], (int)isotropy_type[3],
-        ABCD[0], ABCD[1], ABCD[2], ABCD[3],
-        strength, regularization_factor, variance_threshold,
-        &gpu_out);
-    if(!err)
-    {
-      halide_copy_to_host(NULL, &gpu_out);
-      backend = "Vulkan GPU";
-    }
-    else
-    {
-      dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Vulkan PDE failed (%d), trying CPU", err);
-    }
-    halide_device_free(NULL, &gpu_hf);
-    halide_device_free(NULL, &gpu_lf);
-    halide_device_free(NULL, &gpu_mask);
-    halide_device_free(NULL, &gpu_out);
-  }
+  if(_halide_run_bspline(HALIDE_BACKEND_VULKAN, in_buf, lf_buf, hf_buf, w, h, mult) == 0)
+    return HALIDE_BACKEND_VULKAN;
+  dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Vulkan probe failed, trying CPU");
+  halide_device_free(NULL, in_buf);
+  halide_device_free(NULL, lf_buf);
+  halide_device_free(NULL, hf_buf);
 #endif
+  if(_halide_run_bspline(HALIDE_BACKEND_CPU, in_buf, lf_buf, hf_buf, w, h, mult) == 0)
+    return HALIDE_BACKEND_CPU;
+  return HALIDE_BACKEND_NONE;
+}
 
-  if(err)
+static inline const char *_halide_backend_name(_halide_backend_t b)
+{
+  switch(b)
   {
-    err = dt_halide_diffuse_pde(
-        &hf_buf, &lf_buf, &mask_b,
-        w, h, mult, has_mask ? 1 : 0,
-        anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
-        (int)isotropy_type[0], (int)isotropy_type[1],
-        (int)isotropy_type[2], (int)isotropy_type[3],
-        ABCD[0], ABCD[1], ABCD[2], ABCD[3],
-        strength, regularization_factor, variance_threshold,
-        &out_buf);
-    if(!err) backend = "CPU";
+    case HALIDE_BACKEND_METAL:  return "Metal GPU";
+    case HALIDE_BACKEND_VULKAN: return "Vulkan GPU";
+    case HALIDE_BACKEND_CPU:    return "CPU";
+    default:                    return "none";
   }
-
-  if(backend_out) *backend_out = backend;
-  return err;
 }
 
-// Halide version of wavelets_process: uses Halide for B-spline decomposition,
-// keeps the PDE diffusion kernel as C code (Phase 2 will port that too).
+// Halide-accelerated wavelets_process with GPU buffer persistence.
+// All intermediate buffers stay on the GPU device across the entire
+// decompose + reconstruct loop. Only the final output is copied to host.
 static inline gboolean _wavelets_process_halide(const float *const restrict in,
                                     float *const restrict reconstructed,
                                     const uint8_t *const restrict mask,
@@ -1754,7 +1637,9 @@ static inline gboolean _wavelets_process_halide(const float *const restrict in,
                                     float *const restrict LF_odd,
                                     float *const restrict LF_even)
 {
-  gboolean success = TRUE;
+  const int row_floats = (int)width * 4;
+  const int w = (int)width;
+  const int h = (int)height;
 
   const dt_aligned_pixel_t anisotropy
       = { compute_anisotropy_factor(data->anisotropy_first),
@@ -1771,108 +1656,148 @@ static inline gboolean _wavelets_process_halide(const float *const restrict in,
   const float regularization = powf(10.f, data->regularization) - 1.f;
   const float variance_threshold = powf(10.f, data->variance_threshold);
 
-  // À trous decimated wavelet decompose using Halide-accelerated B-spline blur
-  float *restrict residual;
-  const char *backend = NULL;
-  for(int s = 0; s < scales; ++s)
-  {
-    const int mult = 1 << s;
+  // --- Create persistent halide_buffer_t wrappers for all buffers ---
+  // Each buffer gets its own dims array (Halide may write to these).
+  halide_dimension_t dims_in[2], dims_lf_odd[2], dims_lf_even[2], dims_out[2];
+  halide_dimension_t dims_hf[MAX_NUM_SCALES][2];
+  halide_dimension_t dims_mask[2];
 
-    const float *restrict buffer_in;
-    float *restrict buffer_out;
+  halide_buffer_t buf_in, buf_lf_odd, buf_lf_even, buf_out;
+  halide_buffer_t buf_hf[MAX_NUM_SCALES];
+  halide_buffer_t buf_mask;
 
-    if(s == 0)
-    {
-      buffer_in = in;
-      buffer_out = LF_odd;
-    }
-    else if(s % 2 != 0)
-    {
-      buffer_in = LF_odd;
-      buffer_out = LF_even;
-    }
-    else
-    {
-      buffer_in = LF_even;
-      buffer_out = LF_odd;
-    }
+  _halide_init_flat_f32(&buf_in, dims_in, (float *)in, row_floats, h);
+  _halide_init_flat_f32(&buf_lf_odd, dims_lf_odd, LF_odd, row_floats, h);
+  _halide_init_flat_f32(&buf_lf_even, dims_lf_even, LF_even, row_floats, h);
+  _halide_init_flat_f32(&buf_out, dims_out, reconstructed, row_floats, h);
+  for(int s = 0; s < scales; s++)
+    _halide_init_flat_f32(&buf_hf[s], dims_hf[s], HF[s], row_floats, h);
+  _halide_init_mask_u8(&buf_mask, dims_mask, mask, w, h);
 
-    const char *scale_backend = NULL;
-    int err = _halide_decompose_2D_Bspline(buffer_in, HF[s], buffer_out, width, height, mult, &scale_backend);
-    if(err)
-    {
-      dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide bspline decompose failed at scale %d (err=%d)", s, err);
-      success = FALSE;
-      return success;
-    }
-    if(s == 0) backend = scale_backend;
+  // Mark input as host-dirty so the first kernel uploads it to device
+  buf_in.flags = halide_buffer_flag_host_dirty;
+  if(has_mask) buf_mask.flags = halide_buffer_flag_host_dirty;
 
-    residual = buffer_out;
+  // --- Probe backend using the first bspline decompose call ---
+  // This also performs scale 0 decompose: in -> LF_odd + HF[0]
+  _halide_backend_t backend = _halide_probe_backend(
+      &buf_in, &buf_lf_odd, &buf_hf[0], w, h, 1 /* mult for scale 0 */);
+
+  if(backend == HALIDE_BACKEND_NONE)
+  {
+    dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide: all backends failed for bspline probe");
+    return FALSE;
   }
 
-  if(backend)
-    dt_print(DT_DEBUG_PIPE, "[diffuse] Halide bspline decompose backend: %s", backend);
+  dt_print(DT_DEBUG_PIPE, "[diffuse] Halide backend: %s (GPU-persistent buffers)",
+           _halide_backend_name(backend));
 
-  // PDE diffusion reconstruction (unchanged from C version)
-  float *restrict temp = (residual == LF_even) ? LF_odd : LF_even;
+  // Scale 0 is already done by the probe. Continue with remaining scales.
+  float *restrict residual_ptr = LF_odd;
 
-  int count = 0;
-  for(int s = scales - 1; s > -1; --s)
+  for(int s = 1; s < scales; ++s)
   {
     const int mult = 1 << s;
-    const float current_radius = equivalent_sigma_at_step(B_SPLINE_SIGMA, s);
-    const float real_radius = current_radius * zoom;
-
-    const float norm =
-      expf(-sqf(real_radius - (float)data->radius_center) / sqf(data->radius));
-    const dt_aligned_pixel_t ABCD = { data->first * KAPPA * norm,
-                                      data->second * KAPPA * norm,
-                                      data->third * KAPPA * norm,
-                                      data->fourth * KAPPA * norm };
-    const float strength = data->sharpness * norm + 1.f;
 
-    const float *restrict buffer_in;
-    float *restrict buffer_out;
+    halide_buffer_t *b_in  = (s % 2 != 0) ? &buf_lf_odd : &buf_lf_even;
+    halide_buffer_t *b_out = (s % 2 != 0) ? &buf_lf_even : &buf_lf_odd;
 
-    if(count == 0)
-    {
-      buffer_in = residual;
-      buffer_out = temp;
-    }
-    else if(count % 2 != 0)
-    {
-      buffer_in = temp;
-      buffer_out = residual;
-    }
-    else
+    int err = _halide_run_bspline(backend, b_in, b_out, &buf_hf[s], w, h, mult);
+    if(err)
     {
-      buffer_in = residual;
-      buffer_out = temp;
+      dt_print(DT_DEBUG_ALWAYS,
+               "[diffuse] Halide bspline failed at scale %d (err=%d)", s, err);
+      goto cleanup_fail;
     }
 
-    if(s == 0) buffer_out = reconstructed;
+    residual_ptr = (s % 2 != 0) ? LF_even : LF_odd;
+  }
 
-    const char *pde_backend = NULL;
-    int pde_err = _halide_heat_PDE_diffusion(HF[s], buffer_in, mask, has_mask, buffer_out, width, height,
-                       anisotropy, isotropy_type, regularization,
-                       variance_threshold, sqf(current_radius), mult, ABCD, strength,
-                       &pde_backend);
-    if(pde_err)
-    {
-      dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide PDE diffusion failed at scale %d (err=%d), falling back to C", s, pde_err);
-      // Fall back to C implementation for this scale
-      heat_PDE_diffusion(HF[s], buffer_in, mask, has_mask, buffer_out, width, height,
-                         anisotropy, isotropy_type, regularization,
-                         variance_threshold, sqf(current_radius), mult, ABCD, strength);
-    }
-    else if(s == scales - 1 && pde_backend)
+  // --- PDE diffusion reconstruction (reverse scale order) ---
+  {
+    halide_buffer_t *residual_buf = (residual_ptr == LF_even) ? &buf_lf_even : &buf_lf_odd;
+    halide_buffer_t *temp_buf = (residual_ptr == LF_even) ? &buf_lf_odd : &buf_lf_even;
+
+    int count = 0;
+    for(int s = scales - 1; s > -1; --s)
     {
-      dt_print(DT_DEBUG_PIPE, "[diffuse] Halide PDE diffusion backend: %s", pde_backend);
+      const int mult = 1 << s;
+      const float current_radius = equivalent_sigma_at_step(B_SPLINE_SIGMA, s);
+      const float real_radius = current_radius * zoom;
+      const float norm =
+        expf(-sqf(real_radius - (float)data->radius_center) / sqf(data->radius));
+      const dt_aligned_pixel_t ABCD = { data->first * KAPPA * norm,
+                                        data->second * KAPPA * norm,
+                                        data->third * KAPPA * norm,
+                                        data->fourth * KAPPA * norm };
+      const float strength = data->sharpness * norm + 1.f;
+      const float regularization_factor = regularization * sqf(current_radius) / 9.f;
+
+      halide_buffer_t *b_in, *b_out;
+
+      if(count == 0)
+      {
+        b_in = residual_buf;
+        b_out = temp_buf;
+      }
+      else if(count % 2 != 0)
+      {
+        b_in = temp_buf;
+        b_out = residual_buf;
+      }
+      else
+      {
+        b_in = residual_buf;
+        b_out = temp_buf;
+      }
+
+      // Final scale writes to output buffer
+      if(s == 0) b_out = &buf_out;
+
+      int pde_err = _halide_run_pde(backend, &buf_hf[s], b_in, &buf_mask, b_out,
+                                    w, h, mult, has_mask ? 1 : 0,
+                                    anisotropy, isotropy_type, ABCD,
+                                    strength, regularization_factor, variance_threshold);
+      if(pde_err)
+      {
+        dt_print(DT_DEBUG_ALWAYS,
+                 "[diffuse] Halide PDE failed at scale %d (err=%d)", s, pde_err);
+        goto cleanup_fail;
+      }
+      count++;
     }
-    count++;
   }
 
-  return success;
+  // --- Copy only the final output to host ---
+  if(backend != HALIDE_BACKEND_CPU)
+    halide_copy_to_host(NULL, &buf_out);
+
+  // --- Free all device memory ---
+  if(backend != HALIDE_BACKEND_CPU)
+  {
+    halide_device_free(NULL, &buf_in);
+    halide_device_free(NULL, &buf_lf_odd);
+    halide_device_free(NULL, &buf_lf_even);
+    halide_device_free(NULL, &buf_out);
+    halide_device_free(NULL, &buf_mask);
+    for(int s = 0; s < scales; s++)
+      halide_device_free(NULL, &buf_hf[s]);
+  }
+
+  return TRUE;
+
+cleanup_fail:
+  if(backend != HALIDE_BACKEND_CPU)
+  {
+    halide_device_free(NULL, &buf_in);
+    halide_device_free(NULL, &buf_lf_odd);
+    halide_device_free(NULL, &buf_lf_even);
+    halide_device_free(NULL, &buf_out);
+    halide_device_free(NULL, &buf_mask);
+    for(int s = 0; s < scales; s++)
+      halide_device_free(NULL, &buf_hf[s]);
+  }
+  return FALSE;
 }
 
 __attribute__((visibility("default")))

From 582b1487e559261a6f4e933f29d8317851959365 Mon Sep 17 00:00:00 2001
From: Mario Zimmermann <mail@zisoft.de>
Date: Wed, 22 Apr 2026 17:37:23 +0200
Subject: [PATCH 4/4] bug fixes to avoid artifacts

---
 src/iop/diffuse.c | 77 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 61 insertions(+), 16 deletions(-)

diff --git a/src/iop/diffuse.c b/src/iop/diffuse.c
index 28565a399d59..c99842fc3c2d 100644
--- a/src/iop/diffuse.c
+++ b/src/iop/diffuse.c
@@ -1390,7 +1390,7 @@ void process(dt_iop_module_t *self,
   float *restrict HF[MAX_NUM_SCALES];
   for(int s = 0; s < scales; s++)
   {
-    HF[s] = out_of_memory ? NULL : dt_alloc_align_float(width * height * 4);
+    HF[s] = out_of_memory ? NULL : dt_calloc_align_float(width * height * 4);
     if(!HF[s]) out_of_memory = TRUE;
   }
 
@@ -1635,7 +1635,8 @@ static inline gboolean _wavelets_process_halide(const float *const restrict in,
                                     const gboolean has_mask,
                                     float *const restrict HF[MAX_NUM_SCALES],
                                     float *const restrict LF_odd,
-                                    float *const restrict LF_even)
+                                    float *const restrict LF_even,
+                                    _halide_backend_t *const backend_ptr)
 {
   const int row_floats = (int)width * 4;
   const int w = (int)width;
@@ -1678,22 +1679,50 @@ static inline gboolean _wavelets_process_halide(const float *const restrict in,
   buf_in.flags = halide_buffer_flag_host_dirty;
   if(has_mask) buf_mask.flags = halide_buffer_flag_host_dirty;
 
-  // --- Probe backend using the first bspline decompose call ---
-  // This also performs scale 0 decompose: in -> LF_odd + HF[0]
-  _halide_backend_t backend = _halide_probe_backend(
-      &buf_in, &buf_lf_odd, &buf_hf[0], w, h, 1 /* mult for scale 0 */);
-
-  if(backend == HALIDE_BACKEND_NONE)
+  // --- Use cached backend or probe on first call ---
+  _halide_backend_t backend;
+  gboolean probed = FALSE;
+  
+  if(backend_ptr && *backend_ptr != HALIDE_BACKEND_NONE)
   {
-    dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide: all backends failed for bspline probe");
-    return FALSE;
+    // Backend was cached from a previous call, reuse it to avoid re-probing
+    backend = *backend_ptr;
+    probed = FALSE;  // We're not probing, just using cached backend
   }
+  else
+  {
+    // First call: probe backend using the first bspline decompose call
+    // This also performs scale 0 decompose: in -> LF_odd + HF[0]
+    backend = _halide_probe_backend(
+        &buf_in, &buf_lf_odd, &buf_hf[0], w, h, 1 /* mult for scale 0 */);
+
+    if(backend == HALIDE_BACKEND_NONE)
+    {
+      dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide: all backends failed for bspline probe");
+      return FALSE;
+    }
 
-  dt_print(DT_DEBUG_PIPE, "[diffuse] Halide backend: %s (GPU-persistent buffers)",
-           _halide_backend_name(backend));
+    dt_print(DT_DEBUG_PIPE, "[diffuse] Halide backend: %s",
+             _halide_backend_name(backend));
 
-  // Scale 0 is already done by the probe. Continue with remaining scales.
+    // Cache backend for subsequent calls to avoid re-probing
+    if(backend_ptr) *backend_ptr = backend;
+    probed = TRUE;  // Probe already did scale 0
+  }
+
+  // Scale 0: only run if we didn't probe (probe already did it)
   float *restrict residual_ptr = LF_odd;
+  
+  if(!probed)
+  {
+    // Cached backend iteration: need to run scale 0 bspline explicitly
+    int err = _halide_run_bspline(backend, &buf_in, &buf_lf_odd, &buf_hf[0], w, h, 1);
+    if(err)
+    {
+      dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide bspline failed at scale 0 (err=%d)", err);
+      goto cleanup_fail;
+    }
+  }
 
   for(int s = 1; s < scales; ++s)
   {
@@ -1770,7 +1799,13 @@ static inline gboolean _wavelets_process_halide(const float *const restrict in,
 
   // --- Copy only the final output to host ---
   if(backend != HALIDE_BACKEND_CPU)
+  {
     halide_copy_to_host(NULL, &buf_out);
+    // Force synchronization by touching the output buffer
+    // This ensures GPU has finished writing before we return
+    volatile float sync_check = ((float *)reconstructed)[0];
+    (void)sync_check;
+  }
 
   // --- Free all device memory ---
   if(backend != HALIDE_BACKEND_CPU)
@@ -1849,7 +1884,7 @@ int process_halide(dt_iop_module_t *self,
   float *restrict HF[MAX_NUM_SCALES];
   for(int s = 0; s < scales; s++)
   {
-    HF[s] = out_of_memory ? NULL : dt_alloc_align_float(width * height * 4);
+    HF[s] = out_of_memory ? NULL : dt_calloc_align_float(width * height * 4);
     if(!HF[s]) out_of_memory = TRUE;
   }
 
@@ -1863,7 +1898,7 @@ int process_halide(dt_iop_module_t *self,
     goto finish;
   }
 
-  const gboolean has_mask = (data->threshold > 0.f);
+   const gboolean has_mask = (data->threshold > 0.f);
   if(has_mask)
   {
     build_mask(in, mask, data->threshold, roi_out->width, roi_out->height);
@@ -1871,8 +1906,13 @@ int process_halide(dt_iop_module_t *self,
     in = temp1;
   }
 
+  // Cache backend across iterations to skip re-probing
+  _halide_backend_t cached_backend = HALIDE_BACKEND_NONE;
+
   for(int it = 0; it < iterations; it++)
   {
+    double iter_start = dt_get_wtime();
+    
     if(it == 0)
     {
       temp_in = in;
@@ -1894,12 +1934,17 @@ int process_halide(dt_iop_module_t *self,
 
     if(!_wavelets_process_halide(temp_in, temp_out, mask,
                      roi_out->width, roi_out->height,
-                     data, final_radius, scale, scales, has_mask, HF, LF_odd, LF_even))
+                     data, final_radius, scale, scales, has_mask, HF, LF_odd, LF_even,
+                     &cached_backend))
     {
       // Halide failed, signal fallback to regular process()
       ret = -1;
       goto finish;
     }
+    
+    double iter_time = dt_get_wtime() - iter_start;
+    dt_print(DT_DEBUG_PIPE, "[diffuse] Iteration %d/%d took %.3f secs", 
+             it + 1, iterations, iter_time);
   }
 
 finish: