diff --git a/.ci/Brewfile b/.ci/Brewfile
index 55d6941cac48..07accba836bc 100644
--- a/.ci/Brewfile
+++ b/.ci/Brewfile
@@ -29,6 +29,7 @@ brew 'gtk+3'
brew 'gtk-mac-integration'
brew 'icu4c'
brew 'graphicsmagick'
+brew 'halide'
brew 'imagemagick'
brew 'intltool'
brew 'iso-codes'
diff --git a/DefineOptions.cmake b/DefineOptions.cmake
index 7d2ea4a354a2..92b6c14f2cfb 100644
--- a/DefineOptions.cmake
+++ b/DefineOptions.cmake
@@ -8,6 +8,7 @@ option(USE_LIBSECRET "Build libsecret password storage back-end" ON)
option(USE_UNITY "Use libunity to report progress in the launcher" OFF)
option(USE_OPENMP "Use OpenMP threading support." ON)
option(USE_OPENCL "Use OpenCL support." ON)
+option(USE_HALIDE "Use Halide for optimized image processing." OFF)
option(USE_GRAPHICSMAGICK "Use GraphicsMagick library for image import." ON)
option(USE_IMAGEMAGICK "Use ImageMagick library for image import." OFF)
option(USE_DARKTABLE_PROFILING OFF)
diff --git a/data/halide/CMakeLists.txt b/data/halide/CMakeLists.txt
new file mode 100644
index 000000000000..c4470ad1ac85
--- /dev/null
+++ b/data/halide/CMakeLists.txt
@@ -0,0 +1,108 @@
+# Halide-based image processing generators for darktable
+#
+# This builds Halide generator executables at compile time, then runs them
+# to produce optimized static libraries (.h + .a) for each target platform.
+# The generated code is linked into individual IOP plugin modules.
+
+find_package(Halide REQUIRED)
+
+# Build the generator executables (run on the host at build time)
+add_halide_generator(dt_halide_generators
+ SOURCES
+ generators/exposure_generator.cpp
+)
+
+add_halide_generator(dt_halide_bspline_generators
+ SOURCES
+ generators/bspline_generator.cpp
+)
+
+add_halide_generator(dt_halide_diffuse_generators
+ SOURCES
+ generators/diffuse_pde_generator.cpp
+)
+
+# Generate CPU-optimized code for the host platform
+# "host" auto-detects arch + OS + SIMD features (NEON on ARM64, AVX on x86, etc.)
+add_halide_library(halide_exposure
+ FROM dt_halide_generators
+ GENERATOR exposure
+ TARGETS host
+ FUNCTION_NAME dt_halide_exposure
+)
+
+# Generate Metal GPU code for Apple Silicon (macOS only)
+# Uses the "flat" 2D generator to avoid stride issues with Metal's buffer handling.
+# The flat generator treats each row as width*4 contiguous floats.
+if(APPLE)
+ add_halide_library(halide_exposure_metal
+ FROM dt_halide_generators
+ GENERATOR exposure_flat
+ TARGETS host-metal
+ FUNCTION_NAME dt_halide_exposure_metal
+ )
+endif()
+
+# Generate Vulkan GPU code (cross-platform: Linux, Windows, macOS via MoltenVK)
+# Uses the same flat 2D generator as Metal. Vulkan loader is dlopen'd at runtime,
+# so no link-time dependency — if no Vulkan driver is present, the call simply fails
+# and we fall through to CPU Halide.
+# B-spline wavelet decomposition for diffuse/sharpen module
+# CPU target: separable 5-tap blur + detail extraction in one pass
+add_halide_library(halide_bspline_decompose
+ FROM dt_halide_bspline_generators
+ GENERATOR bspline_decompose
+ TARGETS host
+ FUNCTION_NAME dt_halide_bspline_decompose
+)
+
+# Metal GPU target for B-spline decomposition (macOS only)
+if(APPLE)
+ add_halide_library(halide_bspline_decompose_metal
+ FROM dt_halide_bspline_generators
+ GENERATOR bspline_decompose
+ TARGETS host-metal
+ FUNCTION_NAME dt_halide_bspline_decompose_metal
+ )
+endif()
+
+# Heat PDE diffusion kernel for diffuse/sharpen module
+add_halide_library(halide_diffuse_pde
+ FROM dt_halide_diffuse_generators
+ GENERATOR diffuse_pde
+ TARGETS host
+ FUNCTION_NAME dt_halide_diffuse_pde
+)
+
+# Metal GPU target for PDE diffusion (macOS only)
+if(APPLE)
+ add_halide_library(halide_diffuse_pde_metal
+ FROM dt_halide_diffuse_generators
+ GENERATOR diffuse_pde
+ TARGETS host-metal
+ FUNCTION_NAME dt_halide_diffuse_pde_metal
+ )
+endif()
+
+find_package(Vulkan QUIET)
+if(Vulkan_FOUND)
+ add_halide_library(halide_exposure_vulkan
+ FROM dt_halide_generators
+ GENERATOR exposure_flat
+ TARGETS host-vulkan
+ FUNCTION_NAME dt_halide_exposure_vulkan
+ )
+ add_halide_library(halide_bspline_decompose_vulkan
+ FROM dt_halide_bspline_generators
+ GENERATOR bspline_decompose
+ TARGETS host-vulkan
+ FUNCTION_NAME dt_halide_bspline_decompose_vulkan
+ )
+ add_halide_library(halide_diffuse_pde_vulkan
+ FROM dt_halide_diffuse_generators
+ GENERATOR diffuse_pde
+ TARGETS host-vulkan
+ FUNCTION_NAME dt_halide_diffuse_pde_vulkan
+ )
+ set(HALIDE_HAS_VULKAN TRUE PARENT_SCOPE)
+endif()
diff --git a/data/halide/generators/bspline_generator.cpp b/data/halide/generators/bspline_generator.cpp
new file mode 100644
index 000000000000..d629dd9a3977
--- /dev/null
+++ b/data/halide/generators/bspline_generator.cpp
@@ -0,0 +1,131 @@
+/*
+ This file is part of darktable,
+ Copyright (C) 2026 darktable developers.
+
+ darktable is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ darktable is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with darktable. If not, see .
+*/
+
+/*
+ * Halide generator for B-spline wavelet decomposition used by
+ * darktable's diffuse/sharpen module.
+ *
+ * Algorithm: separable 5-tap [1,4,6,4,1]/16 blur with strided access
+ * (à-trous wavelet transform). Produces both the low-frequency (blurred)
+ * and high-frequency (detail = input - blurred) components.
+ *
+ * darktable pixel layout: interleaved float RGBA, 4 channels per pixel.
+ * We use the "flat" 2D approach (xi = x*4+c, y) that works on both
+ * CPU and GPU, matching the exposure_flat pattern.
+ *
+ * The stride parameter "mult" controls the spacing between filter taps:
+ * mult = 1 << scale (scale 0 -> mult=1, scale 1 -> mult=2, etc.)
+ *
+ * Boundary handling: clamp to edge (matching darktable's MIN/MAX clamping).
+ *
+ * Negative values in the blur output are clipped to 0 (matching darktable's
+ * clip_negatives=TRUE in decompose_2D_Bspline).
+ */
+
+#include "Halide.h"
+
+using namespace Halide;
+
+// Flat 2D layout (xi, y), xi = width * 4.
+// Separable filter: vertical pass then horizontal pass.
+// mult is the tap spacing in pixels (= 1 << scale).
+// width_pixels is the image width in pixels (not floats).
+class BsplineDecomposeGenerator : public Generator {
+public:
+ Input> input{"input"}; // (xi, y) where xi = width*4
+ Input width_pixels{"width_pixels"}; // image width in pixels
+ Input height{"height"}; // image height in pixels
+ Input mult{"mult"}; // tap spacing = 1 << scale
+ Output> lf{"lf"}; // low-frequency (blurred)
+ Output> hf{"hf"}; // high-frequency (detail)
+
+ // Intermediate stage accessible to schedule()
+ Func vert{"vert"};
+
+ void generate() {
+ Var xi("xi"), y("y");
+
+ // B-spline filter weights
+ const float w0 = 1.0f / 16.0f;
+ const float w1 = 4.0f / 16.0f;
+ const float w2 = 6.0f / 16.0f;
+
+ // Extract pixel x and channel from flat index
+ Expr px = xi / 4;
+ Expr ch = xi % 4;
+
+ // Vertical pass: blur along y with spacing mult
+ // Clamp row indices to [0, height-1]
+ Expr r0 = clamp(y - 2 * mult, 0, height - 1);
+ Expr r1 = clamp(y - mult, 0, height - 1);
+ Expr r2 = y;
+ Expr r3 = clamp(y + mult, 0, height - 1);
+ Expr r4 = clamp(y + 2 * mult, 0, height - 1);
+
+ vert(xi, y) = w0 * input(xi, r0)
+ + w1 * input(xi, r1)
+ + w2 * input(xi, r2)
+ + w1 * input(xi, r3)
+ + w0 * input(xi, r4);
+
+ // Horizontal pass: blur along x with spacing mult on vertical result
+ // Clamp at pixel boundaries, preserving channel offset
+ Expr p0 = clamp(px - 2 * mult, 0, width_pixels - 1) * 4 + ch;
+ Expr p1 = clamp(px - mult, 0, width_pixels - 1) * 4 + ch;
+ Expr p2 = xi;
+ Expr p3 = clamp(px + mult, 0, width_pixels - 1) * 4 + ch;
+ Expr p4 = clamp(px + 2 * mult, 0, width_pixels - 1) * 4 + ch;
+
+ Expr blur = w0 * vert(p0, y)
+ + w1 * vert(p1, y)
+ + w2 * vert(p2, y)
+ + w1 * vert(p3, y)
+ + w0 * vert(p4, y);
+
+ // Clip negatives (matching darktable behavior)
+ lf(xi, y) = max(blur, 0.0f);
+ hf(xi, y) = input(xi, y) - lf(xi, y);
+ }
+
+ void schedule() {
+ Var xi = lf.args()[0];
+ Var y = lf.args()[1];
+
+ if(get_target().has_gpu_feature()) {
+ // GPU schedule: compute vert inline (fused into the horizontal pass)
+ // then tile the outputs
+ Var xio("xio"), yo("yo");
+ vert.compute_at(lf, yo);
+ lf.gpu_tile(xi, y, xio, yo, 64, 16);
+ hf.gpu_tile(xi, y, xio, yo, 64, 16);
+ } else {
+ // CPU schedule: compute vert per row strip for cache locality,
+ // vectorize, parallelize over rows
+ Var xio("xio"), xii("xii");
+
+ // Compute vertical pass per output row (store full row)
+ vert.compute_at(lf, y).vectorize(xi, 32);
+
+ lf.split(xi, xio, xii, 32).vectorize(xii).parallel(y);
+ // hf reads lf, so compute hf after lf at the same row
+ hf.split(xi, xio, xii, 32).vectorize(xii).parallel(y);
+ }
+ }
+};
+
+HALIDE_REGISTER_GENERATOR(BsplineDecomposeGenerator, bspline_decompose)
diff --git a/data/halide/generators/diffuse_pde_generator.cpp b/data/halide/generators/diffuse_pde_generator.cpp
new file mode 100644
index 000000000000..b55488d85c72
--- /dev/null
+++ b/data/halide/generators/diffuse_pde_generator.cpp
@@ -0,0 +1,297 @@
+/*
+ This file is part of darktable,
+ Copyright (C) 2026 darktable developers.
+
+ darktable is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ darktable is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with darktable. If not, see .
+*/
+
+/*
+ * Halide generator for the heat PDE diffusion kernel used by
+ * darktable's diffuse/sharpen module.
+ *
+ * This implements the anisotropic PDE solver from:
+ * https://www.researchgate.net/publication/220663968
+ * modified for multi-scale wavelet framework.
+ *
+ * Per pixel, the algorithm:
+ * 1. Fetches 3x3 stencil from HF and LF buffers (strided by mult)
+ * 2. Computes gradients on LF and HF (centered finite differences)
+ * 3. For 4 derivative orders, computes anisotropic diffusion kernels:
+ * - c2 = exp(-magnitude * anisotropy)
+ * - rotation matrix based on isotropy mode (isotrope/isophote/gradient)
+ * - builds 3x3 convolution kernel from rotation matrix
+ * 4. Convolves kernels with LF (orders 0,1) and HF (orders 2,3)
+ * 5. Regularizes variance, accumulates, produces output
+ *
+ * Uses flat 2D buffer layout (xi = width*4, y) for both CPU and GPU.
+ * Channel index c = xi % 4, pixel x = xi / 4.
+ *
+ * The isotropy_type for each of the 4 orders is passed as int parameters:
+ * 0 = ISOTROPE, 1 = ISOPHOTE, 2 = GRADIENT
+ */
+
+#include "Halide.h"
+
+using namespace Halide;
+
+class DiffusePDEGenerator : public Generator {
+public:
+ // Input buffers: flat 2D (xi, y) where xi = width * 4
+ Input> HF{"HF"}; // high-frequency (wavelet detail)
+ Input> LF{"LF"}; // low-frequency (wavelet blur)
+ Input> mask_buf{"mask_buf"}; // per-pixel mask (width, height)
+
+ // Scalar parameters
+ Input width_pixels{"width_pixels"};
+ Input height{"height"};
+ Input mult{"mult"}; // stencil stride = 1 << scale
+ Input has_mask{"has_mask"}; // boolean: use mask?
+
+ // Anisotropy factors (4 orders): already squared user params
+ Input anisotropy_0{"anisotropy_0"};
+ Input anisotropy_1{"anisotropy_1"};
+ Input anisotropy_2{"anisotropy_2"};
+ Input anisotropy_3{"anisotropy_3"};
+
+ // Isotropy types (4 orders): 0=ISOTROPE, 1=ISOPHOTE, 2=GRADIENT
+ Input isotropy_0{"isotropy_0"};
+ Input isotropy_1{"isotropy_1"};
+ Input isotropy_2{"isotropy_2"};
+ Input isotropy_3{"isotropy_3"};
+
+ // ABCD coefficients (diffusion weights per order)
+ Input A{"A"};
+ Input B{"B"};
+ Input C{"C"};
+ Input D{"D"};
+
+ // Other parameters
+ Input strength{"strength"};
+ Input regularization_factor{"regularization_factor"};
+ Input variance_threshold{"variance_threshold"};
+
+ Output> output{"output"};
+
+ // Intermediate Funcs for schedule access
+ Func neighbour_HF{"neighbour_HF"}, neighbour_LF{"neighbour_LF"};
+
+ void generate() {
+ Var xi("xi"), y("y");
+
+ // Channel and pixel coordinates
+ Expr ch = xi % 4;
+ Expr px = xi / 4;
+
+ // For 2D buffer access we need (xi_coord, y_coord)
+ // Helper: clamped flat xi for a given pixel offset
+ auto clamped_xi = [&](Expr pixel_x) -> Expr {
+ return clamp(pixel_x, 0, width_pixels - 1) * 4 + ch;
+ };
+ auto clamped_y = [&](Expr pixel_y) -> Expr {
+ return clamp(pixel_y, 0, height - 1);
+ };
+
+ // Fetch 3x3 stencil neighbors
+ // Stencil offsets: -mult, 0, +mult in pixel space
+
+ // Actually, let's just inline the 9 neighbor accesses directly.
+ // For each of the 9 positions (ii,jj) where offset = (ii-1)*mult, (jj-1)*mult:
+
+ // Fetch all 9 neighbors for HF and LF
+ // Layout: [0]=(-1,-1), [1]=(0,-1), [2]=(+1,-1),
+ // [3]=(-1, 0), [4]=(0, 0), [5]=(+1, 0),
+ // [6]=(-1,+1), [7]=(0,+1), [8]=(+1,+1)
+ // But in darktable convention: first index is row (x=vertical), second is col (y=horizontal)
+ // i_neighbours maps: 0 -> row-mult, 1 -> row, 2 -> row+mult
+ // j_neighbours maps: 0 -> col-mult, 1 -> col, 2 -> col+mult
+ // pixel[3*ii+jj] where ii is row index, jj is col index
+ // So pixel[0] = (row-mult, col-mult), pixel[1] = (row-mult, col), etc.
+ // NOTE: in darktable, "x is vertical, y is horizontal"
+ // row = y in our coords, col = px in our coords
+ // i_neighbours = row offsets, j_neighbours = col offsets
+
+ // darktable: i = row (our y), j = col (our px)
+ // i_neighbours[0] = row - mult, [1] = row, [2] = row + mult
+ // j_neighbours[0] = col - mult, [1] = col, [2] = col + mult
+ // neighbor[3*ii+jj] accesses i_neighbours[ii] + j_neighbours[jj]
+ // So ii indexes row, jj indexes col
+
+ // fetch_rc(row_offset, col_offset) in pixel-space offsets
+ auto fetch_rc = [&](const Input> &buf,
+ int row_off, int col_off) -> Expr {
+ Expr nx = clamped_xi(px + col_off * mult);
+ Expr ny = clamped_y(y + row_off * mult);
+ return buf(nx, ny);
+ };
+
+ // n[k] for k=0..8, k = 3*ii+jj, ii=row_idx(0,1,2), jj=col_idx(0,1,2)
+ // row_off = ii-1, col_off = jj-1
+ Expr hf[9], lf[9];
+ for(int ii = 0; ii < 3; ii++) {
+ for(int jj = 0; jj < 3; jj++) {
+ hf[3*ii+jj] = fetch_rc(HF, ii-1, jj-1);
+ lf[3*ii+jj] = fetch_rc(LF, ii-1, jj-1);
+ }
+ }
+
+ // Gradients (centered finite differences)
+ // find_gradients: xy[0] = (pixels[7] - pixels[1]) / 2 (x = vertical = row direction)
+ // xy[1] = (pixels[5] - pixels[3]) / 2 (y = horizontal = col direction)
+ Expr grad_lf_x = (lf[7] - lf[1]) / 2.0f;
+ Expr grad_lf_y = (lf[5] - lf[3]) / 2.0f;
+ Expr grad_hf_x = (hf[7] - hf[1]) / 2.0f;
+ Expr grad_hf_y = (hf[5] - hf[3]) / 2.0f;
+
+ // Magnitude and normalized direction for LF gradient
+ Expr mag_grad = sqrt(grad_lf_x * grad_lf_x + grad_lf_y * grad_lf_y);
+ Expr cos_grad = select(mag_grad != 0.0f, grad_lf_x / mag_grad, 1.0f);
+ Expr sin_grad = select(mag_grad != 0.0f, grad_lf_y / mag_grad, 0.0f);
+ Expr cos2_grad = cos_grad * cos_grad;
+ Expr sin2_grad = sin_grad * sin_grad;
+ Expr cossin_grad = cos_grad * sin_grad;
+
+ // Magnitude and normalized direction for HF gradient (called "laplacian" in dt)
+ Expr mag_lapl = sqrt(grad_hf_x * grad_hf_x + grad_hf_y * grad_hf_y);
+ Expr cos_lapl = select(mag_lapl != 0.0f, grad_hf_x / mag_lapl, 1.0f);
+ Expr sin_lapl = select(mag_lapl != 0.0f, grad_hf_y / mag_lapl, 0.0f);
+ Expr cos2_lapl = cos_lapl * cos_lapl;
+ Expr sin2_lapl = sin_lapl * sin_lapl;
+ Expr cossin_lapl = cos_lapl * sin_lapl;
+
+ // c2 = exp(-magnitude * anisotropy) for each of 4 orders
+ // darktable uses a fast approximate exp via integer bit tricks.
+ // We use Halide's fast_exp which is similarly approximate.
+ // Orders 0,2 use grad magnitude; orders 1,3 use lapl magnitude.
+ Expr c2_0 = fast_exp(-mag_grad * anisotropy_0);
+ Expr c2_1 = fast_exp(-mag_lapl * anisotropy_1);
+ Expr c2_2 = fast_exp(-mag_grad * anisotropy_2);
+ Expr c2_3 = fast_exp(-mag_lapl * anisotropy_3);
+
+ // Build 3x3 convolution kernels for each order based on isotropy type.
+ // kernel[k] for k=0..8 is the 3x3 stencil weight.
+ //
+ // For ISOTROPE: kernel = [0.25, 0.5, 0.25, 0.5, -3, 0.5, 0.25, 0.5, 0.25]
+ //
+ // For ISOPHOTE with c2, cos2, sin2, cossin:
+ // a[0][0] = cos2 + c2*sin2, a[1][1] = c2*cos2 + sin2
+ // a[0][1] = a[1][0] = (c2 - 1) * cossin
+ // Then build_matrix:
+ // b11 = a01/2, b13 = -b11, b22 = -2*(a00+a11)
+ // kernel = [b11, a11, b13, a00, b22, a00, b13, a11, b11]
+ //
+ // For GRADIENT: same as isophote but swapped:
+ // a[0][0] = c2*cos2 + sin2, a[1][1] = cos2 + c2*sin2
+ // a[0][1] = a[1][0] = (1 - c2) * cossin
+
+ // Helper: build kernel from isotropy_type, c2, cos2, sin2, cossin
+ // Returns 9 Exprs for the kernel weights
+ auto build_kern = [](Expr iso_type, Expr c2_val,
+ Expr cos2, Expr sin2, Expr cossin,
+ Expr kern[9]) {
+ // Isophote rotation matrix elements
+ Expr iso_a00 = cos2 + c2_val * sin2;
+ Expr iso_a11 = c2_val * cos2 + sin2;
+ Expr iso_a01 = (c2_val - 1.0f) * cossin;
+
+ // Gradient rotation matrix elements (swapped)
+ Expr grd_a00 = c2_val * cos2 + sin2;
+ Expr grd_a11 = cos2 + c2_val * sin2;
+ Expr grd_a01 = (1.0f - c2_val) * cossin;
+
+ // Select based on isotropy type
+ Expr a00 = select(iso_type == 0, 0.0f, // isotrope: doesn't use matrix
+ iso_type == 1, iso_a00,
+ grd_a00);
+ Expr a11 = select(iso_type == 0, 0.0f,
+ iso_type == 1, iso_a11,
+ grd_a11);
+ Expr a01 = select(iso_type == 0, 0.0f,
+ iso_type == 1, iso_a01,
+ grd_a01);
+
+ // build_matrix
+ Expr b11 = a01 / 2.0f;
+ Expr b13 = -b11;
+ Expr b22 = -2.0f * (a00 + a11);
+
+ // Anisotropic kernel
+ Expr aniso_kern[9] = {b11, a11, b13, a00, b22, a00, b13, a11, b11};
+
+ // Isotrope kernel (constant)
+ float iso_kern[9] = {0.25f, 0.5f, 0.25f, 0.5f, -3.0f, 0.5f, 0.25f, 0.5f, 0.25f};
+
+ for(int k = 0; k < 9; k++) {
+ kern[k] = select(iso_type == 0, iso_kern[k], aniso_kern[k]);
+ }
+ };
+
+ // Build kernels for each order
+ // Orders 0, 2 use grad angles; orders 1, 3 use lapl angles
+ Expr kern0[9], kern1[9], kern2[9], kern3[9];
+ build_kern(isotropy_0, c2_0, cos2_grad, sin2_grad, cossin_grad, kern0);
+ build_kern(isotropy_1, c2_1, cos2_lapl, sin2_lapl, cossin_lapl, kern1);
+ build_kern(isotropy_2, c2_2, cos2_grad, sin2_grad, cossin_grad, kern2);
+ build_kern(isotropy_3, c2_3, cos2_lapl, sin2_lapl, cossin_lapl, kern3);
+
+ // Convolve: derivatives[k] = sum_j(kern_k[j] * buffer[j])
+ // Orders 0,1 convolve with LF; orders 2,3 convolve with HF
+ Expr deriv0 = cast(0), deriv1 = cast(0);
+ Expr deriv2 = cast(0), deriv3 = cast(0);
+ Expr variance = cast(0);
+
+ for(int k = 0; k < 9; k++) {
+ deriv0 += kern0[k] * lf[k];
+ deriv1 += kern1[k] * lf[k];
+ deriv2 += kern2[k] * hf[k];
+ deriv3 += kern3[k] * hf[k];
+ variance += hf[k] * hf[k];
+ }
+
+ // Regularize variance
+ Expr var_reg = variance_threshold + variance * regularization_factor;
+
+ // Accumulate
+ Expr acc = deriv0 * A + deriv1 * B + deriv2 * C + deriv3 * D;
+
+ // Final output: max(HF*strength + acc/variance + LF, 0)
+ Expr center_hf = hf[4]; // center pixel HF
+ Expr center_lf = lf[4]; // center pixel LF
+ Expr result = center_hf * strength + acc / var_reg + center_lf;
+
+ // Mask handling: if mask is 0, just output HF + LF (passthrough)
+ // mask is per-pixel (width, height), not per-channel
+ Expr mask_val = select(has_mask != 0,
+ cast(mask_buf(px, y)),
+ 1);
+ output(xi, y) = select(mask_val != 0,
+ max(result, 0.0f),
+ center_hf + center_lf);
+ }
+
+ void schedule() {
+ Var xi = output.args()[0];
+ Var y = output.args()[1];
+
+ if(get_target().has_gpu_feature()) {
+ Var xio("xio"), yo("yo");
+ output.gpu_tile(xi, y, xio, yo, 64, 16);
+ } else {
+ // CPU: vectorize across channels/pixels, parallelize rows
+ Var xio("xio"), xii("xii");
+ output.split(xi, xio, xii, 32).vectorize(xii).parallel(y);
+ }
+ }
+};
+
+HALIDE_REGISTER_GENERATOR(DiffusePDEGenerator, diffuse_pde)
diff --git a/data/halide/generators/exposure_generator.cpp b/data/halide/generators/exposure_generator.cpp
new file mode 100644
index 000000000000..9939749aac84
--- /dev/null
+++ b/data/halide/generators/exposure_generator.cpp
@@ -0,0 +1,109 @@
+/*
+ This file is part of darktable,
+ Copyright (C) 2026 darktable developers.
+
+ darktable is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ darktable is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with darktable. If not, see .
+*/
+
+/*
+ * Halide generator for darktable's exposure module.
+ *
+ * Algorithm: out[c] = (in[c] - black) * scale for c = R,G,B
+ * out[alpha] = in[alpha] (passthrough)
+ *
+ * darktable pixel layout: interleaved float RGBA, 4 channels per pixel.
+ * Memory layout: [R0,G0,B0,A0, R1,G1,B1,A1, ...]
+ * stride: c=1, x=4, y=4*width
+ *
+ * Two generators:
+ * "exposure" — 3D (c,x,y) for CPU with SIMD vectorization
+ * "exposure_flat" — 2D (xi,y) for Metal GPU, where xi = x*4+c
+ * This avoids stride issues with Metal's buffer handling.
+ */
+
+#include "Halide.h"
+
+using namespace Halide;
+
+// CPU generator: 3D (c, x, y) with interleaved layout
+class ExposureGenerator : public Generator {
+public:
+ Input> input{"input"};
+ Input black{"black"};
+ Input scale{"scale"};
+ Output> output{"output"};
+
+ void generate() {
+ Var c("c"), x("x"), y("y");
+
+ Expr val = input(c, x, y);
+ Expr adjusted = (val - black) * scale;
+ output(c, x, y) = select(c < 3, adjusted, val);
+ }
+
+ void schedule() {
+ Var c = output.args()[0];
+ Var x = output.args()[1];
+ Var y = output.args()[2];
+
+ output.bound(c, 0, 4);
+ input.dim(0).set_bounds(0, 4);
+
+ // CPU schedule: parallel over rows, vectorize over x
+ output.reorder(c, x, y);
+ output.vectorize(c, 4);
+
+ Var xo("xo"), xi("xi");
+ output.split(x, xo, xi, 8);
+ output.vectorize(xi);
+ output.parallel(y);
+ }
+};
+
+// GPU generator: 2D flat buffer (xi, y) where xi covers width*4 floats per row.
+// Every 4th element (xi % 4 == 3) is alpha and passed through unchanged.
+class ExposureFlatGenerator : public Generator {
+public:
+ Input> input{"input"};
+ Input black{"black"};
+ Input scale{"scale"};
+ Output> output{"output"};
+
+ void generate() {
+ Var xi("xi"), y("y");
+
+ Expr val = input(xi, y);
+ Expr adjusted = (val - black) * scale;
+ // Alpha passthrough: every 4th element (index 3, 7, 11, ...) is alpha
+ output(xi, y) = select((xi % 4) != 3, adjusted, val);
+ }
+
+ void schedule() {
+ Var xi = output.args()[0];
+ Var y = output.args()[1];
+
+ if(get_target().has_gpu_feature()) {
+ Var xio("xio"), xii("xii"), yo("yo"), yi("yi");
+ // Tile: 64 floats wide (16 pixels * 4 channels) x 16 rows
+ output.gpu_tile(xi, y, xio, yo, 64, 16);
+ } else {
+ // Fallback CPU schedule (shouldn't normally be used)
+ output.vectorize(xi, 16);
+ output.parallel(y);
+ }
+ }
+};
+
+HALIDE_REGISTER_GENERATOR(ExposureGenerator, exposure)
+HALIDE_REGISTER_GENERATOR(ExposureFlatGenerator, exposure_flat)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e2d15b5b16cc..37b35a0fc652 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1082,6 +1082,15 @@ add_subdirectory(views ${DARKTABLE_LIBDIR}/views)
# Compile lighttable modules
add_subdirectory(libs ${DARKTABLE_LIBDIR}/plugins/lighttable)
+# Compile Halide generators (must be before iop so targets are available)
+if(USE_HALIDE)
+ add_subdirectory(${CMAKE_SOURCE_DIR}/data/halide ${CMAKE_BINARY_DIR}/halide)
+ add_definitions("-DHAVE_HALIDE")
+ if(HALIDE_HAS_VULKAN)
+ add_definitions("-DHAVE_HALIDE_VULKAN")
+ endif()
+endif()
+
# Compile darkroom modules
add_subdirectory(iop ${DARKTABLE_LIBDIR}/plugins)
diff --git a/src/common/darktable.c b/src/common/darktable.c
index 12fba89b069b..1133ef5057ac 100644
--- a/src/common/darktable.c
+++ b/src/common/darktable.c
@@ -835,6 +835,12 @@ char *version = g_strdup_printf(
" OpenMP -> DISABLED\n"
#endif
+#ifdef HAVE_HALIDE
+ " Halide -> ENABLED\n"
+#else
+ " Halide -> DISABLED\n"
+#endif
+
#ifdef HAVE_OPENCL
" OpenCL -> ENABLED\n"
#else
diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c
index a8ebe3df1b02..b4d9e149b829 100644
--- a/src/develop/pixelpipe_hb.c
+++ b/src/develop/pixelpipe_hb.c
@@ -457,6 +457,13 @@ void dt_dev_pixelpipe_create_nodes(dt_dev_pixelpipe_t *pipe,
g_direct_equal, NULL, dt_free_align_ptr);
memset(&piece->processed_roi_in, 0, sizeof(piece->processed_roi_in));
memset(&piece->processed_roi_out, 0, sizeof(piece->processed_roi_out));
+#ifdef HAVE_HALIDE
+ // Cache the Halide process function pointer (if the module exports one)
+ piece->process_halide = NULL;
+ if(module->so->module)
+ g_module_symbol(module->so->module, "process_halide",
+ (gpointer *)&piece->process_halide);
+#endif
dt_iop_init_pipe(piece->module, pipe, piece);
pipe->nodes = g_list_append(pipe->nodes, piece);
}
@@ -1444,6 +1451,19 @@ static gboolean _pixelpipe_process_on_CPU(dt_dev_pixelpipe_t *pipe,
}
else
{
+#ifdef HAVE_HALIDE
+ gboolean halide_ok = FALSE;
+ if(piece->process_halide)
+ {
+ halide_ok = (piece->process_halide(module, piece, input, *output, roi_in, roi_out) == 0);
+ if(halide_ok)
+ {
+ dt_print_pipe(DT_DEBUG_PIPE, "processed via Halide",
+ pipe, module, DT_DEVICE_CPU, roi_in, roi_out, "");
+ }
+ }
+ if(!halide_ok)
+#endif
module->process(module, piece, input, *output, roi_in, roi_out);
if(relevant)
{
@@ -2083,6 +2103,13 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
}
}
+#ifdef HAVE_HALIDE
+ // If this module has a Halide CPU implementation, prefer it over OpenCL.
+ // This will be replaced by Halide Metal/GPU dispatch in the future.
+ if(possible_cl && piece->process_halide)
+ possible_cl = FALSE;
+#endif
+
if(possible_cl)
{
const int cst_from = input_cst_cl;
diff --git a/src/develop/pixelpipe_hb.h b/src/develop/pixelpipe_hb.h
index fcb8157c62e2..7eabe590a40e 100644
--- a/src/develop/pixelpipe_hb.h
+++ b/src/develop/pixelpipe_hb.h
@@ -63,6 +63,16 @@ typedef struct dt_dev_pixelpipe_iop_t
uint8_t xtrans[6][6];
uint32_t filters;
GHashTable *raster_masks;
+
+#ifdef HAVE_HALIDE
+ // Cached Halide process function pointer (looked up once via g_module_symbol)
+ int (*process_halide)(struct dt_iop_module_t *self,
+ struct dt_dev_pixelpipe_iop_t *piece,
+ const void *const i,
+ void *const o,
+ const struct dt_iop_roi_t *const roi_in,
+ const struct dt_iop_roi_t *const roi_out);
+#endif
} dt_dev_pixelpipe_iop_t;
typedef enum dt_dev_pixelpipe_change_t
diff --git a/src/iop/CMakeLists.txt b/src/iop/CMakeLists.txt
index 92a69c1037f9..4bc8d8b001f0 100644
--- a/src/iop/CMakeLists.txt
+++ b/src/iop/CMakeLists.txt
@@ -78,6 +78,15 @@ add_iop(temperature "temperature.c" DEFAULT_VISIBLE)
add_iop(colorcorrection "colorcorrection.c")
add_iop(basicadj "basicadj.c")
add_iop(exposure "exposure.c" DEFAULT_VISIBLE)
+if(USE_HALIDE)
+ target_link_libraries(exposure halide_exposure)
+ if(APPLE)
+ target_link_libraries(exposure halide_exposure_metal)
+ endif()
+ if(HALIDE_HAS_VULKAN)
+ target_link_libraries(exposure halide_exposure_vulkan)
+ endif()
+endif()
add_iop(equalizer "equalizer.c")
add_iop(rgbcurve "rgbcurve.c")
add_iop(colorbalance "colorbalance.c" DEFAULT_VISIBLE)
@@ -151,6 +160,15 @@ add_iop(censorize "censorize.c")
add_iop(colorbalancergb "colorbalancergb.c")
add_iop(cacorrectrgb "cacorrectrgb.c")
add_iop(diffuse "diffuse.c")
+if(USE_HALIDE)
+ target_link_libraries(diffuse halide_bspline_decompose halide_diffuse_pde)
+ if(APPLE)
+ target_link_libraries(diffuse halide_bspline_decompose_metal halide_diffuse_pde_metal)
+ endif()
+ if(HALIDE_HAS_VULKAN)
+ target_link_libraries(diffuse halide_bspline_decompose_vulkan halide_diffuse_pde_vulkan)
+ endif()
+endif()
add_iop(blurs "blurs.c")
add_iop(sigmoid "sigmoid.c")
add_iop(agx "agx.c")
diff --git a/src/iop/diffuse.c b/src/iop/diffuse.c
index 27885e7b966a..c99842fc3c2d 100644
--- a/src/iop/diffuse.c
+++ b/src/iop/diffuse.c
@@ -1390,7 +1390,7 @@ void process(dt_iop_module_t *self,
float *restrict HF[MAX_NUM_SCALES];
for(int s = 0; s < scales; s++)
{
- HF[s] = out_of_memory ? NULL : dt_alloc_align_float(width * height * 4);
+ HF[s] = out_of_memory ? NULL : dt_calloc_align_float(width * height * 4);
if(!HF[s]) out_of_memory = TRUE;
}
@@ -1450,6 +1450,516 @@ void process(dt_iop_module_t *self,
if(HF[s]) dt_free_align(HF[s]);
}
+#ifdef HAVE_HALIDE
+#include "HalideRuntime.h"
+#include "halide_bspline_decompose.h"
+#include "halide_diffuse_pde.h"
+#ifdef __APPLE__
+#include "halide_bspline_decompose_metal.h"
+#include "halide_diffuse_pde_metal.h"
+#endif
+#ifdef HAVE_HALIDE_VULKAN
+#include "halide_bspline_decompose_vulkan.h"
+#include "halide_diffuse_pde_vulkan.h"
+#endif
+
+// Backend selection for GPU-persistent buffer management
+typedef enum _halide_backend_t
+{
+ HALIDE_BACKEND_NONE = 0,
+ HALIDE_BACKEND_METAL,
+ HALIDE_BACKEND_VULKAN,
+ HALIDE_BACKEND_CPU
+} _halide_backend_t;
+
+// Helper: initialize a flat 2D halide_buffer_t for float RGBA data
+static inline void _halide_init_flat_f32(halide_buffer_t *buf, halide_dimension_t dims[2],
+ float *host, const int row_floats, const int h)
+{
+ dims[0] = (halide_dimension_t){0, row_floats, 1};
+ dims[1] = (halide_dimension_t){0, h, row_floats};
+ memset(buf, 0, sizeof(*buf));
+ buf->host = (uint8_t *)host;
+ buf->type = (struct halide_type_t){halide_type_float, 32, 1};
+ buf->dimensions = 2;
+ buf->dim = dims;
+}
+
+// Helper: initialize a 2D halide_buffer_t for uint8 mask data
+static inline void _halide_init_mask_u8(halide_buffer_t *buf, halide_dimension_t dims[2],
+ const uint8_t *host, const int w, const int h)
+{
+ dims[0] = (halide_dimension_t){0, w, 1};
+ dims[1] = (halide_dimension_t){0, h, w};
+ memset(buf, 0, sizeof(*buf));
+ buf->host = (uint8_t *)host;
+ buf->type = (struct halide_type_t){halide_type_uint, 8, 1};
+ buf->dimensions = 2;
+ buf->dim = dims;
+}
+
+// Try to run bspline decompose on the given backend.
+// Returns 0 on success. Does NOT copy to host or free device memory.
+static inline int _halide_run_bspline(_halide_backend_t backend,
+ halide_buffer_t *in_buf,
+ halide_buffer_t *lf_buf,
+ halide_buffer_t *hf_buf,
+ const int w, const int h, const int mult)
+{
+ switch(backend)
+ {
+#ifdef __APPLE__
+ case HALIDE_BACKEND_METAL:
+ return dt_halide_bspline_decompose_metal(in_buf, w, h, mult, lf_buf, hf_buf);
+#endif
+#ifdef HAVE_HALIDE_VULKAN
+ case HALIDE_BACKEND_VULKAN:
+ return dt_halide_bspline_decompose_vulkan(in_buf, w, h, mult, lf_buf, hf_buf);
+#endif
+ case HALIDE_BACKEND_CPU:
+ return dt_halide_bspline_decompose(in_buf, w, h, mult, lf_buf, hf_buf);
+ default:
+ return -1;
+ }
+}
+
+// Try to run PDE diffusion on the given backend.
+// Returns 0 on success. Does NOT copy to host or free device memory.
+static inline int _halide_run_pde(_halide_backend_t backend,
+ halide_buffer_t *hf_buf,
+ halide_buffer_t *lf_buf,
+ halide_buffer_t *mask_buf,
+ halide_buffer_t *out_buf,
+ const int w, const int h, const int mult,
+ const int has_mask_i,
+ const dt_aligned_pixel_t anisotropy,
+ const dt_isotropy_t isotropy_type[4],
+ const dt_aligned_pixel_t ABCD,
+ const float strength,
+ const float regularization_factor,
+ const float variance_threshold)
+{
+ switch(backend)
+ {
+#ifdef __APPLE__
+ case HALIDE_BACKEND_METAL:
+ return dt_halide_diffuse_pde_metal(
+ hf_buf, lf_buf, mask_buf,
+ w, h, mult, has_mask_i,
+ anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
+ (int)isotropy_type[0], (int)isotropy_type[1],
+ (int)isotropy_type[2], (int)isotropy_type[3],
+ ABCD[0], ABCD[1], ABCD[2], ABCD[3],
+ strength, regularization_factor, variance_threshold,
+ out_buf);
+#endif
+#ifdef HAVE_HALIDE_VULKAN
+ case HALIDE_BACKEND_VULKAN:
+ return dt_halide_diffuse_pde_vulkan(
+ hf_buf, lf_buf, mask_buf,
+ w, h, mult, has_mask_i,
+ anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
+ (int)isotropy_type[0], (int)isotropy_type[1],
+ (int)isotropy_type[2], (int)isotropy_type[3],
+ ABCD[0], ABCD[1], ABCD[2], ABCD[3],
+ strength, regularization_factor, variance_threshold,
+ out_buf);
+#endif
+ case HALIDE_BACKEND_CPU:
+ return dt_halide_diffuse_pde(
+ hf_buf, lf_buf, mask_buf,
+ w, h, mult, has_mask_i,
+ anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3],
+ (int)isotropy_type[0], (int)isotropy_type[1],
+ (int)isotropy_type[2], (int)isotropy_type[3],
+ ABCD[0], ABCD[1], ABCD[2], ABCD[3],
+ strength, regularization_factor, variance_threshold,
+ out_buf);
+ default:
+ return -1;
+ }
+}
+
+// Detect best available Halide GPU backend with a probe bspline call.
+// On success, the buffers will have device allocations ready for reuse.
+static _halide_backend_t _halide_probe_backend(halide_buffer_t *in_buf,
+ halide_buffer_t *lf_buf,
+ halide_buffer_t *hf_buf,
+ const int w, const int h, const int mult)
+{
+#ifdef __APPLE__
+ if(_halide_run_bspline(HALIDE_BACKEND_METAL, in_buf, lf_buf, hf_buf, w, h, mult) == 0)
+ return HALIDE_BACKEND_METAL;
+ dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Metal probe failed, trying next backend");
+ // Clean up failed device state before trying next backend
+ halide_device_free(NULL, in_buf);
+ halide_device_free(NULL, lf_buf);
+ halide_device_free(NULL, hf_buf);
+#endif
+#ifdef HAVE_HALIDE_VULKAN
+ if(_halide_run_bspline(HALIDE_BACKEND_VULKAN, in_buf, lf_buf, hf_buf, w, h, mult) == 0)
+ return HALIDE_BACKEND_VULKAN;
+ dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Vulkan probe failed, trying CPU");
+ halide_device_free(NULL, in_buf);
+ halide_device_free(NULL, lf_buf);
+ halide_device_free(NULL, hf_buf);
+#endif
+ if(_halide_run_bspline(HALIDE_BACKEND_CPU, in_buf, lf_buf, hf_buf, w, h, mult) == 0)
+ return HALIDE_BACKEND_CPU;
+ return HALIDE_BACKEND_NONE;
+}
+
+static inline const char *_halide_backend_name(_halide_backend_t b)
+{
+ switch(b)
+ {
+ case HALIDE_BACKEND_METAL: return "Metal GPU";
+ case HALIDE_BACKEND_VULKAN: return "Vulkan GPU";
+ case HALIDE_BACKEND_CPU: return "CPU";
+ default: return "none";
+ }
+}
+
+// Halide-accelerated wavelets_process with GPU buffer persistence.
+// All intermediate buffers stay on the GPU device across the entire
+// decompose + reconstruct loop. Only the final output is copied to host.
+static inline gboolean _wavelets_process_halide(const float *const restrict in,
+ float *const restrict reconstructed,
+ const uint8_t *const restrict mask,
+ const size_t width,
+ const size_t height,
+ const dt_iop_diffuse_data_t *const data,
+ const float final_radius,
+ const float zoom,
+ const int scales,
+ const gboolean has_mask,
+ float *const restrict HF[MAX_NUM_SCALES],
+ float *const restrict LF_odd,
+ float *const restrict LF_even,
+ _halide_backend_t *const backend_ptr)
+{
+ const int row_floats = (int)width * 4;
+ const int w = (int)width;
+ const int h = (int)height;
+
+ const dt_aligned_pixel_t anisotropy
+ = { compute_anisotropy_factor(data->anisotropy_first),
+ compute_anisotropy_factor(data->anisotropy_second),
+ compute_anisotropy_factor(data->anisotropy_third),
+ compute_anisotropy_factor(data->anisotropy_fourth) };
+
+ const dt_isotropy_t DT_ALIGNED_PIXEL isotropy_type[4]
+ = { check_isotropy_mode(data->anisotropy_first),
+ check_isotropy_mode(data->anisotropy_second),
+ check_isotropy_mode(data->anisotropy_third),
+ check_isotropy_mode(data->anisotropy_fourth) };
+
+ const float regularization = powf(10.f, data->regularization) - 1.f;
+ const float variance_threshold = powf(10.f, data->variance_threshold);
+
+ // --- Create persistent halide_buffer_t wrappers for all buffers ---
+ // Each buffer gets its own dims array (Halide may write to these).
+ halide_dimension_t dims_in[2], dims_lf_odd[2], dims_lf_even[2], dims_out[2];
+ halide_dimension_t dims_hf[MAX_NUM_SCALES][2];
+ halide_dimension_t dims_mask[2];
+
+ halide_buffer_t buf_in, buf_lf_odd, buf_lf_even, buf_out;
+ halide_buffer_t buf_hf[MAX_NUM_SCALES];
+ halide_buffer_t buf_mask;
+
+ _halide_init_flat_f32(&buf_in, dims_in, (float *)in, row_floats, h);
+ _halide_init_flat_f32(&buf_lf_odd, dims_lf_odd, LF_odd, row_floats, h);
+ _halide_init_flat_f32(&buf_lf_even, dims_lf_even, LF_even, row_floats, h);
+ _halide_init_flat_f32(&buf_out, dims_out, reconstructed, row_floats, h);
+ for(int s = 0; s < scales; s++)
+ _halide_init_flat_f32(&buf_hf[s], dims_hf[s], HF[s], row_floats, h);
+ _halide_init_mask_u8(&buf_mask, dims_mask, mask, w, h);
+
+ // Mark input as host-dirty so the first kernel uploads it to device
+ buf_in.flags = halide_buffer_flag_host_dirty;
+ if(has_mask) buf_mask.flags = halide_buffer_flag_host_dirty;
+
+ // --- Use cached backend or probe on first call ---
+ _halide_backend_t backend;
+ gboolean probed = FALSE;
+
+ if(backend_ptr && *backend_ptr != HALIDE_BACKEND_NONE)
+ {
+ // Backend was cached from a previous call, reuse it to avoid re-probing
+ backend = *backend_ptr;
+ probed = FALSE; // We're not probing, just using cached backend
+ }
+ else
+ {
+ // First call: probe backend using the first bspline decompose call
+ // This also performs scale 0 decompose: in -> LF_odd + HF[0]
+ backend = _halide_probe_backend(
+ &buf_in, &buf_lf_odd, &buf_hf[0], w, h, 1 /* mult for scale 0 */);
+
+ if(backend == HALIDE_BACKEND_NONE)
+ {
+ dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide: all backends failed for bspline probe");
+ return FALSE;
+ }
+
+ dt_print(DT_DEBUG_PIPE, "[diffuse] Halide backend: %s",
+ _halide_backend_name(backend));
+
+ // Cache backend for subsequent calls to avoid re-probing
+ if(backend_ptr) *backend_ptr = backend;
+ probed = TRUE; // Probe already did scale 0
+ }
+
+ // Scale 0: only run if we didn't probe (probe already did it)
+ float *restrict residual_ptr = LF_odd;
+
+ if(!probed)
+ {
+ // Cached backend iteration: need to run scale 0 bspline explicitly
+ int err = _halide_run_bspline(backend, &buf_in, &buf_lf_odd, &buf_hf[0], w, h, 1);
+ if(err)
+ {
+ dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide bspline failed at scale 0 (err=%d)", err);
+ goto cleanup_fail;
+ }
+ }
+
+ for(int s = 1; s < scales; ++s)
+ {
+ const int mult = 1 << s;
+
+ halide_buffer_t *b_in = (s % 2 != 0) ? &buf_lf_odd : &buf_lf_even;
+ halide_buffer_t *b_out = (s % 2 != 0) ? &buf_lf_even : &buf_lf_odd;
+
+ int err = _halide_run_bspline(backend, b_in, b_out, &buf_hf[s], w, h, mult);
+ if(err)
+ {
+ dt_print(DT_DEBUG_ALWAYS,
+ "[diffuse] Halide bspline failed at scale %d (err=%d)", s, err);
+ goto cleanup_fail;
+ }
+
+ residual_ptr = (s % 2 != 0) ? LF_even : LF_odd;
+ }
+
+ // --- PDE diffusion reconstruction (reverse scale order) ---
+ {
+ halide_buffer_t *residual_buf = (residual_ptr == LF_even) ? &buf_lf_even : &buf_lf_odd;
+ halide_buffer_t *temp_buf = (residual_ptr == LF_even) ? &buf_lf_odd : &buf_lf_even;
+
+ int count = 0;
+ for(int s = scales - 1; s > -1; --s)
+ {
+ const int mult = 1 << s;
+ const float current_radius = equivalent_sigma_at_step(B_SPLINE_SIGMA, s);
+ const float real_radius = current_radius * zoom;
+ const float norm =
+ expf(-sqf(real_radius - (float)data->radius_center) / sqf(data->radius));
+ const dt_aligned_pixel_t ABCD = { data->first * KAPPA * norm,
+ data->second * KAPPA * norm,
+ data->third * KAPPA * norm,
+ data->fourth * KAPPA * norm };
+ const float strength = data->sharpness * norm + 1.f;
+ const float regularization_factor = regularization * sqf(current_radius) / 9.f;
+
+ halide_buffer_t *b_in, *b_out;
+
+ if(count == 0)
+ {
+ b_in = residual_buf;
+ b_out = temp_buf;
+ }
+ else if(count % 2 != 0)
+ {
+ b_in = temp_buf;
+ b_out = residual_buf;
+ }
+ else
+ {
+ b_in = residual_buf;
+ b_out = temp_buf;
+ }
+
+ // Final scale writes to output buffer
+ if(s == 0) b_out = &buf_out;
+
+ int pde_err = _halide_run_pde(backend, &buf_hf[s], b_in, &buf_mask, b_out,
+ w, h, mult, has_mask ? 1 : 0,
+ anisotropy, isotropy_type, ABCD,
+ strength, regularization_factor, variance_threshold);
+ if(pde_err)
+ {
+ dt_print(DT_DEBUG_ALWAYS,
+ "[diffuse] Halide PDE failed at scale %d (err=%d)", s, pde_err);
+ goto cleanup_fail;
+ }
+ count++;
+ }
+ }
+
+ // --- Copy only the final output to host ---
+ if(backend != HALIDE_BACKEND_CPU)
+ {
+ halide_copy_to_host(NULL, &buf_out);
+ // Force synchronization by touching the output buffer
+ // This ensures GPU has finished writing before we return
+ volatile float sync_check = ((float *)reconstructed)[0];
+ (void)sync_check;
+ }
+
+ // --- Free all device memory ---
+ if(backend != HALIDE_BACKEND_CPU)
+ {
+ halide_device_free(NULL, &buf_in);
+ halide_device_free(NULL, &buf_lf_odd);
+ halide_device_free(NULL, &buf_lf_even);
+ halide_device_free(NULL, &buf_out);
+ halide_device_free(NULL, &buf_mask);
+ for(int s = 0; s < scales; s++)
+ halide_device_free(NULL, &buf_hf[s]);
+ }
+
+ return TRUE;
+
+cleanup_fail:
+ if(backend != HALIDE_BACKEND_CPU)
+ {
+ halide_device_free(NULL, &buf_in);
+ halide_device_free(NULL, &buf_lf_odd);
+ halide_device_free(NULL, &buf_lf_even);
+ halide_device_free(NULL, &buf_out);
+ halide_device_free(NULL, &buf_mask);
+ for(int s = 0; s < scales; s++)
+ halide_device_free(NULL, &buf_hf[s]);
+ }
+ return FALSE;
+}
+
+__attribute__((visibility("default")))
+int process_halide(dt_iop_module_t *self,
+ dt_dev_pixelpipe_iop_t *piece,
+ const void *const restrict ivoid,
+ void *const restrict ovoid,
+ const dt_iop_roi_t *const roi_in,
+ const dt_iop_roi_t *const roi_out)
+{
+ const gboolean fastmode = piece->pipe->type & DT_DEV_PIXELPIPE_FAST;
+ const dt_iop_diffuse_data_t *const data = piece->data;
+
+ const size_t width = roi_out->width;
+ const size_t height = roi_out->height;
+
+ if(fastmode)
+ {
+ const size_t ch = piece->colors;
+ dt_iop_copy_image_roi(ovoid, ivoid, ch, roi_in, roi_out);
+ return 0;
+ }
+
+ uint8_t *const restrict mask = dt_alloc_align_uint8(width * height);
+
+ float *restrict in = DT_IS_ALIGNED((float *const restrict)ivoid);
+ float *const restrict out = DT_IS_ALIGNED((float *const restrict)ovoid);
+
+ float *restrict temp1, *restrict temp2;
+ float *restrict LF_odd, *restrict LF_even;
+ float *restrict temp_in = NULL;
+ float *restrict temp_out = NULL;
+
+ gboolean out_of_memory = !mask
+ || !dt_iop_alloc_image_buffers(self, roi_in, roi_out,
+ 4 | DT_IMGSZ_OUTPUT, &temp1,
+ 4 | DT_IMGSZ_OUTPUT, &temp2,
+ 4 | DT_IMGSZ_OUTPUT, &LF_odd,
+ 4 | DT_IMGSZ_OUTPUT, &LF_even,
+ 0, NULL);
+
+ const float scale = fmaxf(piece->iscale / roi_in->scale, 1.f);
+ const float final_radius = (data->radius + data->radius_center) * 2.f / scale;
+
+ const int iterations = MAX(ceilf((float)data->iterations), 1);
+ const int diffusion_scales = num_steps_to_reach_equivalent_sigma(B_SPLINE_SIGMA, final_radius);
+ const int scales = CLAMP(diffusion_scales, 1, MAX_NUM_SCALES);
+
+ float *restrict HF[MAX_NUM_SCALES];
+ for(int s = 0; s < scales; s++)
+ {
+ HF[s] = out_of_memory ? NULL : dt_calloc_align_float(width * height * 4);
+ if(!HF[s]) out_of_memory = TRUE;
+ }
+
+ int ret = 0;
+
+ if(out_of_memory)
+ {
+ dt_iop_copy_image_roi(ovoid, ivoid, piece->colors, roi_in, roi_out);
+ dt_control_log(_("diffuse/sharpen failed to allocate memory, check your RAM settings"));
+ ret = -1;
+ goto finish;
+ }
+
+ const gboolean has_mask = (data->threshold > 0.f);
+ if(has_mask)
+ {
+ build_mask(in, mask, data->threshold, roi_out->width, roi_out->height);
+ inpaint_mask(temp1, in, mask, roi_out->width, roi_out->height);
+ in = temp1;
+ }
+
+ // Cache backend across iterations to skip re-probing
+ _halide_backend_t cached_backend = HALIDE_BACKEND_NONE;
+
+ for(int it = 0; it < iterations; it++)
+ {
+ double iter_start = dt_get_wtime();
+
+ if(it == 0)
+ {
+ temp_in = in;
+ temp_out = temp2;
+ }
+ else if(it % 2 == 0)
+ {
+ temp_in = temp1;
+ temp_out = temp2;
+ }
+ else
+ {
+ temp_in = temp2;
+ temp_out = temp1;
+ }
+
+ if(it == iterations - 1)
+ temp_out = out;
+
+ if(!_wavelets_process_halide(temp_in, temp_out, mask,
+ roi_out->width, roi_out->height,
+ data, final_radius, scale, scales, has_mask, HF, LF_odd, LF_even,
+ &cached_backend))
+ {
+ // Halide failed, signal fallback to regular process()
+ ret = -1;
+ goto finish;
+ }
+
+ double iter_time = dt_get_wtime() - iter_start;
+ dt_print(DT_DEBUG_PIPE, "[diffuse] Iteration %d/%d took %.3f secs",
+ it + 1, iterations, iter_time);
+ }
+
+finish:
+ dt_free_align(mask);
+ dt_free_align(temp1);
+ dt_free_align(temp2);
+ dt_free_align(LF_even);
+ dt_free_align(LF_odd);
+ for(int s = 0; s < scales; s++)
+ if(HF[s]) dt_free_align(HF[s]);
+
+ return ret;
+}
+#endif /* HAVE_HALIDE */
+
#if HAVE_OPENCL
static inline cl_int wavelets_process_cl(const int devid,
cl_mem in,
diff --git a/src/iop/exposure.c b/src/iop/exposure.c
index 5276cb91228f..00f78ac385b7 100644
--- a/src/iop/exposure.c
+++ b/src/iop/exposure.c
@@ -1,6 +1,6 @@
/*
This file is part of darktable,
- Copyright (C) 2009-2024 darktable developers.
+ Copyright (C) 2009-2026 darktable developers.
darktable is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -508,6 +508,140 @@ static void _process_common_setup(dt_iop_module_t *self,
d->scale = 1.0 / (white - d->black);
}
+#ifdef HAVE_HALIDE
+#include "HalideRuntime.h"
+#include "halide_exposure.h"
+#ifdef HAVE_HALIDE_VULKAN
+#include "halide_exposure_vulkan.h"
+#endif
+#ifdef __APPLE__
+#include "halide_exposure_metal.h"
+#endif
+
+// Try a Halide GPU backend using flat 2D buffers.
+// Returns 0 on success, non-zero on failure.
+static int _halide_try_gpu_flat(const void *const i, void *const o,
+ int width, int height, float black, float scale,
+ int (*gpu_func)(struct halide_buffer_t *,
+ float, float,
+ struct halide_buffer_t *))
+{
+ const int row_floats = width * 4;
+ halide_dimension_t flat_in_dims[2] = {
+ {0, row_floats, 1},
+ {0, height, row_floats},
+ };
+ halide_dimension_t flat_out_dims[2] = {
+ {0, row_floats, 1},
+ {0, height, row_floats},
+ };
+
+ halide_buffer_t flat_in = {0};
+ flat_in.host = (uint8_t *)i;
+ flat_in.type = (struct halide_type_t){halide_type_float, 32, 1};
+ flat_in.dimensions = 2;
+ flat_in.dim = flat_in_dims;
+ flat_in.flags = halide_buffer_flag_host_dirty;
+
+ halide_buffer_t flat_out = {0};
+ flat_out.host = (uint8_t *)o;
+ flat_out.type = (struct halide_type_t){halide_type_float, 32, 1};
+ flat_out.dimensions = 2;
+ flat_out.dim = flat_out_dims;
+
+ int err = gpu_func(&flat_in, black, scale, &flat_out);
+ if(!err)
+ err = halide_copy_to_host(NULL, &flat_out);
+
+ halide_device_free(NULL, &flat_in);
+ halide_device_free(NULL, &flat_out);
+ return err;
+}
+
+__attribute__((visibility("default")))
+int process_halide(dt_iop_module_t *self,
+ dt_dev_pixelpipe_iop_t *piece,
+ const void *const i,
+ void *const o,
+ const dt_iop_roi_t *const roi_in,
+ const dt_iop_roi_t *const roi_out)
+{
+ const dt_iop_exposure_data_t *const d = piece->data;
+
+ _process_common_setup(self, piece);
+
+ const int width = roi_out->width;
+ const int height = roi_out->height;
+ const float black = d->black;
+ const float scale = d->scale;
+
+ int err = -1;
+
+ // GPU backends: try Metal (macOS) then Vulkan (cross-platform)
+#ifdef __APPLE__
+ err = _halide_try_gpu_flat(i, o, width, height, black, scale, dt_halide_exposure_metal);
+ if(!err)
+ dt_print(DT_DEBUG_PIPE, "[exposure] Halide backend: Metal GPU");
+ else
+ dt_print(DT_DEBUG_PIPE, "[exposure] Halide Metal failed (%d), trying Vulkan", err);
+#endif
+
+#ifdef HAVE_HALIDE_VULKAN
+ if(err)
+ {
+ err = _halide_try_gpu_flat(i, o, width, height, black, scale, dt_halide_exposure_vulkan);
+ if(!err)
+ dt_print(DT_DEBUG_PIPE, "[exposure] Halide backend: Vulkan GPU");
+ else
+ dt_print(DT_DEBUG_PIPE, "[exposure] Halide Vulkan failed (%d), trying CPU", err);
+ }
+#endif
+
+ // CPU fallback: uses 3D interleaved buffer layout
+ if(err)
+ {
+ halide_dimension_t in_dims[3] = {
+ {0, 4, 1},
+ {0, width, 4},
+ {0, height, 4 * width},
+ };
+ halide_dimension_t out_dims[3] = {
+ {0, 4, 1},
+ {0, width, 4},
+ {0, height, 4 * width},
+ };
+
+ halide_buffer_t in_buf = {0};
+ in_buf.host = (uint8_t *)i;
+ in_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
+ in_buf.dimensions = 3;
+ in_buf.dim = in_dims;
+
+ halide_buffer_t out_buf = {0};
+ out_buf.host = (uint8_t *)o;
+ out_buf.type = (struct halide_type_t){halide_type_float, 32, 1};
+ out_buf.dimensions = 3;
+ out_buf.dim = out_dims;
+
+ err = dt_halide_exposure(&in_buf, black, scale, &out_buf);
+ if(!err)
+ dt_print(DT_DEBUG_PIPE, "[exposure] Halide backend: CPU");
+ }
+
+ if(err)
+ {
+ dt_print(DT_DEBUG_ALWAYS,
+ "[exposure] Halide processing failed with error %d, falling back", err);
+ return err;
+ }
+
+ for(int k = 0; k < 3; k++)
+ piece->pipe->dsc.processed_maximum[k] *= d->scale;
+
+ return 0;
+}
+#endif /* HAVE_HALIDE */
+
#ifdef HAVE_OPENCL
int process_cl(dt_iop_module_t *self,
dt_dev_pixelpipe_iop_t *piece,