From 99d5596e77c0e850a66f991ac9cf520afa9fe7e5 Mon Sep 17 00:00:00 2001 From: Mario Zimmermann Date: Sun, 19 Apr 2026 19:18:45 +0200 Subject: [PATCH 1/4] prepare halide and exposure module --- .ci/Brewfile | 1 + DefineOptions.cmake | 1 + data/halide/CMakeLists.txt | 49 +++++++ data/halide/generators/exposure_generator.cpp | 109 ++++++++++++++ src/CMakeLists.txt | 9 ++ src/common/darktable.c | 6 + src/develop/pixelpipe_hb.c | 27 ++++ src/develop/pixelpipe_hb.h | 10 ++ src/iop/CMakeLists.txt | 9 ++ src/iop/exposure.c | 136 +++++++++++++++++- 10 files changed, 356 insertions(+), 1 deletion(-) create mode 100644 data/halide/CMakeLists.txt create mode 100644 data/halide/generators/exposure_generator.cpp diff --git a/.ci/Brewfile b/.ci/Brewfile index 55d6941cac48..07accba836bc 100644 --- a/.ci/Brewfile +++ b/.ci/Brewfile @@ -29,6 +29,7 @@ brew 'gtk+3' brew 'gtk-mac-integration' brew 'icu4c' brew 'graphicsmagick' +brew 'halide' brew 'imagemagick' brew 'intltool' brew 'iso-codes' diff --git a/DefineOptions.cmake b/DefineOptions.cmake index 7d2ea4a354a2..92b6c14f2cfb 100644 --- a/DefineOptions.cmake +++ b/DefineOptions.cmake @@ -8,6 +8,7 @@ option(USE_LIBSECRET "Build libsecret password storage back-end" ON) option(USE_UNITY "Use libunity to report progress in the launcher" OFF) option(USE_OPENMP "Use OpenMP threading support." ON) option(USE_OPENCL "Use OpenCL support." ON) +option(USE_HALIDE "Use Halide for optimized image processing." OFF) option(USE_GRAPHICSMAGICK "Use GraphicsMagick library for image import." ON) option(USE_IMAGEMAGICK "Use ImageMagick library for image import." OFF) option(USE_DARKTABLE_PROFILING OFF) diff --git a/data/halide/CMakeLists.txt b/data/halide/CMakeLists.txt new file mode 100644 index 000000000000..f8e667afc6db --- /dev/null +++ b/data/halide/CMakeLists.txt @@ -0,0 +1,49 @@ +# Halide-based image processing generators for darktable +# +# This builds Halide generator executables at compile time, then runs them +# to produce optimized static libraries (.h + .a) for each target platform. +# The generated code is linked into individual IOP plugin modules. + +find_package(Halide REQUIRED) + +# Build the generator executable (runs on the host at build time) +add_halide_generator(dt_halide_generators + SOURCES + generators/exposure_generator.cpp +) + +# Generate CPU-optimized code for the host platform +# "host" auto-detects arch + OS + SIMD features (NEON on ARM64, AVX on x86, etc.) +add_halide_library(halide_exposure + FROM dt_halide_generators + GENERATOR exposure + TARGETS host + FUNCTION_NAME dt_halide_exposure +) + +# Generate Metal GPU code for Apple Silicon (macOS only) +# Uses the "flat" 2D generator to avoid stride issues with Metal's buffer handling. +# The flat generator treats each row as width*4 contiguous floats. +if(APPLE) + add_halide_library(halide_exposure_metal + FROM dt_halide_generators + GENERATOR exposure_flat + TARGETS host-metal + FUNCTION_NAME dt_halide_exposure_metal + ) +endif() + +# Generate Vulkan GPU code (cross-platform: Linux, Windows, macOS via MoltenVK) +# Uses the same flat 2D generator as Metal. Vulkan loader is dlopen'd at runtime, +# so no link-time dependency — if no Vulkan driver is present, the call simply fails +# and we fall through to CPU Halide. +find_package(Vulkan QUIET) +if(Vulkan_FOUND) + add_halide_library(halide_exposure_vulkan + FROM dt_halide_generators + GENERATOR exposure_flat + TARGETS host-vulkan + FUNCTION_NAME dt_halide_exposure_vulkan + ) + set(HALIDE_HAS_VULKAN TRUE PARENT_SCOPE) +endif() diff --git a/data/halide/generators/exposure_generator.cpp b/data/halide/generators/exposure_generator.cpp new file mode 100644 index 000000000000..9939749aac84 --- /dev/null +++ b/data/halide/generators/exposure_generator.cpp @@ -0,0 +1,109 @@ +/* + This file is part of darktable, + Copyright (C) 2026 darktable developers. + + darktable is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + darktable is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with darktable. If not, see . +*/ + +/* + * Halide generator for darktable's exposure module. + * + * Algorithm: out[c] = (in[c] - black) * scale for c = R,G,B + * out[alpha] = in[alpha] (passthrough) + * + * darktable pixel layout: interleaved float RGBA, 4 channels per pixel. + * Memory layout: [R0,G0,B0,A0, R1,G1,B1,A1, ...] + * stride: c=1, x=4, y=4*width + * + * Two generators: + * "exposure" — 3D (c,x,y) for CPU with SIMD vectorization + * "exposure_flat" — 2D (xi,y) for Metal GPU, where xi = x*4+c + * This avoids stride issues with Metal's buffer handling. + */ + +#include "Halide.h" + +using namespace Halide; + +// CPU generator: 3D (c, x, y) with interleaved layout +class ExposureGenerator : public Generator { +public: + Input> input{"input"}; + Input black{"black"}; + Input scale{"scale"}; + Output> output{"output"}; + + void generate() { + Var c("c"), x("x"), y("y"); + + Expr val = input(c, x, y); + Expr adjusted = (val - black) * scale; + output(c, x, y) = select(c < 3, adjusted, val); + } + + void schedule() { + Var c = output.args()[0]; + Var x = output.args()[1]; + Var y = output.args()[2]; + + output.bound(c, 0, 4); + input.dim(0).set_bounds(0, 4); + + // CPU schedule: parallel over rows, vectorize over x + output.reorder(c, x, y); + output.vectorize(c, 4); + + Var xo("xo"), xi("xi"); + output.split(x, xo, xi, 8); + output.vectorize(xi); + output.parallel(y); + } +}; + +// GPU generator: 2D flat buffer (xi, y) where xi covers width*4 floats per row. +// Every 4th element (xi % 4 == 3) is alpha and passed through unchanged. +class ExposureFlatGenerator : public Generator { +public: + Input> input{"input"}; + Input black{"black"}; + Input scale{"scale"}; + Output> output{"output"}; + + void generate() { + Var xi("xi"), y("y"); + + Expr val = input(xi, y); + Expr adjusted = (val - black) * scale; + // Alpha passthrough: every 4th element (index 3, 7, 11, ...) is alpha + output(xi, y) = select((xi % 4) != 3, adjusted, val); + } + + void schedule() { + Var xi = output.args()[0]; + Var y = output.args()[1]; + + if(get_target().has_gpu_feature()) { + Var xio("xio"), xii("xii"), yo("yo"), yi("yi"); + // Tile: 64 floats wide (16 pixels * 4 channels) x 16 rows + output.gpu_tile(xi, y, xio, yo, 64, 16); + } else { + // Fallback CPU schedule (shouldn't normally be used) + output.vectorize(xi, 16); + output.parallel(y); + } + } +}; + +HALIDE_REGISTER_GENERATOR(ExposureGenerator, exposure) +HALIDE_REGISTER_GENERATOR(ExposureFlatGenerator, exposure_flat) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e2d15b5b16cc..37b35a0fc652 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1082,6 +1082,15 @@ add_subdirectory(views ${DARKTABLE_LIBDIR}/views) # Compile lighttable modules add_subdirectory(libs ${DARKTABLE_LIBDIR}/plugins/lighttable) +# Compile Halide generators (must be before iop so targets are available) +if(USE_HALIDE) + add_subdirectory(${CMAKE_SOURCE_DIR}/data/halide ${CMAKE_BINARY_DIR}/halide) + add_definitions("-DHAVE_HALIDE") + if(HALIDE_HAS_VULKAN) + add_definitions("-DHAVE_HALIDE_VULKAN") + endif() +endif() + # Compile darkroom modules add_subdirectory(iop ${DARKTABLE_LIBDIR}/plugins) diff --git a/src/common/darktable.c b/src/common/darktable.c index 12fba89b069b..1133ef5057ac 100644 --- a/src/common/darktable.c +++ b/src/common/darktable.c @@ -835,6 +835,12 @@ char *version = g_strdup_printf( " OpenMP -> DISABLED\n" #endif +#ifdef HAVE_HALIDE + " Halide -> ENABLED\n" +#else + " Halide -> DISABLED\n" +#endif + #ifdef HAVE_OPENCL " OpenCL -> ENABLED\n" #else diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c index a8ebe3df1b02..b4d9e149b829 100644 --- a/src/develop/pixelpipe_hb.c +++ b/src/develop/pixelpipe_hb.c @@ -457,6 +457,13 @@ void dt_dev_pixelpipe_create_nodes(dt_dev_pixelpipe_t *pipe, g_direct_equal, NULL, dt_free_align_ptr); memset(&piece->processed_roi_in, 0, sizeof(piece->processed_roi_in)); memset(&piece->processed_roi_out, 0, sizeof(piece->processed_roi_out)); +#ifdef HAVE_HALIDE + // Cache the Halide process function pointer (if the module exports one) + piece->process_halide = NULL; + if(module->so->module) + g_module_symbol(module->so->module, "process_halide", + (gpointer *)&piece->process_halide); +#endif dt_iop_init_pipe(piece->module, pipe, piece); pipe->nodes = g_list_append(pipe->nodes, piece); } @@ -1444,6 +1451,19 @@ static gboolean _pixelpipe_process_on_CPU(dt_dev_pixelpipe_t *pipe, } else { +#ifdef HAVE_HALIDE + gboolean halide_ok = FALSE; + if(piece->process_halide) + { + halide_ok = (piece->process_halide(module, piece, input, *output, roi_in, roi_out) == 0); + if(halide_ok) + { + dt_print_pipe(DT_DEBUG_PIPE, "processed via Halide", + pipe, module, DT_DEVICE_CPU, roi_in, roi_out, ""); + } + } + if(!halide_ok) +#endif module->process(module, piece, input, *output, roi_in, roi_out); if(relevant) { @@ -2083,6 +2103,13 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe, } } +#ifdef HAVE_HALIDE + // If this module has a Halide CPU implementation, prefer it over OpenCL. + // This will be replaced by Halide Metal/GPU dispatch in the future. + if(possible_cl && piece->process_halide) + possible_cl = FALSE; +#endif + if(possible_cl) { const int cst_from = input_cst_cl; diff --git a/src/develop/pixelpipe_hb.h b/src/develop/pixelpipe_hb.h index fcb8157c62e2..7eabe590a40e 100644 --- a/src/develop/pixelpipe_hb.h +++ b/src/develop/pixelpipe_hb.h @@ -63,6 +63,16 @@ typedef struct dt_dev_pixelpipe_iop_t uint8_t xtrans[6][6]; uint32_t filters; GHashTable *raster_masks; + +#ifdef HAVE_HALIDE + // Cached Halide process function pointer (looked up once via g_module_symbol) + int (*process_halide)(struct dt_iop_module_t *self, + struct dt_dev_pixelpipe_iop_t *piece, + const void *const i, + void *const o, + const struct dt_iop_roi_t *const roi_in, + const struct dt_iop_roi_t *const roi_out); +#endif } dt_dev_pixelpipe_iop_t; typedef enum dt_dev_pixelpipe_change_t diff --git a/src/iop/CMakeLists.txt b/src/iop/CMakeLists.txt index 92a69c1037f9..6777052a8a26 100644 --- a/src/iop/CMakeLists.txt +++ b/src/iop/CMakeLists.txt @@ -78,6 +78,15 @@ add_iop(temperature "temperature.c" DEFAULT_VISIBLE) add_iop(colorcorrection "colorcorrection.c") add_iop(basicadj "basicadj.c") add_iop(exposure "exposure.c" DEFAULT_VISIBLE) +if(USE_HALIDE) + target_link_libraries(exposure halide_exposure) + if(APPLE) + target_link_libraries(exposure halide_exposure_metal) + endif() + if(HALIDE_HAS_VULKAN) + target_link_libraries(exposure halide_exposure_vulkan) + endif() +endif() add_iop(equalizer "equalizer.c") add_iop(rgbcurve "rgbcurve.c") add_iop(colorbalance "colorbalance.c" DEFAULT_VISIBLE) diff --git a/src/iop/exposure.c b/src/iop/exposure.c index 5276cb91228f..00f78ac385b7 100644 --- a/src/iop/exposure.c +++ b/src/iop/exposure.c @@ -1,6 +1,6 @@ /* This file is part of darktable, - Copyright (C) 2009-2024 darktable developers. + Copyright (C) 2009-2026 darktable developers. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -508,6 +508,140 @@ static void _process_common_setup(dt_iop_module_t *self, d->scale = 1.0 / (white - d->black); } +#ifdef HAVE_HALIDE +#include "HalideRuntime.h" +#include "halide_exposure.h" +#ifdef HAVE_HALIDE_VULKAN +#include "halide_exposure_vulkan.h" +#endif +#ifdef __APPLE__ +#include "halide_exposure_metal.h" +#endif + +// Try a Halide GPU backend using flat 2D buffers. +// Returns 0 on success, non-zero on failure. +static int _halide_try_gpu_flat(const void *const i, void *const o, + int width, int height, float black, float scale, + int (*gpu_func)(struct halide_buffer_t *, + float, float, + struct halide_buffer_t *)) +{ + const int row_floats = width * 4; + halide_dimension_t flat_in_dims[2] = { + {0, row_floats, 1}, + {0, height, row_floats}, + }; + halide_dimension_t flat_out_dims[2] = { + {0, row_floats, 1}, + {0, height, row_floats}, + }; + + halide_buffer_t flat_in = {0}; + flat_in.host = (uint8_t *)i; + flat_in.type = (struct halide_type_t){halide_type_float, 32, 1}; + flat_in.dimensions = 2; + flat_in.dim = flat_in_dims; + flat_in.flags = halide_buffer_flag_host_dirty; + + halide_buffer_t flat_out = {0}; + flat_out.host = (uint8_t *)o; + flat_out.type = (struct halide_type_t){halide_type_float, 32, 1}; + flat_out.dimensions = 2; + flat_out.dim = flat_out_dims; + + int err = gpu_func(&flat_in, black, scale, &flat_out); + if(!err) + err = halide_copy_to_host(NULL, &flat_out); + + halide_device_free(NULL, &flat_in); + halide_device_free(NULL, &flat_out); + return err; +} + +__attribute__((visibility("default"))) +int process_halide(dt_iop_module_t *self, + dt_dev_pixelpipe_iop_t *piece, + const void *const i, + void *const o, + const dt_iop_roi_t *const roi_in, + const dt_iop_roi_t *const roi_out) +{ + const dt_iop_exposure_data_t *const d = piece->data; + + _process_common_setup(self, piece); + + const int width = roi_out->width; + const int height = roi_out->height; + const float black = d->black; + const float scale = d->scale; + + int err = -1; + + // GPU backends: try Metal (macOS) then Vulkan (cross-platform) +#ifdef __APPLE__ + err = _halide_try_gpu_flat(i, o, width, height, black, scale, dt_halide_exposure_metal); + if(!err) + dt_print(DT_DEBUG_PIPE, "[exposure] Halide backend: Metal GPU"); + else + dt_print(DT_DEBUG_PIPE, "[exposure] Halide Metal failed (%d), trying Vulkan", err); +#endif + +#ifdef HAVE_HALIDE_VULKAN + if(err) + { + err = _halide_try_gpu_flat(i, o, width, height, black, scale, dt_halide_exposure_vulkan); + if(!err) + dt_print(DT_DEBUG_PIPE, "[exposure] Halide backend: Vulkan GPU"); + else + dt_print(DT_DEBUG_PIPE, "[exposure] Halide Vulkan failed (%d), trying CPU", err); + } +#endif + + // CPU fallback: uses 3D interleaved buffer layout + if(err) + { + halide_dimension_t in_dims[3] = { + {0, 4, 1}, + {0, width, 4}, + {0, height, 4 * width}, + }; + halide_dimension_t out_dims[3] = { + {0, 4, 1}, + {0, width, 4}, + {0, height, 4 * width}, + }; + + halide_buffer_t in_buf = {0}; + in_buf.host = (uint8_t *)i; + in_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; + in_buf.dimensions = 3; + in_buf.dim = in_dims; + + halide_buffer_t out_buf = {0}; + out_buf.host = (uint8_t *)o; + out_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; + out_buf.dimensions = 3; + out_buf.dim = out_dims; + + err = dt_halide_exposure(&in_buf, black, scale, &out_buf); + if(!err) + dt_print(DT_DEBUG_PIPE, "[exposure] Halide backend: CPU"); + } + + if(err) + { + dt_print(DT_DEBUG_ALWAYS, + "[exposure] Halide processing failed with error %d, falling back", err); + return err; + } + + for(int k = 0; k < 3; k++) + piece->pipe->dsc.processed_maximum[k] *= d->scale; + + return 0; +} +#endif /* HAVE_HALIDE */ + #ifdef HAVE_OPENCL int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, From f87e68850db1bf7c227218d74c2b8887ecdf4a44 Mon Sep 17 00:00:00 2001 From: Mario Zimmermann Date: Mon, 20 Apr 2026 19:32:08 +0200 Subject: [PATCH 2/4] convert diffuse/sharpen module to halide --- data/halide/CMakeLists.txt | 61 +- data/halide/generators/bspline_generator.cpp | 131 +++++ .../generators/diffuse_pde_generator.cpp | 297 ++++++++++ src/iop/CMakeLists.txt | 9 + src/iop/diffuse.c | 540 ++++++++++++++++++ 5 files changed, 1037 insertions(+), 1 deletion(-) create mode 100644 data/halide/generators/bspline_generator.cpp create mode 100644 data/halide/generators/diffuse_pde_generator.cpp diff --git a/data/halide/CMakeLists.txt b/data/halide/CMakeLists.txt index f8e667afc6db..c4470ad1ac85 100644 --- a/data/halide/CMakeLists.txt +++ b/data/halide/CMakeLists.txt @@ -6,12 +6,22 @@ find_package(Halide REQUIRED) -# Build the generator executable (runs on the host at build time) +# Build the generator executables (run on the host at build time) add_halide_generator(dt_halide_generators SOURCES generators/exposure_generator.cpp ) +add_halide_generator(dt_halide_bspline_generators + SOURCES + generators/bspline_generator.cpp +) + +add_halide_generator(dt_halide_diffuse_generators + SOURCES + generators/diffuse_pde_generator.cpp +) + # Generate CPU-optimized code for the host platform # "host" auto-detects arch + OS + SIMD features (NEON on ARM64, AVX on x86, etc.) add_halide_library(halide_exposure @@ -37,6 +47,43 @@ endif() # Uses the same flat 2D generator as Metal. Vulkan loader is dlopen'd at runtime, # so no link-time dependency — if no Vulkan driver is present, the call simply fails # and we fall through to CPU Halide. +# B-spline wavelet decomposition for diffuse/sharpen module +# CPU target: separable 5-tap blur + detail extraction in one pass +add_halide_library(halide_bspline_decompose + FROM dt_halide_bspline_generators + GENERATOR bspline_decompose + TARGETS host + FUNCTION_NAME dt_halide_bspline_decompose +) + +# Metal GPU target for B-spline decomposition (macOS only) +if(APPLE) + add_halide_library(halide_bspline_decompose_metal + FROM dt_halide_bspline_generators + GENERATOR bspline_decompose + TARGETS host-metal + FUNCTION_NAME dt_halide_bspline_decompose_metal + ) +endif() + +# Heat PDE diffusion kernel for diffuse/sharpen module +add_halide_library(halide_diffuse_pde + FROM dt_halide_diffuse_generators + GENERATOR diffuse_pde + TARGETS host + FUNCTION_NAME dt_halide_diffuse_pde +) + +# Metal GPU target for PDE diffusion (macOS only) +if(APPLE) + add_halide_library(halide_diffuse_pde_metal + FROM dt_halide_diffuse_generators + GENERATOR diffuse_pde + TARGETS host-metal + FUNCTION_NAME dt_halide_diffuse_pde_metal + ) +endif() + find_package(Vulkan QUIET) if(Vulkan_FOUND) add_halide_library(halide_exposure_vulkan @@ -45,5 +92,17 @@ if(Vulkan_FOUND) TARGETS host-vulkan FUNCTION_NAME dt_halide_exposure_vulkan ) + add_halide_library(halide_bspline_decompose_vulkan + FROM dt_halide_bspline_generators + GENERATOR bspline_decompose + TARGETS host-vulkan + FUNCTION_NAME dt_halide_bspline_decompose_vulkan + ) + add_halide_library(halide_diffuse_pde_vulkan + FROM dt_halide_diffuse_generators + GENERATOR diffuse_pde + TARGETS host-vulkan + FUNCTION_NAME dt_halide_diffuse_pde_vulkan + ) set(HALIDE_HAS_VULKAN TRUE PARENT_SCOPE) endif() diff --git a/data/halide/generators/bspline_generator.cpp b/data/halide/generators/bspline_generator.cpp new file mode 100644 index 000000000000..d629dd9a3977 --- /dev/null +++ b/data/halide/generators/bspline_generator.cpp @@ -0,0 +1,131 @@ +/* + This file is part of darktable, + Copyright (C) 2026 darktable developers. + + darktable is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + darktable is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with darktable. If not, see . +*/ + +/* + * Halide generator for B-spline wavelet decomposition used by + * darktable's diffuse/sharpen module. + * + * Algorithm: separable 5-tap [1,4,6,4,1]/16 blur with strided access + * (à-trous wavelet transform). Produces both the low-frequency (blurred) + * and high-frequency (detail = input - blurred) components. + * + * darktable pixel layout: interleaved float RGBA, 4 channels per pixel. + * We use the "flat" 2D approach (xi = x*4+c, y) that works on both + * CPU and GPU, matching the exposure_flat pattern. + * + * The stride parameter "mult" controls the spacing between filter taps: + * mult = 1 << scale (scale 0 -> mult=1, scale 1 -> mult=2, etc.) + * + * Boundary handling: clamp to edge (matching darktable's MIN/MAX clamping). + * + * Negative values in the blur output are clipped to 0 (matching darktable's + * clip_negatives=TRUE in decompose_2D_Bspline). + */ + +#include "Halide.h" + +using namespace Halide; + +// Flat 2D layout (xi, y), xi = width * 4. +// Separable filter: vertical pass then horizontal pass. +// mult is the tap spacing in pixels (= 1 << scale). +// width_pixels is the image width in pixels (not floats). +class BsplineDecomposeGenerator : public Generator { +public: + Input> input{"input"}; // (xi, y) where xi = width*4 + Input width_pixels{"width_pixels"}; // image width in pixels + Input height{"height"}; // image height in pixels + Input mult{"mult"}; // tap spacing = 1 << scale + Output> lf{"lf"}; // low-frequency (blurred) + Output> hf{"hf"}; // high-frequency (detail) + + // Intermediate stage accessible to schedule() + Func vert{"vert"}; + + void generate() { + Var xi("xi"), y("y"); + + // B-spline filter weights + const float w0 = 1.0f / 16.0f; + const float w1 = 4.0f / 16.0f; + const float w2 = 6.0f / 16.0f; + + // Extract pixel x and channel from flat index + Expr px = xi / 4; + Expr ch = xi % 4; + + // Vertical pass: blur along y with spacing mult + // Clamp row indices to [0, height-1] + Expr r0 = clamp(y - 2 * mult, 0, height - 1); + Expr r1 = clamp(y - mult, 0, height - 1); + Expr r2 = y; + Expr r3 = clamp(y + mult, 0, height - 1); + Expr r4 = clamp(y + 2 * mult, 0, height - 1); + + vert(xi, y) = w0 * input(xi, r0) + + w1 * input(xi, r1) + + w2 * input(xi, r2) + + w1 * input(xi, r3) + + w0 * input(xi, r4); + + // Horizontal pass: blur along x with spacing mult on vertical result + // Clamp at pixel boundaries, preserving channel offset + Expr p0 = clamp(px - 2 * mult, 0, width_pixels - 1) * 4 + ch; + Expr p1 = clamp(px - mult, 0, width_pixels - 1) * 4 + ch; + Expr p2 = xi; + Expr p3 = clamp(px + mult, 0, width_pixels - 1) * 4 + ch; + Expr p4 = clamp(px + 2 * mult, 0, width_pixels - 1) * 4 + ch; + + Expr blur = w0 * vert(p0, y) + + w1 * vert(p1, y) + + w2 * vert(p2, y) + + w1 * vert(p3, y) + + w0 * vert(p4, y); + + // Clip negatives (matching darktable behavior) + lf(xi, y) = max(blur, 0.0f); + hf(xi, y) = input(xi, y) - lf(xi, y); + } + + void schedule() { + Var xi = lf.args()[0]; + Var y = lf.args()[1]; + + if(get_target().has_gpu_feature()) { + // GPU schedule: compute vert inline (fused into the horizontal pass) + // then tile the outputs + Var xio("xio"), yo("yo"); + vert.compute_at(lf, yo); + lf.gpu_tile(xi, y, xio, yo, 64, 16); + hf.gpu_tile(xi, y, xio, yo, 64, 16); + } else { + // CPU schedule: compute vert per row strip for cache locality, + // vectorize, parallelize over rows + Var xio("xio"), xii("xii"); + + // Compute vertical pass per output row (store full row) + vert.compute_at(lf, y).vectorize(xi, 32); + + lf.split(xi, xio, xii, 32).vectorize(xii).parallel(y); + // hf reads lf, so compute hf after lf at the same row + hf.split(xi, xio, xii, 32).vectorize(xii).parallel(y); + } + } +}; + +HALIDE_REGISTER_GENERATOR(BsplineDecomposeGenerator, bspline_decompose) diff --git a/data/halide/generators/diffuse_pde_generator.cpp b/data/halide/generators/diffuse_pde_generator.cpp new file mode 100644 index 000000000000..b55488d85c72 --- /dev/null +++ b/data/halide/generators/diffuse_pde_generator.cpp @@ -0,0 +1,297 @@ +/* + This file is part of darktable, + Copyright (C) 2026 darktable developers. + + darktable is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + darktable is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with darktable. If not, see . +*/ + +/* + * Halide generator for the heat PDE diffusion kernel used by + * darktable's diffuse/sharpen module. + * + * This implements the anisotropic PDE solver from: + * https://www.researchgate.net/publication/220663968 + * modified for multi-scale wavelet framework. + * + * Per pixel, the algorithm: + * 1. Fetches 3x3 stencil from HF and LF buffers (strided by mult) + * 2. Computes gradients on LF and HF (centered finite differences) + * 3. For 4 derivative orders, computes anisotropic diffusion kernels: + * - c2 = exp(-magnitude * anisotropy) + * - rotation matrix based on isotropy mode (isotrope/isophote/gradient) + * - builds 3x3 convolution kernel from rotation matrix + * 4. Convolves kernels with LF (orders 0,1) and HF (orders 2,3) + * 5. Regularizes variance, accumulates, produces output + * + * Uses flat 2D buffer layout (xi = width*4, y) for both CPU and GPU. + * Channel index c = xi % 4, pixel x = xi / 4. + * + * The isotropy_type for each of the 4 orders is passed as int parameters: + * 0 = ISOTROPE, 1 = ISOPHOTE, 2 = GRADIENT + */ + +#include "Halide.h" + +using namespace Halide; + +class DiffusePDEGenerator : public Generator { +public: + // Input buffers: flat 2D (xi, y) where xi = width * 4 + Input> HF{"HF"}; // high-frequency (wavelet detail) + Input> LF{"LF"}; // low-frequency (wavelet blur) + Input> mask_buf{"mask_buf"}; // per-pixel mask (width, height) + + // Scalar parameters + Input width_pixels{"width_pixels"}; + Input height{"height"}; + Input mult{"mult"}; // stencil stride = 1 << scale + Input has_mask{"has_mask"}; // boolean: use mask? + + // Anisotropy factors (4 orders): already squared user params + Input anisotropy_0{"anisotropy_0"}; + Input anisotropy_1{"anisotropy_1"}; + Input anisotropy_2{"anisotropy_2"}; + Input anisotropy_3{"anisotropy_3"}; + + // Isotropy types (4 orders): 0=ISOTROPE, 1=ISOPHOTE, 2=GRADIENT + Input isotropy_0{"isotropy_0"}; + Input isotropy_1{"isotropy_1"}; + Input isotropy_2{"isotropy_2"}; + Input isotropy_3{"isotropy_3"}; + + // ABCD coefficients (diffusion weights per order) + Input A{"A"}; + Input B{"B"}; + Input C{"C"}; + Input D{"D"}; + + // Other parameters + Input strength{"strength"}; + Input regularization_factor{"regularization_factor"}; + Input variance_threshold{"variance_threshold"}; + + Output> output{"output"}; + + // Intermediate Funcs for schedule access + Func neighbour_HF{"neighbour_HF"}, neighbour_LF{"neighbour_LF"}; + + void generate() { + Var xi("xi"), y("y"); + + // Channel and pixel coordinates + Expr ch = xi % 4; + Expr px = xi / 4; + + // For 2D buffer access we need (xi_coord, y_coord) + // Helper: clamped flat xi for a given pixel offset + auto clamped_xi = [&](Expr pixel_x) -> Expr { + return clamp(pixel_x, 0, width_pixels - 1) * 4 + ch; + }; + auto clamped_y = [&](Expr pixel_y) -> Expr { + return clamp(pixel_y, 0, height - 1); + }; + + // Fetch 3x3 stencil neighbors + // Stencil offsets: -mult, 0, +mult in pixel space + + // Actually, let's just inline the 9 neighbor accesses directly. + // For each of the 9 positions (ii,jj) where offset = (ii-1)*mult, (jj-1)*mult: + + // Fetch all 9 neighbors for HF and LF + // Layout: [0]=(-1,-1), [1]=(0,-1), [2]=(+1,-1), + // [3]=(-1, 0), [4]=(0, 0), [5]=(+1, 0), + // [6]=(-1,+1), [7]=(0,+1), [8]=(+1,+1) + // But in darktable convention: first index is row (x=vertical), second is col (y=horizontal) + // i_neighbours maps: 0 -> row-mult, 1 -> row, 2 -> row+mult + // j_neighbours maps: 0 -> col-mult, 1 -> col, 2 -> col+mult + // pixel[3*ii+jj] where ii is row index, jj is col index + // So pixel[0] = (row-mult, col-mult), pixel[1] = (row-mult, col), etc. + // NOTE: in darktable, "x is vertical, y is horizontal" + // row = y in our coords, col = px in our coords + // i_neighbours = row offsets, j_neighbours = col offsets + + // darktable: i = row (our y), j = col (our px) + // i_neighbours[0] = row - mult, [1] = row, [2] = row + mult + // j_neighbours[0] = col - mult, [1] = col, [2] = col + mult + // neighbor[3*ii+jj] accesses i_neighbours[ii] + j_neighbours[jj] + // So ii indexes row, jj indexes col + + // fetch_rc(row_offset, col_offset) in pixel-space offsets + auto fetch_rc = [&](const Input> &buf, + int row_off, int col_off) -> Expr { + Expr nx = clamped_xi(px + col_off * mult); + Expr ny = clamped_y(y + row_off * mult); + return buf(nx, ny); + }; + + // n[k] for k=0..8, k = 3*ii+jj, ii=row_idx(0,1,2), jj=col_idx(0,1,2) + // row_off = ii-1, col_off = jj-1 + Expr hf[9], lf[9]; + for(int ii = 0; ii < 3; ii++) { + for(int jj = 0; jj < 3; jj++) { + hf[3*ii+jj] = fetch_rc(HF, ii-1, jj-1); + lf[3*ii+jj] = fetch_rc(LF, ii-1, jj-1); + } + } + + // Gradients (centered finite differences) + // find_gradients: xy[0] = (pixels[7] - pixels[1]) / 2 (x = vertical = row direction) + // xy[1] = (pixels[5] - pixels[3]) / 2 (y = horizontal = col direction) + Expr grad_lf_x = (lf[7] - lf[1]) / 2.0f; + Expr grad_lf_y = (lf[5] - lf[3]) / 2.0f; + Expr grad_hf_x = (hf[7] - hf[1]) / 2.0f; + Expr grad_hf_y = (hf[5] - hf[3]) / 2.0f; + + // Magnitude and normalized direction for LF gradient + Expr mag_grad = sqrt(grad_lf_x * grad_lf_x + grad_lf_y * grad_lf_y); + Expr cos_grad = select(mag_grad != 0.0f, grad_lf_x / mag_grad, 1.0f); + Expr sin_grad = select(mag_grad != 0.0f, grad_lf_y / mag_grad, 0.0f); + Expr cos2_grad = cos_grad * cos_grad; + Expr sin2_grad = sin_grad * sin_grad; + Expr cossin_grad = cos_grad * sin_grad; + + // Magnitude and normalized direction for HF gradient (called "laplacian" in dt) + Expr mag_lapl = sqrt(grad_hf_x * grad_hf_x + grad_hf_y * grad_hf_y); + Expr cos_lapl = select(mag_lapl != 0.0f, grad_hf_x / mag_lapl, 1.0f); + Expr sin_lapl = select(mag_lapl != 0.0f, grad_hf_y / mag_lapl, 0.0f); + Expr cos2_lapl = cos_lapl * cos_lapl; + Expr sin2_lapl = sin_lapl * sin_lapl; + Expr cossin_lapl = cos_lapl * sin_lapl; + + // c2 = exp(-magnitude * anisotropy) for each of 4 orders + // darktable uses a fast approximate exp via integer bit tricks. + // We use Halide's fast_exp which is similarly approximate. + // Orders 0,2 use grad magnitude; orders 1,3 use lapl magnitude. + Expr c2_0 = fast_exp(-mag_grad * anisotropy_0); + Expr c2_1 = fast_exp(-mag_lapl * anisotropy_1); + Expr c2_2 = fast_exp(-mag_grad * anisotropy_2); + Expr c2_3 = fast_exp(-mag_lapl * anisotropy_3); + + // Build 3x3 convolution kernels for each order based on isotropy type. + // kernel[k] for k=0..8 is the 3x3 stencil weight. + // + // For ISOTROPE: kernel = [0.25, 0.5, 0.25, 0.5, -3, 0.5, 0.25, 0.5, 0.25] + // + // For ISOPHOTE with c2, cos2, sin2, cossin: + // a[0][0] = cos2 + c2*sin2, a[1][1] = c2*cos2 + sin2 + // a[0][1] = a[1][0] = (c2 - 1) * cossin + // Then build_matrix: + // b11 = a01/2, b13 = -b11, b22 = -2*(a00+a11) + // kernel = [b11, a11, b13, a00, b22, a00, b13, a11, b11] + // + // For GRADIENT: same as isophote but swapped: + // a[0][0] = c2*cos2 + sin2, a[1][1] = cos2 + c2*sin2 + // a[0][1] = a[1][0] = (1 - c2) * cossin + + // Helper: build kernel from isotropy_type, c2, cos2, sin2, cossin + // Returns 9 Exprs for the kernel weights + auto build_kern = [](Expr iso_type, Expr c2_val, + Expr cos2, Expr sin2, Expr cossin, + Expr kern[9]) { + // Isophote rotation matrix elements + Expr iso_a00 = cos2 + c2_val * sin2; + Expr iso_a11 = c2_val * cos2 + sin2; + Expr iso_a01 = (c2_val - 1.0f) * cossin; + + // Gradient rotation matrix elements (swapped) + Expr grd_a00 = c2_val * cos2 + sin2; + Expr grd_a11 = cos2 + c2_val * sin2; + Expr grd_a01 = (1.0f - c2_val) * cossin; + + // Select based on isotropy type + Expr a00 = select(iso_type == 0, 0.0f, // isotrope: doesn't use matrix + iso_type == 1, iso_a00, + grd_a00); + Expr a11 = select(iso_type == 0, 0.0f, + iso_type == 1, iso_a11, + grd_a11); + Expr a01 = select(iso_type == 0, 0.0f, + iso_type == 1, iso_a01, + grd_a01); + + // build_matrix + Expr b11 = a01 / 2.0f; + Expr b13 = -b11; + Expr b22 = -2.0f * (a00 + a11); + + // Anisotropic kernel + Expr aniso_kern[9] = {b11, a11, b13, a00, b22, a00, b13, a11, b11}; + + // Isotrope kernel (constant) + float iso_kern[9] = {0.25f, 0.5f, 0.25f, 0.5f, -3.0f, 0.5f, 0.25f, 0.5f, 0.25f}; + + for(int k = 0; k < 9; k++) { + kern[k] = select(iso_type == 0, iso_kern[k], aniso_kern[k]); + } + }; + + // Build kernels for each order + // Orders 0, 2 use grad angles; orders 1, 3 use lapl angles + Expr kern0[9], kern1[9], kern2[9], kern3[9]; + build_kern(isotropy_0, c2_0, cos2_grad, sin2_grad, cossin_grad, kern0); + build_kern(isotropy_1, c2_1, cos2_lapl, sin2_lapl, cossin_lapl, kern1); + build_kern(isotropy_2, c2_2, cos2_grad, sin2_grad, cossin_grad, kern2); + build_kern(isotropy_3, c2_3, cos2_lapl, sin2_lapl, cossin_lapl, kern3); + + // Convolve: derivatives[k] = sum_j(kern_k[j] * buffer[j]) + // Orders 0,1 convolve with LF; orders 2,3 convolve with HF + Expr deriv0 = cast(0), deriv1 = cast(0); + Expr deriv2 = cast(0), deriv3 = cast(0); + Expr variance = cast(0); + + for(int k = 0; k < 9; k++) { + deriv0 += kern0[k] * lf[k]; + deriv1 += kern1[k] * lf[k]; + deriv2 += kern2[k] * hf[k]; + deriv3 += kern3[k] * hf[k]; + variance += hf[k] * hf[k]; + } + + // Regularize variance + Expr var_reg = variance_threshold + variance * regularization_factor; + + // Accumulate + Expr acc = deriv0 * A + deriv1 * B + deriv2 * C + deriv3 * D; + + // Final output: max(HF*strength + acc/variance + LF, 0) + Expr center_hf = hf[4]; // center pixel HF + Expr center_lf = lf[4]; // center pixel LF + Expr result = center_hf * strength + acc / var_reg + center_lf; + + // Mask handling: if mask is 0, just output HF + LF (passthrough) + // mask is per-pixel (width, height), not per-channel + Expr mask_val = select(has_mask != 0, + cast(mask_buf(px, y)), + 1); + output(xi, y) = select(mask_val != 0, + max(result, 0.0f), + center_hf + center_lf); + } + + void schedule() { + Var xi = output.args()[0]; + Var y = output.args()[1]; + + if(get_target().has_gpu_feature()) { + Var xio("xio"), yo("yo"); + output.gpu_tile(xi, y, xio, yo, 64, 16); + } else { + // CPU: vectorize across channels/pixels, parallelize rows + Var xio("xio"), xii("xii"); + output.split(xi, xio, xii, 32).vectorize(xii).parallel(y); + } + } +}; + +HALIDE_REGISTER_GENERATOR(DiffusePDEGenerator, diffuse_pde) diff --git a/src/iop/CMakeLists.txt b/src/iop/CMakeLists.txt index 6777052a8a26..4bc8d8b001f0 100644 --- a/src/iop/CMakeLists.txt +++ b/src/iop/CMakeLists.txt @@ -160,6 +160,15 @@ add_iop(censorize "censorize.c") add_iop(colorbalancergb "colorbalancergb.c") add_iop(cacorrectrgb "cacorrectrgb.c") add_iop(diffuse "diffuse.c") +if(USE_HALIDE) + target_link_libraries(diffuse halide_bspline_decompose halide_diffuse_pde) + if(APPLE) + target_link_libraries(diffuse halide_bspline_decompose_metal halide_diffuse_pde_metal) + endif() + if(HALIDE_HAS_VULKAN) + target_link_libraries(diffuse halide_bspline_decompose_vulkan halide_diffuse_pde_vulkan) + endif() +endif() add_iop(blurs "blurs.c") add_iop(sigmoid "sigmoid.c") add_iop(agx "agx.c") diff --git a/src/iop/diffuse.c b/src/iop/diffuse.c index 27885e7b966a..69f0cd3d97c7 100644 --- a/src/iop/diffuse.c +++ b/src/iop/diffuse.c @@ -1450,6 +1450,546 @@ void process(dt_iop_module_t *self, if(HF[s]) dt_free_align(HF[s]); } +#ifdef HAVE_HALIDE +#include "HalideRuntime.h" +#include "halide_bspline_decompose.h" +#include "halide_diffuse_pde.h" +#ifdef __APPLE__ +#include "halide_bspline_decompose_metal.h" +#include "halide_diffuse_pde_metal.h" +#endif +#ifdef HAVE_HALIDE_VULKAN +#include "halide_bspline_decompose_vulkan.h" +#include "halide_diffuse_pde_vulkan.h" +#endif + +// Halide-accelerated B-spline decomposition: replaces decompose_2D_Bspline() +// Uses flat 2D buffer layout (xi = width*4, y). +// Returns 0 on success, non-zero on failure. +static int _halide_decompose_2D_Bspline(const float *const restrict in, + float *const restrict HF, + float *const restrict LF, + const size_t width, + const size_t height, + const int mult, + const char **backend_out) +{ + const int row_floats = (int)width * 4; + const int w = (int)width; + const int h = (int)height; + + halide_dimension_t in_dims[2] = { + {0, row_floats, 1}, + {0, h, row_floats}, + }; + halide_dimension_t lf_dims[2] = { + {0, row_floats, 1}, + {0, h, row_floats}, + }; + halide_dimension_t hf_dims[2] = { + {0, row_floats, 1}, + {0, h, row_floats}, + }; + + halide_buffer_t in_buf = {0}; + in_buf.host = (uint8_t *)in; + in_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; + in_buf.dimensions = 2; + in_buf.dim = in_dims; + + halide_buffer_t lf_buf = {0}; + lf_buf.host = (uint8_t *)LF; + lf_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; + lf_buf.dimensions = 2; + lf_buf.dim = lf_dims; + + halide_buffer_t hf_buf = {0}; + hf_buf.host = (uint8_t *)HF; + hf_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; + hf_buf.dimensions = 2; + hf_buf.dim = hf_dims; + + int err = -1; + const char *backend = NULL; + + // Try Metal GPU +#ifdef __APPLE__ + { + halide_buffer_t gpu_in = in_buf; + halide_buffer_t gpu_lf = lf_buf; + halide_buffer_t gpu_hf = hf_buf; + gpu_in.flags = halide_buffer_flag_host_dirty; + + err = dt_halide_bspline_decompose_metal(&gpu_in, w, h, mult, &gpu_lf, &gpu_hf); + if(!err) + { + halide_copy_to_host(NULL, &gpu_lf); + halide_copy_to_host(NULL, &gpu_hf); + backend = "Metal GPU"; + } + else + { + dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Metal bspline failed (%d), trying next backend", err); + } + halide_device_free(NULL, &gpu_in); + halide_device_free(NULL, &gpu_lf); + halide_device_free(NULL, &gpu_hf); + } +#endif + + // Try Vulkan GPU +#ifdef HAVE_HALIDE_VULKAN + if(err) + { + halide_buffer_t gpu_in = in_buf; + halide_buffer_t gpu_lf = lf_buf; + halide_buffer_t gpu_hf = hf_buf; + gpu_in.flags = halide_buffer_flag_host_dirty; + + err = dt_halide_bspline_decompose_vulkan(&gpu_in, w, h, mult, &gpu_lf, &gpu_hf); + if(!err) + { + halide_copy_to_host(NULL, &gpu_lf); + halide_copy_to_host(NULL, &gpu_hf); + backend = "Vulkan GPU"; + } + else + { + dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Vulkan bspline failed (%d), trying CPU", err); + } + halide_device_free(NULL, &gpu_in); + halide_device_free(NULL, &gpu_lf); + halide_device_free(NULL, &gpu_hf); + } +#endif + + // CPU Halide fallback + if(err) + { + err = dt_halide_bspline_decompose(&in_buf, w, h, mult, &lf_buf, &hf_buf); + if(!err) backend = "CPU"; + } + + if(backend_out) *backend_out = backend; + return err; +} + +// Halide-accelerated heat PDE diffusion: replaces heat_PDE_diffusion() +// Returns 0 on success, non-zero on failure. +static int _halide_heat_PDE_diffusion(const float *const restrict high_freq, + const float *const restrict low_freq, + const uint8_t *const restrict mask, + const gboolean has_mask, + float *const restrict output, + const size_t width, + const size_t height, + const dt_aligned_pixel_t anisotropy, + const dt_isotropy_t isotropy_type[4], + const float regularization, + const float variance_threshold, + const float current_radius_square, + const int mult, + const dt_aligned_pixel_t ABCD, + const float strength, + const char **backend_out) +{ + const int row_floats = (int)width * 4; + const int w = (int)width; + const int h = (int)height; + const float regularization_factor = regularization * current_radius_square / 9.f; + + // Setup flat 2D buffers for HF, LF, output + halide_dimension_t flat_dims[2] = { + {0, row_floats, 1}, + {0, h, row_floats}, + }; + + halide_buffer_t hf_buf = {0}; + hf_buf.host = (uint8_t *)high_freq; + hf_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; + hf_buf.dimensions = 2; + hf_buf.dim = flat_dims; + + halide_dimension_t lf_dims[2] = { + {0, row_floats, 1}, + {0, h, row_floats}, + }; + halide_buffer_t lf_buf = {0}; + lf_buf.host = (uint8_t *)low_freq; + lf_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; + lf_buf.dimensions = 2; + lf_buf.dim = lf_dims; + + // Mask buffer: per-pixel uint8 (width, height) + halide_dimension_t mask_dims[2] = { + {0, w, 1}, + {0, h, w}, + }; + halide_buffer_t mask_b = {0}; + mask_b.host = (uint8_t *)mask; + mask_b.type = (struct halide_type_t){halide_type_uint, 8, 1}; + mask_b.dimensions = 2; + mask_b.dim = mask_dims; + + halide_dimension_t out_dims[2] = { + {0, row_floats, 1}, + {0, h, row_floats}, + }; + halide_buffer_t out_buf = {0}; + out_buf.host = (uint8_t *)output; + out_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; + out_buf.dimensions = 2; + out_buf.dim = out_dims; + + int err = -1; + const char *backend = NULL; + + // Common arguments (after buffer args): + // width_pixels, height, mult, has_mask, + // anisotropy_0..3, isotropy_0..3, A, B, C, D, + // strength, regularization_factor, variance_threshold + +#ifdef __APPLE__ + { + halide_buffer_t gpu_hf = hf_buf; + halide_buffer_t gpu_lf = lf_buf; + halide_buffer_t gpu_mask = mask_b; + halide_buffer_t gpu_out = out_buf; + gpu_hf.flags = halide_buffer_flag_host_dirty; + gpu_lf.flags = halide_buffer_flag_host_dirty; + if(has_mask) gpu_mask.flags = halide_buffer_flag_host_dirty; + + err = dt_halide_diffuse_pde_metal( + &gpu_hf, &gpu_lf, &gpu_mask, + w, h, mult, has_mask ? 1 : 0, + anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3], + (int)isotropy_type[0], (int)isotropy_type[1], + (int)isotropy_type[2], (int)isotropy_type[3], + ABCD[0], ABCD[1], ABCD[2], ABCD[3], + strength, regularization_factor, variance_threshold, + &gpu_out); + if(!err) + { + halide_copy_to_host(NULL, &gpu_out); + backend = "Metal GPU"; + } + else + { + dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Metal PDE failed (%d), trying next backend", err); + } + halide_device_free(NULL, &gpu_hf); + halide_device_free(NULL, &gpu_lf); + halide_device_free(NULL, &gpu_mask); + halide_device_free(NULL, &gpu_out); + } +#endif + +#ifdef HAVE_HALIDE_VULKAN + if(err) + { + halide_buffer_t gpu_hf = hf_buf; + halide_buffer_t gpu_lf = lf_buf; + halide_buffer_t gpu_mask = mask_b; + halide_buffer_t gpu_out = out_buf; + gpu_hf.flags = halide_buffer_flag_host_dirty; + gpu_lf.flags = halide_buffer_flag_host_dirty; + if(has_mask) gpu_mask.flags = halide_buffer_flag_host_dirty; + + err = dt_halide_diffuse_pde_vulkan( + &gpu_hf, &gpu_lf, &gpu_mask, + w, h, mult, has_mask ? 1 : 0, + anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3], + (int)isotropy_type[0], (int)isotropy_type[1], + (int)isotropy_type[2], (int)isotropy_type[3], + ABCD[0], ABCD[1], ABCD[2], ABCD[3], + strength, regularization_factor, variance_threshold, + &gpu_out); + if(!err) + { + halide_copy_to_host(NULL, &gpu_out); + backend = "Vulkan GPU"; + } + else + { + dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Vulkan PDE failed (%d), trying CPU", err); + } + halide_device_free(NULL, &gpu_hf); + halide_device_free(NULL, &gpu_lf); + halide_device_free(NULL, &gpu_mask); + halide_device_free(NULL, &gpu_out); + } +#endif + + if(err) + { + err = dt_halide_diffuse_pde( + &hf_buf, &lf_buf, &mask_b, + w, h, mult, has_mask ? 1 : 0, + anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3], + (int)isotropy_type[0], (int)isotropy_type[1], + (int)isotropy_type[2], (int)isotropy_type[3], + ABCD[0], ABCD[1], ABCD[2], ABCD[3], + strength, regularization_factor, variance_threshold, + &out_buf); + if(!err) backend = "CPU"; + } + + if(backend_out) *backend_out = backend; + return err; +} + +// Halide version of wavelets_process: uses Halide for B-spline decomposition, +// keeps the PDE diffusion kernel as C code (Phase 2 will port that too). +static inline gboolean _wavelets_process_halide(const float *const restrict in, + float *const restrict reconstructed, + const uint8_t *const restrict mask, + const size_t width, + const size_t height, + const dt_iop_diffuse_data_t *const data, + const float final_radius, + const float zoom, + const int scales, + const gboolean has_mask, + float *const restrict HF[MAX_NUM_SCALES], + float *const restrict LF_odd, + float *const restrict LF_even) +{ + gboolean success = TRUE; + + const dt_aligned_pixel_t anisotropy + = { compute_anisotropy_factor(data->anisotropy_first), + compute_anisotropy_factor(data->anisotropy_second), + compute_anisotropy_factor(data->anisotropy_third), + compute_anisotropy_factor(data->anisotropy_fourth) }; + + const dt_isotropy_t DT_ALIGNED_PIXEL isotropy_type[4] + = { check_isotropy_mode(data->anisotropy_first), + check_isotropy_mode(data->anisotropy_second), + check_isotropy_mode(data->anisotropy_third), + check_isotropy_mode(data->anisotropy_fourth) }; + + const float regularization = powf(10.f, data->regularization) - 1.f; + const float variance_threshold = powf(10.f, data->variance_threshold); + + // À trous decimated wavelet decompose using Halide-accelerated B-spline blur + float *restrict residual; + const char *backend = NULL; + for(int s = 0; s < scales; ++s) + { + const int mult = 1 << s; + + const float *restrict buffer_in; + float *restrict buffer_out; + + if(s == 0) + { + buffer_in = in; + buffer_out = LF_odd; + } + else if(s % 2 != 0) + { + buffer_in = LF_odd; + buffer_out = LF_even; + } + else + { + buffer_in = LF_even; + buffer_out = LF_odd; + } + + const char *scale_backend = NULL; + int err = _halide_decompose_2D_Bspline(buffer_in, HF[s], buffer_out, width, height, mult, &scale_backend); + if(err) + { + dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide bspline decompose failed at scale %d (err=%d)", s, err); + success = FALSE; + return success; + } + if(s == 0) backend = scale_backend; + + residual = buffer_out; + } + + if(backend) + dt_print(DT_DEBUG_PIPE, "[diffuse] Halide bspline decompose backend: %s", backend); + + // PDE diffusion reconstruction (unchanged from C version) + float *restrict temp = (residual == LF_even) ? LF_odd : LF_even; + + int count = 0; + for(int s = scales - 1; s > -1; --s) + { + const int mult = 1 << s; + const float current_radius = equivalent_sigma_at_step(B_SPLINE_SIGMA, s); + const float real_radius = current_radius * zoom; + + const float norm = + expf(-sqf(real_radius - (float)data->radius_center) / sqf(data->radius)); + const dt_aligned_pixel_t ABCD = { data->first * KAPPA * norm, + data->second * KAPPA * norm, + data->third * KAPPA * norm, + data->fourth * KAPPA * norm }; + const float strength = data->sharpness * norm + 1.f; + + const float *restrict buffer_in; + float *restrict buffer_out; + + if(count == 0) + { + buffer_in = residual; + buffer_out = temp; + } + else if(count % 2 != 0) + { + buffer_in = temp; + buffer_out = residual; + } + else + { + buffer_in = residual; + buffer_out = temp; + } + + if(s == 0) buffer_out = reconstructed; + + const char *pde_backend = NULL; + int pde_err = _halide_heat_PDE_diffusion(HF[s], buffer_in, mask, has_mask, buffer_out, width, height, + anisotropy, isotropy_type, regularization, + variance_threshold, sqf(current_radius), mult, ABCD, strength, + &pde_backend); + if(pde_err) + { + dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide PDE diffusion failed at scale %d (err=%d), falling back to C", s, pde_err); + // Fall back to C implementation for this scale + heat_PDE_diffusion(HF[s], buffer_in, mask, has_mask, buffer_out, width, height, + anisotropy, isotropy_type, regularization, + variance_threshold, sqf(current_radius), mult, ABCD, strength); + } + else if(s == scales - 1 && pde_backend) + { + dt_print(DT_DEBUG_PIPE, "[diffuse] Halide PDE diffusion backend: %s", pde_backend); + } + count++; + } + + return success; +} + +__attribute__((visibility("default"))) +int process_halide(dt_iop_module_t *self, + dt_dev_pixelpipe_iop_t *piece, + const void *const restrict ivoid, + void *const restrict ovoid, + const dt_iop_roi_t *const roi_in, + const dt_iop_roi_t *const roi_out) +{ + const gboolean fastmode = piece->pipe->type & DT_DEV_PIXELPIPE_FAST; + const dt_iop_diffuse_data_t *const data = piece->data; + + const size_t width = roi_out->width; + const size_t height = roi_out->height; + + if(fastmode) + { + const size_t ch = piece->colors; + dt_iop_copy_image_roi(ovoid, ivoid, ch, roi_in, roi_out); + return 0; + } + + uint8_t *const restrict mask = dt_alloc_align_uint8(width * height); + + float *restrict in = DT_IS_ALIGNED((float *const restrict)ivoid); + float *const restrict out = DT_IS_ALIGNED((float *const restrict)ovoid); + + float *restrict temp1, *restrict temp2; + float *restrict LF_odd, *restrict LF_even; + float *restrict temp_in = NULL; + float *restrict temp_out = NULL; + + gboolean out_of_memory = !mask + || !dt_iop_alloc_image_buffers(self, roi_in, roi_out, + 4 | DT_IMGSZ_OUTPUT, &temp1, + 4 | DT_IMGSZ_OUTPUT, &temp2, + 4 | DT_IMGSZ_OUTPUT, &LF_odd, + 4 | DT_IMGSZ_OUTPUT, &LF_even, + 0, NULL); + + const float scale = fmaxf(piece->iscale / roi_in->scale, 1.f); + const float final_radius = (data->radius + data->radius_center) * 2.f / scale; + + const int iterations = MAX(ceilf((float)data->iterations), 1); + const int diffusion_scales = num_steps_to_reach_equivalent_sigma(B_SPLINE_SIGMA, final_radius); + const int scales = CLAMP(diffusion_scales, 1, MAX_NUM_SCALES); + + float *restrict HF[MAX_NUM_SCALES]; + for(int s = 0; s < scales; s++) + { + HF[s] = out_of_memory ? NULL : dt_alloc_align_float(width * height * 4); + if(!HF[s]) out_of_memory = TRUE; + } + + int ret = 0; + + if(out_of_memory) + { + dt_iop_copy_image_roi(ovoid, ivoid, piece->colors, roi_in, roi_out); + dt_control_log(_("diffuse/sharpen failed to allocate memory, check your RAM settings")); + ret = -1; + goto finish; + } + + const gboolean has_mask = (data->threshold > 0.f); + if(has_mask) + { + build_mask(in, mask, data->threshold, roi_out->width, roi_out->height); + inpaint_mask(temp1, in, mask, roi_out->width, roi_out->height); + in = temp1; + } + + for(int it = 0; it < iterations; it++) + { + if(it == 0) + { + temp_in = in; + temp_out = temp2; + } + else if(it % 2 == 0) + { + temp_in = temp1; + temp_out = temp2; + } + else + { + temp_in = temp2; + temp_out = temp1; + } + + if(it == iterations - 1) + temp_out = out; + + if(!_wavelets_process_halide(temp_in, temp_out, mask, + roi_out->width, roi_out->height, + data, final_radius, scale, scales, has_mask, HF, LF_odd, LF_even)) + { + // Halide failed, signal fallback to regular process() + ret = -1; + goto finish; + } + } + +finish: + dt_free_align(mask); + dt_free_align(temp1); + dt_free_align(temp2); + dt_free_align(LF_even); + dt_free_align(LF_odd); + for(int s = 0; s < scales; s++) + if(HF[s]) dt_free_align(HF[s]); + + return ret; +} +#endif /* HAVE_HALIDE */ + #if HAVE_OPENCL static inline cl_int wavelets_process_cl(const int devid, cl_mem in, From a342c3b40e0f9c9ee10dc3361fd1b6296092f86a Mon Sep 17 00:00:00 2001 From: Mario Zimmermann Date: Mon, 20 Apr 2026 19:45:28 +0200 Subject: [PATCH 3/4] use GPU persistent buffers --- src/iop/diffuse.c | 603 ++++++++++++++++++++-------------------------- 1 file changed, 264 insertions(+), 339 deletions(-) diff --git a/src/iop/diffuse.c b/src/iop/diffuse.c index 69f0cd3d97c7..28565a399d59 100644 --- a/src/iop/diffuse.c +++ b/src/iop/diffuse.c @@ -1463,283 +1463,166 @@ void process(dt_iop_module_t *self, #include "halide_diffuse_pde_vulkan.h" #endif -// Halide-accelerated B-spline decomposition: replaces decompose_2D_Bspline() -// Uses flat 2D buffer layout (xi = width*4, y). -// Returns 0 on success, non-zero on failure. -static int _halide_decompose_2D_Bspline(const float *const restrict in, - float *const restrict HF, - float *const restrict LF, - const size_t width, - const size_t height, - const int mult, - const char **backend_out) +// Backend selection for GPU-persistent buffer management +typedef enum _halide_backend_t { - const int row_floats = (int)width * 4; - const int w = (int)width; - const int h = (int)height; + HALIDE_BACKEND_NONE = 0, + HALIDE_BACKEND_METAL, + HALIDE_BACKEND_VULKAN, + HALIDE_BACKEND_CPU +} _halide_backend_t; + +// Helper: initialize a flat 2D halide_buffer_t for float RGBA data +static inline void _halide_init_flat_f32(halide_buffer_t *buf, halide_dimension_t dims[2], + float *host, const int row_floats, const int h) +{ + dims[0] = (halide_dimension_t){0, row_floats, 1}; + dims[1] = (halide_dimension_t){0, h, row_floats}; + memset(buf, 0, sizeof(*buf)); + buf->host = (uint8_t *)host; + buf->type = (struct halide_type_t){halide_type_float, 32, 1}; + buf->dimensions = 2; + buf->dim = dims; +} - halide_dimension_t in_dims[2] = { - {0, row_floats, 1}, - {0, h, row_floats}, - }; - halide_dimension_t lf_dims[2] = { - {0, row_floats, 1}, - {0, h, row_floats}, - }; - halide_dimension_t hf_dims[2] = { - {0, row_floats, 1}, - {0, h, row_floats}, - }; - - halide_buffer_t in_buf = {0}; - in_buf.host = (uint8_t *)in; - in_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; - in_buf.dimensions = 2; - in_buf.dim = in_dims; - - halide_buffer_t lf_buf = {0}; - lf_buf.host = (uint8_t *)LF; - lf_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; - lf_buf.dimensions = 2; - lf_buf.dim = lf_dims; - - halide_buffer_t hf_buf = {0}; - hf_buf.host = (uint8_t *)HF; - hf_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; - hf_buf.dimensions = 2; - hf_buf.dim = hf_dims; - - int err = -1; - const char *backend = NULL; - - // Try Metal GPU -#ifdef __APPLE__ - { - halide_buffer_t gpu_in = in_buf; - halide_buffer_t gpu_lf = lf_buf; - halide_buffer_t gpu_hf = hf_buf; - gpu_in.flags = halide_buffer_flag_host_dirty; +// Helper: initialize a 2D halide_buffer_t for uint8 mask data +static inline void _halide_init_mask_u8(halide_buffer_t *buf, halide_dimension_t dims[2], + const uint8_t *host, const int w, const int h) +{ + dims[0] = (halide_dimension_t){0, w, 1}; + dims[1] = (halide_dimension_t){0, h, w}; + memset(buf, 0, sizeof(*buf)); + buf->host = (uint8_t *)host; + buf->type = (struct halide_type_t){halide_type_uint, 8, 1}; + buf->dimensions = 2; + buf->dim = dims; +} - err = dt_halide_bspline_decompose_metal(&gpu_in, w, h, mult, &gpu_lf, &gpu_hf); - if(!err) - { - halide_copy_to_host(NULL, &gpu_lf); - halide_copy_to_host(NULL, &gpu_hf); - backend = "Metal GPU"; - } - else - { - dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Metal bspline failed (%d), trying next backend", err); - } - halide_device_free(NULL, &gpu_in); - halide_device_free(NULL, &gpu_lf); - halide_device_free(NULL, &gpu_hf); - } +// Try to run bspline decompose on the given backend. +// Returns 0 on success. Does NOT copy to host or free device memory. +static inline int _halide_run_bspline(_halide_backend_t backend, + halide_buffer_t *in_buf, + halide_buffer_t *lf_buf, + halide_buffer_t *hf_buf, + const int w, const int h, const int mult) +{ + switch(backend) + { +#ifdef __APPLE__ + case HALIDE_BACKEND_METAL: + return dt_halide_bspline_decompose_metal(in_buf, w, h, mult, lf_buf, hf_buf); #endif - - // Try Vulkan GPU #ifdef HAVE_HALIDE_VULKAN - if(err) - { - halide_buffer_t gpu_in = in_buf; - halide_buffer_t gpu_lf = lf_buf; - halide_buffer_t gpu_hf = hf_buf; - gpu_in.flags = halide_buffer_flag_host_dirty; - - err = dt_halide_bspline_decompose_vulkan(&gpu_in, w, h, mult, &gpu_lf, &gpu_hf); - if(!err) - { - halide_copy_to_host(NULL, &gpu_lf); - halide_copy_to_host(NULL, &gpu_hf); - backend = "Vulkan GPU"; - } - else - { - dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Vulkan bspline failed (%d), trying CPU", err); - } - halide_device_free(NULL, &gpu_in); - halide_device_free(NULL, &gpu_lf); - halide_device_free(NULL, &gpu_hf); - } + case HALIDE_BACKEND_VULKAN: + return dt_halide_bspline_decompose_vulkan(in_buf, w, h, mult, lf_buf, hf_buf); #endif + case HALIDE_BACKEND_CPU: + return dt_halide_bspline_decompose(in_buf, w, h, mult, lf_buf, hf_buf); + default: + return -1; + } +} - // CPU Halide fallback - if(err) +// Try to run PDE diffusion on the given backend. +// Returns 0 on success. Does NOT copy to host or free device memory. +static inline int _halide_run_pde(_halide_backend_t backend, + halide_buffer_t *hf_buf, + halide_buffer_t *lf_buf, + halide_buffer_t *mask_buf, + halide_buffer_t *out_buf, + const int w, const int h, const int mult, + const int has_mask_i, + const dt_aligned_pixel_t anisotropy, + const dt_isotropy_t isotropy_type[4], + const dt_aligned_pixel_t ABCD, + const float strength, + const float regularization_factor, + const float variance_threshold) +{ + switch(backend) { - err = dt_halide_bspline_decompose(&in_buf, w, h, mult, &lf_buf, &hf_buf); - if(!err) backend = "CPU"; +#ifdef __APPLE__ + case HALIDE_BACKEND_METAL: + return dt_halide_diffuse_pde_metal( + hf_buf, lf_buf, mask_buf, + w, h, mult, has_mask_i, + anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3], + (int)isotropy_type[0], (int)isotropy_type[1], + (int)isotropy_type[2], (int)isotropy_type[3], + ABCD[0], ABCD[1], ABCD[2], ABCD[3], + strength, regularization_factor, variance_threshold, + out_buf); +#endif +#ifdef HAVE_HALIDE_VULKAN + case HALIDE_BACKEND_VULKAN: + return dt_halide_diffuse_pde_vulkan( + hf_buf, lf_buf, mask_buf, + w, h, mult, has_mask_i, + anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3], + (int)isotropy_type[0], (int)isotropy_type[1], + (int)isotropy_type[2], (int)isotropy_type[3], + ABCD[0], ABCD[1], ABCD[2], ABCD[3], + strength, regularization_factor, variance_threshold, + out_buf); +#endif + case HALIDE_BACKEND_CPU: + return dt_halide_diffuse_pde( + hf_buf, lf_buf, mask_buf, + w, h, mult, has_mask_i, + anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3], + (int)isotropy_type[0], (int)isotropy_type[1], + (int)isotropy_type[2], (int)isotropy_type[3], + ABCD[0], ABCD[1], ABCD[2], ABCD[3], + strength, regularization_factor, variance_threshold, + out_buf); + default: + return -1; } - - if(backend_out) *backend_out = backend; - return err; } -// Halide-accelerated heat PDE diffusion: replaces heat_PDE_diffusion() -// Returns 0 on success, non-zero on failure. -static int _halide_heat_PDE_diffusion(const float *const restrict high_freq, - const float *const restrict low_freq, - const uint8_t *const restrict mask, - const gboolean has_mask, - float *const restrict output, - const size_t width, - const size_t height, - const dt_aligned_pixel_t anisotropy, - const dt_isotropy_t isotropy_type[4], - const float regularization, - const float variance_threshold, - const float current_radius_square, - const int mult, - const dt_aligned_pixel_t ABCD, - const float strength, - const char **backend_out) +// Detect best available Halide GPU backend with a probe bspline call. +// On success, the buffers will have device allocations ready for reuse. +static _halide_backend_t _halide_probe_backend(halide_buffer_t *in_buf, + halide_buffer_t *lf_buf, + halide_buffer_t *hf_buf, + const int w, const int h, const int mult) { - const int row_floats = (int)width * 4; - const int w = (int)width; - const int h = (int)height; - const float regularization_factor = regularization * current_radius_square / 9.f; - - // Setup flat 2D buffers for HF, LF, output - halide_dimension_t flat_dims[2] = { - {0, row_floats, 1}, - {0, h, row_floats}, - }; - - halide_buffer_t hf_buf = {0}; - hf_buf.host = (uint8_t *)high_freq; - hf_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; - hf_buf.dimensions = 2; - hf_buf.dim = flat_dims; - - halide_dimension_t lf_dims[2] = { - {0, row_floats, 1}, - {0, h, row_floats}, - }; - halide_buffer_t lf_buf = {0}; - lf_buf.host = (uint8_t *)low_freq; - lf_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; - lf_buf.dimensions = 2; - lf_buf.dim = lf_dims; - - // Mask buffer: per-pixel uint8 (width, height) - halide_dimension_t mask_dims[2] = { - {0, w, 1}, - {0, h, w}, - }; - halide_buffer_t mask_b = {0}; - mask_b.host = (uint8_t *)mask; - mask_b.type = (struct halide_type_t){halide_type_uint, 8, 1}; - mask_b.dimensions = 2; - mask_b.dim = mask_dims; - - halide_dimension_t out_dims[2] = { - {0, row_floats, 1}, - {0, h, row_floats}, - }; - halide_buffer_t out_buf = {0}; - out_buf.host = (uint8_t *)output; - out_buf.type = (struct halide_type_t){halide_type_float, 32, 1}; - out_buf.dimensions = 2; - out_buf.dim = out_dims; - - int err = -1; - const char *backend = NULL; - - // Common arguments (after buffer args): - // width_pixels, height, mult, has_mask, - // anisotropy_0..3, isotropy_0..3, A, B, C, D, - // strength, regularization_factor, variance_threshold - #ifdef __APPLE__ - { - halide_buffer_t gpu_hf = hf_buf; - halide_buffer_t gpu_lf = lf_buf; - halide_buffer_t gpu_mask = mask_b; - halide_buffer_t gpu_out = out_buf; - gpu_hf.flags = halide_buffer_flag_host_dirty; - gpu_lf.flags = halide_buffer_flag_host_dirty; - if(has_mask) gpu_mask.flags = halide_buffer_flag_host_dirty; - - err = dt_halide_diffuse_pde_metal( - &gpu_hf, &gpu_lf, &gpu_mask, - w, h, mult, has_mask ? 1 : 0, - anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3], - (int)isotropy_type[0], (int)isotropy_type[1], - (int)isotropy_type[2], (int)isotropy_type[3], - ABCD[0], ABCD[1], ABCD[2], ABCD[3], - strength, regularization_factor, variance_threshold, - &gpu_out); - if(!err) - { - halide_copy_to_host(NULL, &gpu_out); - backend = "Metal GPU"; - } - else - { - dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Metal PDE failed (%d), trying next backend", err); - } - halide_device_free(NULL, &gpu_hf); - halide_device_free(NULL, &gpu_lf); - halide_device_free(NULL, &gpu_mask); - halide_device_free(NULL, &gpu_out); - } + if(_halide_run_bspline(HALIDE_BACKEND_METAL, in_buf, lf_buf, hf_buf, w, h, mult) == 0) + return HALIDE_BACKEND_METAL; + dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Metal probe failed, trying next backend"); + // Clean up failed device state before trying next backend + halide_device_free(NULL, in_buf); + halide_device_free(NULL, lf_buf); + halide_device_free(NULL, hf_buf); #endif - #ifdef HAVE_HALIDE_VULKAN - if(err) - { - halide_buffer_t gpu_hf = hf_buf; - halide_buffer_t gpu_lf = lf_buf; - halide_buffer_t gpu_mask = mask_b; - halide_buffer_t gpu_out = out_buf; - gpu_hf.flags = halide_buffer_flag_host_dirty; - gpu_lf.flags = halide_buffer_flag_host_dirty; - if(has_mask) gpu_mask.flags = halide_buffer_flag_host_dirty; - - err = dt_halide_diffuse_pde_vulkan( - &gpu_hf, &gpu_lf, &gpu_mask, - w, h, mult, has_mask ? 1 : 0, - anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3], - (int)isotropy_type[0], (int)isotropy_type[1], - (int)isotropy_type[2], (int)isotropy_type[3], - ABCD[0], ABCD[1], ABCD[2], ABCD[3], - strength, regularization_factor, variance_threshold, - &gpu_out); - if(!err) - { - halide_copy_to_host(NULL, &gpu_out); - backend = "Vulkan GPU"; - } - else - { - dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Vulkan PDE failed (%d), trying CPU", err); - } - halide_device_free(NULL, &gpu_hf); - halide_device_free(NULL, &gpu_lf); - halide_device_free(NULL, &gpu_mask); - halide_device_free(NULL, &gpu_out); - } + if(_halide_run_bspline(HALIDE_BACKEND_VULKAN, in_buf, lf_buf, hf_buf, w, h, mult) == 0) + return HALIDE_BACKEND_VULKAN; + dt_print(DT_DEBUG_PIPE, "[diffuse] Halide Vulkan probe failed, trying CPU"); + halide_device_free(NULL, in_buf); + halide_device_free(NULL, lf_buf); + halide_device_free(NULL, hf_buf); #endif + if(_halide_run_bspline(HALIDE_BACKEND_CPU, in_buf, lf_buf, hf_buf, w, h, mult) == 0) + return HALIDE_BACKEND_CPU; + return HALIDE_BACKEND_NONE; +} - if(err) +static inline const char *_halide_backend_name(_halide_backend_t b) +{ + switch(b) { - err = dt_halide_diffuse_pde( - &hf_buf, &lf_buf, &mask_b, - w, h, mult, has_mask ? 1 : 0, - anisotropy[0], anisotropy[1], anisotropy[2], anisotropy[3], - (int)isotropy_type[0], (int)isotropy_type[1], - (int)isotropy_type[2], (int)isotropy_type[3], - ABCD[0], ABCD[1], ABCD[2], ABCD[3], - strength, regularization_factor, variance_threshold, - &out_buf); - if(!err) backend = "CPU"; + case HALIDE_BACKEND_METAL: return "Metal GPU"; + case HALIDE_BACKEND_VULKAN: return "Vulkan GPU"; + case HALIDE_BACKEND_CPU: return "CPU"; + default: return "none"; } - - if(backend_out) *backend_out = backend; - return err; } -// Halide version of wavelets_process: uses Halide for B-spline decomposition, -// keeps the PDE diffusion kernel as C code (Phase 2 will port that too). +// Halide-accelerated wavelets_process with GPU buffer persistence. +// All intermediate buffers stay on the GPU device across the entire +// decompose + reconstruct loop. Only the final output is copied to host. static inline gboolean _wavelets_process_halide(const float *const restrict in, float *const restrict reconstructed, const uint8_t *const restrict mask, @@ -1754,7 +1637,9 @@ static inline gboolean _wavelets_process_halide(const float *const restrict in, float *const restrict LF_odd, float *const restrict LF_even) { - gboolean success = TRUE; + const int row_floats = (int)width * 4; + const int w = (int)width; + const int h = (int)height; const dt_aligned_pixel_t anisotropy = { compute_anisotropy_factor(data->anisotropy_first), @@ -1771,108 +1656,148 @@ static inline gboolean _wavelets_process_halide(const float *const restrict in, const float regularization = powf(10.f, data->regularization) - 1.f; const float variance_threshold = powf(10.f, data->variance_threshold); - // À trous decimated wavelet decompose using Halide-accelerated B-spline blur - float *restrict residual; - const char *backend = NULL; - for(int s = 0; s < scales; ++s) - { - const int mult = 1 << s; + // --- Create persistent halide_buffer_t wrappers for all buffers --- + // Each buffer gets its own dims array (Halide may write to these). + halide_dimension_t dims_in[2], dims_lf_odd[2], dims_lf_even[2], dims_out[2]; + halide_dimension_t dims_hf[MAX_NUM_SCALES][2]; + halide_dimension_t dims_mask[2]; - const float *restrict buffer_in; - float *restrict buffer_out; + halide_buffer_t buf_in, buf_lf_odd, buf_lf_even, buf_out; + halide_buffer_t buf_hf[MAX_NUM_SCALES]; + halide_buffer_t buf_mask; - if(s == 0) - { - buffer_in = in; - buffer_out = LF_odd; - } - else if(s % 2 != 0) - { - buffer_in = LF_odd; - buffer_out = LF_even; - } - else - { - buffer_in = LF_even; - buffer_out = LF_odd; - } + _halide_init_flat_f32(&buf_in, dims_in, (float *)in, row_floats, h); + _halide_init_flat_f32(&buf_lf_odd, dims_lf_odd, LF_odd, row_floats, h); + _halide_init_flat_f32(&buf_lf_even, dims_lf_even, LF_even, row_floats, h); + _halide_init_flat_f32(&buf_out, dims_out, reconstructed, row_floats, h); + for(int s = 0; s < scales; s++) + _halide_init_flat_f32(&buf_hf[s], dims_hf[s], HF[s], row_floats, h); + _halide_init_mask_u8(&buf_mask, dims_mask, mask, w, h); - const char *scale_backend = NULL; - int err = _halide_decompose_2D_Bspline(buffer_in, HF[s], buffer_out, width, height, mult, &scale_backend); - if(err) - { - dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide bspline decompose failed at scale %d (err=%d)", s, err); - success = FALSE; - return success; - } - if(s == 0) backend = scale_backend; + // Mark input as host-dirty so the first kernel uploads it to device + buf_in.flags = halide_buffer_flag_host_dirty; + if(has_mask) buf_mask.flags = halide_buffer_flag_host_dirty; - residual = buffer_out; + // --- Probe backend using the first bspline decompose call --- + // This also performs scale 0 decompose: in -> LF_odd + HF[0] + _halide_backend_t backend = _halide_probe_backend( + &buf_in, &buf_lf_odd, &buf_hf[0], w, h, 1 /* mult for scale 0 */); + + if(backend == HALIDE_BACKEND_NONE) + { + dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide: all backends failed for bspline probe"); + return FALSE; } - if(backend) - dt_print(DT_DEBUG_PIPE, "[diffuse] Halide bspline decompose backend: %s", backend); + dt_print(DT_DEBUG_PIPE, "[diffuse] Halide backend: %s (GPU-persistent buffers)", + _halide_backend_name(backend)); - // PDE diffusion reconstruction (unchanged from C version) - float *restrict temp = (residual == LF_even) ? LF_odd : LF_even; + // Scale 0 is already done by the probe. Continue with remaining scales. + float *restrict residual_ptr = LF_odd; - int count = 0; - for(int s = scales - 1; s > -1; --s) + for(int s = 1; s < scales; ++s) { const int mult = 1 << s; - const float current_radius = equivalent_sigma_at_step(B_SPLINE_SIGMA, s); - const float real_radius = current_radius * zoom; - - const float norm = - expf(-sqf(real_radius - (float)data->radius_center) / sqf(data->radius)); - const dt_aligned_pixel_t ABCD = { data->first * KAPPA * norm, - data->second * KAPPA * norm, - data->third * KAPPA * norm, - data->fourth * KAPPA * norm }; - const float strength = data->sharpness * norm + 1.f; - const float *restrict buffer_in; - float *restrict buffer_out; + halide_buffer_t *b_in = (s % 2 != 0) ? &buf_lf_odd : &buf_lf_even; + halide_buffer_t *b_out = (s % 2 != 0) ? &buf_lf_even : &buf_lf_odd; - if(count == 0) - { - buffer_in = residual; - buffer_out = temp; - } - else if(count % 2 != 0) - { - buffer_in = temp; - buffer_out = residual; - } - else + int err = _halide_run_bspline(backend, b_in, b_out, &buf_hf[s], w, h, mult); + if(err) { - buffer_in = residual; - buffer_out = temp; + dt_print(DT_DEBUG_ALWAYS, + "[diffuse] Halide bspline failed at scale %d (err=%d)", s, err); + goto cleanup_fail; } - if(s == 0) buffer_out = reconstructed; + residual_ptr = (s % 2 != 0) ? LF_even : LF_odd; + } - const char *pde_backend = NULL; - int pde_err = _halide_heat_PDE_diffusion(HF[s], buffer_in, mask, has_mask, buffer_out, width, height, - anisotropy, isotropy_type, regularization, - variance_threshold, sqf(current_radius), mult, ABCD, strength, - &pde_backend); - if(pde_err) - { - dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide PDE diffusion failed at scale %d (err=%d), falling back to C", s, pde_err); - // Fall back to C implementation for this scale - heat_PDE_diffusion(HF[s], buffer_in, mask, has_mask, buffer_out, width, height, - anisotropy, isotropy_type, regularization, - variance_threshold, sqf(current_radius), mult, ABCD, strength); - } - else if(s == scales - 1 && pde_backend) + // --- PDE diffusion reconstruction (reverse scale order) --- + { + halide_buffer_t *residual_buf = (residual_ptr == LF_even) ? &buf_lf_even : &buf_lf_odd; + halide_buffer_t *temp_buf = (residual_ptr == LF_even) ? &buf_lf_odd : &buf_lf_even; + + int count = 0; + for(int s = scales - 1; s > -1; --s) { - dt_print(DT_DEBUG_PIPE, "[diffuse] Halide PDE diffusion backend: %s", pde_backend); + const int mult = 1 << s; + const float current_radius = equivalent_sigma_at_step(B_SPLINE_SIGMA, s); + const float real_radius = current_radius * zoom; + const float norm = + expf(-sqf(real_radius - (float)data->radius_center) / sqf(data->radius)); + const dt_aligned_pixel_t ABCD = { data->first * KAPPA * norm, + data->second * KAPPA * norm, + data->third * KAPPA * norm, + data->fourth * KAPPA * norm }; + const float strength = data->sharpness * norm + 1.f; + const float regularization_factor = regularization * sqf(current_radius) / 9.f; + + halide_buffer_t *b_in, *b_out; + + if(count == 0) + { + b_in = residual_buf; + b_out = temp_buf; + } + else if(count % 2 != 0) + { + b_in = temp_buf; + b_out = residual_buf; + } + else + { + b_in = residual_buf; + b_out = temp_buf; + } + + // Final scale writes to output buffer + if(s == 0) b_out = &buf_out; + + int pde_err = _halide_run_pde(backend, &buf_hf[s], b_in, &buf_mask, b_out, + w, h, mult, has_mask ? 1 : 0, + anisotropy, isotropy_type, ABCD, + strength, regularization_factor, variance_threshold); + if(pde_err) + { + dt_print(DT_DEBUG_ALWAYS, + "[diffuse] Halide PDE failed at scale %d (err=%d)", s, pde_err); + goto cleanup_fail; + } + count++; } - count++; } - return success; + // --- Copy only the final output to host --- + if(backend != HALIDE_BACKEND_CPU) + halide_copy_to_host(NULL, &buf_out); + + // --- Free all device memory --- + if(backend != HALIDE_BACKEND_CPU) + { + halide_device_free(NULL, &buf_in); + halide_device_free(NULL, &buf_lf_odd); + halide_device_free(NULL, &buf_lf_even); + halide_device_free(NULL, &buf_out); + halide_device_free(NULL, &buf_mask); + for(int s = 0; s < scales; s++) + halide_device_free(NULL, &buf_hf[s]); + } + + return TRUE; + +cleanup_fail: + if(backend != HALIDE_BACKEND_CPU) + { + halide_device_free(NULL, &buf_in); + halide_device_free(NULL, &buf_lf_odd); + halide_device_free(NULL, &buf_lf_even); + halide_device_free(NULL, &buf_out); + halide_device_free(NULL, &buf_mask); + for(int s = 0; s < scales; s++) + halide_device_free(NULL, &buf_hf[s]); + } + return FALSE; } __attribute__((visibility("default"))) From 582b1487e559261a6f4e933f29d8317851959365 Mon Sep 17 00:00:00 2001 From: Mario Zimmermann Date: Wed, 22 Apr 2026 17:37:23 +0200 Subject: [PATCH 4/4] bug fixes to avoid artifacts --- src/iop/diffuse.c | 77 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 16 deletions(-) diff --git a/src/iop/diffuse.c b/src/iop/diffuse.c index 28565a399d59..c99842fc3c2d 100644 --- a/src/iop/diffuse.c +++ b/src/iop/diffuse.c @@ -1390,7 +1390,7 @@ void process(dt_iop_module_t *self, float *restrict HF[MAX_NUM_SCALES]; for(int s = 0; s < scales; s++) { - HF[s] = out_of_memory ? NULL : dt_alloc_align_float(width * height * 4); + HF[s] = out_of_memory ? NULL : dt_calloc_align_float(width * height * 4); if(!HF[s]) out_of_memory = TRUE; } @@ -1635,7 +1635,8 @@ static inline gboolean _wavelets_process_halide(const float *const restrict in, const gboolean has_mask, float *const restrict HF[MAX_NUM_SCALES], float *const restrict LF_odd, - float *const restrict LF_even) + float *const restrict LF_even, + _halide_backend_t *const backend_ptr) { const int row_floats = (int)width * 4; const int w = (int)width; @@ -1678,22 +1679,50 @@ static inline gboolean _wavelets_process_halide(const float *const restrict in, buf_in.flags = halide_buffer_flag_host_dirty; if(has_mask) buf_mask.flags = halide_buffer_flag_host_dirty; - // --- Probe backend using the first bspline decompose call --- - // This also performs scale 0 decompose: in -> LF_odd + HF[0] - _halide_backend_t backend = _halide_probe_backend( - &buf_in, &buf_lf_odd, &buf_hf[0], w, h, 1 /* mult for scale 0 */); - - if(backend == HALIDE_BACKEND_NONE) + // --- Use cached backend or probe on first call --- + _halide_backend_t backend; + gboolean probed = FALSE; + + if(backend_ptr && *backend_ptr != HALIDE_BACKEND_NONE) { - dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide: all backends failed for bspline probe"); - return FALSE; + // Backend was cached from a previous call, reuse it to avoid re-probing + backend = *backend_ptr; + probed = FALSE; // We're not probing, just using cached backend } + else + { + // First call: probe backend using the first bspline decompose call + // This also performs scale 0 decompose: in -> LF_odd + HF[0] + backend = _halide_probe_backend( + &buf_in, &buf_lf_odd, &buf_hf[0], w, h, 1 /* mult for scale 0 */); + + if(backend == HALIDE_BACKEND_NONE) + { + dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide: all backends failed for bspline probe"); + return FALSE; + } - dt_print(DT_DEBUG_PIPE, "[diffuse] Halide backend: %s (GPU-persistent buffers)", - _halide_backend_name(backend)); + dt_print(DT_DEBUG_PIPE, "[diffuse] Halide backend: %s", + _halide_backend_name(backend)); - // Scale 0 is already done by the probe. Continue with remaining scales. + // Cache backend for subsequent calls to avoid re-probing + if(backend_ptr) *backend_ptr = backend; + probed = TRUE; // Probe already did scale 0 + } + + // Scale 0: only run if we didn't probe (probe already did it) float *restrict residual_ptr = LF_odd; + + if(!probed) + { + // Cached backend iteration: need to run scale 0 bspline explicitly + int err = _halide_run_bspline(backend, &buf_in, &buf_lf_odd, &buf_hf[0], w, h, 1); + if(err) + { + dt_print(DT_DEBUG_ALWAYS, "[diffuse] Halide bspline failed at scale 0 (err=%d)", err); + goto cleanup_fail; + } + } for(int s = 1; s < scales; ++s) { @@ -1770,7 +1799,13 @@ static inline gboolean _wavelets_process_halide(const float *const restrict in, // --- Copy only the final output to host --- if(backend != HALIDE_BACKEND_CPU) + { halide_copy_to_host(NULL, &buf_out); + // Force synchronization by touching the output buffer + // This ensures GPU has finished writing before we return + volatile float sync_check = ((float *)reconstructed)[0]; + (void)sync_check; + } // --- Free all device memory --- if(backend != HALIDE_BACKEND_CPU) @@ -1849,7 +1884,7 @@ int process_halide(dt_iop_module_t *self, float *restrict HF[MAX_NUM_SCALES]; for(int s = 0; s < scales; s++) { - HF[s] = out_of_memory ? NULL : dt_alloc_align_float(width * height * 4); + HF[s] = out_of_memory ? NULL : dt_calloc_align_float(width * height * 4); if(!HF[s]) out_of_memory = TRUE; } @@ -1863,7 +1898,7 @@ int process_halide(dt_iop_module_t *self, goto finish; } - const gboolean has_mask = (data->threshold > 0.f); + const gboolean has_mask = (data->threshold > 0.f); if(has_mask) { build_mask(in, mask, data->threshold, roi_out->width, roi_out->height); @@ -1871,8 +1906,13 @@ int process_halide(dt_iop_module_t *self, in = temp1; } + // Cache backend across iterations to skip re-probing + _halide_backend_t cached_backend = HALIDE_BACKEND_NONE; + for(int it = 0; it < iterations; it++) { + double iter_start = dt_get_wtime(); + if(it == 0) { temp_in = in; @@ -1894,12 +1934,17 @@ int process_halide(dt_iop_module_t *self, if(!_wavelets_process_halide(temp_in, temp_out, mask, roi_out->width, roi_out->height, - data, final_radius, scale, scales, has_mask, HF, LF_odd, LF_even)) + data, final_radius, scale, scales, has_mask, HF, LF_odd, LF_even, + &cached_backend)) { // Halide failed, signal fallback to regular process() ret = -1; goto finish; } + + double iter_time = dt_get_wtime() - iter_start; + dt_print(DT_DEBUG_PIPE, "[diffuse] Iteration %d/%d took %.3f secs", + it + 1, iterations, iter_time); } finish: