Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .ci/Brewfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ brew 'gtk+3'
brew 'gtk-mac-integration'
brew 'icu4c'
brew 'graphicsmagick'
brew 'halide'
brew 'imagemagick'
brew 'intltool'
brew 'iso-codes'
Expand Down
1 change: 1 addition & 0 deletions DefineOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ option(USE_LIBSECRET "Build libsecret password storage back-end" ON)
option(USE_UNITY "Use libunity to report progress in the launcher" OFF)
option(USE_OPENMP "Use OpenMP threading support." ON)
option(USE_OPENCL "Use OpenCL support." ON)
option(USE_HALIDE "Use Halide for optimized image processing." OFF)
option(USE_GRAPHICSMAGICK "Use GraphicsMagick library for image import." ON)
option(USE_IMAGEMAGICK "Use ImageMagick library for image import." OFF)
option(USE_DARKTABLE_PROFILING OFF)
Expand Down
108 changes: 108 additions & 0 deletions data/halide/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Halide-based image processing generators for darktable
#
# This builds Halide generator executables at compile time, then runs them
# to produce optimized static libraries (.h + .a) for each target platform.
# The generated code is linked into individual IOP plugin modules.

find_package(Halide REQUIRED)

# Build the generator executables (run on the host at build time)
add_halide_generator(dt_halide_generators
SOURCES
generators/exposure_generator.cpp
)

add_halide_generator(dt_halide_bspline_generators
SOURCES
generators/bspline_generator.cpp
)

add_halide_generator(dt_halide_diffuse_generators
SOURCES
generators/diffuse_pde_generator.cpp
)

# Generate CPU-optimized code for the host platform
# "host" auto-detects arch + OS + SIMD features (NEON on ARM64, AVX on x86, etc.)
add_halide_library(halide_exposure
FROM dt_halide_generators
GENERATOR exposure
TARGETS host
FUNCTION_NAME dt_halide_exposure
)

# Generate Metal GPU code for Apple Silicon (macOS only)
# Uses the "flat" 2D generator to avoid stride issues with Metal's buffer handling.
# The flat generator treats each row as width*4 contiguous floats.
if(APPLE)
add_halide_library(halide_exposure_metal
FROM dt_halide_generators
GENERATOR exposure_flat
TARGETS host-metal
FUNCTION_NAME dt_halide_exposure_metal
)
endif()

# Generate Vulkan GPU code (cross-platform: Linux, Windows, macOS via MoltenVK)
# Uses the same flat 2D generator as Metal. Vulkan loader is dlopen'd at runtime,
# so no link-time dependency — if no Vulkan driver is present, the call simply fails
# and we fall through to CPU Halide.
# B-spline wavelet decomposition for diffuse/sharpen module
# CPU target: separable 5-tap blur + detail extraction in one pass
add_halide_library(halide_bspline_decompose
FROM dt_halide_bspline_generators
GENERATOR bspline_decompose
TARGETS host
FUNCTION_NAME dt_halide_bspline_decompose
)

# Metal GPU target for B-spline decomposition (macOS only)
if(APPLE)
add_halide_library(halide_bspline_decompose_metal
FROM dt_halide_bspline_generators
GENERATOR bspline_decompose
TARGETS host-metal
FUNCTION_NAME dt_halide_bspline_decompose_metal
)
endif()

# Heat PDE diffusion kernel for diffuse/sharpen module
add_halide_library(halide_diffuse_pde
FROM dt_halide_diffuse_generators
GENERATOR diffuse_pde
TARGETS host
FUNCTION_NAME dt_halide_diffuse_pde
)

# Metal GPU target for PDE diffusion (macOS only)
if(APPLE)
add_halide_library(halide_diffuse_pde_metal
FROM dt_halide_diffuse_generators
GENERATOR diffuse_pde
TARGETS host-metal
FUNCTION_NAME dt_halide_diffuse_pde_metal
)
endif()

find_package(Vulkan QUIET)
if(Vulkan_FOUND)
add_halide_library(halide_exposure_vulkan
FROM dt_halide_generators
GENERATOR exposure_flat
TARGETS host-vulkan
FUNCTION_NAME dt_halide_exposure_vulkan
)
add_halide_library(halide_bspline_decompose_vulkan
FROM dt_halide_bspline_generators
GENERATOR bspline_decompose
TARGETS host-vulkan
FUNCTION_NAME dt_halide_bspline_decompose_vulkan
)
add_halide_library(halide_diffuse_pde_vulkan
FROM dt_halide_diffuse_generators
GENERATOR diffuse_pde
TARGETS host-vulkan
FUNCTION_NAME dt_halide_diffuse_pde_vulkan
)
set(HALIDE_HAS_VULKAN TRUE PARENT_SCOPE)
endif()
131 changes: 131 additions & 0 deletions data/halide/generators/bspline_generator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/*
This file is part of darktable,
Copyright (C) 2026 darktable developers.

darktable is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

darktable is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with darktable. If not, see <http://www.gnu.org/licenses/>.
*/

/*
* Halide generator for B-spline wavelet decomposition used by
* darktable's diffuse/sharpen module.
*
* Algorithm: separable 5-tap [1,4,6,4,1]/16 blur with strided access
* (à-trous wavelet transform). Produces both the low-frequency (blurred)
* and high-frequency (detail = input - blurred) components.
*
* darktable pixel layout: interleaved float RGBA, 4 channels per pixel.
* We use the "flat" 2D approach (xi = x*4+c, y) that works on both
* CPU and GPU, matching the exposure_flat pattern.
*
* The stride parameter "mult" controls the spacing between filter taps:
* mult = 1 << scale (scale 0 -> mult=1, scale 1 -> mult=2, etc.)
*
* Boundary handling: clamp to edge (matching darktable's MIN/MAX clamping).
*
* Negative values in the blur output are clipped to 0 (matching darktable's
* clip_negatives=TRUE in decompose_2D_Bspline).
*/

#include "Halide.h"

using namespace Halide;

// Flat 2D layout (xi, y), xi = width * 4.
// Separable filter: vertical pass then horizontal pass.
// mult is the tap spacing in pixels (= 1 << scale).
// width_pixels is the image width in pixels (not floats).
class BsplineDecomposeGenerator : public Generator<BsplineDecomposeGenerator> {
public:
Input<Buffer<float, 2>> input{"input"}; // (xi, y) where xi = width*4
Input<int32_t> width_pixels{"width_pixels"}; // image width in pixels
Input<int32_t> height{"height"}; // image height in pixels
Input<int32_t> mult{"mult"}; // tap spacing = 1 << scale
Output<Buffer<float, 2>> lf{"lf"}; // low-frequency (blurred)
Output<Buffer<float, 2>> hf{"hf"}; // high-frequency (detail)

// Intermediate stage accessible to schedule()
Func vert{"vert"};

void generate() {
Var xi("xi"), y("y");

// B-spline filter weights
const float w0 = 1.0f / 16.0f;
const float w1 = 4.0f / 16.0f;
const float w2 = 6.0f / 16.0f;

// Extract pixel x and channel from flat index
Expr px = xi / 4;
Expr ch = xi % 4;

// Vertical pass: blur along y with spacing mult
// Clamp row indices to [0, height-1]
Expr r0 = clamp(y - 2 * mult, 0, height - 1);
Expr r1 = clamp(y - mult, 0, height - 1);
Expr r2 = y;
Expr r3 = clamp(y + mult, 0, height - 1);
Expr r4 = clamp(y + 2 * mult, 0, height - 1);

vert(xi, y) = w0 * input(xi, r0)
+ w1 * input(xi, r1)
+ w2 * input(xi, r2)
+ w1 * input(xi, r3)
+ w0 * input(xi, r4);

// Horizontal pass: blur along x with spacing mult on vertical result
// Clamp at pixel boundaries, preserving channel offset
Expr p0 = clamp(px - 2 * mult, 0, width_pixels - 1) * 4 + ch;
Expr p1 = clamp(px - mult, 0, width_pixels - 1) * 4 + ch;
Expr p2 = xi;
Expr p3 = clamp(px + mult, 0, width_pixels - 1) * 4 + ch;
Expr p4 = clamp(px + 2 * mult, 0, width_pixels - 1) * 4 + ch;

Expr blur = w0 * vert(p0, y)
+ w1 * vert(p1, y)
+ w2 * vert(p2, y)
+ w1 * vert(p3, y)
+ w0 * vert(p4, y);

// Clip negatives (matching darktable behavior)
lf(xi, y) = max(blur, 0.0f);
hf(xi, y) = input(xi, y) - lf(xi, y);
}

void schedule() {
Var xi = lf.args()[0];
Var y = lf.args()[1];

if(get_target().has_gpu_feature()) {
// GPU schedule: compute vert inline (fused into the horizontal pass)
// then tile the outputs
Var xio("xio"), yo("yo");
vert.compute_at(lf, yo);
lf.gpu_tile(xi, y, xio, yo, 64, 16);
hf.gpu_tile(xi, y, xio, yo, 64, 16);
} else {
// CPU schedule: compute vert per row strip for cache locality,
// vectorize, parallelize over rows
Var xio("xio"), xii("xii");

// Compute vertical pass per output row (store full row)
vert.compute_at(lf, y).vectorize(xi, 32);

lf.split(xi, xio, xii, 32).vectorize(xii).parallel(y);
// hf reads lf, so compute hf after lf at the same row
hf.split(xi, xio, xii, 32).vectorize(xii).parallel(y);
}
}
};

HALIDE_REGISTER_GENERATOR(BsplineDecomposeGenerator, bspline_decompose)
Loading