From f37c21dc4f28cfed926f85890e37bdc41784c15b Mon Sep 17 00:00:00 2001 From: Daniele Pighin Date: Thu, 16 Apr 2026 21:38:50 +0200 Subject: [PATCH 1/2] Efficiency improvements in blur. --- data/kernels/blurs.cl | 67 +++++++-- src/iop/blurs.c | 312 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 311 insertions(+), 68 deletions(-) diff --git a/data/kernels/blurs.cl b/data/kernels/blurs.cl index 3a8edbebd117..53f65c9ff61b 100644 --- a/data/kernels/blurs.cl +++ b/data/kernels/blurs.cl @@ -1,6 +1,6 @@ /* This file is part of darktable, - copyright (c) 2021 darktable developers. + copyright (c) 2021-2025 darktable developers. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,13 +18,16 @@ #include "common.h" -kernel void -convolve(read_only image2d_t in, read_only image2d_t kern, write_only image2d_t out, - const int width, const int height, const int radius) +// Dense 2D convolution for lens/motion blurs. +// kern is a flat float buffer (read-only; small enough for GPU L2 cache at typical radii). +kernel void convolve(read_only image2d_t in, + __global const float *kern, + write_only image2d_t out, + const int width, const int height, + const int radius, const int kernel_width) { const int x = get_global_id(0); const int y = get_global_id(1); - if(x >= width || y >= height) return; float4 pix_in = read_imagef(in, samplerA, (int2)(x, y)); @@ -35,16 +38,58 @@ convolve(read_only image2d_t in, read_only image2d_t kern, write_only image2d_t { const int ii = clamp(y + l, 0, height - 1); const int jj = clamp(x + m, 0, width - 1); - const float4 pix = read_imagef(in, samplerA, (int2)(jj, ii)); - const int ik = l + radius; const int jk = m + radius; - const float k = read_imagef(kern, samplerA, (int2)(jk, ik)).x; - - acc += k * pix; + acc += kern[ik * kernel_width + jk] * read_imagef(in, samplerA, (int2)(jj, ii)); } - // copy alpha acc.w = pix_in.w; write_imagef(out, (int2)(x, y), acc); } + +// Sparse 2D convolution: only the non-negligible kernel entries are applied. +// Dramatically faster for motion blur (thin arc kernel, ~1-5% fill at large radii) +// and moderate improvement for lens blur (circular aperture, ~78% fill). +// offsets_x / offsets_y are pixel coordinate deltas; values are the kernel weights. +kernel void convolve_sparse(read_only image2d_t in, + __global const int *offsets_x, + __global const int *offsets_y, + __global const float *values, + write_only image2d_t out, + const int width, const int height, + const int n_entries) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + if(x >= width || y >= height) return; + + float4 pix_in = read_imagef(in, samplerA, (int2)(x, y)); + float4 acc = 0.f; + + for(int i = 0; i < n_entries; i++) + { + const int2 coord = (int2)(clamp(x + offsets_x[i], 0, width - 1), + clamp(y + offsets_y[i], 0, height - 1)); + acc += values[i] * read_imagef(in, samplerA, coord); + } + + acc.w = pix_in.w; + write_imagef(out, (int2)(x, y), acc); +} + +// Copies RGB from blurred and alpha from original into out. +// Used after dt_gaussian_blur_cl to restore the pipeline mask channel. +kernel void restore_alpha(read_only image2d_t original, + read_only image2d_t blurred, + write_only image2d_t out, + const int width, const int height) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + if(x >= width || y >= height) return; + + float4 px_blurred = read_imagef(blurred, samplerA, (int2)(x, y)); + float4 px_orig = read_imagef(original, samplerA, (int2)(x, y)); + px_blurred.w = px_orig.w; + write_imagef(out, (int2)(x, y), px_blurred); +} diff --git a/src/iop/blurs.c b/src/iop/blurs.c index f4aeb1be0251..ab931b5d1666 100644 --- a/src/iop/blurs.c +++ b/src/iop/blurs.c @@ -24,7 +24,9 @@ #include "iop/iop_api.h" // #include // one day, include FFT convolution +#include "common/gaussian.h" #include "common/math.h" +#include "develop/tiling.h" #include #include @@ -70,6 +72,8 @@ typedef struct dt_iop_blurs_gui_data_t typedef struct dt_iop_blurs_global_data_t { int kernel_blurs_convolve; + int kernel_blurs_convolve_sparse; + int kernel_blurs_restore_alpha; } dt_iop_blurs_global_data_t; @@ -93,7 +97,7 @@ const char **description(dt_iop_module_t *self) int flags() { - return IOP_FLAGS_INCLUDE_IN_STYLES | IOP_FLAGS_SUPPORTS_BLENDING; + return IOP_FLAGS_INCLUDE_IN_STYLES | IOP_FLAGS_SUPPORTS_BLENDING | IOP_FLAGS_ALLOW_TILING; } @@ -116,6 +120,26 @@ void commit_params(dt_iop_module_t *self, dt_iop_params_t *p1, dt_dev_pixelpipe_ memcpy(piece->data, p1, self->params_size); } +void tiling_callback(dt_iop_module_t *self, + dt_dev_pixelpipe_iop_t *piece, + const dt_iop_roi_t *roi_in, + const dt_iop_roi_t *roi_out, + dt_develop_tiling_t *tiling) +{ + const dt_iop_blurs_params_t *p = piece->data; + const float scale = fmaxf(piece->iscale / roi_in->scale, 1.f); + const int radius = MAX(roundf(p->radius / scale), 2); + + // Gaussian uses IIR which handles tiling internally (no halo needed). + // Lens/motion need full radius overlap so the convolution neighbourhood is intact. + tiling->factor = 2.5f; + tiling->factor_cl = 3.5f; + tiling->maxbuf = 1.0f; + tiling->overhead = 0; + tiling->overlap = (p->type == DT_BLUR_GAUSSIAN) ? 0 : radius; + tiling->align = 1; +} + // B spline filter #define FSIZE 5 @@ -533,12 +557,12 @@ static void process_fft(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *pi } #endif -// Spatial convolution should be slower for large blurs because it is o(N²) where N is the width of the kernel -// but code is much simpler and easier to debug - -void process(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, - const void *const restrict ivoid, void *const restrict ovoid, - const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) +void process(dt_iop_module_t *self, + dt_dev_pixelpipe_iop_t *piece, + const void *const restrict ivoid, + void *const restrict ovoid, + const dt_iop_roi_t *const roi_in, + const dt_iop_roi_t *const roi_out) { dt_iop_blurs_params_t *p = piece->data; const float scale = fmaxf(piece->iscale / roi_in->scale, 1.f); @@ -549,64 +573,136 @@ void process(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const float *const restrict in = DT_IS_ALIGNED(ivoid); float *const restrict out = DT_IS_ALIGNED(ovoid); - // Init the blur kernel const int radius = MAX(roundf(p->radius / scale), 2); + + // ── Gaussian fast path: IIR separable filter, O(n) regardless of radius ── + if(p->type == DT_BLUR_GAUSSIAN) + { + // sigma matched to the original expf(-4*(d/radius)²) kernel shape + const float sigma = (float)(radius - 1) / (2.0f * sqrtf(2.0f)); + const float range = 1.0e9f; + const dt_aligned_pixel_t maxv = { range, range, range, range }; + const dt_aligned_pixel_t minv = { -range, -range, -range, -range }; + + dt_gaussian_t *g = + dt_gaussian_init(roi_in->width, roi_in->height, 4, maxv, minv, sigma, DT_IOP_GAUSSIAN_ZERO); + if(g) + { + dt_gaussian_blur_4c(g, in, out); + dt_gaussian_free(g); + } + else + { + dt_iop_copy_image_roi(ovoid, ivoid, 4, roi_in, roi_out); + } + + // dt_gaussian_blur_4c also blurs alpha; restore original pipeline mask + const size_t npix = (size_t)roi_out->width * roi_out->height; + DT_OMP_FOR_SIMD(aligned(in, out : 64)) + for(size_t k = 0; k < npix; k++) + out[k * 4 + 3] = in[k * 4 + 3]; + + return; + } + + // ── Lens / motion: sparse spatial convolution ── + // + // With tiling_callback declaring overlap = radius, the pipeline provides + // roi_in with a halo of (at least) radius pixels on each side. + // ox/oy are the position of roi_out's origin within the roi_in buffer. + const int in_width = roi_in->width; + const int in_height = roi_in->height; + const int ox = roi_out->x - roi_in->x; // >= 0 + const int oy = roi_out->y - roi_in->y; // >= 0 + const size_t kernel_width = 2 * radius + 1; + const size_t max_entries = kernel_width * kernel_width; + + float *const restrict kernel = dt_alloc_align_float(max_entries); + ptrdiff_t *const restrict offsets = dt_alloc_aligned(max_entries * sizeof(ptrdiff_t)); + float *const restrict values = dt_alloc_align_float(max_entries); + + if(!kernel || !offsets || !values) + { + dt_free_align(kernel); + dt_free_align(offsets); + dt_free_align(values); + dt_iop_copy_image_roi(ovoid, ivoid, 4, roi_in, roi_out); + return; + } - float *const restrict kernel = dt_alloc_align_float(kernel_width * kernel_width); _build_pixel_kernel(kernel, kernel_width, kernel_width, p); + // Build sparse list: pixel-offset (in floats) from center, for the interior + // fast path where no clamping is needed. + int n_nonzero = 0; + for(int l = -radius; l <= radius; l++) + for(int m = -radius; m <= radius; m++) + { + const float k = kernel[(l + radius) * kernel_width + (m + radius)]; + if(k > 1e-6f) + { + offsets[n_nonzero] = ((ptrdiff_t)l * in_width + m) * 4; + values[n_nonzero] = k; + n_nonzero++; + } + } + DT_OMP_FOR(collapse(2)) for(int i = 0; i < roi_out->height; i++) for(int j = 0; j < roi_out->width; j++) { - const size_t index = ((i * roi_out->width) + j) * 4; + const size_t idx_out = ((size_t)i * roi_out->width + j) * 4; float DT_ALIGNED_PIXEL acc[4] = { 0.f }; - if(i >= radius && j >= radius && i < roi_out->height - radius && j < roi_out->width - radius) - { - // We are in the safe area, no need to check for out-of-bounds - for(int l = -radius; l <= radius; l++) - for(int m = -radius; m <= radius; m++) - { - const int ii = i + l; - const int jj = j + m; - const size_t idx_shift = ((ii * roi_out->width) + jj) * 4; + // Position of this output pixel inside the roi_in buffer + const int ci = i + oy; + const int cj = j + ox; - const int ik = l + radius; - const int jk = m + radius; - const size_t idx_kernel = (ik * kernel_width) + jk; - const float k = kernel[idx_kernel]; - - for_four_channels(c, aligned(in : 64)) acc[c] += k * in[idx_shift + c]; - } + if(ci >= radius && cj >= radius && ci < in_height - radius && cj < in_width - radius) + { + // Interior: all neighbours are within roi_in — no clamping needed + const float *in_center = in + ((size_t)ci * in_width + cj) * 4; + for(int s = 0; s < n_nonzero; s++) + { + const float *p_src = in_center + offsets[s]; + const float w = values[s]; + acc[0] += w * p_src[0]; + acc[1] += w * p_src[1]; + acc[2] += w * p_src[2]; + acc[3] += w * p_src[3]; + } } else { - // We are close to borders, we need to clamp indices to bounds - // assume constant boundary conditions + // Edge: clamp neighbours to roi_in bounds for(int l = -radius; l <= radius; l++) for(int m = -radius; m <= radius; m++) { - const int ii = CLAMP((int)i + l, (int)0, (int)roi_out->height - 1); - const int jj = CLAMP((int)j + m, (int)0, (int)roi_out->width - 1); - const size_t idx_shift = ((ii * roi_out->width) + jj) * 4; - - const int ik = l + radius; - const int jk = m + radius; - const size_t idx_kernel = (ik * kernel_width) + jk; - const float k = kernel[idx_kernel]; - - for_four_channels(c, aligned(in : 64)) acc[c] += k * in[idx_shift + c]; + const float k = kernel[(l + radius) * kernel_width + (m + radius)]; + if(k > 1e-6f) + { + const int ii = CLAMP(ci + l, 0, in_height - 1); + const int jj = CLAMP(cj + m, 0, in_width - 1); + const float *p_src = in + ((size_t)ii * in_width + jj) * 4; + acc[0] += k * p_src[0]; + acc[1] += k * p_src[1]; + acc[2] += k * p_src[2]; + acc[3] += k * p_src[3]; + } } } - for_each_channel(c, aligned(out : 64) aligned(acc : 16)) out[index + c] = acc[c]; - - // copy alpha - out[index + 3] = in[index + 3]; + out[idx_out + 0] = acc[0]; + out[idx_out + 1] = acc[1]; + out[idx_out + 2] = acc[2]; + // preserve original alpha (pipeline mask) + out[idx_out + 3] = in[((size_t)ci * in_width + cj) * 4 + 3]; } + dt_free_align(kernel); + dt_free_align(offsets); + dt_free_align(values); } @@ -618,38 +714,138 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_ const dt_iop_blurs_global_data_t *const gd = self->global_data; cl_int err = DT_OPENCL_SYSMEM_ALLOCATION; - cl_mem kernel_cl = NULL; const int devid = piece->pipe->devid; - const int width = roi_in->width; - const int height = roi_in->height; + const int width = roi_out->width; + const int height = roi_out->height; - - // Init the blur kernel const float scale = fmaxf(piece->iscale / roi_in->scale, 1.f); const int radius = MAX(roundf(p->radius / scale), 2); + + // ── Gaussian fast path: IIR separable filter ── + if(p->type == DT_BLUR_GAUSSIAN) + { + const float sigma = (float)(radius - 1) / (2.0f * sqrtf(2.0f)); + const float range = 1.0e9f; + const dt_aligned_pixel_t maxv = { range, range, range, range }; + const dt_aligned_pixel_t minv = { -range, -range, -range, -range }; + + cl_mem dev_blurred = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); + dt_gaussian_cl_t *g = + dt_gaussian_init_cl(devid, width, height, 4, maxv, minv, sigma, DT_IOP_GAUSSIAN_ZERO); + if(dev_blurred && g) + { + err = dt_gaussian_blur_cl(g, dev_in, dev_blurred); + if(err == CL_SUCCESS) + err = dt_opencl_enqueue_kernel_2d_args(devid, + gd->kernel_blurs_restore_alpha, + width, + height, + CLARG(dev_in), + CLARG(dev_blurred), + CLARG(dev_out), + CLARG(width), + CLARG(height)); + } + dt_gaussian_free_cl(g); + dt_opencl_release_mem_object(dev_blurred); + return err; + } + + // ── Lens / motion: sparse convolution ── const size_t kernel_width = 2 * radius + 1; + const size_t max_entries = kernel_width * kernel_width; - float *const restrict kernel = dt_alloc_align_float(kernel_width * kernel_width); - if(kernel) + float *const restrict kernel = dt_alloc_align_float(max_entries); + int *const restrict offsets_x = dt_alloc_aligned(max_entries * sizeof(int)); + int *const restrict offsets_y = dt_alloc_aligned(max_entries * sizeof(int)); + float *const restrict values = dt_alloc_align_float(max_entries); + + cl_mem dev_offsets_x = NULL; + cl_mem dev_offsets_y = NULL; + cl_mem dev_values = NULL; + cl_mem dev_kernel = NULL; + + if(!kernel || !offsets_x || !offsets_y || !values) + goto cleanup_cl; + + _build_pixel_kernel(kernel, kernel_width, kernel_width, p); + + // Build sparse list + int n_nonzero = 0; + for(int l = -radius; l <= radius; l++) + for(int m = -radius; m <= radius; m++) + { + const float k = kernel[(l + radius) * kernel_width + (m + radius)]; + if(k > 1e-6f) + { + offsets_x[n_nonzero] = m; + offsets_y[n_nonzero] = l; + values[n_nonzero] = k; + n_nonzero++; + } + } + + // Try sparse path first; fall back to dense if allocation fails + dev_offsets_x = dt_opencl_copy_host_to_device_constant(devid, n_nonzero * sizeof(int), offsets_x); + dev_offsets_y = dt_opencl_copy_host_to_device_constant(devid, n_nonzero * sizeof(int), offsets_y); + dev_values = dt_opencl_copy_host_to_device_constant(devid, n_nonzero * sizeof(float), values); + + if(dev_offsets_x && dev_offsets_y && dev_values) + { + err = dt_opencl_enqueue_kernel_2d_args(devid, + gd->kernel_blurs_convolve_sparse, + width, + height, + CLARG(dev_in), + CLARG(dev_offsets_x), + CLARG(dev_offsets_y), + CLARG(dev_values), + CLARG(dev_out), + CLARG(width), + CLARG(height), + CLARG(n_nonzero)); + } + else { - _build_pixel_kernel(kernel, kernel_width, kernel_width, p); - kernel_cl = dt_opencl_copy_host_to_device(devid, kernel, kernel_width, kernel_width, sizeof(float)); - if(kernel_cl) - err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_blurs_convolve, width, height, - CLARG(dev_in), CLARG(kernel_cl), CLARG(dev_out), CLARG(roi_out->width), CLARG(roi_out->height), - CLARG(radius)); + // Dense fallback: kernel as read-only buffer (L2-cacheable on GPU) + dev_kernel = dt_opencl_copy_host_to_device_constant(devid, max_entries * sizeof(float), kernel); + if(dev_kernel) + { + const int kw = (int)kernel_width; + err = dt_opencl_enqueue_kernel_2d_args(devid, + gd->kernel_blurs_convolve, + width, + height, + CLARG(dev_in), + CLARG(dev_kernel), + CLARG(dev_out), + CLARG(width), + CLARG(height), + CLARG(radius), + CLARG(kw)); + } } + +cleanup_cl: dt_free_align(kernel); - dt_opencl_release_mem_object(kernel_cl); + dt_free_align(offsets_x); + dt_free_align(offsets_y); + dt_free_align(values); + dt_opencl_release_mem_object(dev_offsets_x); + dt_opencl_release_mem_object(dev_offsets_y); + dt_opencl_release_mem_object(dev_values); + dt_opencl_release_mem_object(dev_kernel); return err; } void init_global(dt_iop_module_so_t *self) { - const int program = 34; + const int program = 34; // blurs.cl dt_iop_blurs_global_data_t *gd = malloc(sizeof(dt_iop_blurs_global_data_t)); self->data = gd; gd->kernel_blurs_convolve = dt_opencl_create_kernel(program, "convolve"); + gd->kernel_blurs_convolve_sparse = dt_opencl_create_kernel(program, "convolve_sparse"); + gd->kernel_blurs_restore_alpha = dt_opencl_create_kernel(program, "restore_alpha"); } @@ -657,6 +853,8 @@ void cleanup_global(dt_iop_module_so_t *self) { const dt_iop_blurs_global_data_t *gd = self->data; dt_opencl_free_kernel(gd->kernel_blurs_convolve); + dt_opencl_free_kernel(gd->kernel_blurs_convolve_sparse); + dt_opencl_free_kernel(gd->kernel_blurs_restore_alpha); free(self->data); self->data = NULL; } From 42e71b7a009f72efeff72508fc9cf8e94739e338 Mon Sep 17 00:00:00 2001 From: Daniele Pighin Date: Tue, 5 May 2026 10:15:58 +0200 Subject: [PATCH 2/2] Address reviewer comments. --- data/kernels/blurs.cl | 14 ++-- src/iop/blurs.c | 171 +++++++++++++++++++++--------------------- 2 files changed, 96 insertions(+), 89 deletions(-) diff --git a/data/kernels/blurs.cl b/data/kernels/blurs.cl index 53f65c9ff61b..176da6c9cf7f 100644 --- a/data/kernels/blurs.cl +++ b/data/kernels/blurs.cl @@ -1,6 +1,6 @@ /* This file is part of darktable, - copyright (c) 2021-2025 darktable developers. + copyright (c) 2021-2026 darktable developers. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -23,8 +23,10 @@ kernel void convolve(read_only image2d_t in, __global const float *kern, write_only image2d_t out, - const int width, const int height, - const int radius, const int kernel_width) + const int width, + const int height, + const int radius, + const int kernel_width) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -56,7 +58,8 @@ kernel void convolve_sparse(read_only image2d_t in, __global const int *offsets_y, __global const float *values, write_only image2d_t out, - const int width, const int height, + const int width, + const int height, const int n_entries) { const int x = get_global_id(0); @@ -82,7 +85,8 @@ kernel void convolve_sparse(read_only image2d_t in, kernel void restore_alpha(read_only image2d_t original, read_only image2d_t blurred, write_only image2d_t out, - const int width, const int height) + const int width, + const int height) { const int x = get_global_id(0); const int y = get_global_id(1); diff --git a/src/iop/blurs.c b/src/iop/blurs.c index ab931b5d1666..286041ee653c 100644 --- a/src/iop/blurs.c +++ b/src/iop/blurs.c @@ -601,108 +601,111 @@ void process(dt_iop_module_t *self, DT_OMP_FOR_SIMD(aligned(in, out : 64)) for(size_t k = 0; k < npix; k++) out[k * 4 + 3] = in[k * 4 + 3]; - - return; } + else + { - // ── Lens / motion: sparse spatial convolution ── - // - // With tiling_callback declaring overlap = radius, the pipeline provides - // roi_in with a halo of (at least) radius pixels on each side. - // ox/oy are the position of roi_out's origin within the roi_in buffer. - const int in_width = roi_in->width; - const int in_height = roi_in->height; - const int ox = roi_out->x - roi_in->x; // >= 0 - const int oy = roi_out->y - roi_in->y; // >= 0 - - const size_t kernel_width = 2 * radius + 1; - const size_t max_entries = kernel_width * kernel_width; + // ── Lens / motion: sparse spatial convolution ── + // + // With tiling_callback declaring overlap = radius, the pipeline provides + // roi_in with a halo of (at least) radius pixels on each side. + // ox/oy are the position of roi_out's origin within the roi_in buffer. + const int in_width = roi_in->width; + const int in_height = roi_in->height; + const int ox = roi_out->x - roi_in->x; // >= 0 + const int oy = roi_out->y - roi_in->y; // >= 0 - float *const restrict kernel = dt_alloc_align_float(max_entries); - ptrdiff_t *const restrict offsets = dt_alloc_aligned(max_entries * sizeof(ptrdiff_t)); - float *const restrict values = dt_alloc_align_float(max_entries); + const size_t kernel_width = 2 * radius + 1; + const size_t max_entries = kernel_width * kernel_width; - if(!kernel || !offsets || !values) - { - dt_free_align(kernel); - dt_free_align(offsets); - dt_free_align(values); - dt_iop_copy_image_roi(ovoid, ivoid, 4, roi_in, roi_out); - return; - } + float *const restrict kernel = dt_alloc_align_float(max_entries); + ptrdiff_t *const restrict offsets = dt_alloc_aligned(max_entries * sizeof(ptrdiff_t)); + float *const restrict values = dt_alloc_align_float(max_entries); - _build_pixel_kernel(kernel, kernel_width, kernel_width, p); - - // Build sparse list: pixel-offset (in floats) from center, for the interior - // fast path where no clamping is needed. - int n_nonzero = 0; - for(int l = -radius; l <= radius; l++) - for(int m = -radius; m <= radius; m++) + if(!kernel || !offsets || !values) { - const float k = kernel[(l + radius) * kernel_width + (m + radius)]; - if(k > 1e-6f) - { - offsets[n_nonzero] = ((ptrdiff_t)l * in_width + m) * 4; - values[n_nonzero] = k; - n_nonzero++; - } + dt_free_align(kernel); + dt_free_align(offsets); + dt_free_align(values); + dt_iop_copy_image_roi(ovoid, ivoid, 4, roi_in, roi_out); } - - DT_OMP_FOR(collapse(2)) - for(int i = 0; i < roi_out->height; i++) - for(int j = 0; j < roi_out->width; j++) + else { - const size_t idx_out = ((size_t)i * roi_out->width + j) * 4; - float DT_ALIGNED_PIXEL acc[4] = { 0.f }; - // Position of this output pixel inside the roi_in buffer - const int ci = i + oy; - const int cj = j + ox; + _build_pixel_kernel(kernel, kernel_width, kernel_width, p); - if(ci >= radius && cj >= radius && ci < in_height - radius && cj < in_width - radius) - { - // Interior: all neighbours are within roi_in — no clamping needed - const float *in_center = in + ((size_t)ci * in_width + cj) * 4; - for(int s = 0; s < n_nonzero; s++) + // Build sparse list: pixel-offset (in floats) from center, for the interior + // fast path where no clamping is needed. + int n_nonzero = 0; + for(int l = -radius; l <= radius; l++) + for(int m = -radius; m <= radius; m++) { - const float *p_src = in_center + offsets[s]; - const float w = values[s]; - acc[0] += w * p_src[0]; - acc[1] += w * p_src[1]; - acc[2] += w * p_src[2]; - acc[3] += w * p_src[3]; + const float k = kernel[(l + radius) * kernel_width + (m + radius)]; + if(k > 1e-6f) + { + offsets[n_nonzero] = ((ptrdiff_t)l * in_width + m) * 4; + values[n_nonzero] = k; + n_nonzero++; + } } - } - else - { - // Edge: clamp neighbours to roi_in bounds - for(int l = -radius; l <= radius; l++) - for(int m = -radius; m <= radius; m++) + + DT_OMP_FOR(collapse(2)) + for(int i = 0; i < roi_out->height; i++) + for(int j = 0; j < roi_out->width; j++) + { + const size_t idx_out = ((size_t)i * roi_out->width + j) * 4; + float DT_ALIGNED_PIXEL acc[4] = { 0.f }; + + // Position of this output pixel inside the roi_in buffer + const int ci = i + oy; + const int cj = j + ox; + + if(ci >= radius && cj >= radius && ci < in_height - radius && cj < in_width - radius) { - const float k = kernel[(l + radius) * kernel_width + (m + radius)]; - if(k > 1e-6f) + // Interior: all neighbours are within roi_in — no clamping needed + const float *in_center = in + ((size_t)ci * in_width + cj) * 4; + for(int s = 0; s < n_nonzero; s++) { - const int ii = CLAMP(ci + l, 0, in_height - 1); - const int jj = CLAMP(cj + m, 0, in_width - 1); - const float *p_src = in + ((size_t)ii * in_width + jj) * 4; - acc[0] += k * p_src[0]; - acc[1] += k * p_src[1]; - acc[2] += k * p_src[2]; - acc[3] += k * p_src[3]; + const float *p_src = in_center + offsets[s]; + const float w = values[s]; + acc[0] += w * p_src[0]; + acc[1] += w * p_src[1]; + acc[2] += w * p_src[2]; + acc[3] += w * p_src[3]; } } - } + else + { + // Edge: clamp neighbours to roi_in bounds + for(int l = -radius; l <= radius; l++) + for(int m = -radius; m <= radius; m++) + { + const float k = kernel[(l + radius) * kernel_width + (m + radius)]; + if(k > 1e-6f) + { + const int ii = CLAMP(ci + l, 0, in_height - 1); + const int jj = CLAMP(cj + m, 0, in_width - 1); + const float *p_src = in + ((size_t)ii * in_width + jj) * 4; + acc[0] += k * p_src[0]; + acc[1] += k * p_src[1]; + acc[2] += k * p_src[2]; + acc[3] += k * p_src[3]; + } + } + } - out[idx_out + 0] = acc[0]; - out[idx_out + 1] = acc[1]; - out[idx_out + 2] = acc[2]; - // preserve original alpha (pipeline mask) - out[idx_out + 3] = in[((size_t)ci * in_width + cj) * 4 + 3]; - } + out[idx_out + 0] = acc[0]; + out[idx_out + 1] = acc[1]; + out[idx_out + 2] = acc[2]; + // preserve original alpha (pipeline mask) + out[idx_out + 3] = in[((size_t)ci * in_width + cj) * 4 + 3]; + } - dt_free_align(kernel); - dt_free_align(offsets); - dt_free_align(values); + dt_free_align(kernel); + dt_free_align(offsets); + dt_free_align(values); + } + } }