From 896accf7fcef92be3e364e9d43f6525d883b1014 Mon Sep 17 00:00:00 2001
From: Andrii Ryzhkov <andrii.ryzhkov@pm.me>
Date: Thu, 23 Apr 2026 13:33:44 +0200
Subject: [PATCH 1/9] Add AI raw denoise (RawNIND, Bayer + X-Trans)

---
 data/ai_models.json                      |    8 +
 data/darktableconfig.xml.in              |    4 +-
 dev-doc/AI.md                            |   16 +-
 dev-doc/AI_Tasks.md                      |    4 +-
 src/CMakeLists.txt                       |    4 +
 src/ai/backend.h                         |   11 +-
 src/ai/backend_common.c                  |   56 +-
 src/ai/backend_onnx.c                    |   50 +-
 src/common/ai/restore.c                  | 1479 ++++++++--------
 src/common/ai/restore.h                  |  560 ++++---
 src/common/ai/restore_common.h           |  236 +++
 src/common/ai/restore_raw_bayer.c        |  803 +++++++++
 src/common/ai/restore_raw_bayer.h        |  134 ++
 src/common/ai/restore_raw_linear.c       | 1032 ++++++++++++
 src/common/ai/restore_raw_linear.h       |  161 ++
 src/common/ai/restore_rgb.c              |  832 +++++++++
 src/common/ai/restore_rgb.h              |  180 ++
 src/common/ai/segmentation.c             |    4 +-
 src/common/dng_writer.c                  |  373 +++++
 src/common/dng_writer.h                  |  105 ++
 src/libs/neural_restore.c                | 1949 +++++++++++++++++++---
 src/tests/unittests/ai/test_ai_backend.c |    4 +-
 22 files changed, 6689 insertions(+), 1316 deletions(-)
 create mode 100644 src/common/ai/restore_common.h
 create mode 100644 src/common/ai/restore_raw_bayer.c
 create mode 100644 src/common/ai/restore_raw_bayer.h
 create mode 100644 src/common/ai/restore_raw_linear.c
 create mode 100644 src/common/ai/restore_raw_linear.h
 create mode 100644 src/common/ai/restore_rgb.c
 create mode 100644 src/common/ai/restore_rgb.h
 create mode 100644 src/common/dng_writer.c
 create mode 100644 src/common/dng_writer.h
diff --git a/data/ai_models.json b/data/ai_models.json
index 12d46c1cb66d..08fa95e6acad 100644
--- a/data/ai_models.json
+++ b/data/ai_models.json
@@ -35,6 +35,14 @@
       "github_asset": "denoise-nafnet.dtmodel",
       "default": false
     },
+    {
+      "id": "rawdenoise-nind",
+      "name": "raw denoise nind",
+      "description": "UtNet2 raw denoiser trained on RawNIND dataset",
+      "task": "rawdenoise",
+      "github_asset": "rawdenoise-nind.dtmodel",
+      "default": true
+    },
     {
       "id": "upscale-bsrgan",
       "name": "upscale bsrgan",
diff --git a/data/darktableconfig.xml.in b/data/darktableconfig.xml.in
index d4bf3bb7cc87..e14605fe16fc 100644
--- a/data/darktableconfig.xml.in
+++ b/data/darktableconfig.xml.in
@@ -3772,9 +3772,9 @@
   <dtconfig>
     <name>plugins/lighttable/neural_restore/detail_recovery_bands</name>
     <type>string</type>
-    <default>0.5,0.3,0.1,0.05,0.02</default>
+    <default>0.25,0.15,0.05,0.02,0.01</default>
     <shortdescription>detail recovery wavelet band thresholds</shortdescription>
-    <longdescription>comma-separated sigma multipliers for wavelet detail recovery bands (finest to coarsest). controls how much noise vs texture is recovered by the detail recovery slider</longdescription>
+    <longdescription>comma-separated sigma multipliers for wavelet detail recovery bands (finest to coarsest). controls how much noise vs texture passes through the DWT filter when strength is below 100</longdescription>
   </dtconfig>
   <dtconfig>
     <name>plugins/lighttable/neural_restore/preview_height</name>
diff --git a/dev-doc/AI.md b/dev-doc/AI.md
index 18270ff25382..4c5c2454924c 100644
--- a/dev-doc/AI.md
+++ b/dev-doc/AI.md
@@ -17,7 +17,12 @@ src/ai/                          ONNX Runtime backend (darktable_ai static lib)
 
 src/common/ai/                   higher-level AI modules (compiled in lib_darktable)
   segmentation.c/.h                SAM/SegNext interactive masking
-  restore.c/.h                     denoise/upscale tiled inference
+  restore.c/.h                     generic env/ctx lifecycle + model loaders
+  restore_common.h                 private struct defs shared by restore_*
+  restore_rgb.c/.h                 RGB-path denoise + upscale (tiled inference,
+                                   shadow boost, DWT detail recovery)
+  restore_raw_bayer.c/.h           RawNIND Bayer denoise (batch + piped preview)
+  restore_raw_linear.c/.h          RawNIND linear/X-Trans denoise
 
 src/common/ai_models.c/.h       model registry, download, preferences integration
 src/gui/preferences_ai.c        AI preferences tab
@@ -402,6 +407,9 @@ FILE(GLOB SOURCE_FILES_AI
   "common/ai_models.c"
   "common/ai/segmentation.c"
   "common/ai/restore.c"
+  "common/ai/restore_rgb.c"
+  "common/ai/restore_raw_bayer.c"
+  "common/ai/restore_raw_linear.c"
   "common/ai/your_task.c"       # add here
   ...
 )
@@ -455,8 +463,10 @@ dt_your_task_free(ctx);
 | Task | Key | API | Consumer |
 |------|-----|-----|----------|
 | Object Mask | `"mask"` | `src/common/ai/segmentation.h` | `src/develop/masks/object.c` |
-| Denoise | `"denoise"` | `src/common/ai/restore.h` | `src/libs/neural_restore.c` |
-| Upscale | `"upscale"` | `src/common/ai/restore.h` | `src/libs/neural_restore.c` |
+| Denoise | `"denoise"` | `src/common/ai/restore_rgb.h` | `src/libs/neural_restore.c` |
+| Upscale | `"upscale"` | `src/common/ai/restore_rgb.h` | `src/libs/neural_restore.c` |
+| Raw Denoise (Bayer)   | `"rawdenoise"` | `src/common/ai/restore_raw_bayer.h`  | `src/libs/neural_restore.c` |
+| Raw Denoise (Linear)  | `"rawdenoise"` | `src/common/ai/restore_raw_linear.h` | `src/libs/neural_restore.c` |
 
 For model requirements, I/O specifications, tiling strategies, color
 space conventions, ONNX export instructions, and config.json examples
diff --git a/dev-doc/AI_Tasks.md b/dev-doc/AI_Tasks.md
index 8720f9d9b2ef..9a034a70be3c 100644
--- a/dev-doc/AI_Tasks.md
+++ b/dev-doc/AI_Tasks.md
@@ -131,7 +131,7 @@ repository. Requirements for the decoder export:
 Removes noise from developed images using neural network inference.
 
 **Task key**: `"denoise"`
-**API**: `src/common/ai/restore.h` (`dt_restore_load_denoise`)
+**API**: `src/common/ai/restore.h` (loader: `dt_restore_load_denoise`), `src/common/ai/restore_rgb.h` (processing: `dt_restore_process_tiled`)
 **Consumer**: `src/libs/neural_restore.c`
 
 ### How It Works
@@ -222,7 +222,7 @@ torch.onnx.export(model, dummy_input, "model.onnx",
 Super-resolution upscaling of developed images (2x or 4x).
 
 **Task key**: `"upscale"`
-**API**: `src/common/ai/restore.h` (`dt_restore_load_upscale_x2`, `dt_restore_load_upscale_x4`)
+**API**: `src/common/ai/restore.h` (loaders: `dt_restore_load_upscale_x2`, `dt_restore_load_upscale_x4`), `src/common/ai/restore_rgb.h` (processing: `dt_restore_process_tiled`)
 **Consumer**: `src/libs/neural_restore.c`
 
 ### How It Works
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e2d15b5b16cc..035104ed95d4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -462,6 +462,10 @@ if(USE_AI)
     "common/ai_models.c"
     "common/ai/segmentation.c"
     "common/ai/restore.c"
+    "common/ai/restore_rgb.c"
+    "common/ai/restore_raw_bayer.c"
+    "common/ai/restore_raw_linear.c"
+    "common/dng_writer.c"
     "develop/masks/object.c"
     "gui/preferences_ai.c"
   )
diff --git a/src/ai/backend.h b/src/ai/backend.h
index 776bf8a4c523..93ca4fde67fe 100644
--- a/src/ai/backend.h
+++ b/src/ai/backend.h
@@ -166,6 +166,14 @@ double dt_ai_model_attribute_double(const dt_ai_model_info_t *info,
 char *dt_ai_model_attribute_string(const dt_ai_model_info_t *info,
                                    const char *key);
 
+/** Return a newly-allocated int array from the JSON-array attribute
+ *  named `key`. *out_count is set to the array length; NULL is returned
+ *  (and *out_count = 0) when the key is absent or not a JSON array.
+ *  Caller frees the returned array with g_free(). */
+int *dt_ai_model_attribute_int_array(const dt_ai_model_info_t *info,
+                                     const char *key,
+                                     int *out_count);
+
 /* --- Discovery --- */
 
 /**
@@ -267,7 +275,8 @@ dt_ai_context_t *dt_ai_load_model_ext(dt_ai_environment_t *env,
                                        dt_ai_provider_t provider,
                                        dt_ai_opt_level_t opt_level,
                                        const dt_ai_dim_override_t *dim_overrides,
-                                       int n_overrides);
+                                       int n_overrides,
+                                       uint32_t ep_flags);
 
 /**
  * @brief Tensor Data Types
diff --git a/src/ai/backend_common.c b/src/ai/backend_common.c
index 1e9ae773e924..f7a594f90d49 100644
--- a/src/ai/backend_common.c
+++ b/src/ai/backend_common.c
@@ -357,7 +357,8 @@ dt_ai_provider_t dt_ai_env_get_provider(dt_ai_environment_t *env)
 extern dt_ai_context_t *
 dt_ai_onnx_load_ext(const char *model_dir, const char *model_file,
                     dt_ai_provider_t provider, dt_ai_opt_level_t opt_level,
-                    const dt_ai_dim_override_t *dim_overrides, int n_overrides);
+                    const dt_ai_dim_override_t *dim_overrides, int n_overrides,
+                    uint32_t ep_flags);
 
 // model loading with backend dispatch
 
@@ -367,7 +368,7 @@ dt_ai_context_t *dt_ai_load_model(dt_ai_environment_t *env,
                                   dt_ai_provider_t provider)
 {
   return dt_ai_load_model_ext(env, model_id, model_file, provider,
-                              DT_AI_OPT_ALL, NULL, 0);
+                              DT_AI_OPT_ALL, NULL, 0, 0);
 }
 
 dt_ai_context_t *dt_ai_load_model_ext(dt_ai_environment_t *env,
@@ -376,7 +377,8 @@ dt_ai_context_t *dt_ai_load_model_ext(dt_ai_environment_t *env,
                                       dt_ai_provider_t provider,
                                       dt_ai_opt_level_t opt_level,
                                       const dt_ai_dim_override_t *dim_overrides,
-                                      int n_overrides)
+                                      int n_overrides,
+                                      uint32_t ep_flags)
 {
   if(!env || !model_id)
     return NULL;
@@ -428,7 +430,7 @@ dt_ai_context_t *dt_ai_load_model_ext(dt_ai_environment_t *env,
   if(strcmp(backend_copy, "onnx") == 0)
   {
     ctx = dt_ai_onnx_load_ext(model_dir, model_file, provider, opt_level,
-                               dim_overrides, n_overrides);
+                               dim_overrides, n_overrides, ep_flags);
   }
   else
   {
@@ -448,6 +450,10 @@ dt_ai_context_t *dt_ai_load_model_ext(dt_ai_environment_t *env,
 // _attribute_node returns the parsed JsonParser plus a borrowed JsonNode*
 // for the named key; caller must g_object_unref the returned parser;
 // returns NULL parser if the attribute set is absent or the key is missing
+//
+// the key accepts a dotted path ("variants.bayer.onnx"): each segment
+// except the last must resolve to a JSON object; the final segment is
+// the leaf lookup and may hold any JSON value type
 static JsonParser *_attribute_node(const dt_ai_model_info_t *info,
                                    const char *key,
                                    JsonNode **out_node)
@@ -467,12 +473,26 @@ static JsonParser *_attribute_node(const dt_ai_model_info_t *info,
     return NULL;
   }
   JsonObject *obj = json_node_get_object(root);
-  if(!json_object_has_member(obj, key))
+  gchar **segments = g_strsplit(key, ".", -1);
+  const int n = g_strv_length(segments);
+  JsonNode *node = NULL;
+  for(int i = 0; i < n; i++)
+  {
+    if(!json_object_has_member(obj, segments[i])) goto out;
+    node = json_object_get_member(obj, segments[i]);
+    if(i == n - 1) break;
+    // intermediate segments must be objects to descend further
+    if(!node || !JSON_NODE_HOLDS_OBJECT(node)) { node = NULL; goto out; }
+    obj = json_node_get_object(node);
+  }
+out:
+  g_strfreev(segments);
+  if(!node)
   {
     g_object_unref(parser);
     return NULL;
   }
-  *out_node = json_object_get_member(obj, key);
+  *out_node = node;
   return parser;
 }
 
@@ -529,6 +549,30 @@ char *dt_ai_model_attribute_string(const dt_ai_model_info_t *info,
   return result;
 }
 
+int *dt_ai_model_attribute_int_array(const dt_ai_model_info_t *info,
+                                     const char *key,
+                                     int *out_count)
+{
+  if(out_count) *out_count = 0;
+  JsonNode *v = NULL;
+  JsonParser *p = _attribute_node(info, key, &v);
+  int *result = NULL;
+  if(v && JSON_NODE_HOLDS_ARRAY(v))
+  {
+    JsonArray *arr = json_node_get_array(v);
+    const guint n = json_array_get_length(arr);
+    if(n > 0)
+    {
+      result = g_new(int, n);
+      for(guint i = 0; i < n; i++)
+        result[i] = (int)json_array_get_int_element(arr, i);
+      if(out_count) *out_count = (int)n;
+    }
+  }
+  if(p) g_object_unref(p);
+  return result;
+}
+
 // provider string conversion
 
 const char *dt_ai_provider_to_string(dt_ai_provider_t provider)
diff --git a/src/ai/backend_onnx.c b/src/ai/backend_onnx.c
index a6ce05ca2236..80cf93072f67 100644
--- a/src/ai/backend_onnx.c
+++ b/src/ai/backend_onnx.c
@@ -801,7 +801,8 @@ static float _half_to_float(uint16_t h)
 static gboolean _try_provider(OrtSessionOptions *session_opts,
                               const char *symbol_name,
                               const char *provider_name,
-                              const char *device_type)
+                              const char *device_type,
+                              uint32_t flags)
 {
   OrtStatus *status = NULL;
   gboolean ok = FALSE;
@@ -851,7 +852,7 @@ static gboolean _try_provider(OrtSessionOptions *session_opts,
       // integer-argument providers (CUDA, CoreML, DML, MIGraphX, ROCm)
       typedef OrtStatus *(*ProviderAppenderInt)(OrtSessionOptions *, uint32_t);
       ProviderAppenderInt appender = (ProviderAppenderInt)func_ptr;
-      status = appender(session_opts, 0);
+      status = appender(session_opts, flags);
     }
     if(!status)
     {
@@ -880,7 +881,9 @@ static gboolean _try_provider(OrtSessionOptions *session_opts,
 }
 
 static void
-_enable_acceleration(OrtSessionOptions *session_opts, dt_ai_provider_t provider)
+_enable_acceleration(OrtSessionOptions *session_opts,
+                     dt_ai_provider_t provider,
+                     uint32_t coreml_flags)
 {
   switch(provider)
   {
@@ -894,14 +897,14 @@ _enable_acceleration(OrtSessionOptions *session_opts, dt_ai_provider_t provider)
     _try_provider(
       session_opts,
       "OrtSessionOptionsAppendExecutionProvider_CoreML",
-      "Apple CoreML", NULL);
+      "Apple CoreML", NULL, coreml_flags);
 #else
     dt_print(DT_DEBUG_AI, "[darktable_ai] apple CoreML not available on this platform");
 #endif
     break;
 
   case DT_AI_PROVIDER_CUDA:
-    _try_provider(session_opts, "OrtSessionOptionsAppendExecutionProvider_CUDA", "NVIDIA CUDA", NULL);
+    _try_provider(session_opts, "OrtSessionOptionsAppendExecutionProvider_CUDA", "NVIDIA CUDA", NULL, 0);
     break;
 
   case DT_AI_PROVIDER_MIGRAPHX:
@@ -909,13 +912,13 @@ _enable_acceleration(OrtSessionOptions *session_opts, dt_ai_provider_t provider)
     // load time, so they must be set before CreateEnv() — see
     // _setup_amd_caches() above. OpenVINO (below) takes options
     // per-session, so its cache path is passed inline here
-    if(!_try_provider(session_opts, "OrtSessionOptionsAppendExecutionProvider_MIGraphX", "AMD MIGraphX", NULL))
-      _try_provider(session_opts, "OrtSessionOptionsAppendExecutionProvider_ROCM", "AMD ROCm (legacy)", NULL);
+    if(!_try_provider(session_opts, "OrtSessionOptionsAppendExecutionProvider_MIGraphX", "AMD MIGraphX", NULL, 0))
+      _try_provider(session_opts, "OrtSessionOptionsAppendExecutionProvider_ROCM", "AMD ROCm (legacy)", NULL, 0);
     break;
 
   case DT_AI_PROVIDER_OPENVINO:
     if(!_try_openvino_with_cache(session_opts))
-      _try_provider(session_opts, "OrtSessionOptionsAppendExecutionProvider_OpenVINO", "Intel OpenVINO", "AUTO");
+      _try_provider(session_opts, "OrtSessionOptionsAppendExecutionProvider_OpenVINO", "Intel OpenVINO", "AUTO", 0);
     break;
 
   case DT_AI_PROVIDER_DIRECTML:
@@ -923,7 +926,7 @@ _enable_acceleration(OrtSessionOptions *session_opts, dt_ai_provider_t provider)
     _try_provider(
       session_opts,
       "OrtSessionOptionsAppendExecutionProvider_DML",
-      "Windows DirectML", NULL);
+      "Windows DirectML", NULL, 0);
 #else
     dt_print(DT_DEBUG_AI, "[darktable_ai] windows DirectML not available on this platform");
 #endif
@@ -936,27 +939,27 @@ _enable_acceleration(OrtSessionOptions *session_opts, dt_ai_provider_t provider)
     _try_provider(
       session_opts,
       "OrtSessionOptionsAppendExecutionProvider_CoreML",
-      "Apple CoreML", NULL);
+      "Apple CoreML", NULL, coreml_flags);
 #elif defined(_WIN32)
     _try_provider(
       session_opts,
       "OrtSessionOptionsAppendExecutionProvider_DML",
-      "Windows DirectML", NULL);
+      "Windows DirectML", NULL, 0);
 #elif defined(__linux__)
     // try CUDA first, then MIGraphX (cache configured at env init)
     if(!_try_provider(
          session_opts,
          "OrtSessionOptionsAppendExecutionProvider_CUDA",
-         "NVIDIA CUDA", NULL))
+         "NVIDIA CUDA", NULL, 0))
     {
       if(!_try_provider(
            session_opts,
            "OrtSessionOptionsAppendExecutionProvider_MIGraphX",
-           "AMD MIGraphX", NULL))
+           "AMD MIGraphX", NULL, 0))
         _try_provider(
           session_opts,
           "OrtSessionOptionsAppendExecutionProvider_ROCM",
-          "AMD ROCm (legacy)", NULL);
+          "AMD ROCm (legacy)", NULL, 0);
     }
 #endif
     break;
@@ -996,20 +999,20 @@ int dt_ai_probe_provider(dt_ai_provider_t provider)
   switch(provider)
   {
   case DT_AI_PROVIDER_COREML:
-    ok = _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_CoreML", "Apple CoreML", NULL);
+    ok = _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_CoreML", "Apple CoreML", NULL, 0);
     break;
   case DT_AI_PROVIDER_CUDA:
-    ok = _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_CUDA", "NVIDIA CUDA", NULL);
+    ok = _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_CUDA", "NVIDIA CUDA", NULL, 0);
     break;
   case DT_AI_PROVIDER_MIGRAPHX:
-    ok = _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_MIGraphX", "AMD MIGraphX", NULL)
-      || _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_ROCM", "AMD ROCm (legacy)", NULL);
+    ok = _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_MIGraphX", "AMD MIGraphX", NULL, 0)
+      || _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_ROCM", "AMD ROCm (legacy)", NULL, 0);
     break;
   case DT_AI_PROVIDER_OPENVINO:
-    ok = _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_OpenVINO", "Intel OpenVINO", "AUTO");
+    ok = _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_OpenVINO", "Intel OpenVINO", "AUTO", 0);
     break;
   case DT_AI_PROVIDER_DIRECTML:
-    ok = _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_DML", "Windows DirectML", NULL);
+    ok = _try_provider(opts, "OrtSessionOptionsAppendExecutionProvider_DML", "Windows DirectML", NULL, 0);
     break;
   default:
     break;
@@ -1026,7 +1029,8 @@ int dt_ai_probe_provider(dt_ai_provider_t provider)
 dt_ai_context_t *
 dt_ai_onnx_load_ext(const char *model_dir, const char *model_file,
                     dt_ai_provider_t provider, dt_ai_opt_level_t opt_level,
-                    const dt_ai_dim_override_t *dim_overrides, int n_overrides)
+                    const dt_ai_dim_override_t *dim_overrides, int n_overrides,
+                    uint32_t ep_flags)
 {
   if(!model_dir)
     return NULL;
@@ -1111,7 +1115,7 @@ dt_ai_onnx_load_ext(const char *model_dir, const char *model_file,
   }
 
   // optimize: enable hardware acceleration (AMD caches set at env init)
-  _enable_acceleration(session_opts, provider);
+  _enable_acceleration(session_opts, provider, ep_flags);
 
 #ifdef _WIN32
   // on windows, CreateSession expects a wide character string
@@ -1176,7 +1180,7 @@ dt_ai_onnx_load_ext(const char *model_dir, const char *model_file,
         if(s) g_ort->ReleaseStatus(s);
       }
       if(fallbacks[fb].prov != DT_AI_PROVIDER_CPU)
-        _enable_acceleration(session_opts, fallbacks[fb].prov);
+        _enable_acceleration(session_opts, fallbacks[fb].prov, ep_flags);
 #ifdef _WIN32
       status = g_ort->CreateSession(g_env, onnx_path_wide, session_opts, &ctx->session);
 #else
diff --git a/src/common/ai/restore.c b/src/common/ai/restore.c
index 53ceab0b4863..7be560413db9 100644
--- a/src/common/ai/restore.c
+++ b/src/common/ai/restore.c
@@ -17,120 +17,34 @@
 */
 
 #include "common/ai/restore.h"
+#include "common/ai/restore_common.h"
 #include "ai/backend.h"
 #include "common/darktable.h"
 #include "common/ai_models.h"
 #include "common/colorspaces.h"
-#include "common/colorspaces_inline_conversions.h"
-#include "common/imagebuf.h"
-#include "common/math.h"
-#include "common/matrices.h"
+#include "common/iop_order.h"
+#include "control/control.h"
 #include "control/jobs.h"
+#include "develop/develop.h"
+#include "develop/imageop.h"
+#include "develop/pixelpipe_hb.h"
+#include "imageio/imageio_common.h"
 
-// forward-declare to avoid pulling in dwt.h (which
-// includes OpenCL types when HAVE_OPENCL is defined)
-extern void dwt_denoise(float *buf, int width, int height,
-                        int bands, const float *noise);
 #include <math.h>
 #include <string.h>
 
 #define OVERLAP_DENOISE 64
 #define OVERLAP_UPSCALE 16
-#define MAX_MODEL_INPUTS 4
-#define DWT_DETAIL_BANDS 5
 
 // candidate tile sizes from largest to smallest, used by both the
 // startup memory-budget selector and the runtime OOM-retry fallback.
 // the memory-budget check gates which entry is chosen at startup;
 // the tile size cache persists the result so JIT-compiling EPs
-// (MIGraphX, CoreML, TensorRT) only pay the compile cost once.
+// (MIGraphX, CoreML, TensorRT) only pay the compile cost once
 #define DT_RESTORE_TILE_LADDER_1X {2048, 1536, 1024, 768, 512, 384, 256}
 #define DT_RESTORE_TILE_LADDER_SR {768, 512, 384, 256, 192}
 
-/* --- opaque struct definitions --- */
-
-struct dt_restore_env_t
-{
-  dt_ai_environment_t *ai_env;
-};
-
-struct dt_restore_context_t
-{
-  dt_ai_context_t *ai_ctx;
-  dt_restore_env_t *env;
-  char *model_id;
-  char *model_file;
-  char *task;
-  int tile_size;    // tile size used to create the current session
-  char *dim_h;      // symbolic height dim name used for session overrides
-  char *dim_w;      // symbolic width dim name used for session overrides
-  // color management: convert from working profile to sRGB before
-  // inference (model was trained on sRGB primaries) and back after.
-  // if has_profile is FALSE, fall back to gamma-only conversion
-  // (treats working-profile numbers as if they were sRGB)
-  gboolean has_profile;
-  float wp_to_srgb[9];   // working profile RGB -> sRGB linear (row-major)
-  float srgb_to_wp[9];   // sRGB linear -> working profile RGB (row-major)
-  // when TRUE (default), out-of-sRGB-gamut pixels pass through unchanged
-  // during denoise. when FALSE, every pixel uses the model output and
-  // wide-gamut colors get clipped to sRGB but everything is denoised
-  gboolean preserve_wide_gamut;
-  // shadow_boost_capable: TRUE when the model declares the
-  // "shadow_boost" attribute in its config.json; set once at load
-  gboolean shadow_boost_capable;
-  // shadow_boost: the effective flag used at inference; recomputed
-  // per-image inside dt_restore_process_tiled() when capable, based
-  // on a luminance check (bright images skip the curve)
-  gboolean shadow_boost;
-  gint ref_count;
-};
-
-// default multipliers of residual sigma for each wavelet band.
-// band 0 (finest) gets the strongest suppression since fine-scale
-// features are hardest to distinguish from noise. coarser bands
-// preserve more because they capture real texture.
-// tunable via darktablerc: plugins/lighttable/neural_restore/detail_recovery_bands
-static const float _dwt_sigma_mul_default[DWT_DETAIL_BANDS] = {
-  0.5f,   // band 0 (finest) — suppress fine luminance noise
-  0.3f,   // band 1
-  0.1f,   // band 2
-  0.05f,  // band 3
-  0.02f   // band 4 (coarsest) — keep almost everything
-};
-
-// compute adaptive noise thresholds from residual standard deviation
-static void _compute_adaptive_noise(const float *const restrict buf,
-                                    const size_t npix,
-                                    float noise[DWT_DETAIL_BANDS])
-{
-  // read band multipliers from config (comma-separated list).
-  // e.g. "0.5,0.3,0.1,0.05,0.02" in darktablerc
-  float sigma_mul[DWT_DETAIL_BANDS];
-  memcpy(sigma_mul, _dwt_sigma_mul_default, sizeof(sigma_mul));
-  gchar *val = dt_conf_get_string("plugins/lighttable/neural_restore/detail_recovery_bands");
-  if(val && val[0])
-  {
-    gchar **parts = g_strsplit(val, ",", DWT_DETAIL_BANDS);
-    for(int b = 0; parts[b] && b < DWT_DETAIL_BANDS; b++)
-      sigma_mul[b] = g_ascii_strtod(g_strstrip(parts[b]), NULL);
-    g_strfreev(parts);
-  }
-  g_free(val);
-
-  double sum = 0.0, sum2 = 0.0;
-  for(size_t i = 0; i < npix; i++)
-  {
-    sum += (double)buf[i];
-    sum2 += (double)buf[i] * (double)buf[i];
-  }
-  const double mean = sum / (double)npix;
-  const float sigma = (float)sqrt(sum2 / (double)npix - mean * mean);
-
-  for(int b = 0; b < DWT_DETAIL_BANDS; b++)
-    noise[b] = sigma * sigma_mul[b];
-}
-
-/* --- environment lifecycle --- */
+// --- environment lifecycle ---
 
 dt_restore_env_t *dt_restore_env_init(void)
 {
@@ -156,12 +70,131 @@ void dt_restore_env_destroy(dt_restore_env_t *env)
   g_free(env);
 }
 
-/* --- model lifecycle --- */
+// --- model lifecycle ---
+
+#define TASK_DENOISE    "denoise"
+#define TASK_RAWDENOISE "rawdenoise"
+#define TASK_UPSCALE    "upscale"
+
+// --- manifest policy parsers ---
+//
+// parse a variant's string attribute into the matching enum. unknown
+// values return UNKNOWN (for input_kind — caller validates), or the
+// supplied default (for the other three — caller has already decided
+// the default matches today's RawNIND behavior)
+
+static dt_restore_input_kind_t _parse_input_kind(const char *s)
+{
+  if(!s) return DT_RESTORE_INPUT_KIND_UNKNOWN;
+  if(!g_strcmp0(s, "bayer_v1"))  return DT_RESTORE_INPUT_KIND_BAYER_V1;
+  if(!g_strcmp0(s, "xtrans_v1")) return DT_RESTORE_INPUT_KIND_XTRANS_V1;
+  if(!g_strcmp0(s, "linear_v1")) return DT_RESTORE_INPUT_KIND_LINEAR_V1;
+  return DT_RESTORE_INPUT_KIND_UNKNOWN;
+}
+
+static const char *_input_kind_name(dt_restore_input_kind_t k)
+{
+  switch(k)
+  {
+    case DT_RESTORE_INPUT_KIND_BAYER_V1:  return "bayer_v1";
+    case DT_RESTORE_INPUT_KIND_XTRANS_V1: return "xtrans_v1";
+    case DT_RESTORE_INPUT_KIND_LINEAR_V1: return "linear_v1";
+    default:                              return "unknown";
+  }
+}
+
+static dt_restore_colorspace_t _parse_colorspace(const char *s,
+                                                 dt_restore_colorspace_t dflt)
+{
+  if(!s) return dflt;
+  if(!g_strcmp0(s, "lin_rec2020")) return DT_RESTORE_CS_LIN_REC2020;
+  if(!g_strcmp0(s, "camRGB"))      return DT_RESTORE_CS_CAMRGB;
+  if(!g_strcmp0(s, "srgb_linear")) return DT_RESTORE_CS_SRGB_LINEAR;
+  dt_print(DT_DEBUG_AI,
+           "[restore] unknown input_colorspace '%s', using default", s);
+  return dflt;
+}
+
+static dt_restore_wb_mode_t _parse_wb_mode(const char *s,
+                                           dt_restore_wb_mode_t dflt)
+{
+  if(!s) return dflt;
+  if(!g_strcmp0(s, "daylight")) return DT_RESTORE_WB_DAYLIGHT;
+  if(!g_strcmp0(s, "as_shot"))  return DT_RESTORE_WB_AS_SHOT;
+  if(!g_strcmp0(s, "none"))     return DT_RESTORE_WB_NONE;
+  dt_print(DT_DEBUG_AI,
+           "[restore] unknown wb_norm '%s', using default", s);
+  return dflt;
+}
+
+static dt_restore_output_scale_t _parse_output_scale(const char *s,
+                                                     dt_restore_output_scale_t dflt)
+{
+  if(!s) return dflt;
+  if(!g_strcmp0(s, "match_gain")) return DT_RESTORE_OUT_MATCH_GAIN;
+  if(!g_strcmp0(s, "absolute"))   return DT_RESTORE_OUT_ABSOLUTE;
+  dt_print(DT_DEBUG_AI,
+           "[restore] unknown output_scale '%s', using default", s);
+  return dflt;
+}
+
+static dt_restore_bayer_orientation_t _parse_bayer_orientation(
+    const char *s, dt_restore_bayer_orientation_t dflt)
+{
+  if(!s) return dflt;
+  if(!g_strcmp0(s, "force_rggb")) return DT_RESTORE_BAYER_FORCE_RGGB;
+  if(!g_strcmp0(s, "native"))     return DT_RESTORE_BAYER_NATIVE;
+  dt_print(DT_DEBUG_AI,
+           "[restore] unknown bayer_orientation '%s', using default", s);
+  return dflt;
+}
 
-#define TASK_DENOISE "denoise"
-#define TASK_UPSCALE "upscale"
+static dt_restore_edge_pad_t _parse_edge_pad(const char *s,
+                                             dt_restore_edge_pad_t dflt)
+{
+  if(!s) return dflt;
+  if(!g_strcmp0(s, "mirror_cropped")) return DT_RESTORE_EDGE_MIRROR_CROPPED;
+  if(!g_strcmp0(s, "mirror"))         return DT_RESTORE_EDGE_MIRROR;
+  dt_print(DT_DEBUG_AI,
+           "[restore] unknown edge_pad '%s', using default", s);
+  return dflt;
+}
+
+// target_mean accepts "null" as an explicit disable; missing key falls
+// back to the per-variant default (NAN for bayer, 0.3 for linear).
+// a numeric string parses via g_ascii_strtod
+static float _parse_target_mean(const dt_ai_model_info_t *info,
+                                const char *key, float dflt)
+{
+  char *s = dt_ai_model_attribute_string(info, key);
+  if(!s) return dflt;
+  if(!g_strcmp0(s, "null") || !g_strcmp0(s, "none"))
+  {
+    g_free(s);
+    return NAN;
+  }
+  char *endp = NULL;
+  const double v = g_ascii_strtod(s, &endp);
+  if(endp == s || !endp || *endp != '\0')
+  {
+    dt_print(DT_DEBUG_AI,
+             "[restore] target_mean '%s' not parseable, using default", s);
+    g_free(s);
+    return dflt;
+  }
+  g_free(s);
+  return (float)v;
+}
 
-static int _select_tile_size(int scale);
+static int _select_tile_size(const int *ladder, int n_ladder, int scale);
+// resolve the tile ladder for a model: prefer the "input_sizes" JSON
+// array attribute from config.json when present, otherwise fall back
+// to the built-in ladder for the model's scale. always returns a
+// freshly-allocated int[] + count that the caller owns and g_free()s
+static void _resolve_tile_ladder(const dt_ai_model_info_t *info,
+                                 int scale,
+                                 int **out_sizes,
+                                 int *out_count);
 
 // returns the cached tile size for model_id+scale+provider combo, or 0 if not set
 static int _get_cached_tile_size(const char *model_id, int scale)
@@ -187,13 +220,14 @@ static void _set_cached_tile_size(const char *model_id, int scale, int tile_size
 }
 
 // internal: create an ORT session for model_id/model_file with spatial dims
-// fixed to tile_size. returns a new ai_ctx, or NULL on failure.
+// fixed to tile_size. returns a new ai_ctx, or NULL on failure
 static dt_ai_context_t *_create_session(dt_ai_environment_t *ai_env,
                                         const char *model_id,
                                         const char *model_file,
                                         const char *dim_h,
                                         const char *dim_w,
-                                        int tile_size)
+                                        int tile_size,
+                                        uint32_t ep_flags)
 {
   const dt_ai_dim_override_t overrides[] = {
     { "batch_size", 1 },
@@ -204,13 +238,14 @@ static dt_ai_context_t *_create_session(dt_ai_environment_t *ai_env,
   return dt_ai_load_model_ext(
     ai_env, model_id, model_file,
     DT_AI_PROVIDER_CONFIGURED, DT_AI_OPT_ALL,
-    overrides, (int)G_N_ELEMENTS(overrides));
+    overrides, (int)G_N_ELEMENTS(overrides), ep_flags);
 }
 
 // internal: resolve task -> model_id -> load with tile size dim overrides
 static dt_restore_context_t *_load(dt_restore_env_t *env,
                                    const char *task,
-                                   const char *model_file,
+                                   const char *variant,
+                                   const char *default_file,
                                    int scale)
 {
   if(!env) return NULL;
@@ -227,16 +262,158 @@ static dt_restore_context_t *_load(dt_restore_env_t *env,
   dt_ai_models_get_spatial_dims(darktable.ai_registry, model_id,
                                 &dim_h, &dim_w);
 
-  // select tile size from cache or memory budget
+  const dt_ai_model_info_t *info
+    = dt_ai_get_model_info_by_id(env->ai_env, model_id);
+
+  // variant-aware config lookup: variant models must declare their ONNX
+  // filename under variants.<variant>.onnx. input_kind is stashed on ctx
+  // so raw paths can sanity-check they're pointing at the right model.
+  // non-variant models (denoise, upscale) pass variant=NULL and supply
+  // the filename directly via default_file
+  char *variant_file = NULL;
+  char *input_kind = NULL;
+  // policy strings (all optional; NULL falls through to defaults)
+  char *cs_str = NULL, *wb_str = NULL, *scale_str = NULL;
+  char *bo_str = NULL, *edge_str = NULL;
+  // expected input_kind for this variant slot. raw variants MUST match
+  // one of the declared v1 contracts; non-variant tasks pass UNKNOWN
+  // and skip the contract check entirely
+  dt_restore_input_kind_t expected_kind = DT_RESTORE_INPUT_KIND_UNKNOWN;
+  if(variant)
+  {
+    char *k_file  = g_strdup_printf("variants.%s.onnx", variant);
+    char *k_kind  = g_strdup_printf("variants.%s.input_kind", variant);
+    char *k_cs    = g_strdup_printf("variants.%s.input_colorspace", variant);
+    char *k_wb    = g_strdup_printf("variants.%s.wb_norm", variant);
+    char *k_scale = g_strdup_printf("variants.%s.output_scale", variant);
+    char *k_bo    = g_strdup_printf("variants.%s.bayer_orientation", variant);
+    char *k_edge  = g_strdup_printf("variants.%s.edge_pad", variant);
+    variant_file = dt_ai_model_attribute_string(info, k_file);
+    input_kind   = dt_ai_model_attribute_string(info, k_kind);
+    cs_str       = dt_ai_model_attribute_string(info, k_cs);
+    wb_str       = dt_ai_model_attribute_string(info, k_wb);
+    scale_str    = dt_ai_model_attribute_string(info, k_scale);
+    bo_str       = dt_ai_model_attribute_string(info, k_bo);
+    edge_str     = dt_ai_model_attribute_string(info, k_edge);
+    g_free(k_file);
+    g_free(k_kind);
+    g_free(k_cs);
+    g_free(k_wb);
+    g_free(k_scale);
+    g_free(k_bo);
+    g_free(k_edge);
+
+    if(!variant_file)
+    {
+      dt_print(DT_DEBUG_AI,
+               "[restore] model %s declares no variants.%s.onnx — "
+               "cannot load variant",
+               model_id, variant);
+      g_free(input_kind);
+      g_free(cs_str);
+      g_free(wb_str);
+      g_free(scale_str);
+      g_free(bo_str);
+      g_free(edge_str);
+      g_free(model_id);
+      return NULL;
+    }
+
+    // contract check: the variant slot name pins which input_kind we
+    // expect. older manifests predate the label; if unset, assume the
+    // expected one (back-compat). a declared-but-wrong label is a hard
+    // error — refusing to load keeps mis-packaged ONNX from crashing
+    // at inference with a confusing shape-mismatch
+    if(!g_strcmp0(task, TASK_RAWDENOISE))
+    {
+      if(!g_strcmp0(variant, "bayer"))
+        expected_kind = DT_RESTORE_INPUT_KIND_BAYER_V1;
+      else if(!g_strcmp0(variant, "xtrans"))
+        expected_kind = DT_RESTORE_INPUT_KIND_XTRANS_V1;
+      else if(!g_strcmp0(variant, "linear"))
+        expected_kind = DT_RESTORE_INPUT_KIND_LINEAR_V1;
+    }
+    if(expected_kind != DT_RESTORE_INPUT_KIND_UNKNOWN)
+    {
+      const dt_restore_input_kind_t declared = _parse_input_kind(input_kind);
+      const gboolean missing = (input_kind == NULL);
+      const gboolean mismatch
+        = !missing && declared != expected_kind;
+      if(mismatch || (!missing && declared == DT_RESTORE_INPUT_KIND_UNKNOWN))
+      {
+        dt_print(DT_DEBUG_AI,
+                 "[restore] model %s variant '%s': input_kind '%s' "
+                 "does not match expected '%s' — refusing to load",
+                 model_id, variant, input_kind,
+                 _input_kind_name(expected_kind));
+        dt_control_log(_("raw denoise model %s: incompatible input_kind"),
+                       model_id);
+        g_free(input_kind);
+        g_free(cs_str);
+        g_free(wb_str);
+        g_free(scale_str);
+        g_free(bo_str);
+        g_free(edge_str);
+        g_free(variant_file);
+        g_free(model_id);
+        return NULL;
+      }
+    }
+
+    dt_print(DT_DEBUG_AI,
+             "[restore] variant '%s': file=%s input_kind=%s",
+             variant, variant_file,
+             input_kind ? input_kind : "(none)");
+  }
+  const char *model_file = variant ? variant_file : default_file;
+
+  // resolve the tile ladder: model-declared input_sizes if present,
+  // otherwise a copy of the built-in ladder for this scale
+  int *tile_ladder = NULL;
+  int n_tile_ladder = 0;
+  _resolve_tile_ladder(info, scale, &tile_ladder, &n_tile_ladder);
+
+  // select tile size from cache, but only if the cached value is still
+  // a member of the ladder — otherwise a model upgrade that narrowed
+  // its supported input_sizes would load with a stale size and fail
+  // at graph shape inference (U-Nets are strict about spatial dims)
   int tile_size = _get_cached_tile_size(model_id, scale);
-  if(tile_size <= 0)
-    tile_size = _select_tile_size(scale);
+  gboolean cached_ok = FALSE;
+  for(int i = 0; i < n_tile_ladder && !cached_ok; i++)
+    if(tile_ladder[i] == tile_size) cached_ok = TRUE;
+  if(!cached_ok)
+  {
+    if(tile_size > 0)
+      dt_print(DT_DEBUG_AI,
+               "[restore] cached tile size %d not in ladder, re-selecting",
+               tile_size);
+    tile_size = _select_tile_size(tile_ladder, n_tile_ladder, scale);
+  }
+
+  // CoreML CPU-only flag: models whose intermediate activations
+  // overflow FP16 (e.g. raw denoise) declare this in config.json
+  // to force CoreML's CPU path which runs FP32
+  const uint32_t ep_flags
+    = dt_ai_model_attribute_bool(info, "coreml_cpu_only") ? 1 : 0;
+  if(ep_flags)
+    dt_print(DT_DEBUG_AI,
+             "[restore] model %s: coreml_cpu_only=true (ep_flags=%u)",
+             model_id, ep_flags);
 
   dt_ai_context_t *ai_ctx = _create_session(
-    env->ai_env, model_id, model_file, dim_h, dim_w, tile_size);
+    env->ai_env, model_id, model_file, dim_h, dim_w, tile_size,
+    ep_flags);
   if(!ai_ctx)
   {
     g_free(model_id);
+    g_free(tile_ladder);
+    g_free(variant_file);
+    g_free(input_kind);
+    g_free(cs_str);
+    g_free(wb_str);
+    g_free(scale_str);
+    g_free(bo_str);
+    g_free(edge_str);
     return NULL;
   }
 
@@ -245,18 +422,78 @@ static dt_restore_context_t *_load(dt_restore_env_t *env,
   ctx->ai_ctx              = ai_ctx;
   ctx->env                 = env;
   ctx->task                = g_strdup(task);
+  ctx->input_kind          = input_kind;   // take ownership
+  ctx->scale               = scale;
   ctx->model_id            = model_id;
   ctx->model_file          = g_strdup(model_file);
   ctx->tile_size           = tile_size;
+  ctx->tile_ladder         = tile_ladder;
+  ctx->n_tile_ladder       = n_tile_ladder;
+  ctx->ep_flags            = ep_flags;
   ctx->dim_h               = g_strdup(dim_h);
   ctx->dim_w               = g_strdup(dim_w);
   ctx->preserve_wide_gamut = TRUE;
+
+  // resolve policy enums: per-variant defaults reproduce today's
+  // RawNIND behavior exactly, so manifests that declare none of these
+  // keys keep working unchanged. bayer path defaults to daylight WB
+  // (training distribution); linear path defaults to as-shot WB (its
+  // re-imported DNG benefits from matching the source tonemap — see
+  // the rationale in dt_restore_raw_linear). output_scale defaults to
+  // match_gain for both. linear gets a 0.30 exposure target; bayer
+  // doesn't use one (NAN = disabled). input_colorspace only applies
+  // to the linear path
+  ctx->input_kind_enum = expected_kind;
+  {
+    // linear_v1 and xtrans_v1 share the demosaic-based pipeline
+    // defaults (as-shot WB, lin_rec2020 colorspace, 0.30 training-
+    // brightness exposure target). When a dedicated xtrans model
+    // ships these defaults may need to diverge — override in the
+    // manifest if so
+    const gboolean demosaic_pipeline
+      = (expected_kind == DT_RESTORE_INPUT_KIND_LINEAR_V1)
+        || (expected_kind == DT_RESTORE_INPUT_KIND_XTRANS_V1);
+    const dt_restore_wb_mode_t default_wb
+      = demosaic_pipeline ? DT_RESTORE_WB_AS_SHOT : DT_RESTORE_WB_DAYLIGHT;
+    const dt_restore_colorspace_t default_cs
+      = demosaic_pipeline ? DT_RESTORE_CS_LIN_REC2020 : DT_RESTORE_CS_CAMRGB;
+    const float default_tm = demosaic_pipeline ? 0.30f : NAN;
+    ctx->wb_mode          = _parse_wb_mode(wb_str, default_wb);
+    ctx->output_scale     = _parse_output_scale(scale_str, DT_RESTORE_OUT_MATCH_GAIN);
+    ctx->input_colorspace = _parse_colorspace(cs_str, default_cs);
+    char *k_tm = variant
+      ? g_strdup_printf("variants.%s.target_mean", variant) : NULL;
+    ctx->target_mean = k_tm
+      ? _parse_target_mean(info, k_tm, default_tm) : default_tm;
+    g_free(k_tm);
+
+    // bayer-only packing knobs. bayer_v1's contract pairs with
+    // force_rggb + mirror_cropped (matches RawNIND training which
+    // physically crops to RGGB before tiling — so corner-tile mirror
+    // reflections must happen in the cropped frame). a future
+    // 'native' orientation would let a model see non-RGGB sensors
+    // without any origin shift; paired default is mirror_absolute
+    // since there's no cropped frame to reflect within
+    const dt_restore_bayer_orientation_t default_bo
+      = (expected_kind == DT_RESTORE_INPUT_KIND_BAYER_V1)
+          ? DT_RESTORE_BAYER_FORCE_RGGB
+          : DT_RESTORE_BAYER_NATIVE;
+    ctx->bayer_orientation = _parse_bayer_orientation(bo_str, default_bo);
+    const dt_restore_edge_pad_t default_edge
+      = (ctx->bayer_orientation == DT_RESTORE_BAYER_FORCE_RGGB)
+          ? DT_RESTORE_EDGE_MIRROR_CROPPED
+          : DT_RESTORE_EDGE_MIRROR;
+    ctx->edge_pad = _parse_edge_pad(edge_str, default_edge);
+  }
+  g_free(cs_str);
+  g_free(wb_str);
+  g_free(scale_str);
+  g_free(bo_str);
+  g_free(edge_str);
   // shadow boost capability is declared per-model via the
   // "attributes": { "shadow_boost": true } object in config.json;
   // models that hallucinate in dark patches opt in this way;
   // other models run as-is
-  const dt_ai_model_info_t *info
-    = dt_ai_get_model_info_by_id(env->ai_env, model_id);
   ctx->shadow_boost_capable
     = dt_ai_model_attribute_bool(info, "shadow_boost");
   ctx->shadow_boost = ctx->shadow_boost_capable;
@@ -264,19 +501,28 @@ static dt_restore_context_t *_load(dt_restore_env_t *env,
     dt_print(DT_DEBUG_AI,
              "[restore] model %s declares shadow_boost attribute",
              model_id);
+  g_free(variant_file);
   return ctx;
 }
 
 // internal: recreate the ORT session with a smaller tile size after OOM.
 // updates ctx->ai_ctx and ctx->tile_size in place.
 // returns TRUE on success, FALSE if the reload also fails.
+//
+// unload the old session BEFORE creating the new one: after a GPU OOM
+// the old session is still holding VRAM, and trying to allocate even
+// a tiny new session on top triggers a cascade of init failures in
+// ORT's provider-fallback retry path. freeing first lets the new
+// session fit without the retries
 static gboolean _reload_session(dt_restore_context_t *ctx, int new_tile_size)
 {
+  dt_ai_unload_model(ctx->ai_ctx);
+  ctx->ai_ctx = NULL;
+
   dt_ai_context_t *new_ctx = _create_session(
     ctx->env->ai_env, ctx->model_id, ctx->model_file,
-    ctx->dim_h, ctx->dim_w, new_tile_size);
+    ctx->dim_h, ctx->dim_w, new_tile_size, ctx->ep_flags);
   if(!new_ctx) return FALSE;
-  dt_ai_unload_model(ctx->ai_ctx);
   ctx->ai_ctx    = new_ctx;
   ctx->tile_size = new_tile_size;
   return TRUE;
@@ -284,17 +530,63 @@ static gboolean _reload_session(dt_restore_context_t *ctx, int new_tile_size)
 
 dt_restore_context_t *dt_restore_load_denoise(dt_restore_env_t *env)
 {
-  return _load(env, TASK_DENOISE, NULL, 1);
+  return _load(env, TASK_DENOISE, NULL, NULL, 1);
+}
+
+dt_restore_sensor_class_t dt_restore_classify_sensor(const dt_image_t *img)
+{
+  if(!img || !(img->flags & DT_IMAGE_RAW))
+    return DT_RESTORE_SENSOR_CLASS_UNSUPPORTED;
+  if(img->flags & (DT_IMAGE_MONOCHROME | DT_IMAGE_MONOCHROME_BAYER))
+    return DT_RESTORE_SENSOR_CLASS_UNSUPPORTED;
+  const uint32_t filters = img->buf_dsc.filters;
+  if(filters == 9u) return DT_RESTORE_SENSOR_CLASS_XTRANS;
+  if(filters != 0u) return DT_RESTORE_SENSOR_CLASS_BAYER;
+  return DT_RESTORE_SENSOR_CLASS_LINEAR;
+}
+
+dt_restore_context_t *dt_restore_load_rawdenoise_bayer(dt_restore_env_t *env)
+{
+  // scale 1x, same pipeline as denoise; filename comes from the model's
+  // variants.bayer.onnx attribute. loading fails if the YAML doesn't
+  // declare it — no silent fallback for broken model packages
+  return _load(env, TASK_RAWDENOISE, "bayer", NULL, 1);
+}
+
+dt_restore_context_t *dt_restore_load_rawdenoise_linear(dt_restore_env_t *env)
+{
+  // generic-demosaic fallback: Foveon, monochrome-with-pattern, and
+  // currently also X-Trans (until dt_restore_load_rawdenoise_xtrans
+  // gets a dedicated variant to load)
+  return _load(env, TASK_RAWDENOISE, "linear", NULL, 1);
+}
+
+dt_restore_context_t *dt_restore_load_rawdenoise_xtrans(dt_restore_env_t *env)
+{
+  // prefer a dedicated xtrans variant when the manifest declares one;
+  // fall back to the linear pipeline otherwise. this lets a future
+  // RawNIND release ship a dedicated X-Trans model via just a manifest
+  // update — no code changes in darktable (assuming the dedicated model
+  // shares the linear pipeline; a structurally different X-Trans input
+  // format would still need its own preprocessing code)
+  dt_restore_context_t *ctx = _load(env, TASK_RAWDENOISE, "xtrans", NULL, 1);
+  if(!ctx)
+  {
+    dt_print(DT_DEBUG_AI,
+             "[restore] no dedicated xtrans variant; using linear as fallback");
+    ctx = _load(env, TASK_RAWDENOISE, "linear", NULL, 1);
+  }
+  return ctx;
 }
 
 dt_restore_context_t *dt_restore_load_upscale_x2(dt_restore_env_t *env)
 {
-  return _load(env, TASK_UPSCALE, "model_x2.onnx", 2);
+  return _load(env, TASK_UPSCALE, NULL, "model_x2.onnx", 2);
 }
 
 dt_restore_context_t *dt_restore_load_upscale_x4(dt_restore_env_t *env)
 {
-  return _load(env, TASK_UPSCALE, "model_x4.onnx", 4);
+  return _load(env, TASK_UPSCALE, NULL, "model_x4.onnx", 4);
 }
 
 dt_restore_context_t *dt_restore_ref(dt_restore_context_t *ctx)
@@ -310,76 +602,16 @@ void dt_restore_unref(dt_restore_context_t *ctx)
   {
     dt_ai_unload_model(ctx->ai_ctx);
     g_free(ctx->task);
+    g_free(ctx->input_kind);
     g_free(ctx->model_id);
     g_free(ctx->model_file);
     g_free(ctx->dim_h);
     g_free(ctx->dim_w);
+    g_free(ctx->tile_ladder);
     g_free(ctx);
   }
 }
 
-void dt_restore_set_profile(dt_restore_context_t *ctx, void *profile)
-{
-  if(!ctx) return;
-  if(!profile)
-  {
-    ctx->has_profile = FALSE;
-    return;
-  }
-
-  float primaries[3][2], whitepoint[2];
-  if(!dt_colorspaces_get_primaries_and_whitepoint_from_profile(
-       (cmsHPROFILE)profile, primaries, whitepoint))
-  {
-    dt_print(DT_DEBUG_AI,
-             "[restore] could not read primaries from working profile, "
-             "falling back to gamma-only conversion");
-    ctx->has_profile = FALSE;
-    return;
-  }
-
-  // build WP -> XYZ (stored transposed by dt, convert to row-major)
-  dt_colormatrix_t wp_to_xyz_T;
-  dt_make_transposed_matrices_from_primaries_and_whitepoint(primaries,
-                                                            whitepoint,
-                                                            wp_to_xyz_T);
-  float wp_to_xyz[9];
-  for(int i = 0; i < 3; i++)
-    for(int j = 0; j < 3; j++)
-      wp_to_xyz[3 * i + j] = wp_to_xyz_T[j][i];
-
-  // transpose dt's sRGB<->XYZ matrices (Bradford D50) to row-major
-  float xyz_to_srgb[9], srgb_to_xyz[9];
-  for(int i = 0; i < 3; i++)
-    for(int j = 0; j < 3; j++)
-    {
-      xyz_to_srgb[3 * i + j] = xyz_to_srgb_transposed[j][i];
-      srgb_to_xyz[3 * i + j] = sRGB_to_xyz_transposed[j][i];
-    }
-
-  // WP -> sRGB = (XYZ -> sRGB) * (WP -> XYZ)
-  mat3mul(ctx->wp_to_srgb, xyz_to_srgb, wp_to_xyz);
-
-  // invert WP -> XYZ to get XYZ -> WP, then compose sRGB -> WP
-  float xyz_to_wp[9];
-  if(mat3inv(xyz_to_wp, wp_to_xyz) != 0)
-  {
-    dt_print(DT_DEBUG_AI,
-             "[restore] singular WP->XYZ matrix, falling back to gamma-only");
-    ctx->has_profile = FALSE;
-    return;
-  }
-  mat3mul(ctx->srgb_to_wp, xyz_to_wp, srgb_to_xyz);
-
-  ctx->has_profile = TRUE;
-  dt_print(DT_DEBUG_AI, "[restore] working profile color matrices ready");
-}
-
-void dt_restore_set_preserve_wide_gamut(dt_restore_context_t *ctx, gboolean preserve)
-{
-  if(ctx) ctx->preserve_wide_gamut = preserve;
-}
-
 static gboolean _model_available(dt_restore_env_t *env,
                                  const char *task)
 {
@@ -403,64 +635,31 @@ gboolean dt_restore_denoise_available(dt_restore_env_t *env)
   return _model_available(env, TASK_DENOISE);
 }
 
-gboolean dt_restore_upscale_available(dt_restore_env_t *env)
+gboolean dt_restore_rawdenoise_available(dt_restore_env_t *env)
 {
-  return _model_available(env, TASK_UPSCALE);
+  return _model_available(env, TASK_RAWDENOISE);
 }
 
-/* --- color conversion --- */
-
-// sRGB transfer function (gamma curve only, no primaries change).
-// values > 1.0 are allowed to preserve wide-gamut colors
-static inline float _linear_to_srgb(const float v)
-{
-  if(v <= 0.0f) return 0.0f;
-  return (v <= 0.0031308f)
-    ? 12.92f * v
-    : 1.055f * powf(v, 1.0f / 2.4f) - 0.055f;
-}
-
-static inline float _srgb_to_linear(const float v)
-{
-  if(v <= 0.0f) return 0.0f;
-  return (v <= 0.04045f)
-    ? v / 12.92f
-    : powf((v + 0.055f) / 1.055f, 2.4f);
-}
-
-/* --- helpers --- */
-
-static inline int _mirror(int v, int max)
+gboolean dt_restore_upscale_available(dt_restore_env_t *env)
 {
-  if(v < 0) v = -v;
-  if(v >= max) v = 2 * max - 2 - v;
-  if(v < 0) return 0;
-  if(v >= max) return max - 1;
-  return v;
+  return _model_available(env, TASK_UPSCALE);
 }
 
-/* --- public API --- */
+// --- public API ---
 
 int dt_restore_get_overlap(int scale)
 {
   return (scale > 1) ? OVERLAP_UPSCALE : OVERLAP_DENOISE;
 }
 
-static int _select_tile_size(int scale)
+static int _select_tile_size(const int *ladder, int n_ladder, int scale)
 {
-  const int ladder_1x[] = DT_RESTORE_TILE_LADDER_1X;
-  const int ladder_sr[] = DT_RESTORE_TILE_LADDER_SR;
-  const int *candidates = (scale > 1) ? ladder_sr : ladder_1x;
-  const int n_candidates = (scale > 1)
-    ? (int)(sizeof(ladder_sr) / sizeof(int))
-    : (int)(sizeof(ladder_1x) / sizeof(int));
-
   const size_t avail = dt_get_available_mem();
   const size_t budget = avail / 4;
 
-  for(int i = 0; i < n_candidates; i++)
+  for(int i = 0; i < n_ladder; i++)
   {
-    const size_t T = (size_t)candidates[i];
+    const size_t T = (size_t)ladder[i];
     const size_t T_out = T * scale;
     const size_t tile_in = T * T * 3 * sizeof(float);
     const size_t tile_out
@@ -474,659 +673,299 @@ static int _select_tile_size(int scale)
     {
       dt_print(DT_DEBUG_AI,
                "[restore] tile size %d (scale=%d, need %zuMB, budget %zuMB)",
-               candidates[i], scale,
+               ladder[i], scale,
                total / (1024 * 1024),
                budget / (1024 * 1024));
-      return candidates[i];
+      return ladder[i];
     }
   }
 
   dt_print(DT_DEBUG_AI,
            "[restore] using minimum tile size %d (budget %zuMB)",
-           candidates[n_candidates - 1],
+           ladder[n_ladder - 1],
            budget / (1024 * 1024));
-  return candidates[n_candidates - 1];
+  return ladder[n_ladder - 1];
 }
 
-// Rec.709 / sRGB luminance weights (Y row of sRGB->XYZ D65);
-// applied to working-profile-linear pixels in the pass-through
-// blending below; exact only when the working profile is
-// sRGB/Rec.709, but correct enough for luminance deltas
-static inline float _luma_rec709(float r, float g, float b)
-{
-  return 0.2126f * r + 0.7152f * g + 0.0722f * b;
+static void _resolve_tile_ladder(const dt_ai_model_info_t *info,
+                                 int scale,
+                                 int **out_sizes,
+                                 int *out_count)
+{
+  // prefer the model's declared input_sizes if present: some exports
+  // ship with a fixed set of supported tile sizes (e.g. the model was
+  // compiled for specific spatial dims) and using anything outside
+  // that list will either refuse to run or produce garbage
+  int n = 0;
+  int *sizes = dt_ai_model_attribute_int_array(info, "input_sizes", &n);
+  if(sizes && n > 0)
+  {
+    *out_sizes = sizes;
+    *out_count = n;
+    return;
+  }
+  g_free(sizes);
+
+  // fall back to the built-in ladder for the model's scale
+  static const int ladder_1x[] = DT_RESTORE_TILE_LADDER_1X;
+  static const int ladder_sr[] = DT_RESTORE_TILE_LADDER_SR;
+  const int *src = (scale > 1) ? ladder_sr : ladder_1x;
+  const int src_n = (scale > 1)
+    ? (int)(sizeof(ladder_sr) / sizeof(int))
+    : (int)(sizeof(ladder_1x) / sizeof(int));
+  int *copy = g_new(int, src_n);
+  memcpy(copy, src, src_n * sizeof(int));
+  *out_sizes = copy;
+  *out_count = src_n;
 }
 
-int dt_restore_run_patch(dt_restore_context_t *ctx,
-                         const float *in_patch,
-                         int w, int h,
-                         float *out_patch,
-                         int scale)
+int dt_restore_run_patch_bayer(dt_restore_context_t *ctx,
+                               const float *in_4ch,
+                               int w, int h,
+                               float *out_3ch)
 {
   if(!ctx || !ctx->ai_ctx) return 1;
-  const size_t in_pixels = (size_t)w * h * 3;
-  const int out_w = w * scale;
-  const int out_h = h * scale;
-  const size_t out_pixels = (size_t)out_w * out_h * 3;
-  const size_t plane = (size_t)w * h;
-
-  // convert to sRGB gamma-encoded. If a working profile is set,
-  // first convert primaries (working profile -> sRGB linear) so the
-  // model sees the image as if it were native sRGB. Otherwise only
-  // apply the gamma curve (legacy path, shifts hues for wide-gamut).
-  // input layout is planar NCHW: R plane, then G plane, then B plane.
-  // in_gamut_mask records which pixels were in sRGB gamut (scale==1
-  // only) so the output pass can skip recomputing WP->sRGB
-  float *srgb_in = g_try_malloc(in_pixels * sizeof(float));
-  uint8_t *in_gamut_mask = NULL;
-  if(!srgb_in) return 1;
-  // only allocate the gamut mask when denoise pass-through is requested
-  const gboolean need_gamut_mask
-    = ctx->has_profile && scale == 1 && ctx->preserve_wide_gamut;
-  if(need_gamut_mask)
-  {
-    in_gamut_mask = g_try_malloc(plane);
-    if(!in_gamut_mask)
-    {
-      g_free(srgb_in);
-      return 1;
-    }
-  }
 
-  if(ctx->has_profile)
-  {
-    const float *M = ctx->wp_to_srgb;
-    const gboolean boost = ctx->shadow_boost;
-    for(size_t p = 0; p < plane; p++)
-    {
-      const float r = in_patch[p];
-      const float g = in_patch[p + plane];
-      const float b = in_patch[p + 2 * plane];
-      float sr = M[0] * r + M[1] * g + M[2] * b;
-      float sg = M[3] * r + M[4] * g + M[5] * b;
-      float sb = M[6] * r + M[7] * g + M[8] * b;
-      // gamut check uses pre-boost values so pass-through decisions
-      // reflect the original color
-      if(in_gamut_mask)
-      {
-        const float m = 0.01f;  // ~1% margin beyond [0, 1]
-        in_gamut_mask[p] = (sr >= -m && sr <= 1.0f + m
-                           && sg >= -m && sg <= 1.0f + m
-                           && sb >= -m && sb <= 1.0f + m) ? 1 : 0;
-      }
-      if(boost)
-      {
-        sr = sr > 0.0f ? sqrtf(sr) : 0.0f;
-        sg = sg > 0.0f ? sqrtf(sg) : 0.0f;
-        sb = sb > 0.0f ? sqrtf(sb) : 0.0f;
-      }
-      srgb_in[p]             = _linear_to_srgb(sr);
-      srgb_in[p + plane]     = _linear_to_srgb(sg);
-      srgb_in[p + 2 * plane] = _linear_to_srgb(sb);
-    }
-  }
-  else if(ctx->shadow_boost)
-  {
-    // no profile: still boost shadows so the model stays within its
-    // comfort zone, even though we treat WP values as sRGB
-    for(size_t i = 0; i < in_pixels; i++)
-    {
-      const float v = in_patch[i];
-      const float boosted = v > 0.0f ? sqrtf(v) : 0.0f;
-      srgb_in[i] = _linear_to_srgb(boosted);
-    }
-  }
-  else
-  {
-    for(size_t i = 0; i < in_pixels; i++)
-      srgb_in[i] = _linear_to_srgb(in_patch[i]);
-  }
-
-  const int num_inputs = dt_ai_get_input_count(ctx->ai_ctx);
-  if(num_inputs > MAX_MODEL_INPUTS)
-  {
-    g_free(srgb_in);
-    return 1;
-  }
+  int64_t in_shape[]  = { 1, 4, h, w };
+  int64_t out_shape[] = { 1, 3, 2 * h, 2 * w };
+  dt_ai_tensor_t input = {
+    .data  = (void *)in_4ch,
+    .shape = in_shape,
+    .ndim  = 4,
+    .type  = DT_AI_FLOAT,
+  };
+  dt_ai_tensor_t output = {
+    .data  = out_3ch,
+    .shape = out_shape,
+    .ndim  = 4,
+    .type  = DT_AI_FLOAT,
+  };
+  return dt_ai_run(ctx->ai_ctx, &input, 1, &output, 1);
+}
 
-  int64_t input_shape[] = {1, 3, h, w};
-  dt_ai_tensor_t inputs[MAX_MODEL_INPUTS];
-  memset(inputs, 0, sizeof(inputs));
-  inputs[0] = (dt_ai_tensor_t){
-    .data = (void *)srgb_in,
-    .shape = input_shape,
-    .ndim = 4,
-    .type = DT_AI_FLOAT};
-
-  // noise level map for multi-input models
-  float *noise_map = NULL;
-  int64_t noise_shape[] = {1, 1, h, w};
-  if(num_inputs >= 2)
-  {
-    const size_t map_size = (size_t)w * h;
-    noise_map = g_try_malloc(map_size * sizeof(float));
-    if(!noise_map)
-    {
-      g_free(srgb_in);
-      return 1;
-    }
-    const float sigma_norm = 25.0f / 255.0f;
-    for(size_t i = 0; i < map_size; i++)
-      noise_map[i] = sigma_norm;
-    inputs[1] = (dt_ai_tensor_t){
-      .data = (void *)noise_map,
-      .shape = noise_shape,
-      .ndim = 4,
-      .type = DT_AI_FLOAT};
-  }
+int dt_restore_run_patch_3ch_raw(dt_restore_context_t *ctx,
+                                 const float *in_3ch,
+                                 int w, int h,
+                                 float *out_3ch)
+{
+  if(!ctx || !ctx->ai_ctx) return 1;
 
-  int64_t output_shape[] = {1, 3, out_h, out_w};
+  int64_t in_shape[]  = { 1, 3, h, w };
+  int64_t out_shape[] = { 1, 3, h, w };
+  dt_ai_tensor_t input = {
+    .data  = (void *)in_3ch,
+    .shape = in_shape,
+    .ndim  = 4,
+    .type  = DT_AI_FLOAT,
+  };
   dt_ai_tensor_t output = {
-    .data = (void *)out_patch,
-    .shape = output_shape,
-    .ndim = 4,
-    .type = DT_AI_FLOAT};
-
-  int ret = dt_ai_run(ctx->ai_ctx, inputs, num_inputs,
-                      &output, 1);
-  g_free(srgb_in);
-  g_free(noise_map);
-  if(ret != 0)
-  {
-    g_free(in_gamut_mask);
-    return ret;
-  }
+    .data  = out_3ch,
+    .shape = out_shape,
+    .ndim  = 4,
+    .type  = DT_AI_FLOAT,
+  };
+  return dt_ai_run(ctx->ai_ctx, &input, 1, &output, 1);
+}
 
-  // convert model output back to the working profile
-  //
-  // with profile: apply inverse sRGB gamma, then check if the ORIGINAL
-  // input pixel (converted to sRGB linear) is representable in sRGB
-  // gamut. if yes, use model output converted back to working profile.
-  // if no, pass through the original pixel (wide-gamut colors preserved,
-  // no denoising on those pixels). upscale has no pixel-to-pixel
-  // correspondence so pass-through is not possible — always use the
-  // model output
-  //
-  // without profile: fall back to per-channel pass-through in the
-  // original (working-profile-as-sRGB) space
-  const gboolean boost = ctx->shadow_boost;
-  if(ctx->has_profile && scale == 1 && ctx->preserve_wide_gamut)
-  {
-    const size_t out_plane = (size_t)out_w * out_h;
-    const float *Mi = ctx->srgb_to_wp;
-    // pass 1: write denoised values for in-gamut pixels; out-of-gamut
-    // pixels get plain pass-through as a fallback (used only when no
-    // in-gamut neighbors are found in pass 2)
-    for(size_t p = 0; p < out_plane; p++)
-    {
-      if(in_gamut_mask[p])
-      {
-        float sr = _srgb_to_linear(out_patch[p]);
-        float sg = _srgb_to_linear(out_patch[p + out_plane]);
-        float sb = _srgb_to_linear(out_patch[p + 2 * out_plane]);
-        if(boost) { sr *= sr; sg *= sg; sb *= sb; }
-        out_patch[p]                 = Mi[0] * sr + Mi[1] * sg + Mi[2] * sb;
-        out_patch[p + out_plane]     = Mi[3] * sr + Mi[4] * sg + Mi[5] * sb;
-        out_patch[p + 2 * out_plane] = Mi[6] * sr + Mi[7] * sg + Mi[8] * sb;
-      }
-      else
-      {
-        out_patch[p]                 = in_patch[p];
-        out_patch[p + out_plane]     = in_patch[p + plane];
-        out_patch[p + 2 * out_plane] = in_patch[p + 2 * plane];
-      }
-    }
-    // pass 2: luminance-only smoothing for out-of-gamut pixels. the
-    // original pixel keeps its chroma (wide-gamut color preserved
-    // exactly) but its brightness is shifted to match the local
-    // average luminance of denoised in-gamut neighbors; this kills
-    // the single-pixel speckles that pass-through would otherwise
-    // leave visible against the denoised background
-    const int radius = 2;  // 5x5 window
-    for(int y = 0; y < out_h; y++)
-    {
-      for(int x = 0; x < out_w; x++)
-      {
-        const size_t p = (size_t)y * out_w + x;
-        if(in_gamut_mask[p]) continue;
-        const float r0 = in_patch[p];
-        const float g0 = in_patch[p + plane];
-        const float b0 = in_patch[p + 2 * plane];
-        const float Y_orig = _luma_rec709(r0, g0, b0);
-        float sumY = 0.0f;
-        int count = 0;
-        const int y0 = y - radius < 0 ? 0 : y - radius;
-        const int y1 = y + radius >= out_h ? out_h - 1 : y + radius;
-        const int x0 = x - radius < 0 ? 0 : x - radius;
-        const int x1 = x + radius >= out_w ? out_w - 1 : x + radius;
-        for(int yy = y0; yy <= y1; yy++)
-        {
-          for(int xx = x0; xx <= x1; xx++)
-          {
-            const size_t q = (size_t)yy * out_w + xx;
-            if(!in_gamut_mask[q]) continue;
-            const float rq = out_patch[q];
-            const float gq = out_patch[q + out_plane];
-            const float bq = out_patch[q + 2 * out_plane];
-            sumY += _luma_rec709(rq, gq, bq);
-            count++;
-          }
-        }
-        if(count > 0)
-        {
-          const float dY = sumY / (float)count - Y_orig;
-          out_patch[p]                 = r0 + dY;
-          out_patch[p + out_plane]     = g0 + dY;
-          out_patch[p + 2 * out_plane] = b0 + dY;
-        }
-      }
-    }
-  }
-  else if(ctx->has_profile && scale == 1)
-  {
-    // denoise with profile but NO pass-through: apply the inverse
-    // matrix to every pixel. wide-gamut inputs will have been clipped
-    // by the model, but we get denoising everywhere
-    const size_t out_plane = (size_t)out_w * out_h;
-    const float *Mi = ctx->srgb_to_wp;
-    for(size_t p = 0; p < out_plane; p++)
-    {
-      float sr = _srgb_to_linear(out_patch[p]);
-      float sg = _srgb_to_linear(out_patch[p + out_plane]);
-      float sb = _srgb_to_linear(out_patch[p + 2 * out_plane]);
-      if(boost) { sr *= sr; sg *= sg; sb *= sb; }
-      out_patch[p]                 = Mi[0] * sr + Mi[1] * sg + Mi[2] * sb;
-      out_patch[p + out_plane]     = Mi[3] * sr + Mi[4] * sg + Mi[5] * sb;
-      out_patch[p + 2 * out_plane] = Mi[6] * sr + Mi[7] * sg + Mi[8] * sb;
-    }
-  }
-  else if(scale == 1)
-  {
-    // no profile set: per-channel pass-through, treats working-profile
-    // numbers as if they were sRGB. colors will be slightly shifted
-    // for wide-gamut working profiles — rely on the profile path above
-    // when possible. pass-through still honored via preserve_wide_gamut
-    for(size_t i = 0; i < out_pixels; i++)
-    {
-      const float in = in_patch[i];
-      if(ctx->preserve_wide_gamut && (in < 0.0f || in > 1.0f))
-      {
-        out_patch[i] = in;
-      }
-      else
-      {
-        float v = _srgb_to_linear(out_patch[i]);
-        if(boost) v *= v;
-        out_patch[i] = v;
-      }
-    }
-  }
-  else
+const int *dt_restore_get_tile_ladder(const dt_restore_context_t *ctx,
+                                      int *out_count)
+{
+  if(!ctx)
   {
-    // upscale: no pixel-to-pixel correspondence, use model output as-is
-    if(ctx->has_profile)
-    {
-      const size_t out_plane = (size_t)out_w * out_h;
-      const float *Mi = ctx->srgb_to_wp;
-      for(size_t p = 0; p < out_plane; p++)
-      {
-        float sr = _srgb_to_linear(out_patch[p]);
-        float sg = _srgb_to_linear(out_patch[p + out_plane]);
-        float sb = _srgb_to_linear(out_patch[p + 2 * out_plane]);
-        if(boost) { sr *= sr; sg *= sg; sb *= sb; }
-        out_patch[p]                 = Mi[0] * sr + Mi[1] * sg + Mi[2] * sb;
-        out_patch[p + out_plane]     = Mi[3] * sr + Mi[4] * sg + Mi[5] * sb;
-        out_patch[p + 2 * out_plane] = Mi[6] * sr + Mi[7] * sg + Mi[8] * sb;
-      }
-    }
-    else
-    {
-      for(size_t i = 0; i < out_pixels; i++)
-      {
-        float v = _srgb_to_linear(out_patch[i]);
-        if(boost) v *= v;
-        out_patch[i] = v;
-      }
-    }
+    if(out_count) *out_count = 0;
+    return NULL;
   }
+  if(out_count) *out_count = ctx->n_tile_ladder;
+  return ctx->tile_ladder;
+}
 
-  g_free(in_gamut_mask);
-  return 0;
+int dt_restore_get_tile_size(const dt_restore_context_t *ctx)
+{
+  return ctx ? ctx->tile_size : 0;
 }
 
-// per-image gate for the shadow-boost curve; enable only when the image
-// has substantial near-black area to protect — bright images would only
-// pay the curve cost (minor highlight compression) for no gain;
-// thresholds tuned so localized very-dark features (a tree hollow, a
-// silhouette) do NOT trigger; only broad noisy shadow regions do
-//
-// in_data is interleaved float4 RGBA
-#define _SHADOW_BOOST_THRESHOLD 0.005f  // 0.5% linear luminance
-#define _SHADOW_BOOST_FRACTION  0.10f   // 10% of sampled pixels
-static gboolean _image_has_deep_shadows(const float *in_data, int w, int h)
+gboolean dt_restore_reload_session(dt_restore_context_t *ctx,
+                                   int new_tile_size)
 {
-  const size_t stride = 16;  // sample 1/256 of pixels for speed
-  size_t dark = 0, total = 0;
-  for(size_t y = 0; y < (size_t)h; y += stride)
-    for(size_t x = 0; x < (size_t)w; x += stride)
-    {
-      const size_t p = ((size_t)y * w + x) * 4;
-      const float luma = 0.2126f * in_data[p]
-                       + 0.7152f * in_data[p + 1]
-                       + 0.0722f * in_data[p + 2];
-      if(luma < _SHADOW_BOOST_THRESHOLD) dark++;
-      total++;
-    }
-  return total > 0 && (float)dark / total >= _SHADOW_BOOST_FRACTION;
+  if(!ctx) return FALSE;
+  return _reload_session(ctx, new_tile_size);
 }
 
-int dt_restore_process_tiled(dt_restore_context_t *ctx,
-                             const float *in_data,
-                             int width, int height,
-                             int scale,
-                             dt_restore_row_writer_t row_writer,
-                             void *writer_data,
-                             struct _dt_job_t *control_job)
+void dt_restore_persist_tile_size(const dt_restore_context_t *ctx)
 {
-  if(!ctx || !in_data || !row_writer)
-    return 1;
+  if(ctx && ctx->model_id)
+    _set_cached_tile_size(ctx->model_id, ctx->scale, ctx->tile_size);
+}
 
-  // for shadow-boost-capable models, decide per-image whether the
-  // curve is worth applying; one analysis per call, before tiling,
-  // so all tiles see the same flag (avoids per-tile seams)
-  if(ctx->shadow_boost_capable)
-  {
-    const gboolean dark = _image_has_deep_shadows(in_data, width, height);
-    ctx->shadow_boost = dark;
-    dt_print(DT_DEBUG_AI, "[restore] shadow boost %s",
-             dark ? "enabled" : "disabled");
-  }
+// shared bridge: run the user's darktable pixelpipe on an arbitrary sensor
+// buffer, capture the display-referred RGB at an ROI. used by both raw-
+// denoise preview paths (Bayer CFA after re-mosaic, X-Trans CFA after
+// re-mosaic) so the preview before/after match what the user sees in
+// darkroom after Process + DNG re-import
+int dt_restore_run_user_pipe_roi(dt_imgid_t imgid,
+                                 void *input_native,
+                                 int iw,
+                                 int ih,
+                                 int roi_x, int roi_y,
+                                 int roi_w, int roi_h,
+                                 int *out_w, int *out_h,
+                                 float **out_rgb)
+{
+  if(out_rgb) *out_rgb = NULL;
+  if(out_w) *out_w = 0;
+  if(out_h) *out_h = 0;
+  if(!input_native || iw <= 0 || ih <= 0
+     || roi_w <= 0 || roi_h <= 0)
+    return 1;
 
-  const int O = (scale > 1) ? OVERLAP_UPSCALE : OVERLAP_DENOISE;
-  const int S = scale;
-  const int out_w = width * S;
-  const int ladder_1x[] = DT_RESTORE_TILE_LADDER_1X;
-  const int ladder_sr[] = DT_RESTORE_TILE_LADDER_SR;
-  const int *ladder = (scale > 1) ? ladder_sr : ladder_1x;
-  const int n_ladder = (scale > 1)
-    ? (int)(sizeof(ladder_sr) / sizeof(int))
-    : (int)(sizeof(ladder_1x) / sizeof(int));
-  int T = ctx->tile_size;
-
-  // outer retry loop: on inference failure (e.g. GPU OOM) drop to the
-  // next smaller candidate in the shared ladder and try again
-retry:;
-  int step = T - 2 * O;
-  int T_out = T * S;
-  int O_out = O * S;
-  int step_out = step * S;
-  size_t in_plane = (size_t)T * T;
-  size_t out_plane = (size_t)T_out * T_out;
-  int cols = (width + step - 1) / step;
-  int rows = (height + step - 1) / step;
-  int total_tiles = cols * rows;
+  dt_develop_t dev;
+  dt_dev_init(&dev, FALSE);
+  dt_dev_load_image(&dev, imgid);
 
-  dt_print(DT_DEBUG_AI,
-           "[restore] tiling %dx%d (scale=%d)"
-           " -> %dx%d, %dx%d grid (%d tiles, T=%d)",
-           width, height, S, out_w, height * S,
-           cols, rows, total_tiles, T);
-
-  float *tile_in = g_try_malloc(
-    in_plane * 3 * sizeof(float));
-  float *tile_out = g_try_malloc(
-    out_plane * 3 * sizeof(float));
-  float *row_buf = g_try_malloc(
-    (size_t)out_w * step_out * 3 * sizeof(float));
-  if(!tile_in || !tile_out || !row_buf)
+  dt_dev_pixelpipe_t pipe;
+  if(!dt_dev_pixelpipe_init_export(&pipe, iw, ih, IMAGEIO_FLOAT, FALSE))
   {
-    g_free(tile_in);
-    g_free(tile_out);
-    g_free(row_buf);
+    dt_dev_cleanup(&dev);
     return 1;
   }
 
-  int res = 0;
-  int tile_count = 0;
-
-  for(int ty = 0; ty < rows; ty++)
+  // force output to linear Rec.709 (sRGB primaries, linear transfer)
+  // so the widget's sRGB-gamma encoder displays the right colours.
+  // MUST be called before create_nodes / synch_all: colorout reads
+  // pipe->icc_type during commit_params at synch_all time. setting it
+  // afterwards leaves colorout committed with the user's working
+  // profile (often Rec.2020 / ProPhoto) → the cairo path then
+  // applies sRGB gamma to wrong-primaries numbers → preview comes
+  // out noticeably brighter / wrong colours vs. the batch DNG that
+  // re-imports through the normal pipe
+  dt_dev_pixelpipe_set_icc(&pipe, DT_COLORSPACE_LIN_REC709, NULL,
+                           DT_INTENT_PERCEPTUAL);
+
+  dt_ioppr_resync_modules_order(&dev);
+  dt_dev_pixelpipe_set_input(&pipe, &dev, (float *)input_native,
+                             iw, ih, 1.0f);
+  dt_dev_pixelpipe_create_nodes(&pipe, &dev);
+  dt_dev_pixelpipe_synch_all(&pipe, &dev);
+
+  // skip rawdenoise — neural denoise already happened upstream.
+  // safe to do this after synch_all: this only flips piece->enabled,
+  // which the per-iop process loop checks at run time
+  for(GList *n = pipe.nodes; n; n = g_list_next(n))
   {
-    const int y = ty * step;
-    const int valid_h = (y + step > height)
-      ? height - y : step;
-    const int valid_h_out = valid_h * S;
-
-    memset(row_buf, 0,
-           (size_t)out_w * valid_h_out * 3
-           * sizeof(float));
-
-    for(int tx = 0; tx < cols; tx++)
-    {
-      if(control_job
-         && dt_control_job_get_state(control_job)
-              == DT_JOB_STATE_CANCELLED)
-      {
-        res = 1;
-        goto cleanup;
-      }
-
-      const int x = tx * step;
-      const int in_x = x - O;
-      const int in_y = y - O;
-      const int needs_mirror
-        = (in_x < 0 || in_y < 0
-           || in_x + T > width
-           || in_y + T > height);
-
-      // interleaved RGBx -> planar RGB
-      if(needs_mirror)
-      {
-        for(int dy = 0; dy < T; ++dy)
-        {
-          const int sy = _mirror(in_y + dy, height);
-          for(int dx = 0; dx < T; ++dx)
-          {
-            const int sx
-              = _mirror(in_x + dx, width);
-            const size_t po = (size_t)dy * T + dx;
-            const size_t si
-              = ((size_t)sy * width + sx) * 4;
-            tile_in[po] = in_data[si + 0];
-            tile_in[po + in_plane]
-              = in_data[si + 1];
-            tile_in[po + 2 * in_plane]
-              = in_data[si + 2];
-          }
-        }
-      }
-      else
-      {
-        for(int dy = 0; dy < T; ++dy)
-        {
-          const float *row
-            = in_data
-              + ((size_t)(in_y + dy) * width
-                 + in_x) * 4;
-          const size_t ro = (size_t)dy * T;
-          for(int dx = 0; dx < T; ++dx)
-          {
-            tile_in[ro + dx] = row[dx * 4 + 0];
-            tile_in[ro + dx + in_plane]
-              = row[dx * 4 + 1];
-            tile_in[ro + dx + 2 * in_plane]
-              = row[dx * 4 + 2];
-          }
-        }
-      }
-
-      if(dt_restore_run_patch(
-           ctx, tile_in, T, T, tile_out, S) != 0)
-      {
-        // retry with the next smaller ladder entry if no rows have
-        // been delivered yet (safe to restart). once rows are written
-        // we can't rewind the row_writer (e.g. TIFF is sequential).
-        // _reload_session() recreates the ORT session for the smaller
-        // tile size (dim overrides are shape-specific).
-        int next_T = 0;
-        for(int i = 0; i < n_ladder; i++)
-          if(ladder[i] < T) { next_T = ladder[i]; break; }
-        if(next_T > 0 && ty == 0
-           && _reload_session(ctx, next_T))
-        {
-          dt_print(DT_DEBUG_AI,
-                   "[restore] inference failed at tile %d,%d "
-                   "(T=%d), retrying with T=%d",
-                   x, y, T, next_T);
-          g_free(tile_in);
-          g_free(tile_out);
-          g_free(row_buf);
-          T = next_T;
-          goto retry;
-        }
-        dt_print(DT_DEBUG_AI,
-                 "[restore] inference failed at"
-                 " tile %d,%d (T=%d, minimum reached)", x, y, T);
-        res = 1;
-        goto cleanup;
-      }
-
-      // valid region -> row buffer
-      const int valid_w = (x + step > width)
-        ? width - x : step;
-      const int valid_w_out = valid_w * S;
-
-      for(int dy = 0; dy < valid_h_out; ++dy)
-      {
-        const size_t src_row
-          = (size_t)(O_out + dy) * T_out + O_out;
-        const size_t dst_row
-          = ((size_t)dy * out_w + x * S) * 3;
-        for(int dx = 0; dx < valid_w_out; ++dx)
-        {
-          row_buf[dst_row + dx * 3 + 0]
-            = tile_out[src_row + dx];
-          row_buf[dst_row + dx * 3 + 1]
-            = tile_out[src_row + dx + out_plane];
-          row_buf[dst_row + dx * 3 + 2]
-            = tile_out[src_row + dx
-                       + 2 * out_plane];
-        }
-      }
-
-      tile_count++;
-      if(control_job)
-        dt_control_job_set_progress(control_job,
-                                    (double)tile_count / total_tiles);
-    }
+    dt_dev_pixelpipe_iop_t *piece = n->data;
+    if(dt_iop_module_is(piece->module->so, "rawdenoise"))
+      piece->enabled = FALSE;
+  }
 
-    // deliver completed scanlines via callback
-    for(int dy = 0; dy < valid_h_out; dy++)
+  int pw = 0, ph = 0;
+  dt_dev_pixelpipe_get_dimensions(&pipe, &dev, iw, ih, &pw, &ph);
+  if(pw <= 0 || ph <= 0)
+  {
+    dt_dev_pixelpipe_cleanup(&pipe);
+    dt_dev_cleanup(&dev);
+    return 1;
+  }
+  pipe.processed_width = pw;
+  pipe.processed_height = ph;
+
+  // the ROI passed to process_no_gamma is in POST-pipe (final output)
+  // coords, but the caller hands us sensor (input) coords so the ROI
+  // lines up with the denoised CFA patch it built. forward-transform
+  // the crop rectangle's 4 corners through the user's full geometry
+  // chain (rawprepare + clipping + ashift + lens + rotatepixels + ...)
+  // and use the INSCRIBED axis-aligned rectangle of the transformed
+  // quad as the pipe ROI. the circumscribed AABB would include corner
+  // triangles that back-project to sensor positions OUTSIDE the
+  // denoised region — they'd render as noisy strips at the edges of
+  // the preview. the inscribed rect is strictly inside the quad so
+  // every sample back-projects within the patched region
+  float corners[8] = {
+    (float)roi_x,             (float)roi_y,
+    (float)(roi_x + roi_w),   (float)roi_y,
+    (float)roi_x,             (float)(roi_y + roi_h),
+    (float)(roi_x + roi_w),   (float)(roi_y + roi_h),
+  };
+  dt_dev_distort_transform_plus(&dev, &pipe, 0.0,
+                                DT_DEV_TRANSFORM_DIR_ALL_GEOMETRY,
+                                corners, 4);
+
+  // inscribed AABB: second-smallest x/y and second-largest x/y of the
+  // 4 transformed corners. for a parallelogram these are the innermost
+  // of each pair; for small lens distortions they're still safe (i.e.
+  // lie inside the quad) because the quad stays nearly rectangular
+  float xs[4] = { corners[0], corners[2], corners[4], corners[6] };
+  float ys[4] = { corners[1], corners[3], corners[5], corners[7] };
+  for(int i = 0; i < 3; i++)
+    for(int j = i + 1; j < 4; j++)
     {
-      const float *src = row_buf + (size_t)dy * out_w * 3;
-      if(row_writer(src, out_w, y * S + dy,
-                    writer_data) != 0)
-      {
-        res = 1;
-        goto cleanup;
-      }
+      if(xs[i] > xs[j]) { float t = xs[i]; xs[i] = xs[j]; xs[j] = t; }
+      if(ys[i] > ys[j]) { float t = ys[i]; ys[i] = ys[j]; ys[j] = t; }
     }
-  }
-
-  // persist tile size on first full success so subsequent runs skip OOM retry
-  if(res == 0)
-    _set_cached_tile_size(ctx->model_id, S, ctx->tile_size);
-
-cleanup:
-  g_free(tile_in);
-  g_free(tile_out);
-  g_free(row_buf);
-  return res;
-}
-
-void dt_restore_apply_detail_recovery(const float *original_4ch,
-                                      float *denoised_4ch,
-                                      int width, int height,
-                                      float alpha)
-{
-  const size_t npix = (size_t)width * height;
-
-  float *const restrict lum_residual
-    = dt_alloc_align_float(npix);
-  if(!lum_residual) return;
-
-#ifdef _OPENMP
-#pragma omp parallel for simd default(none)           \
-  dt_omp_firstprivate(original_4ch, denoised_4ch,     \
-                      lum_residual, npix)             \
-  schedule(simd:static)                               \
-  aligned(original_4ch, denoised_4ch, lum_residual:64)
-#endif
-  for(size_t i = 0; i < npix; i++)
+  // round inward (ceil for inner min, floor for inner max) so the
+  // chosen rect stays strictly inside the transformed quad
+  int pipe_roi_x = (int)ceilf(xs[1]);
+  int pipe_roi_y = (int)ceilf(ys[1]);
+  int pipe_roi_w = (int)floorf(xs[2]) - pipe_roi_x;
+  int pipe_roi_h = (int)floorf(ys[2]) - pipe_roi_y;
+
+  // clamp to the pipe's actual processed extent; a sensor ROI near
+  // the edge may transform to a post-pipe ROI that spills past pw/ph
+  if(pipe_roi_x < 0) { pipe_roi_w += pipe_roi_x; pipe_roi_x = 0; }
+  if(pipe_roi_y < 0) { pipe_roi_h += pipe_roi_y; pipe_roi_y = 0; }
+  if(pipe_roi_x + pipe_roi_w > pw) pipe_roi_w = pw - pipe_roi_x;
+  if(pipe_roi_y + pipe_roi_h > ph) pipe_roi_h = ph - pipe_roi_y;
+  if(pipe_roi_w <= 0 || pipe_roi_h <= 0)
   {
-    const size_t p = i * 4;
-    const float lum_orig
-      = 0.2126f * original_4ch[p + 0]
-        + 0.7152f * original_4ch[p + 1]
-        + 0.0722f * original_4ch[p + 2];
-    const float lum_den
-      = 0.2126f * denoised_4ch[p + 0]
-        + 0.7152f * denoised_4ch[p + 1]
-        + 0.0722f * denoised_4ch[p + 2];
-    lum_residual[i] = lum_orig - lum_den;
+    dt_dev_pixelpipe_cleanup(&pipe);
+    dt_dev_cleanup(&dev);
+    return 1;
   }
 
-  float noise[DWT_DETAIL_BANDS];
-  _compute_adaptive_noise(lum_residual, npix, noise);
-  dwt_denoise(lum_residual, width, height,
-              DWT_DETAIL_BANDS, noise);
-
-#ifdef _OPENMP
-#pragma omp parallel for simd default(none)       \
-  dt_omp_firstprivate(denoised_4ch, lum_residual, \
-                      npix, alpha)                \
-  schedule(simd:static)                           \
-  aligned(denoised_4ch, lum_residual:64)
-#endif
-  for(size_t i = 0; i < npix; i++)
+  // NB: process_no_gamma's return value signals "pipe altered
+  // mid-flight", NOT success — check backbuf instead
+  dt_dev_pixelpipe_process_no_gamma(&pipe, &dev,
+                                    pipe_roi_x, pipe_roi_y,
+                                    pipe_roi_w, pipe_roi_h, 1.0f);
+
+  const int bw = pipe.backbuf_width;
+  const int bh = pipe.backbuf_height;
+  if(!pipe.backbuf || bw <= 0 || bh <= 0)
   {
-    const size_t p = i * 4;
-    const float d = alpha * lum_residual[i];
-    denoised_4ch[p + 0] += d;
-    denoised_4ch[p + 1] += d;
-    denoised_4ch[p + 2] += d;
+    dt_dev_pixelpipe_cleanup(&pipe);
+    dt_dev_cleanup(&dev);
+    return 1;
   }
 
-  dt_free_align(lum_residual);
-}
-
-float *dt_restore_compute_dwt_detail(const float *before_3ch,
-                                     const float *after_3ch,
-                                     int width, int height)
-{
-  const size_t npix = (size_t)width * height;
-  float *lum_residual = dt_alloc_align_float(npix);
-  if(!lum_residual) return NULL;
+  // actual rendered dims may differ from the geometry-transformed
+  // pipe ROI if the pipe is trimmed mid-chain (rare but possible).
+  // callers must read *out_w / *out_h instead of assuming anything
+  if(bw != pipe_roi_w || bh != pipe_roi_h)
+    dt_print(DT_DEBUG_AI,
+             "[restore] pipe ROI %dx%d -> backbuf %dx%d",
+             pipe_roi_w, pipe_roi_h, bw, bh);
 
-  for(size_t i = 0; i < npix; i++)
+  // pipe.backbuf is 4ch interleaved RGBA; repack to 3ch for the
+  // preview blend / display path
+  float *rgb = g_try_malloc((size_t)bw * bh * 3 * sizeof(float));
+  if(rgb)
   {
-    const size_t si = i * 3;
-    const float lum_orig
-      = 0.2126f * before_3ch[si + 0]
-        + 0.7152f * before_3ch[si + 1]
-        + 0.0722f * before_3ch[si + 2];
-    const float lum_den
-      = 0.2126f * after_3ch[si + 0]
-        + 0.7152f * after_3ch[si + 1]
-        + 0.0722f * after_3ch[si + 2];
-    lum_residual[i] = lum_orig - lum_den;
+    const float *src = (const float *)pipe.backbuf;
+    for(size_t i = 0; i < (size_t)bw * bh; i++)
+    {
+      rgb[i * 3 + 0] = src[i * 4 + 0];
+      rgb[i * 3 + 1] = src[i * 4 + 1];
+      rgb[i * 3 + 2] = src[i * 4 + 2];
+    }
   }
 
-  float noise[DWT_DETAIL_BANDS];
-  _compute_adaptive_noise(lum_residual, npix, noise);
-  dwt_denoise(lum_residual, width, height,
-              DWT_DETAIL_BANDS, noise);
+  dt_dev_pixelpipe_cleanup(&pipe);
+  dt_dev_cleanup(&dev);
 
-  return lum_residual;
+  if(!rgb) return 1;
+  *out_rgb = rgb;
+  if(out_w) *out_w = bw;
+  if(out_h) *out_h = bh;
+  return 0;
 }
 
 // clang-format off
diff --git a/src/common/ai/restore.h b/src/common/ai/restore.h
index 7d1587846ac4..e98e6d4a15bc 100644
--- a/src/common/ai/restore.h
+++ b/src/common/ai/restore.h
@@ -16,268 +16,372 @@
     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-/*
-   restore — reusable AI denoise and upscale processing
-
-   this module provides the core inference, tiling, and detail
-   recovery logic for AI-based image restoration. it is part of
-   the darktable_ai shared library and has no GUI dependencies.
-
-   consumers:
-     - src/libs/neural_restore.c (lighttable batch + preview)
-
-   pixel pipeline:
-     input is linear Rec.709 float4 RGBA (from darktable export).
-     dt_restore_run_patch() converts linear->sRGB before inference
-     and sRGB->linear after. models operate in planar NCHW layout.
-     dt_restore_process_tiled() handles interleaved-to-planar
-     conversion, mirror padding at boundaries, and overlap blending.
-
-   detail recovery:
-     dt_restore_apply_detail_recovery() uses wavelet (DWT)
-     decomposition to separate noise from texture in the luminance
-     residual (original - denoised). fine bands are thresholded;
-     coarser bands are preserved and blended back.
-*/
+// restore — generic AI restore environment and model lifecycle.
+//
+// this module provides the shared scaffolding that all AI restore
+// paths sit on top of: environment init, model loading with tile
+// ladder selection + OOM retry, reference-counted contexts, tile
+// size persistence, and the user-pipe ROI bridge used by raw-
+// denoise previews. RGB denoise/upscale inference lives in
+// restore_rgb.{c,h}; raw variants live in restore_raw_*.{c,h}.
+//
+// consumers:
+// - src/libs/neural_restore.c (lighttable batch + preview)
+// - src/common/ai/restore_rgb.c (RGB denoise + upscale)
+// - src/common/ai/restore_raw_bayer.c (RawNIND Bayer)
+// - src/common/ai/restore_raw_linear.c (RawNIND linear/X-Trans)
 
 #pragma once
 
 #include <glib.h>
 
-struct _dt_job_t;
+#include "common/image.h"  // for dt_imgid_t
 
-/* --- opaque types --- */
+// --- opaque types ---
 
 typedef struct dt_restore_env_t dt_restore_env_t;
 typedef struct dt_restore_context_t dt_restore_context_t;
 
-/* --- environment lifecycle --- */
-
-/**
- * @brief initialize the restore environment
- *
- * wraps dt_ai_env_init(). returns NULL when AI is disabled.
- *
- * @return environment handle, or NULL
- */
+// --- sensor classification ---
+
+// BAYER is any standard 2x2 Bayer (RGGB / BGGR / GRBG / GBRG).
+// XTRANS is Fuji's 6x6 pattern (filters == 9u). LINEAR is the
+// generic-demosaic fallback used for Foveon, monochrome-with-pattern,
+// and anything else without a dedicated pipeline. UNSUPPORTED means
+// the image can't be routed to any denoise variant (non-raw, pure
+// monochrome, etc.). pick the loader matching the class:
+//   BAYER  -> dt_restore_load_rawdenoise_bayer
+//   XTRANS -> dt_restore_load_rawdenoise_xtrans
+//   LINEAR -> dt_restore_load_rawdenoise_linear
+typedef enum
+{
+  DT_RESTORE_SENSOR_CLASS_BAYER = 0,
+  DT_RESTORE_SENSOR_CLASS_XTRANS,
+  DT_RESTORE_SENSOR_CLASS_LINEAR,
+  DT_RESTORE_SENSOR_CLASS_UNSUPPORTED,
+} dt_restore_sensor_class_t;
+
+// classify a raw image by its CFA pattern. pure function of img flags
+// and buf_dsc.filters; caller is expected to have a raw-loaded image
+// (buf_dsc.filters populated by rawspeed). returns UNSUPPORTED when
+// the image isn't a raw darktable can denoise
+dt_restore_sensor_class_t dt_restore_classify_sensor(const dt_image_t *img);
+
+// --- environment lifecycle ---
+
+// @brief initialize the restore environment
+//
+// wraps dt_ai_env_init(). returns NULL when AI is disabled.
+//
+// @return environment handle, or NULL
 dt_restore_env_t *dt_restore_env_init(void);
 
-/**
- * @brief refresh model list after downloads/installs
- * @param env environment handle
- */
+// @brief refresh model list after downloads/installs
+// @param env environment handle
 void dt_restore_env_refresh(dt_restore_env_t *env);
 
-/**
- * @brief destroy the environment and free resources
- * @param env environment handle (NULL-safe)
- */
+// @brief destroy the environment and free resources
+// @param env environment handle (NULL-safe)
 void dt_restore_env_destroy(dt_restore_env_t *env);
 
-/* --- model lifecycle --- */
+// --- model lifecycle ---
 
-/**
- * @brief load denoise model (scale 1x)
- * @param env environment handle
- * @return context handle, or NULL if no model available
- */
+// @brief load denoise model (scale 1x)
+// @param env environment handle
+// @return context handle, or NULL if no model available
 dt_restore_context_t *dt_restore_load_denoise(dt_restore_env_t *env);
 
-/**
- * @brief load upscale model at 2x
- * @param env environment handle
- * @return context handle, or NULL if no model available
- */
+// @brief load raw-denoise bayer model (scale 1x)
+//
+// raw denoise reuses the full scale==1 denoise pipeline (tile size,
+// color conversion, shadow boost, wide-gamut pass-through); only the
+// model's task string ("rawdenoise") differs. the bayer and linear
+// ONNX files ship together in one "rawdenoise" package and the caller
+// picks which variant to load.
+//
+// the filename is read from the model's variants.bayer.onnx attribute;
+// a model package that doesn't declare this attribute fails to load
+// (no silent fallback).
+//
+// --- bayer_v1 input contract ---
+//
+// variants declaring `input_kind: bayer_v1` must satisfy:
+//
+//   INPUT: NCHW, 4 channels, T×T (packed half-resolution, where the
+//          sensor tile is 2T × 2T). channel order: R, G1, G2, B —
+//          extraction starts at the CFA's R origin so non-RGGB sensors
+//          (BGGR, GRBG, GBRG) get packed as if they were RGGB. this
+//          matches RawNIND training, which physically crops non-RGGB
+//          sensors to an RGGB origin before tiling. overridable via
+//          variants.bayer.bayer_orientation (force_rggb | native);
+//          default: force_rggb.
+//          values: (raw - black[site]) / range[site] * wb_norm[ch].
+//          wb_norm defaults to daylight (D65 derived from the camera
+//          adobe_XYZ_to_CAM), overridable via variants.bayer.wb_norm
+//          (daylight | as_shot | none).
+//          edge tiles that extend past the image bounds are mirror-
+//          padded inside the effective-RGGB-cropped rectangle
+//          (variants.bayer.edge_pad: mirror_cropped | mirror). default
+//          for bayer_v1 is mirror_cropped so corner tiles see the same
+//          reflections the model's training did.
+//
+//   OUTPUT: NCHW, 3 channels, 2T × 2T (model internally demosaics via
+//           PixelShuffle). values are camRGB in the same (WB, exposure)
+//           frame as the input. output scale is arbitrary unless the
+//           variant declares output_scale: absolute — by default the
+//           loader applies match_gain (scalar mean-match) before
+//           re-mosaicing. input_colorspace and target_mean are ignored
+//           on this path.
+//
+// a declared-but-unknown input_kind (or one that contradicts the slot)
+// is a hard error — the loader refuses to open a mis-packaged ONNX.
+// manifests predating the contract label (input_kind missing) are
+// accepted for back-compat and treated as bayer_v1.
+//
+// @param env environment handle
+// @return context handle, or NULL if no model available / misconfigured
+dt_restore_context_t *dt_restore_load_rawdenoise_bayer(dt_restore_env_t *env);
+
+// @brief load raw-denoise X-Trans model (scale 1x)
+//
+// prefers a dedicated xtrans variant when the manifest declares
+// variants.xtrans.onnx; falls back transparently to the linear variant
+// otherwise. callers pick this loader for X-Trans sensors so a future
+// RawNIND release can swap in a dedicated model via manifest-only
+// changes.
+//
+// --- xtrans_v1 input contract (reserved) ---
+//
+// variants declaring `input_kind: xtrans_v1` are accepted by the
+// loader but the actual preprocessing contract (channel layout, WB
+// convention, output-space semantics) is TBD until Benoit's dedicated
+// X-Trans model stabilizes. until then this loader's first call
+// returns NULL for any manifest lacking a variants.xtrans slot, and
+// the fallback path produces a linear_v1 context
+//
+// @param env environment handle
+// @return context handle, or NULL if neither an xtrans nor a linear
+//         variant is available.
+dt_restore_context_t *dt_restore_load_rawdenoise_xtrans(dt_restore_env_t *env);
+
+// @brief load raw-denoise linear model (scale 1x)
+//
+// generic-demosaic-based denoise: used for Foveon, monochrome sensors
+// with a CFA-ish pattern, and any raw whose CFA pattern doesn't fit
+// the bayer or xtrans pipelines. also the fallback pipeline for
+// X-Trans sensors (via dt_restore_load_rawdenoise_xtrans) until a
+// dedicated xtrans_v1 model is available.
+//
+// --- linear_v1 input contract ---
+//
+// variants declaring `input_kind: linear_v1` must satisfy:
+//
+//   INPUT: NCHW, 3 channels, T × T planar. colorspace is
+//          variants.linear.input_colorspace (default lin_rec2020;
+//          alternatives: camRGB, srgb_linear). preprocessing applies
+//          WB in camRGB first — mode via variants.linear.wb_norm
+//          (default as_shot; see _resolve_linear_wb) — then the
+//          camRGB → input-space 3×3 matrix derived from
+//          adobe_XYZ_to_CAM, then an optional scalar exposure boost
+//          to variants.linear.target_mean (default 0.30 for the
+//          training distribution; set "null" to disable).
+//
+//   OUTPUT: NCHW, 3 channels, T × T in the same input-space. output
+//           scale is arbitrary unless the variant declares
+//           output_scale: absolute — default is per-channel match_gain
+//           against the boosted input. the caller then inverts the
+//           exposure boost, the matrix, and the WB to recover a raw
+//           camRGB DNG that renders identically under the importing
+//           pipeline.
+//
+// same contract-label semantics as the bayer variant: missing label
+// accepted as linear_v1, declared-but-mismatched label refuses
+// to load with dt_control_log feedback.
+//
+// @param env environment handle
+// @return context handle, or NULL if no model available / misconfigured
+dt_restore_context_t *dt_restore_load_rawdenoise_linear(dt_restore_env_t *env);
+
+// @brief load upscale model at 2x
+// @param env environment handle
+// @return context handle, or NULL if no model available
 dt_restore_context_t *dt_restore_load_upscale_x2(dt_restore_env_t *env);
 
-/**
- * @brief load upscale model at 4x
- * @param env environment handle
- * @return context handle, or NULL if no model available
- */
+// @brief load upscale model at 4x
+// @param env environment handle
+// @return context handle, or NULL if no model available
 dt_restore_context_t *dt_restore_load_upscale_x4(dt_restore_env_t *env);
 
-/**
- * @brief increment the reference count for shared ownership.
- *        multiple threads can share the same context for concurrent
- *        inference via dt_restore_run_patch().
- * @param ctx context handle
- * @return the same pointer (for convenience)
- */
+// @brief increment the reference count for shared ownership.
+//        multiple threads can share the same context for concurrent
+//        inference.
+// @param ctx context handle
+// @return the same pointer (for convenience)
 dt_restore_context_t *dt_restore_ref(dt_restore_context_t *ctx);
 
-/**
- * @brief decrement the reference count. frees the context and all
- *        resources when the count reaches zero.
- * @param ctx context handle (NULL-safe)
- */
+// @brief decrement the reference count. frees the context and all
+//        resources when the count reaches zero.
+// @param ctx context handle (NULL-safe)
 void dt_restore_unref(dt_restore_context_t *ctx);
 
-/**
- * @brief Set the working color profile for the context.
- *
- * The AI model was trained on sRGB primaries. If the input pixels are
- * in a different working profile (e.g. Rec.2020), we must convert to
- * sRGB before inference and back after to avoid hue shifts. Call this
- * before running inference on each image that may use a different
- * working profile.
- *
- * If profile is NULL, the pipeline falls back to gamma-only conversion
- * (treating working-profile numbers as if they were sRGB), which can
- * cause color shifts for wide-gamut working profiles.
- *
- * Thread-safety: must not be called concurrently with
- * dt_restore_run_patch() or dt_restore_process_tiled(). Set the
- * profile before dispatching inference on a given image.
- *
- * @param ctx context handle (NULL-safe)
- * @param profile lcms2 cmsHPROFILE handle cast to void*; NULL to disable
- */
-void dt_restore_set_profile(dt_restore_context_t *ctx,
-                            void *profile);
-
-/**
- * @brief Enable/disable wide-gamut pass-through for denoise.
- *
- * When TRUE (default): pixels that would be out of sRGB gamut pass
- * through unchanged, preserving color but not denoising them. When
- * FALSE: all pixels use the model output, wide-gamut colors are
- * clipped to sRGB but everything gets denoised.
- *
- * Affects denoise only (scale == 1). Upscale always uses the model
- * output because there is no pixel-to-pixel correspondence to
- * pass through.
- *
- * @param ctx context handle (NULL-safe)
- * @param preserve TRUE to enable pass-through, FALSE to denoise everything
- */
-void dt_restore_set_preserve_wide_gamut(dt_restore_context_t *ctx,
-                                        gboolean preserve);
-
-/**
- * @brief check if a denoise model is available
- * @param env environment handle
- * @return TRUE if a denoise model is configured and present
- */
+// @brief check if a denoise model is available
+// @param env environment handle
+// @return TRUE if a denoise model is configured and present
 gboolean dt_restore_denoise_available(dt_restore_env_t *env);
 
-/**
- * @brief check if an upscale model is available
- * @param env environment handle
- * @return TRUE if an upscale model is configured and present
- */
+// @brief check if a raw-denoise model is available
+// @param env environment handle
+// @return TRUE if a raw-denoise model is configured and present
+gboolean dt_restore_rawdenoise_available(dt_restore_env_t *env);
+
+// @brief check if an upscale model is available
+// @param env environment handle
+// @return TRUE if an upscale model is configured and present
 gboolean dt_restore_upscale_available(dt_restore_env_t *env);
 
-/* --- tile size --- */
+// --- tile size ---
 
-/**
- * @brief get tile overlap for a given scale factor
- * @param scale upscale factor (1 for denoise)
- * @return overlap in pixels
- */
+// @brief get tile overlap for a given scale factor
+// @param scale upscale factor (1 for denoise)
+// @return overlap in pixels
 int dt_restore_get_overlap(int scale);
 
-/* --- inference --- */
-
-/**
- * @brief row writer callback for dt_restore_process_tiled
- *
- * called once per tile-row with 3ch interleaved float scanlines.
- * the callback can write to a buffer, TIFF, or any other sink.
- *
- * @param scanline 3ch interleaved float data (out_w pixels)
- * @param out_w output width in pixels
- * @param y scanline index in the output image
- * @param user_data caller-provided context
- * @return 0 on success, non-zero to abort
- */
-typedef int (*dt_restore_row_writer_t)(const float *scanline,
-                                       int out_w,
-                                       int y,
-                                       void *user_data);
-
-/**
- * @brief run a single inference patch with sRGB conversion
- *
- * converts linear RGB input to sRGB, runs ONNX inference,
- * converts output back to linear. input is planar NCHW float.
- *
- * @param ctx loaded restore context
- * @param in_patch input tile (planar RGB, 3 * w * h floats)
- * @param w tile width
- * @param h tile height
- * @param out_patch output buffer (planar RGB, 3 * w*s * h*s)
- * @param scale upscale factor (1 for denoise)
- * @return 0 on success
- */
-int dt_restore_run_patch(dt_restore_context_t *ctx,
-                         const float *in_patch,
-                         int w, int h,
-                         float *out_patch,
-                         int scale);
-
-/**
- * @brief process an image with tiled inference
- *
- * tiles the input, runs inference on each tile, and delivers
- * completed scanlines via the row_writer callback. input is
- * float4 RGBA interleaved (from dt export).
- *
- * @param ctx loaded restore context (tile_size is stored in ctx)
- * @param in_data input pixels (float4 RGBA, width * height)
- * @param width input width
- * @param height input height
- * @param scale upscale factor (1 for denoise)
- * @param row_writer callback receiving 3ch float scanlines
- * @param writer_data user data passed to row_writer
- * @param control_job job handle for progress/cancellation (NULL-safe)
- * @return 0 on success
- */
-int dt_restore_process_tiled(dt_restore_context_t *ctx,
-                             const float *in_data,
-                             int width, int height,
-                             int scale,
-                             dt_restore_row_writer_t row_writer,
-                             void *writer_data,
-                             struct _dt_job_t *control_job);
-
-/* --- detail recovery --- */
-
-/**
- * @brief apply DWT-based detail recovery after denoising
- *
- * extracts luminance residual, filters noise with wavelet
- * decomposition, and blends preserved texture back.
- * both buffers are float4 RGBA at the same dimensions.
- *
- * @param original_4ch original input pixels (read-only)
- * @param denoised_4ch denoised pixels (modified in-place)
- * @param width image width
- * @param height image height
- * @param alpha blend strength (0 = none, 1 = full)
- */
-void dt_restore_apply_detail_recovery(const float *original_4ch,
-                                      float *denoised_4ch,
-                                      int width, int height,
-                                      float alpha);
-
-/**
- * @brief compute DWT-filtered luminance detail from 3ch buffers
- *
- * returns a 1ch float array with wavelet-filtered luminance
- * residual (noise removed, texture preserved). used for
- * preview split visualization.
- *
- * @param before_3ch original image (3ch interleaved float)
- * @param after_3ch processed image (3ch interleaved float)
- * @param width image width
- * @param height image height
- * @return newly allocated 1ch buffer, or NULL. caller frees
- *         with dt_free_align()
- */
-float *dt_restore_compute_dwt_detail(const float *before_3ch,
-                                     const float *after_3ch,
-                                     int width, int height);
+// --- inference ---
+
+// @brief run a single RawNIND bayer inference patch
+//
+// thin wrapper over dt_ai_run for bayer-packed input: NO colorspace
+// or gamma conversion, NO WB handling, NO shadow boost. caller is
+// responsible for black-subtract / normalize / WB / RGGB pack.
+// input is planar 4ch NCHW at packed half-resolution, output is
+// planar 3ch at full sensor resolution (model internally upscales
+// 2x via PixelShuffle). output is in camRGB — the camera ColorMatrix
+// is NOT applied in the graph (training applies it externally for
+// loss, so re-mosaic + DNG write works natively).
+//
+// @param ctx loaded restore context (bayer model)
+// @param in_4ch packed input (planar 4ch: R, G1, G2, B; 4 * w * h)
+// @param w packed-space tile width (= sensor_w / 2)
+// @param h packed-space tile height (= sensor_h / 2)
+// @param out_3ch output buffer (planar 3ch at 2w * 2h)
+// @return 0 on success
+int dt_restore_run_patch_bayer(dt_restore_context_t *ctx,
+                               const float *in_4ch,
+                               int w, int h,
+                               float *out_3ch);
+
+// @brief run a single RawNIND linear inference patch
+//
+// 3ch in, 3ch out, SAME spatial dims (no internal upscale). like
+// _run_patch_bayer: no sRGB / gamma / WP conversion, no shadow boost.
+// caller prepares input in the colorspace the linear model was
+// trained on (lin_rec2020 per config.json) and gain-matches the
+// output afterward (model output is arbitrary-scale camRGB-in-that-
+// space, matching the behavior already observed on the bayer variant).
+//
+// @param ctx loaded restore context (linear model)
+// @param in_3ch planar 3ch input (3 * w * h floats, NCHW order)
+// @param w tile width
+// @param h tile height
+// @param out_3ch output buffer (planar 3ch, 3 * w * h floats)
+// @return 0 on success
+int dt_restore_run_patch_3ch_raw(dt_restore_context_t *ctx,
+                                 const float *in_3ch,
+                                 int w, int h,
+                                 float *out_3ch);
+
+// @brief look up the tile ladder for a restore context
+//
+// exposes the model-declared (or default) input_sizes list in
+// packed-space. used by the bayer pipeline to pick a starting
+// tile size that respects the model's declared shapes.
+//
+// @param ctx loaded restore context
+// @param out_count filled with number of entries (may be NULL)
+// @return pointer to the ladder (owned by ctx; do not free). NULL
+//         if ctx is NULL.
+const int *dt_restore_get_tile_ladder(const dt_restore_context_t *ctx,
+                                      int *out_count);
+
+// @brief current tile size stored in the loaded session
+//
+// @param ctx loaded restore context
+// @return tile size in packed-space, or 0 if ctx is NULL
+int dt_restore_get_tile_size(const dt_restore_context_t *ctx);
+
+// @brief recreate the ORT session for a different tile size
+//
+// used by the bayer OOM-retry loop to step down the ladder
+// when inference fails. keeps the same model/provider; only the
+// H/W dim overrides change. the old session is unloaded first
+// (avoids VRAM cascade on GPU OOM).
+//
+// @param ctx loaded restore context
+// @param new_tile_size new tile size (must be a ladder member)
+// @return TRUE on success
+gboolean dt_restore_reload_session(dt_restore_context_t *ctx,
+                                   int new_tile_size);
+
+// @brief persist the current tile size to darktablerc
+//
+// once the bayer pipeline has processed an image end-to-end at
+// ctx->tile_size without OOM, call this so the next run skips the
+// retry loop and JIT-compiling providers don't pay the compile
+// cost again.
+//
+// @param ctx loaded restore context
+void dt_restore_persist_tile_size(const dt_restore_context_t *ctx);
+
+// @brief run darktable's real user pixelpipe on a sensor buffer, ROI-clipped.
+//
+// Shared bridge for the raw-denoise preview paths. Both Bayer and
+// X-Trans previews need to run the user's full iop stack on a
+// (possibly neural-denoised and re-mosaiced) raw buffer so the
+// displayed before/after pixels match what the user would see after
+// batch processing and DNG re-import. The pipe runs natively —
+// rawprepare + demosaic + temperature + colorin + filmic + output
+// profile — with rawdenoise skipped since the neural denoiser has
+// already done its work.
+//
+// @param imgid        image id (the pipe is built per image)
+// @param input_native buffer matching the image's native raw format
+//                     (uint16 CFA or 3ch float LinearRaw). pipe only
+//                     reads this; caller retains ownership.
+// @param iw           buffer width in native samples
+// @param ih           buffer height
+// @param roi_x        ROI top-left x in sensor (input) coords — same
+//                     coordinate system the caller used to patch the
+//                     denoised CFA into input_native. the bridge
+//                     forward-transforms this through the user's
+//                     geometry chain (rawprepare + clipping + ashift +
+//                     lens + ...) so the pipe renders the same sensor
+//                     area the caller patched.
+// @param roi_y        ROI top-left y (sensor coords)
+// @param roi_w        ROI width  (sensor coords)
+// @param roi_h        ROI height (sensor coords)
+// @param out_w        receives actual rendered width (may differ from
+//                     roi_w when user history contains geometry-
+//                     modifying modules like clipping/ashift/lens, or
+//                     when rawprepare trims; NULL to skip)
+// @param out_h        receives actual rendered height (as out_w; NULL
+//                     to skip)
+// @param out_rgb      caller-frees with g_free. 3ch interleaved
+//                     (*out_w * *out_h * 3 floats) in linear Rec.709,
+//                     ready for sRGB-gamma display. callers must use
+//                     *out_w / *out_h (not the requested roi_w/roi_h)
+//                     for subsequent indexing.
+// @return 0 on success; *out_rgb set to NULL on failure.
+int dt_restore_run_user_pipe_roi(dt_imgid_t imgid,
+                                 void *input_native,
+                                 int iw,
+                                 int ih,
+                                 int roi_x,
+                                 int roi_y,
+                                 int roi_w,
+                                 int roi_h,
+                                 int *out_w,
+                                 int *out_h,
+                                 float **out_rgb);
 
 // clang-format off
 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
diff --git a/src/common/ai/restore_common.h b/src/common/ai/restore_common.h
new file mode 100644
index 000000000000..2a5138b6ac4d
--- /dev/null
+++ b/src/common/ai/restore_common.h
@@ -0,0 +1,236 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// restore_common — internal shared definitions for the restore_*
+// module family (restore.c, restore_rgb.c). NOT a public API:
+// consumers must continue to treat dt_restore_context_t /
+// dt_restore_env_t as opaque and go through the accessor functions
+// declared in restore.h.
+
+#pragma once
+
+#include "ai/backend.h"
+#include "common/image.h"
+
+#include <glib.h>
+#include <stdint.h>
+
+// --- preprocessing policy enums ---
+//
+// ctx fields keyed off these enums replace what used to be compile-time
+// RawNIND assumptions. Manifest keys (variants.<v>.{input_kind, wb_norm,
+// output_scale, input_colorspace, target_mean}) resolve to these values
+// at load; defaults reproduce RawNIND v1 behavior so legacy manifests
+// keep working. See restore.h for the per-variant contract.
+
+// identifies the full preprocessing contract (layout + WB + scaling +
+// training distribution) the ONNX graph was trained against. treated as
+// a string match, not a feature set: a model declaring bayer_v1 must
+// comply with everything documented for that label in restore.h
+typedef enum
+{
+  DT_RESTORE_INPUT_KIND_UNKNOWN = 0,
+  DT_RESTORE_INPUT_KIND_BAYER_V1,
+  // reserved for a future dedicated X-Trans denoise model. accepted as
+  // a manifest value so packages shipping an xtrans variant can be
+  // validated; the actual preprocessing pipeline is TBD and filed in
+  // restore_raw_xtrans.c when Benoit's model format stabilizes. until
+  // then X-Trans sensors route to the linear pipeline as a fallback
+  // via dt_restore_load_rawdenoise_xtrans
+  DT_RESTORE_INPUT_KIND_XTRANS_V1,
+  DT_RESTORE_INPUT_KIND_LINEAR_V1,
+} dt_restore_input_kind_t;
+
+// color space the linear path feeds to the model. bayer path ignores
+// this (4ch-packed layout fixes the space to camRGB by construction)
+typedef enum
+{
+  DT_RESTORE_CS_LIN_REC2020 = 0,   // default for linear path
+  DT_RESTORE_CS_CAMRGB,
+  DT_RESTORE_CS_SRGB_LINEAR,
+} dt_restore_colorspace_t;
+
+// how WB is normalized before inference (and inverted after). DAYLIGHT
+// uses the D65 coefficients derived from adobe_XYZ_to_CAM; AS_SHOT uses
+// the raw's wb_coeffs; NONE leaves camRGB untouched
+typedef enum
+{
+  DT_RESTORE_WB_DAYLIGHT = 0,
+  DT_RESTORE_WB_AS_SHOT,
+  DT_RESTORE_WB_NONE,
+} dt_restore_wb_mode_t;
+
+// post-inference output scale handling. MATCH_GAIN rescales the model
+// output so its mean matches the model input mean (compensates for the
+// arbitrary output scale RawNIND's L1 loss produces); ABSOLUTE trusts
+// the model output as-is
+typedef enum
+{
+  DT_RESTORE_OUT_MATCH_GAIN = 0,
+  DT_RESTORE_OUT_ABSOLUTE,
+} dt_restore_output_scale_t;
+
+// how the 4-channel packed Bayer input is oriented. FORCE_RGGB extracts
+// from the CFA's R origin so channel 0 is always R regardless of the
+// sensor pattern — matches RawNIND v1 training, which cropped non-RGGB
+// sensors to an RGGB origin before packing. NATIVE packs in the sensor's
+// own CFA order (channel 0 at the top-left of each 2x2 block) for models
+// that accept any Bayer pattern unchanged
+typedef enum
+{
+  DT_RESTORE_BAYER_FORCE_RGGB = 0,
+  DT_RESTORE_BAYER_NATIVE,
+} dt_restore_bayer_orientation_t;
+
+// edge handling when a tile extends past the image boundary. MIRROR is
+// darktable's historical periodic reflection on absolute sensor coords.
+// MIRROR_CROPPED reflects in the effective-cropped frame (post FORCE_RGGB
+// shift) so the reflected content matches what a training pipeline that
+// physically crops the sensor before tiling would see — required for
+// bit-identical corner tiles on non-RGGB sensors under bayer_v1
+typedef enum
+{
+  DT_RESTORE_EDGE_MIRROR_CROPPED = 0,
+  DT_RESTORE_EDGE_MIRROR,
+} dt_restore_edge_pad_t;
+
+// dt_restore_sensor_class_t and _classify_sensor now live in restore.h
+// (part of the public API so callers picking a variant loader can use
+// it without pulling in restore_common.h's internal struct layouts)
+
+// --- struct definitions shared across the restore_* module family ---
+
+struct dt_restore_env_t
+{
+  dt_ai_environment_t *ai_env;
+};
+
+struct dt_restore_context_t
+{
+  dt_ai_context_t *ai_ctx;
+  struct dt_restore_env_t *env;
+  char *model_id;
+  char *model_file;
+  char *task;
+  char *input_kind; // variant-declared input kind (e.g. "packed_bayer",
+                    // "lin_rec2020"); NULL if the model doesn't declare one
+  // policy enums resolved from the manifest at load time; see comments
+  // on each enum in this file and the per-variant contract in restore.h.
+  // defaults (0-init from g_new0) reproduce RawNIND v1 behavior, except
+  // target_mean which needs explicit initialization — see _load
+  dt_restore_input_kind_t        input_kind_enum;
+  dt_restore_colorspace_t        input_colorspace;
+  dt_restore_wb_mode_t           wb_mode;
+  dt_restore_output_scale_t      output_scale;
+  dt_restore_bayer_orientation_t bayer_orientation;
+  dt_restore_edge_pad_t          edge_pad;
+  float                          target_mean;  // NAN = no exposure boost
+  int scale;        // model upscale factor (1 for denoise, 2/4 for upscale)
+  int tile_size;    // tile size used to create the current session
+  char *dim_h;      // symbolic height dim name used for session overrides
+  char *dim_w;      // symbolic width dim name used for session overrides
+  // color management (RGB path): convert working profile → sRGB before
+  // inference and back after. if has_profile is FALSE, fall back to
+  // gamma-only conversion (treats working-profile numbers as if sRGB).
+  gboolean has_profile;
+  float wp_to_srgb[9];   // working profile RGB -> sRGB linear
+  float srgb_to_wp[9];   // sRGB linear -> working profile RGB
+  // RGB path: when TRUE (default), out-of-sRGB-gamut pixels pass
+  // through unchanged during denoise. when FALSE, every pixel uses
+  // the model output and wide-gamut colors get clipped to sRGB.
+  gboolean preserve_wide_gamut;
+  // RGB path: shadow_boost_capable is set once at load from the
+  // model's "shadow_boost" attribute; shadow_boost is re-computed
+  // per image inside dt_restore_process_tiled() based on luminance.
+  gboolean shadow_boost_capable;
+  gboolean shadow_boost;
+  // tile ladder candidates from largest to smallest; either the
+  // model's "input_sizes" attribute from config.json (when declared)
+  // or a copy of the built-in ladder for the model's scale. both the
+  // startup budget selector and the runtime OOM retry loop iterate it
+  int *tile_ladder;
+  int n_tile_ladder;
+  uint32_t ep_flags;  // execution provider flags (e.g. CoreML CPU-only)
+  gint ref_count;
+};
+
+// DWT detail-recovery band count (used by restore_rgb.c)
+#define DWT_DETAIL_BANDS 5
+
+// compute per-site black level (4 entries) and raw-ADC range
+// (white − black) for this image. prefers the per-site
+// raw_black_level_separate when any entry is non-zero, otherwise
+// falls back to the single raw_black_level. range entries are
+// guarded against non-positive values so callers can divide safely.
+// shared by the Bayer prep helper (restore_raw_bayer.c) and by the
+// linear path's raw re-mosaic step (restore_raw_linear.c)
+static inline void _compute_cfa_black_range(const dt_image_t *img,
+                                            float black[4],
+                                            float range[4],
+                                            float *out_white)
+{
+  const float white = img->raw_white_point
+    ? (float)img->raw_white_point : 65535.0f;
+  if(out_white) *out_white = white;
+
+  const gboolean have_separate
+    = (img->raw_black_level_separate[0] != 0
+       || img->raw_black_level_separate[1] != 0
+       || img->raw_black_level_separate[2] != 0
+       || img->raw_black_level_separate[3] != 0);
+  for(int i = 0; i < 4; i++)
+    black[i] = have_separate
+      ? (float)img->raw_black_level_separate[i]
+      : (float)img->raw_black_level;
+  for(int i = 0; i < 4; i++)
+  {
+    range[i] = white - black[i];
+    if(range[i] <= 0.0f) range[i] = 1.0f;
+  }
+}
+
+// periodic mirror-pad index reflection, shared by every restore_*
+// consumer that needs edge padding for tile reads (RGB, raw bayer,
+// raw linear). fully periodic: any input index maps into [0, n)
+static inline int _mirror(int i, int n)
+{
+  if(n <= 1) return 0;
+  if(i < 0) i = -i;
+  const int period = 2 * (n - 1);
+  i = i % period;
+  if(i < 0) i += period;
+  if(i >= n) i = period - i;
+  return i;
+}
+
+// mirror-pad reflection within an arbitrary sub-range [lo, hi) of the
+// underlying 1D array (exclusive hi). used by the Bayer edge-pad mode
+// MIRROR_CROPPED so reflections happen inside the RGGB-forced crop
+// rectangle rather than the original sensor buffer — matches training
+// pipelines that physically crop the sensor to RGGB before tiling
+static inline int _mirror_in_range(int i, int lo, int hi)
+{
+  const int n = hi - lo;
+  return lo + _mirror(i - lo, n);
+}
+
+// clang-format off
+// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
+// vim: shiftwidth=2 expandtab tabstop=2 cindent
+// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
+// clang-format on
diff --git a/src/common/ai/restore_raw_bayer.c b/src/common/ai/restore_raw_bayer.c
new file mode 100644
index 000000000000..91cff804936b
--- /dev/null
+++ b/src/common/ai/restore_raw_bayer.c
@@ -0,0 +1,803 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "common/ai/restore_raw_bayer.h"
+#include "common/ai/restore.h"
+#include "common/ai/restore_common.h"
+#include "common/colorspaces.h"
+#include "common/darktable.h"
+#include "common/image.h"
+#include "common/image_cache.h"
+#include "common/iop_order.h"
+#include "common/mipmap_cache.h"
+#include "control/jobs.h"
+#include "develop/develop.h"
+#include "develop/imageop.h"
+#include "develop/imageop_math.h"
+#include "develop/pixelpipe_hb.h"
+#include "imageio/imageio_common.h"
+
+#include <glib.h>
+#include <math.h>
+#include <string.h>
+
+#define OVERLAP_PACKED 32  // tile overlap in packed (half-res) pixels
+
+// find (y0, x0) in {0,1}^2 such that FC(y0, x0, filters) == 0 (R).
+// returns TRUE for standard Bayer patterns; FALSE for non-Bayer (filters
+// == 0) or X-Trans (filters == 9u), which this pipeline does not handle
+static gboolean _bayer_origin(uint32_t filters, int *y0, int *x0)
+{
+  if(filters == 0u || filters == 9u) return FALSE;
+  for(int y = 0; y < 2; y++)
+    for(int x = 0; x < 2; x++)
+      if(FC(y, x, filters) == 0)
+      {
+        *y0 = y;
+        *x0 = x;
+        return TRUE;
+      }
+  return FALSE;
+}
+
+// shared prep data for Bayer batch + preview
+// resolves everything the CFA→packed-input pipeline needs from the
+// image metadata: CFA pattern + origin, per-site black/white/range,
+// and daylight WB multipliers. both the batch path
+// (dt_restore_raw_bayer) and the piped preview use this identical
+// pre-processing; keeping it in one helper stops the two copies from
+// drifting
+typedef struct _bayer_prep_t
+{
+  uint32_t filters;
+  int      y0, x0;
+  float    white;
+  float    black[4];
+  float    range[4];
+  float    wb_norm[3];  // daylight WB, G normalised to 1
+  float    clip_max;    // = white; kept separate for readability
+} _bayer_prep_t;
+
+// populate prep from img metadata. returns 0 on success, 1 when the
+// CFA pattern is unsupported (X-Trans or monochrome)
+// compute daylight WB (D65 derived from adobe_XYZ_to_CAM). on success
+// writes R/B multipliers with G=1 into wb[0..2] and returns TRUE
+static gboolean _bayer_wb_daylight(const dt_image_t *img, float wb[3])
+{
+  const float D65[3] = { 0.9504f, 1.0f, 1.0889f };
+  float resp[3] = { 0.0f, 0.0f, 0.0f };
+  float mag = 0.0f;
+  for(int c = 0; c < 3; c++)
+  {
+    resp[c] = img->adobe_XYZ_to_CAM[c][0] * D65[0]
+            + img->adobe_XYZ_to_CAM[c][1] * D65[1]
+            + img->adobe_XYZ_to_CAM[c][2] * D65[2];
+    mag += fabsf(img->adobe_XYZ_to_CAM[c][0])
+         + fabsf(img->adobe_XYZ_to_CAM[c][1])
+         + fabsf(img->adobe_XYZ_to_CAM[c][2]);
+  }
+  if(mag <= 0.0f || resp[0] <= 0.0f || resp[1] <= 0.0f || resp[2] <= 0.0f)
+    return FALSE;
+  wb[0] = resp[1] / resp[0];
+  wb[1] = 1.0f;
+  wb[2] = resp[1] / resp[2];
+  return TRUE;
+}
+
+// as-shot WB from img->wb_coeffs normalized to G=1
+static gboolean _bayer_wb_as_shot(const dt_image_t *img, float wb[3])
+{
+  if(img->wb_coeffs[0] <= 0.0f
+     || img->wb_coeffs[1] <= 0.0f
+     || img->wb_coeffs[2] <= 0.0f)
+    return FALSE;
+  const float g = img->wb_coeffs[1];
+  wb[0] = img->wb_coeffs[0] / g;
+  wb[1] = 1.0f;
+  wb[2] = img->wb_coeffs[2] / g;
+  return TRUE;
+}
+
+static int _compute_bayer_prep(const dt_restore_context_t *ctx,
+                               const dt_image_t *img, _bayer_prep_t *p)
+{
+  if(!img || !p) return 1;
+  p->filters = img->buf_dsc.filters;
+  if(!_bayer_origin(p->filters, &p->y0, &p->x0))
+  {
+    dt_print(DT_DEBUG_AI,
+             "[restore_raw_bayer] unsupported CFA pattern (filters=0x%x)",
+             p->filters);
+    return 1;
+  }
+
+  _compute_cfa_black_range(img, p->black, p->range, &p->white);
+  p->clip_max = p->white;
+
+  // WB normalization keyed off ctx->wb_mode. RawNIND's v1 weights were
+  // trained on daylight-WB'd data, so the default is DAYLIGHT (derive
+  // D65 multipliers from adobe_XYZ_to_CAM) with as-shot as the fallback
+  // when the matrix is missing. AS_SHOT flips the order (as-shot first,
+  // daylight fallback) for models trained on as-shot distributions.
+  // NONE leaves camRGB untouched. The same wb_norm is inverted in
+  // postprocess so the round-trip is consistent regardless of mode.
+  p->wb_norm[0] = p->wb_norm[1] = p->wb_norm[2] = 1.0f;
+  const dt_restore_wb_mode_t mode
+    = ctx ? ctx->wb_mode : DT_RESTORE_WB_DAYLIGHT;
+  if(mode == DT_RESTORE_WB_DAYLIGHT)
+  {
+    if(!_bayer_wb_daylight(img, p->wb_norm))
+      _bayer_wb_as_shot(img, p->wb_norm);
+  }
+  else if(mode == DT_RESTORE_WB_AS_SHOT)
+  {
+    if(!_bayer_wb_as_shot(img, p->wb_norm))
+      _bayer_wb_daylight(img, p->wb_norm);
+  }
+  // DT_RESTORE_WB_NONE: leave at {1, 1, 1}
+  return 0;
+}
+
+// shared re-mosaic per-pixel math: model camRGB value → raw ADC
+// value (reverses WB, normalisation and black-level shift). caller
+// supplies (r, c, ch) from its own FC() dispatch and reads model_val
+// from the 2T × 2T tile_out; the caller-side blend / clip / store
+// differs per path (batch writes uint16 CFA with strength blend,
+// preview writes uint16 or float into a patched sensor buffer) so we
+// keep just the pure pixel math shared
+static inline float _bayer_remosaic_raw(int r, int c, int ch,
+                                        float model_val,
+                                        const _bayer_prep_t *prep)
+{
+  const float normalized = model_val / prep->wb_norm[ch];
+  const int bl_idx = ((r & 1) << 1) | (c & 1);
+  return normalized * prep->range[bl_idx] + prep->black[bl_idx];
+}
+
+// shared scalar match_gain: scales 3ch model output (2T × 2T) so
+// its mean equals the 4ch input mean. identical algorithm for batch
+// and preview; batch uses the returned means/gain to log a per-tile
+// diagnostic
+static void _bayer_gain_match(const float *tile_in,
+                              float *tile_out,
+                              int T,
+                              double *out_in_mean,
+                              double *out_out_mean,
+                              float *out_gain)
+{
+  const size_t tile_in_plane = (size_t)T * T;
+  const size_t tile_out_plane = (size_t)(2 * T) * (size_t)(2 * T);
+  double in_sum = 0.0, out_sum = 0.0;
+  for(int k = 0; k < 4; k++)
+  {
+    const float *p = tile_in + (size_t)k * tile_in_plane;
+    for(size_t i = 0; i < tile_in_plane; i++) in_sum += p[i];
+  }
+  for(int k = 0; k < 3; k++)
+  {
+    const float *p = tile_out + (size_t)k * tile_out_plane;
+    for(size_t i = 0; i < tile_out_plane; i++) out_sum += p[i];
+  }
+  const double in_mean = in_sum / (double)(4 * tile_in_plane);
+  const double out_mean = out_sum / (double)(3 * tile_out_plane);
+  // allow negative gain too: the RawNIND model output scale is
+  // arbitrary by design (match_gain post-step during training absorbs
+  // it); in some variants the sign is also inverted. guard only
+  // against near-zero mean
+  const float gain = (fabsf((float)out_mean) > 1e-8f)
+    ? (float)(in_mean / out_mean) : 1.0f;
+  if(gain != 1.0f)
+  {
+    const size_t total_out = tile_out_plane * 3;
+    for(size_t i = 0; i < total_out; i++) tile_out[i] *= gain;
+  }
+  if(out_in_mean)  *out_in_mean = in_mean;
+  if(out_out_mean) *out_out_mean = out_mean;
+  if(out_gain)     *out_gain = gain;
+}
+
+// shared 4ch packing: CFA → planar [R, G1, G2, B] at T×T packed
+// compute the mirror-reflection bounds + oriented tile origin based on
+// the packing policy on ctx. sr0_base / sc0_base are the caller's
+// sensor-space base coords for the tile's top-left 2x2 *before* any
+// RGGB-forcing shift (batch passes 2*(py_base - O), preview passes
+// the user-centred even-snapped inf_y/inf_x)
+//   - FORCE_RGGB + MIRROR_CROPPED (bayer_v1 default): origin shifts by
+//     (y0, x0) so channel 0 always hits R; mirror reflects within the
+//     cropped [y0, H - y0?1:0) x [x0, W - x0?1:0) rectangle — matches
+//     training pipelines that physically crop to RGGB before tiling
+//   - FORCE_RGGB + MIRROR: same origin shift, but reflections happen
+//     against the full buffer (legacy darktable behavior; equivalent
+//     to training that doesn't use mirror padding at all)
+//   - NATIVE + *: no origin shift; each 4ch slot holds the sensor's
+//     native CFA position. mirror is always full-buffer
+static void _bayer_tile_geometry(const dt_restore_context_t *ctx,
+                                 const _bayer_prep_t *prep,
+                                 int sr0_base, int sc0_base,
+                                 int width, int height,
+                                 int *sr0_origin, int *sc0_origin,
+                                 int *mir_y_lo, int *mir_y_hi,
+                                 int *mir_x_lo, int *mir_x_hi)
+{
+  const gboolean force_rggb
+    = !ctx || ctx->bayer_orientation == DT_RESTORE_BAYER_FORCE_RGGB;
+  const int y0 = force_rggb ? prep->y0 : 0;
+  const int x0 = force_rggb ? prep->x0 : 0;
+  *sr0_origin = sr0_base + y0;
+  *sc0_origin = sc0_base + x0;
+
+  const gboolean cropped_mirror
+    = ctx && force_rggb
+      && ctx->edge_pad == DT_RESTORE_EDGE_MIRROR_CROPPED;
+  *mir_y_lo = cropped_mirror ? y0 : 0;
+  *mir_x_lo = cropped_mirror ? x0 : 0;
+  *mir_y_hi = cropped_mirror ? (height - (y0 ? 1 : 0)) : height;
+  *mir_x_hi = cropped_mirror ? (width  - (x0 ? 1 : 0)) : width;
+}
+
+// sr0_origin / sc0_origin are sensor-space top-left coords of the packed
+// block's (0, 0); edges are mirror-padded via _mirror_in_range().
+// batch and preview paths call this with different origins (tile grid
+// vs. a single centred inference tile) but the per-pixel math is identical
+// pack a T x T packed-half-res 4-channel tile from the full CFA buffer.
+// sr0_origin / sc0_origin is the starting sensor-space (row, col) of the
+// tile's top-left 2x2 block; for force_rggb orientation the caller shifts
+// by (y0, x0) so channel 0 always hits R.
+// [mir_y_lo, mir_y_hi) and [mir_x_lo, mir_x_hi) are the mirror-reflection
+// bounds. for EDGE_MIRROR these are [0, height) / [0, width); for
+// EDGE_MIRROR_CROPPED they shrink to the effective-RGGB-cropped
+// rectangle so reflections match what a crop-then-tile training pipeline
+// would see
+static void _pack_bayer_tile(const float *cfa,
+                             int width, int height,
+                             int sr0_origin, int sc0_origin,
+                             int mir_y_lo, int mir_y_hi,
+                             int mir_x_lo, int mir_x_hi,
+                             int T,
+                             const _bayer_prep_t *prep,
+                             float *tile_in)
+{
+  const uint32_t filters = prep->filters;
+  const float *const black = prep->black;
+  const float *const range = prep->range;
+  const float *const wb_norm = prep->wb_norm;
+  const size_t tile_in_plane = (size_t)T * T;
+
+  for(int dy = 0; dy < T; dy++)
+  {
+    const int sr0 = sr0_origin + 2 * dy;
+    for(int dx = 0; dx < T; dx++)
+    {
+      const int sc0 = sc0_origin + 2 * dx;
+      for(int k = 0; k < 4; k++)
+      {
+        const int dr = (k >> 1) & 1;
+        const int dc = k & 1;
+        const int r = _mirror_in_range(sr0 + dr, mir_y_lo, mir_y_hi);
+        const int c = _mirror_in_range(sc0 + dc, mir_x_lo, mir_x_hi);
+        const float val = cfa[(size_t)r * width + c];
+        const int bl_idx = ((r & 1) << 1) | (c & 1);
+        const float normalized = (val - black[bl_idx]) / range[bl_idx];
+        const int ch = FC(r, c, filters);
+        tile_in[k * tile_in_plane + (size_t)dy * T + dx]
+          = normalized * wb_norm[ch];
+      }
+    }
+  }
+}
+
+int dt_restore_raw_bayer(dt_restore_context_t *ctx,
+                         const dt_image_t *img,
+                         const float *cfa_in,
+                         int width,
+                         int height,
+                         uint16_t *cfa_out,
+                         float strength,
+                         struct _dt_job_t *control_job)
+{
+  if(!ctx || !img || !cfa_in || !cfa_out
+     || width <= 0 || height <= 0)
+    return 1;
+
+  const float alpha = strength < 0.0f ? 0.0f
+                    : (strength > 1.0f ? 1.0f : strength);
+  const float inv_alpha = 1.0f - alpha;
+
+  _bayer_prep_t prep;
+  if(_compute_bayer_prep(ctx, img, &prep)) return 1;
+  const uint32_t filters = prep.filters;
+  const int      y0      = prep.y0;
+  const int      x0      = prep.x0;
+  const float    white   = prep.white;
+  const float *const black   = prep.black;
+  const float *const wb_norm = prep.wb_norm;
+  const float    clip_max = prep.clip_max;
+
+  // initialize output with source CFA (covers margins directly)
+  // margins are the 0-2 rows/cols outside the bayer-aligned working
+  // region; the model doesn't see them, so we keep original sensor
+  // values there
+  for(size_t i = 0; i < (size_t)width * height; i++)
+  {
+    const float v = cfa_in[i];
+    const float cv = v < 0.0f ? 0.0f : (v > clip_max ? clip_max : v);
+    cfa_out[i] = (uint16_t)(cv + 0.5f);
+  }
+
+  // working region in sensor coords: [y0..y0+2*Hh) x [x0..x0+2*Wh)
+  const int Hh = (height - y0) / 2;
+  const int Wh = (width - x0) / 2;
+  if(Hh <= 0 || Wh <= 0) return 0;  // too small; output == input
+
+  // tile setup in packed (half-res) space
+  const int O = OVERLAP_PACKED;
+  int T = dt_restore_get_tile_size(ctx);
+  int n_ladder = 0;
+  const int *ladder = dt_restore_get_tile_ladder(ctx, &n_ladder);
+  if(T <= 2 * O) T = 256;  // defensive fallback
+
+retry:;
+  const int step = T - 2 * O;
+  if(step <= 0) return 1;
+  const size_t tile_in_plane = (size_t)T * T;
+  const size_t tile_out_w = 2 * (size_t)T;
+  const size_t tile_out_plane = tile_out_w * tile_out_w;
+  const int cols = (Wh + step - 1) / step;
+  const int rows = (Hh + step - 1) / step;
+  const int total_tiles = cols * rows;
+
+  dt_print(DT_DEBUG_AI,
+           "[restore_raw_bayer] %dx%d sensor (CFA origin %d,%d), "
+           "working %dx%d packed, tile T=%d, %dx%d grid (%d tiles)",
+           width, height, y0, x0, Wh, Hh,
+           T, cols, rows, total_tiles);
+
+  // diagnostic: raw CFA range and preprocessing params
+  {
+    const size_t npix_dbg = (size_t)width * height;
+    float in_min = cfa_in[0], in_max = cfa_in[0];
+    const size_t step_ = (npix_dbg < 1000000) ? 1 : (npix_dbg / 1000000);
+    for(size_t i = 0; i < npix_dbg; i += step_)
+    {
+      if(cfa_in[i] < in_min) in_min = cfa_in[i];
+      if(cfa_in[i] > in_max) in_max = cfa_in[i];
+    }
+    dt_print(DT_DEBUG_AI,
+             "[restore_raw_bayer] raw CFA range [%.1f, %.1f], "
+             "black=[%.0f,%.0f,%.0f,%.0f] white=%.0f "
+             "wb_coeffs=[%.3f,%.3f,%.3f,%.3f] wb_norm=[%.3f,%.3f,%.3f]",
+             in_min, in_max,
+             black[0], black[1], black[2], black[3], white,
+             img->wb_coeffs[0], img->wb_coeffs[1],
+             img->wb_coeffs[2], img->wb_coeffs[3],
+             wb_norm[0], wb_norm[1], wb_norm[2]);
+  }
+
+  float *tile_in = g_try_malloc(tile_in_plane * 4 * sizeof(float));
+  float *tile_out = g_try_malloc(tile_out_plane * 3 * sizeof(float));
+  if(!tile_in || !tile_out)
+  {
+    g_free(tile_in);
+    g_free(tile_out);
+    return 1;
+  }
+
+  int res = 0;
+  int tile_count = 0;
+
+  for(int ty = 0; ty < rows && res == 0; ty++)
+  {
+    for(int tx = 0; tx < cols && res == 0; tx++)
+    {
+      if(control_job
+         && dt_control_job_get_state(control_job)
+              == DT_JOB_STATE_CANCELLED)
+      {
+        res = 1;
+        break;
+      }
+
+      const int py_base = ty * step;  // core-valid packed start (within working)
+      const int px_base = tx * step;
+      const int py_end = (py_base + step > Hh) ? Hh : py_base + step;
+      const int px_end = (px_base + step > Wh) ? Wh : px_base + step;
+      const int core_h = py_end - py_base;
+      const int core_w = px_end - px_base;
+
+      // build 4ch input at packed half-res (T x T). geometry picks
+      // the right origin and mirror-reflection bounds based on
+      // ctx->bayer_orientation + ctx->edge_pad
+      int sr0_origin, sc0_origin;
+      int mir_y_lo, mir_y_hi, mir_x_lo, mir_x_hi;
+      _bayer_tile_geometry(ctx, &prep,
+                           2 * (py_base - O), 2 * (px_base - O),
+                           width, height,
+                           &sr0_origin, &sc0_origin,
+                           &mir_y_lo, &mir_y_hi, &mir_x_lo, &mir_x_hi);
+      _pack_bayer_tile(cfa_in, width, height,
+                       sr0_origin, sc0_origin,
+                       mir_y_lo, mir_y_hi, mir_x_lo, mir_x_hi,
+                       T, &prep, tile_in);
+
+      // diagnostic: tile 0 pre-inference (4ch packed input)
+      if(tx == 0 && ty == 0)
+      {
+        float mn[4] = {tile_in[0], tile_in[0], tile_in[0], tile_in[0]};
+        float mx[4] = {tile_in[0], tile_in[0], tile_in[0], tile_in[0]};
+        for(int k = 0; k < 4; k++)
+        {
+          const float *p = tile_in + (size_t)k * tile_in_plane;
+          mn[k] = mx[k] = p[0];
+          for(size_t i = 0; i < tile_in_plane; i++)
+          {
+            if(p[i] < mn[k]) mn[k] = p[i];
+            if(p[i] > mx[k]) mx[k] = p[i];
+          }
+        }
+        dt_print(DT_DEBUG_AI,
+                 "[restore_raw_bayer] tile0 model_input range "
+                 "R=[%.3f,%.3f] G1=[%.3f,%.3f] G2=[%.3f,%.3f] B=[%.3f,%.3f]",
+                 mn[0], mx[0], mn[1], mx[1],
+                 mn[2], mx[2], mn[3], mx[3]);
+      }
+
+      // inference
+      if(dt_restore_run_patch_bayer(ctx, tile_in, T, T, tile_out) != 0)
+      {
+        // step down the ladder if possible. first tile only so we
+        // don't rewrite pixels we've already delivered
+        int next_T = 0;
+        for(int i = 0; i < n_ladder; i++)
+          if(ladder[i] < T) { next_T = ladder[i]; break; }
+        if(next_T > 0 && ty == 0 && tx == 0
+           && dt_restore_reload_session(ctx, next_T))
+        {
+          dt_print(DT_DEBUG_AI,
+                   "[restore_raw_bayer] inference failed at T=%d, retrying T=%d",
+                   T, next_T);
+          g_free(tile_in);
+          g_free(tile_out);
+          T = next_T;
+          goto retry;
+        }
+        dt_print(DT_DEBUG_AI,
+                 "[restore_raw_bayer] inference failed at tile %d,%d (T=%d)",
+                 tx, ty, T);
+        res = 1;
+        break;
+      }
+
+      // match_gain: scale model output so its mean equals the
+      // preprocessed input mean. the RawNIND model output has an
+      // arbitrary scale (up to ~10^6) — the Python inference path
+      // applies match_gain() after every forward pass. we match
+      // per-tile which is stable: the gain factor is a property of
+      // the trained weights, approximately constant across tiles of
+      // the same image. applied in place in tile_out. skipped for
+      // ABSOLUTE-scale models whose output is already calibrated
+      double in_mean = 0.0, out_mean = 0.0;
+      float gain = 1.0f;
+      if(ctx->output_scale == DT_RESTORE_OUT_MATCH_GAIN)
+        _bayer_gain_match(tile_in, tile_out, T,
+                          &in_mean, &out_mean, &gain);
+
+      // diagnostic: tile 0 post-gain model-output ranges + gain info
+      if(tx == 0 && ty == 0)
+      {
+        float mn[3] = {tile_out[0], tile_out[0], tile_out[0]};
+        float mx[3] = {tile_out[0], tile_out[0], tile_out[0]};
+        for(int k = 0; k < 3; k++)
+        {
+          const float *p = tile_out + (size_t)k * tile_out_plane;
+          mn[k] = mx[k] = p[0];
+          for(size_t i = 0; i < tile_out_plane; i++)
+          {
+            if(p[i] < mn[k]) mn[k] = p[i];
+            if(p[i] > mx[k]) mx[k] = p[i];
+          }
+        }
+        dt_print(DT_DEBUG_AI,
+                 "[restore_raw_bayer] tile0 model_output range "
+                 "R=[%.3f,%.3f] G=[%.3f,%.3f] B=[%.3f,%.3f] "
+                 "in_mean=%.3f out_mean=%.3f gain=%.3e",
+                 mn[0], mx[0], mn[1], mx[1], mn[2], mx[2],
+                 in_mean, out_mean, (double)gain);
+      }
+
+      // re-mosaic the core-valid region and un-preprocess
+      // model output dims: 2T x 2T (sensor pixels) for T x T packed tile.
+      // core valid region in model output starts at (2*O, 2*O) and spans
+      // (2*core_h) x (2*core_w) sensor pixels
+      const int core_sh = 2 * core_h;  // sensor height of core
+      const int core_sw = 2 * core_w;
+      for(int dy = 0; dy < core_sh; dy++)
+      {
+        const int r = y0 + 2 * py_base + dy;          // sensor row
+        const int my = 2 * O + dy;                    // model-output row
+        const size_t row_off = (size_t)my * tile_out_w;
+        for(int dx = 0; dx < core_sw; dx++)
+        {
+          const int c = x0 + 2 * px_base + dx;        // sensor col
+          const int mx = 2 * O + dx;
+
+          const int ch = FC(r, c, filters);           // 0=R, 1=G, 2=B
+          const float model_val
+            = tile_out[(size_t)ch * tile_out_plane + row_off + mx];
+
+          // reverse WB + normalisation → raw ADC
+          const float raw_val
+            = _bayer_remosaic_raw(r, c, ch, model_val, &prep);
+
+          // strength blend: α=1 → denoised, α=0 → source CFA
+          const size_t pidx = (size_t)r * width + c;
+          const float blended
+            = alpha * raw_val + inv_alpha * cfa_in[pidx];
+
+          const float clipped
+            = blended < 0.0f ? 0.0f
+              : (blended > clip_max ? clip_max : blended);
+          cfa_out[pidx] = (uint16_t)(clipped + 0.5f);
+        }
+      }
+
+      tile_count++;
+      if(control_job)
+        dt_control_job_set_progress(control_job,
+                                    (double)tile_count / total_tiles);
+    }
+  }
+
+  g_free(tile_in);
+  g_free(tile_out);
+
+  if(res == 0)
+  {
+    // diagnostic: sample cfa_out to confirm values are in a sensible
+    // raw-ADC range matching BlackLevel/WhiteLevel the DNG advertises
+    const size_t npix_dbg = (size_t)width * height;
+    uint16_t omin = cfa_out[0], omax = cfa_out[0];
+    uint64_t osum = 0;
+    const size_t step_ = (npix_dbg < 1000000) ? 1 : (npix_dbg / 1000000);
+    size_t n = 0;
+    for(size_t i = 0; i < npix_dbg; i += step_)
+    {
+      if(cfa_out[i] < omin) omin = cfa_out[i];
+      if(cfa_out[i] > omax) omax = cfa_out[i];
+      osum += cfa_out[i];
+      n++;
+    }
+    dt_print(DT_DEBUG_AI,
+             "[restore_raw_bayer] cfa_out u16 range [%u, %u] mean=%.0f "
+             "(DNG will advertise black~%.0f white=%.0f)",
+             (unsigned)omin, (unsigned)omax,
+             n ? (double)osum / n : 0.0,
+             black[0], white);
+
+    dt_restore_persist_tile_size(ctx);
+  }
+
+  return res;
+}
+
+// preview: single-tile bayer inference + re-mosaic onto a patched CFA,
+// then run the user's pipe (via dt_restore_run_user_pipe_roi) twice —
+// once on the original mbuf for "before", once on the patched copy for
+// "after". the pipe runs at ROI = displayed crop so refreshes stay fast.
+// the "after" display-referred output matches what the user would see
+// after Process + DNG re-import
+int dt_restore_raw_bayer_preview_piped(dt_restore_context_t *ctx,
+                                       const dt_image_t *img,
+                                       dt_imgid_t imgid,
+                                       const float *cfa_full,
+                                       int width, int height,
+                                       int crop_x, int crop_y,
+                                       int crop_w, int crop_h,
+                                       float **out_before_rgb,
+                                       float **out_denoised_rgb,
+                                       int *out_w,
+                                       int *out_h)
+{
+  if(!ctx || !img || !cfa_full || !out_before_rgb || !out_denoised_rgb)
+    return 1;
+  *out_before_rgb = NULL;
+  *out_denoised_rgb = NULL;
+  if(out_w) *out_w = 0;
+  if(out_h) *out_h = 0;
+
+  if(width <= 0 || height <= 0 || crop_w <= 0 || crop_h <= 0) return 1;
+
+  _bayer_prep_t prep;
+  if(_compute_bayer_prep(ctx, img, &prep)) return 1;
+  const uint32_t filters = prep.filters;
+  const float    clip_max = prep.clip_max;
+
+  const int T = dt_restore_get_tile_size(ctx);
+  if(T <= 0) return 1;
+  const int sensor_T = 2 * T;
+  const int max_disp = sensor_T - 4 * OVERLAP_PACKED;
+  if(crop_w > max_disp || crop_h > max_disp) return 1;
+
+  // snap crop to CFA grid
+  crop_x = (crop_x / 2) * 2;
+  crop_y = (crop_y / 2) * 2;
+  crop_w = (crop_w / 2) * 2;
+  crop_h = (crop_h / 2) * 2;
+  if(crop_w <= 0 || crop_h <= 0) return 1;
+
+  int inf_x = crop_x + crop_w / 2 - sensor_T / 2;
+  int inf_y = crop_y + crop_h / 2 - sensor_T / 2;
+  inf_x = (inf_x / 2) * 2;
+  inf_y = (inf_y / 2) * 2;
+
+  // inference (single tile)
+  const size_t tile_in_plane = (size_t)T * T;
+  const size_t tile_out_w = 2 * (size_t)T;
+  const size_t tile_out_plane = tile_out_w * tile_out_w;
+
+  float *tile_in = g_try_malloc(tile_in_plane * 4 * sizeof(float));
+  float *tile_out = g_try_malloc(tile_out_plane * 3 * sizeof(float));
+  if(!tile_in || !tile_out)
+  {
+    g_free(tile_in);
+    g_free(tile_out);
+    return 1;
+  }
+
+  // geometry applies the same orientation + mirror policy as the batch
+  // path. sr0_base / sc0_base for the preview is the user-centred,
+  // even-snapped inference tile origin in sensor coords
+  int pp_sr0, pp_sc0, pp_mir_y_lo, pp_mir_y_hi, pp_mir_x_lo, pp_mir_x_hi;
+  _bayer_tile_geometry(ctx, &prep, inf_y, inf_x, width, height,
+                       &pp_sr0, &pp_sc0,
+                       &pp_mir_y_lo, &pp_mir_y_hi,
+                       &pp_mir_x_lo, &pp_mir_x_hi);
+  _pack_bayer_tile(cfa_full, width, height,
+                   pp_sr0, pp_sc0,
+                   pp_mir_y_lo, pp_mir_y_hi, pp_mir_x_lo, pp_mir_x_hi,
+                   T, &prep, tile_in);
+
+  if(dt_restore_run_patch_bayer(ctx, tile_in, T, T, tile_out) != 0)
+  {
+    g_free(tile_in);
+    g_free(tile_out);
+    return 1;
+  }
+
+  // gain-match: same scalar correction as the batch path (gated on
+  // output_scale; ABSOLUTE-scale models skip it)
+  if(ctx->output_scale == DT_RESTORE_OUT_MATCH_GAIN)
+    _bayer_gain_match(tile_in, tile_out, T, NULL, NULL, NULL);
+  g_free(tile_in);
+
+  // fetch source sensor buffer in native dtype
+  dt_mipmap_buffer_t mbuf;
+  dt_mipmap_cache_get(&mbuf, imgid, DT_MIPMAP_FULL,
+                      DT_MIPMAP_BLOCKING, 'r');
+  if(!mbuf.buf || mbuf.width != width || mbuf.height != height)
+  {
+    dt_mipmap_cache_release(&mbuf);
+    g_free(tile_out);
+    return 1;
+  }
+
+  const int is_uint16 = (img->buf_dsc.datatype == TYPE_UINT16);
+  const int is_float  = (img->buf_dsc.datatype == TYPE_FLOAT);
+  if(!is_uint16 && !is_float)
+  {
+    dt_print(DT_DEBUG_AI,
+             "[restore_raw_bayer] preview_piped: unsupported raw datatype %d",
+             img->buf_dsc.datatype);
+    dt_mipmap_cache_release(&mbuf);
+    g_free(tile_out);
+    return 1;
+  }
+  const size_t pixel_sz = is_uint16 ? 2 : 4;
+  const size_t total_bytes = (size_t)width * height * pixel_sz;
+
+  // build denoised-patched CFA: copy original, overwrite the
+  // entire inference region (2T × 2T sensor pixels) with denoised data.
+  // patching beyond the display crop gives the pipe's geometry chain
+  // ~64 px of slop on each side — enough to absorb the few-pixel ROI
+  // drift that the inscribed-AABB trick alone can't eliminate (pipe
+  // sampling slightly outside the quad's interior due to floor/ceil
+  // rounding or modules whose distort_transform returns approximations)
+  void *patched = g_try_malloc(total_bytes);
+  if(!patched)
+  {
+    dt_mipmap_cache_release(&mbuf);
+    g_free(tile_out);
+    return 1;
+  }
+  memcpy(patched, mbuf.buf, total_bytes);
+
+  // patch the full 2T × 2T inference region, clamped to the sensor.
+  // pp_sr0 / pp_sc0 is the oriented tile origin (inf_y/x + y0/x0 under
+  // FORCE_RGGB, or inf_y/x under NATIVE) — this is where the *output*
+  // tile's (0, 0) lives in sensor coords, so the patch rectangle and
+  // the tile_out index must use it consistently
+  const int patch_x0 = (pp_sc0 < 0) ? 0 : pp_sc0;
+  const int patch_y0 = (pp_sr0 < 0) ? 0 : pp_sr0;
+  const int patch_x1 = (pp_sc0 + sensor_T > width)  ? width  : pp_sc0 + sensor_T;
+  const int patch_y1 = (pp_sr0 + sensor_T > height) ? height : pp_sr0 + sensor_T;
+
+  for(int sr = patch_y0; sr < patch_y1; sr++)
+  {
+    const size_t mo_row = (size_t)(sr - pp_sr0) * tile_out_w;
+    for(int sc = patch_x0; sc < patch_x1; sc++)
+    {
+      const int ch = FC(sr, sc, filters);
+      const float model_val
+        = tile_out[(size_t)ch * tile_out_plane + mo_row + (sc - pp_sc0)];
+      const float raw_val
+        = _bayer_remosaic_raw(sr, sc, ch, model_val, &prep);
+      const float clipped = raw_val < 0.0f ? 0.0f
+        : (raw_val > clip_max ? clip_max : raw_val);
+      const size_t idx = (size_t)sr * width + sc;
+      if(is_uint16)
+        ((uint16_t *)patched)[idx] = (uint16_t)(clipped + 0.5f);
+      else
+        ((float *)patched)[idx] = clipped;
+    }
+  }
+
+  g_free(tile_out);
+
+  // run pipe on patched CFA → out_denoised_rgb
+  int dw = 0, dh = 0, bw = 0, bh = 0;
+  int err = dt_restore_run_user_pipe_roi(imgid, patched, width, height,
+                             crop_x, crop_y, crop_w, crop_h,
+                             &dw, &dh, out_denoised_rgb);
+  g_free(patched);
+
+  // run pipe on original mbuf → out_before_rgb
+  // mbuf.buf is const from our perspective (read-only cache entry) but the
+  // pipe set_input API isn't marked const; cast to writable pointer with
+  // the understanding that the pipe doesn't mutate its input buffer
+  if(err == 0)
+  {
+    err = dt_restore_run_user_pipe_roi(imgid, (void *)mbuf.buf, width, height,
+                           crop_x, crop_y, crop_w, crop_h,
+                           &bw, &bh, out_before_rgb);
+  }
+
+  dt_mipmap_cache_release(&mbuf);
+
+  if(err || dw != bw || dh != bh)
+  {
+    // dims must match between the two passes so the caller can blend
+    // them; mismatch shouldn't happen (same pipe, same ROI) but guard
+    // anyway so we never hand back inconsistent buffers
+    if(dw != bw || dh != bh)
+      dt_print(DT_DEBUG_AI,
+               "[restore_raw_bayer] preview_piped: before/after dim "
+               "mismatch (%dx%d vs %dx%d) — aborting",
+               bw, bh, dw, dh);
+    g_free(*out_before_rgb);   *out_before_rgb = NULL;
+    g_free(*out_denoised_rgb); *out_denoised_rgb = NULL;
+    return 1;
+  }
+  if(out_w) *out_w = dw;
+  if(out_h) *out_h = dh;
+  return 0;
+}
+
+// clang-format off
+// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
+// vim: shiftwidth=2 expandtab tabstop=2 cindent
+// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
+// clang-format on
diff --git a/src/common/ai/restore_raw_bayer.h b/src/common/ai/restore_raw_bayer.h
new file mode 100644
index 000000000000..bd25ff48ae57
--- /dev/null
+++ b/src/common/ai/restore_raw_bayer.h
@@ -0,0 +1,134 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// raw_restore — RawNIND bayer-denoise pipeline
+//
+// wraps a loaded RawNIND bayer model and runs the whole raw->raw
+// denoise pipeline: preprocessing (black level, normalize, per-channel
+// WB, 2x2 pack), tiled inference with overlap blending, postprocessing
+// (un-WB, un-normalize), and re-mosaic back to the original CFA
+// pattern. produces a uint16 sensor-sized mosaic that is written to
+// DNG by dt_dng_write_cfa_bayer().
+//
+// this is kept separate from the RGB denoise/upscale path in restore.c
+// because:
+// - input is single-channel CFA, not RGB; no sRGB gamma, no wide-gamut
+// - preprocessing is raw-specific (per-channel black, WB normalization)
+// - output is re-mosaiced to a CFA, not scanlines of interleaved RGB
+// - tile dims are in packed half-res space, not sensor resolution
+
+#pragma once
+
+#include <glib.h>
+#include <stdint.h>
+
+#include "common/ai/restore.h"
+#include "common/image.h"  // for dt_imgid_t
+
+struct dt_image_t;
+struct _dt_job_t;
+
+// @brief Run the RawNIND bayer denoise pipeline end-to-end.
+//
+// @param ctx      loaded bayer context (dt_restore_load_rawdenoise_bayer)
+// @param img      source image (metadata for preprocessing + re-mosaic)
+// @param cfa_in   sensor CFA as float (full sensor resolution, row-major,
+//                 unnormalized: values in raw ADC units, no black
+//                 subtracted). This is what rawspeed delivers in
+//                 DT_MIPMAP_FULL for raw images.
+// @param width    sensor width (img->width)
+// @param height   sensor height (img->height)
+// @param cfa_out  caller-allocated uint16 buffer of width*height samples.
+//                 On success, contains the denoised mosaic in the same
+//                 CFA layout and raw ADC range as the input.
+// @param strength linear blend between original and denoised CFA in
+//                 [0, 1]. 0 = pass-through the source CFA, 1 = full
+//                 model output. Applied per sample at the end of the
+//                 tile postprocess so tile boundaries stay seamless.
+// @param control_job job handle for progress/cancellation (NULL-safe)
+// @return 0 on success
+int dt_restore_raw_bayer(dt_restore_context_t *ctx,
+                         const struct dt_image_t *img,
+                         const float *cfa_in,
+                         int width,
+                         int height,
+                         uint16_t *cfa_out,
+                         float strength,
+                         struct _dt_job_t *control_job);
+
+// @brief Bayer preview through darktable's real pixelpipe — "preview = batch".
+//
+// Runs model inference on the displayed crop, re-mosaics the output back
+// to CFA (same un-WB / un-normalise / clip logic as dt_restore_raw_bayer),
+// patches it into a full-sensor copy of the source, then runs darktable's
+// full pixelpipe TWICE — once on the patched (denoised) CFA for the
+// "after" view, once on the original CFA for the "before" view. Both
+// results go through the image's complete history stack (including
+// temperature / filmic / output profile), so the displayed preview
+// matches what the user will see after Process + re-import.
+//
+// The strength slider should blend out_before_rgb and out_denoised_rgb
+// at display time; this entry always returns the "strength = 1" denoised
+// result.
+//
+// Expensive: two full pipelined renders per refresh on top of the model
+// inference. Typically 2–5 seconds depending on sensor size and iop stack
+// complexity. Use dt_restore_raw_bayer_preview for cheaper (but
+// colour-approximate) previews.
+//
+// @param ctx        loaded bayer context
+// @param img        source image metadata
+// @param imgid      image id (used by the pixelpipe)
+// @param cfa_full   full-sensor CFA as float (cache in neural_restore)
+// @param width      sensor width
+// @param height     sensor height
+// @param crop_x     displayed crop top-left x (sensor coords, snapped mod 2)
+// @param crop_y     displayed crop top-left y (snapped mod 2)
+// @param crop_w     displayed crop width (snapped mod 2)
+// @param crop_h     displayed crop height (snapped mod 2)
+// @param out_before_rgb    caller-frees with g_free. 3ch interleaved
+//                          (*out_w * *out_h * 3 floats), linear Rec.709,
+//                          pipe output for the original CFA.
+// @param out_denoised_rgb  caller-frees with g_free. same shape, pipe
+//                          output for the denoised-patched CFA at α=1.
+// @param out_w             receives actual rendered width (may differ
+//                          from crop_w when user history contains
+//                          geometry-modifying modules; both returned
+//                          buffers share these dims).
+// @param out_h             receives actual rendered height.
+// @return 0 on success; both outputs NULL on failure.
+int dt_restore_raw_bayer_preview_piped(dt_restore_context_t *ctx,
+                                       const struct dt_image_t *img,
+                                       dt_imgid_t imgid,
+                                       const float *cfa_full,
+                                       int width,
+                                       int height,
+                                       int crop_x,
+                                       int crop_y,
+                                       int crop_w,
+                                       int crop_h,
+                                       float **out_before_rgb,
+                                       float **out_denoised_rgb,
+                                       int *out_w,
+                                       int *out_h);
+
+// clang-format off
+// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
+// vim: shiftwidth=2 expandtab tabstop=2 cindent
+// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
+// clang-format on
diff --git a/src/common/ai/restore_raw_linear.c b/src/common/ai/restore_raw_linear.c
new file mode 100644
index 000000000000..23a63ea378ef
--- /dev/null
+++ b/src/common/ai/restore_raw_linear.c
@@ -0,0 +1,1032 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "common/ai/restore_raw_linear.h"
+#include "common/ai/restore.h"
+#include "common/ai/restore_common.h"
+#include "common/darktable.h"
+#include "common/image.h"
+#include "common/image_cache.h"
+#include "common/iop_order.h"
+#include "common/math.h"
+#include "common/matrices.h"
+#include "common/mipmap_cache.h"
+#include "control/jobs.h"
+#include "develop/develop.h"
+#include "develop/imageop.h"
+#include "develop/imageop_math.h"
+#include "develop/pixelpipe_hb.h"
+#include "imageio/imageio_common.h"
+
+#include <float.h>
+#include <glib.h>
+#include <math.h>
+#include <string.h>
+
+#define OVERLAP_LINEAR 32  // sensor pixels; same scale as input
+
+// derive daylight WB multipliers from the camera's XYZ->CAM matrix:
+// at D65 white, the camera response per channel is
+//   resp[c] = sum_i M[c][i] * D65[i]
+// and wb_norm[c] = resp[G] / resp[c] normalizes green to 1.
+// returns TRUE when a usable matrix is available
+static gboolean _daylight_wb(const dt_image_t *img, float wb_norm[3])
+{
+  const float D65[3] = { 0.9504f, 1.0f, 1.0889f };
+  float resp[3];
+  float mag = 0.0f;
+  for(int c = 0; c < 3; c++)
+  {
+    resp[c] = img->adobe_XYZ_to_CAM[c][0] * D65[0]
+            + img->adobe_XYZ_to_CAM[c][1] * D65[1]
+            + img->adobe_XYZ_to_CAM[c][2] * D65[2];
+    mag += fabsf(img->adobe_XYZ_to_CAM[c][0])
+         + fabsf(img->adobe_XYZ_to_CAM[c][1])
+         + fabsf(img->adobe_XYZ_to_CAM[c][2]);
+  }
+  if(mag <= 0.0f || resp[0] <= 0.0f || resp[1] <= 0.0f || resp[2] <= 0.0f)
+  {
+    wb_norm[0] = wb_norm[1] = wb_norm[2] = 1.0f;
+    return FALSE;
+  }
+  wb_norm[0] = resp[1] / resp[0];
+  wb_norm[1] = 1.0f;
+  wb_norm[2] = resp[1] / resp[2];
+  return TRUE;
+}
+
+// build the combined "input-space → camRGB + undo exposure boost + undo
+// WB" 3×3 used in the final un-matrix pass. folds three linear ops into
+// one per-pixel multiplication for speed. caller provides input_to_cam
+// (built by _build_cam_matrices for the ctx's input_colorspace),
+// inv_boost (= 1 / exposure_boost) and wb_norm
+static void _linear_build_M_boosted(const float input_to_cam[9],
+                                    float inv_boost,
+                                    const float wb_norm[3],
+                                    float M[9])
+{
+  for(int k = 0; k < 3; k++)
+    for(int i = 0; i < 3; i++)
+      M[k * 3 + i] = input_to_cam[k * 3 + i] * inv_boost / wb_norm[k];
+}
+
+// per-channel scalar match_gain: tile_out[c] *= in_mean[c]/out_mean[c].
+// RawNIND linear output is arbitrary-scale camRGB-in-lin_rec2020 and
+// match_gain() is the canonical post-step that puts it back on the
+// input's scale. applied in place. out_gain[3] optional (batch uses it
+// for a tile0 diagnostic)
+static void _linear_gain_match_3ch(const float *tile_in,
+                                   float *tile_out,
+                                   size_t per_ch,
+                                   float out_gain[3])
+{
+  for(int k = 0; k < 3; k++)
+  {
+    const float *pi = tile_in + (size_t)k * per_ch;
+    float *po = tile_out + (size_t)k * per_ch;
+    double in_sum = 0.0, out_sum = 0.0;
+    for(size_t i = 0; i < per_ch; i++)
+    {
+      in_sum += pi[i];
+      out_sum += po[i];
+    }
+    const double im = in_sum / (double)per_ch;
+    const double om = out_sum / (double)per_ch;
+    const float g = (fabs(om) > 1e-8) ? (float)(im / om) : 1.0f;
+    if(g != 1.0f)
+      for(size_t i = 0; i < per_ch; i++) po[i] *= g;
+    if(out_gain) out_gain[k] = g;
+  }
+}
+
+// derive + apply an exposure boost to a planar 3ch lin_rec2020 buffer.
+// RawNIND training data was exposed at editorial brightness (mean ~0.3
+// in lin_rec2020); low-light raws land near ~0.02, which is >10× darker
+// than the training distribution. the UtNet2 weights diverge on such
+// OOD input (observed: model output range ±1e10 with negative mean,
+// breaking match_gain). we boost to the training mean pre-inference
+// and un-boost at the very end; the multiplication commutes with the
+// linear un-matrix and un-WB steps so correctness holds.
+// target_mean = NAN disables the boost entirely for models that don't
+// need a brightness-normalized input. otherwise boost is capped at
+// [1, 100] (never dim bright scenes). returned mean / boost are filled
+// for optional diagnostics (boost=1 when disabled)
+static void _linear_exposure_boost(const dt_restore_context_t *ctx,
+                                   float *rgb_planar,
+                                   size_t plane,
+                                   float *out_mean,
+                                   float *out_boost)
+{
+  const size_t total = plane * 3;
+  double sum = 0.0;
+  for(size_t i = 0; i < total; i++) sum += rgb_planar[i];
+  const float scene_mean = (float)(sum / (double)total);
+  const float target = ctx ? ctx->target_mean : 0.30f;
+  float boost = 1.0f;
+  if(!isnan(target) && target > 0.0f && scene_mean > 1e-4f)
+  {
+    boost = target / scene_mean;
+    if(boost < 1.0f) boost = 1.0f;
+    if(boost > 100.0f) boost = 100.0f;
+  }
+  if(boost != 1.0f)
+    for(size_t i = 0; i < total; i++) rgb_planar[i] *= boost;
+  if(out_mean)  *out_mean = scene_mean;
+  if(out_boost) *out_boost = boost;
+}
+
+// as-shot WB from img->wb_coeffs normalized to G=1
+static gboolean _as_shot_wb(const dt_image_t *img, float wb_norm[3])
+{
+  if(img->wb_coeffs[0] <= 0.0f
+     || img->wb_coeffs[1] <= 0.0f
+     || img->wb_coeffs[2] <= 0.0f)
+    return FALSE;
+  const float g = img->wb_coeffs[1];
+  wb_norm[0] = img->wb_coeffs[0] / g;
+  wb_norm[1] = 1.0f;
+  wb_norm[2] = img->wb_coeffs[2] / g;
+  return TRUE;
+}
+
+// resolve WB for the linear path keyed off ctx->wb_mode. Default for
+// this path is AS_SHOT (as-shot beats daylight for re-imported DNGs
+// because the denoised output's tonal character then matches the
+// source — see the long rationale in dt_restore_raw_linear). Fallback
+// order swaps per mode; NONE skips normalization entirely
+static void _resolve_linear_wb(const dt_restore_context_t *ctx,
+                               const dt_image_t *img, float wb_norm[3])
+{
+  wb_norm[0] = wb_norm[1] = wb_norm[2] = 1.0f;
+  const dt_restore_wb_mode_t mode
+    = ctx ? ctx->wb_mode : DT_RESTORE_WB_AS_SHOT;
+  if(mode == DT_RESTORE_WB_AS_SHOT)
+  {
+    if(!_as_shot_wb(img, wb_norm))
+      _daylight_wb(img, wb_norm);
+  }
+  else if(mode == DT_RESTORE_WB_DAYLIGHT)
+  {
+    if(!_daylight_wb(img, wb_norm))
+      _as_shot_wb(img, wb_norm);
+  }
+  // DT_RESTORE_WB_NONE: leave at {1, 1, 1}
+}
+
+// D65 XYZ -> linear Rec.2020 (ITU-R BT.2020), row-major 3x3.
+// matches the lin_rec2020 color profile the RawNIND linear variant
+// was trained on
+static const float _xyz_to_rec2020[9] = {
+   1.7166511880f, -0.3556707838f, -0.2533662814f,
+  -0.6666843518f,  1.6164812366f,  0.0157685458f,
+   0.0176398574f, -0.0427706133f,  0.9421031212f,
+};
+
+static const float _rec2020_to_xyz[9] = {
+  0.6369580483f, 0.1446169036f, 0.1688809752f,
+  0.2627002120f, 0.6779980715f, 0.0593017165f,
+  0.0000000000f, 0.0280726930f, 1.0609850577f,
+};
+
+// D65 XYZ -> linear sRGB / Rec.709 (IEC 61966-2-1), row-major 3x3.
+// used when a variant declares input_colorspace: srgb_linear
+static const float _xyz_to_srgb[9] = {
+   3.2404542f, -1.5371385f, -0.4985314f,
+  -0.9692660f,  1.8760108f,  0.0415560f,
+   0.0556434f, -0.2040259f,  1.0572252f,
+};
+
+static const float _srgb_to_xyz[9] = {
+  0.4124564f, 0.3575761f, 0.1804375f,
+  0.2126729f, 0.7151522f, 0.0721750f,
+  0.0193339f, 0.1191920f, 0.9503041f,
+};
+
+// build the per-image camRGB<->input-space matrices, where input-space
+// is chosen by ctx->input_colorspace:
+//   LIN_REC2020 (default): xyz_to_rec2020 · inverse(adobe_XYZ_to_CAM)
+//   SRGB_LINEAR:           xyz_to_srgb · inverse(adobe_XYZ_to_CAM)
+//   CAMRGB:                identity (model runs directly on camRGB)
+// returns TRUE when the input-space transform could be built; FALSE
+// when the camera's color matrix is absent or singular (CAMRGB always
+// succeeds since it skips the matrix entirely). on FALSE the caller
+// falls back to identity (color cast but at least no garbage)
+static gboolean _build_cam_matrices(const dt_restore_context_t *ctx,
+                                    const dt_image_t *img,
+                                    float cam_to_input[9],
+                                    float input_to_cam[9])
+{
+  const dt_restore_colorspace_t cs
+    = ctx ? ctx->input_colorspace : DT_RESTORE_CS_LIN_REC2020;
+
+  if(cs == DT_RESTORE_CS_CAMRGB)
+  {
+    for(int i = 0; i < 9; i++)
+      cam_to_input[i] = input_to_cam[i] = (i % 4 == 0) ? 1.0f : 0.0f;
+    return TRUE;
+  }
+
+  float cam_from_xyz[9];
+  float mag = 0.0f;
+  for(int k = 0; k < 3; k++)
+    for(int i = 0; i < 3; i++)
+    {
+      const float v = img->adobe_XYZ_to_CAM[k][i];
+      cam_from_xyz[k * 3 + i] = v;
+      mag += fabsf(v);
+    }
+  if(mag <= 0.0f) return FALSE;
+
+  float xyz_from_cam[9];
+  if(mat3inv(xyz_from_cam, cam_from_xyz) != 0)
+    return FALSE;
+
+  const float *xyz_to_input = (cs == DT_RESTORE_CS_SRGB_LINEAR)
+    ? _xyz_to_srgb : _xyz_to_rec2020;
+  const float *input_to_xyz = (cs == DT_RESTORE_CS_SRGB_LINEAR)
+    ? _srgb_to_xyz : _rec2020_to_xyz;
+
+  mat3mul(cam_to_input, xyz_to_input, xyz_from_cam);
+  mat3mul(input_to_cam, cam_from_xyz, input_to_xyz);
+  return TRUE;
+}
+
+// run the minimal darktable pixelpipe: rawprepare + highlights +
+// demosaic, nothing after, no temperature (so output is raw-native
+// camRGB without WB applied). output is a newly-allocated 4ch float
+// RGBA buffer at the pipeline's processed_{width,height}; caller frees
+// with dt_free_align(). returns 0 on success
+static int _run_demosaic_pipe(const dt_imgid_t imgid,
+                              float **out_buf,
+                              int *out_w,
+                              int *out_h)
+{
+  dt_develop_t dev;
+  dt_dev_init(&dev, FALSE);
+  dt_dev_load_image(&dev, imgid);
+
+  dt_mipmap_buffer_t mbuf;
+  dt_mipmap_cache_get(&mbuf, imgid, DT_MIPMAP_FULL,
+                      DT_MIPMAP_BLOCKING, 'r');
+  if(!mbuf.buf || !mbuf.width || !mbuf.height)
+  {
+    dt_print(DT_DEBUG_AI,
+             "[restore_raw_linear] could not load raw for imgid %d",
+             imgid);
+    dt_mipmap_cache_release(&mbuf);
+    dt_dev_cleanup(&dev);
+    return 1;
+  }
+
+  const int iw = mbuf.width;
+  const int ih = mbuf.height;
+
+  dt_dev_pixelpipe_t pipe;
+  if(!dt_dev_pixelpipe_init_export(&pipe, iw, ih,
+                                   IMAGEIO_FLOAT, FALSE))
+  {
+    dt_print(DT_DEBUG_AI,
+             "[restore_raw_linear] pipe init_export failed (%dx%d)", iw, ih);
+    dt_mipmap_cache_release(&mbuf);
+    dt_dev_cleanup(&dev);
+    return 1;
+  }
+
+  // the export code sequences this as: resync_modules_order -> set_input
+  // -> create_nodes -> synch_all. resync builds the iop-order table
+  // from the loaded image's history; without it, create_nodes sees an
+  // empty/misaligned list and leaves pipe->nodes NULL, which then
+  // crashes dt_dev_pixelpipe_disable_after when it dereferences
+  // g_list_last(pipe->nodes)
+  dt_ioppr_resync_modules_order(&dev);
+  dt_dev_pixelpipe_set_input(&pipe, &dev, (float *)mbuf.buf,
+                             iw, ih, mbuf.iscale);
+  dt_dev_pixelpipe_create_nodes(&pipe, &dev);
+  dt_dev_pixelpipe_synch_all(&pipe, &dev);
+
+  if(!pipe.nodes)
+  {
+    dt_print(DT_DEBUG_AI,
+             "[restore_raw_linear] pipe has no nodes — aborting");
+    dt_dev_pixelpipe_cleanup(&pipe);
+    dt_mipmap_cache_release(&mbuf);
+    dt_dev_cleanup(&dev);
+    return 1;
+  }
+
+  // keep rawprepare + highlights (clip) + demosaic; skip temperature
+  // (we apply our own daylight WB later) and everything after demosaic
+  dt_dev_pixelpipe_disable_after(&pipe, "demosaic");
+  for(GList *n = pipe.nodes; n; n = g_list_next(n))
+  {
+    dt_dev_pixelpipe_iop_t *piece = n->data;
+    if(dt_iop_module_is(piece->module->so, "temperature")
+       || dt_iop_module_is(piece->module->so, "rawdenoise"))
+      piece->enabled = FALSE;
+  }
+
+  dt_dev_pixelpipe_get_dimensions(&pipe, &dev, iw, ih,
+                                  &pipe.processed_width,
+                                  &pipe.processed_height);
+  const int pw = pipe.processed_width;
+  const int ph = pipe.processed_height;
+
+  // process CPU-side at full scale. no_gamma keeps float output
+  dt_dev_pixelpipe_process_no_gamma(&pipe, &dev, 0, 0, pw, ph, 1.0f);
+
+  if(!pipe.backbuf || !pipe.backbuf_width || !pipe.backbuf_height)
+  {
+    dt_dev_pixelpipe_cleanup(&pipe);
+    dt_mipmap_cache_release(&mbuf);
+    dt_dev_cleanup(&dev);
+    dt_print(DT_DEBUG_AI,
+             "[restore_raw_linear] pipe produced no backbuffer");
+    return 1;
+  }
+
+  const int bw = pipe.backbuf_width;
+  const int bh = pipe.backbuf_height;
+  float *copy = dt_alloc_align_float((size_t)bw * bh * 4);
+  if(!copy)
+  {
+    dt_dev_pixelpipe_cleanup(&pipe);
+    dt_mipmap_cache_release(&mbuf);
+    dt_dev_cleanup(&dev);
+    return 1;
+  }
+
+  memcpy(copy, pipe.backbuf, (size_t)bw * bh * 4 * sizeof(float));
+
+  *out_buf = copy;
+  *out_w = bw;
+  *out_h = bh;
+
+  dt_dev_pixelpipe_cleanup(&pipe);
+  dt_mipmap_cache_release(&mbuf);
+  dt_dev_cleanup(&dev);
+  return 0;
+}
+
+int dt_restore_raw_linear(dt_restore_context_t *ctx,
+                          const dt_imgid_t imgid,
+                          float **out_rgb,
+                          int *out_w,
+                          int *out_h,
+                          float strength,
+                          struct _dt_job_t *control_job)
+{
+  if(!ctx || !out_rgb || !out_w || !out_h) return 1;
+  *out_rgb = NULL;
+
+  const float alpha = strength < 0.0f ? 0.0f
+                    : (strength > 1.0f ? 1.0f : strength);
+  const float inv_alpha = 1.0f - alpha;
+
+  // --- 1. produce demosaicked 4ch RGBA via minimal pipeline ---
+  float *rgba = NULL;
+  int w = 0, h = 0;
+  if(_run_demosaic_pipe(imgid, &rgba, &w, &h)) return 1;
+
+  // snapshot image metadata for WB derivation (plain data members;
+  // don't touch heap pointers like profile/dng_gain_maps)
+  const dt_image_t *cached = dt_image_cache_get(imgid, 'r');
+  if(!cached)
+  {
+    dt_free_align(rgba);
+    return 1;
+  }
+  dt_image_t img_meta = *cached;
+  dt_image_cache_read_release(cached);
+
+  // WB normalization per ctx->wb_mode (default AS_SHOT; see
+  // _resolve_linear_wb). AS_SHOT beats DAYLIGHT for this path because
+  // match_gain + the negative-gain hack absorb the training-distribution
+  // mismatch, so the WB choice mostly shapes the final DNG's tonal look
+  // and we want the re-imported DNG to render with the same tone/contrast
+  // as the source
+  float wb_norm[3];
+  _resolve_linear_wb(ctx, &img_meta, wb_norm);
+
+  // feed the model in ctx->input_colorspace (default lin_rec2020,
+  // matches RawNIND training preprocessing). identity fallback when
+  // the camera's color matrix is absent (rare); CAMRGB always succeeds
+  float cam_to_input[9];
+  float input_to_cam[9];
+  const gboolean matrix_ok =
+    _build_cam_matrices(ctx, &img_meta, cam_to_input, input_to_cam);
+  if(!matrix_ok)
+  {
+    for(int i = 0; i < 9; i++)
+      cam_to_input[i] = input_to_cam[i] = (i % 4 == 0) ? 1.0f : 0.0f;
+  }
+  dt_print(DT_DEBUG_AI,
+           "[restore_raw_linear] wb_norm=[%.3f,%.3f,%.3f], "
+           "colorspace matrix: %s",
+           wb_norm[0], wb_norm[1], wb_norm[2],
+           matrix_ok ? "cam->input from adobe_XYZ_to_CAM"
+                     : "identity (no color matrix)");
+
+  const size_t npix = (size_t)w * h;
+
+  // allocate planar 3ch buffers for tile I/O + the preserved
+  // pre-inference source for the strength blend
+  float *rgb_src = dt_alloc_align_float(npix * 3);  // planar R,G,B
+  if(!rgb_src)
+  {
+    dt_free_align(rgba);
+    return 1;
+  }
+
+  // interleaved RGBA -> planar RGB. apply daylight WB first (matches
+  // RawNIND training: WB in camRGB space, then camRGB->lin_rec2020),
+  // then the matrix transform so the model sees lin_rec2020 directly
+  const size_t plane = npix;
+  for(size_t i = 0; i < npix; i++)
+  {
+    const float cam[3] = {
+      rgba[i * 4 + 0] * wb_norm[0],
+      rgba[i * 4 + 1] * wb_norm[1],
+      rgba[i * 4 + 2] * wb_norm[2],
+    };
+    float input_rgb[3];
+    mat3mulv(input_rgb, cam_to_input, cam);
+    rgb_src[i]             = input_rgb[0];
+    rgb_src[i + plane]     = input_rgb[1];
+    rgb_src[i + 2 * plane] = input_rgb[2];
+  }
+
+  // diagnostic min/max sweep + exposure boost. sweep is separate from
+  // the shared boost helper because only the batch diagnostic needs the
+  // per-channel min/max; the helper just computes the mean
+  float dbg_min[3], dbg_max[3];
+  for(int k = 0; k < 3; k++)
+  {
+    const float *p = rgb_src + (size_t)k * plane;
+    dbg_min[k] = dbg_max[k] = p[0];
+    for(size_t i = 0; i < plane; i++)
+    {
+      if(p[i] < dbg_min[k]) dbg_min[k] = p[i];
+      if(p[i] > dbg_max[k]) dbg_max[k] = p[i];
+    }
+  }
+  float scene_mean = 0.0f, exposure_boost = 1.0f;
+  _linear_exposure_boost(ctx, rgb_src, plane, &scene_mean, &exposure_boost);
+  dt_print(DT_DEBUG_AI,
+           "[restore_raw_linear] %dx%d, lin_rec2020 input range "
+           "R=[%.3f,%.3f] G=[%.3f,%.3f] B=[%.3f,%.3f] "
+           "mean=%.4f boost=%.2fx",
+           w, h,
+           dbg_min[0], dbg_max[0], dbg_min[1], dbg_max[1],
+           dbg_min[2], dbg_max[2],
+           scene_mean, exposure_boost);
+
+  // allocate planar output buffer that tiles blend into
+  float *rgb_out = dt_alloc_align_float(npix * 3);
+  if(!rgb_out)
+  {
+    dt_free_align(rgb_src);
+    dt_free_align(rgba);
+    return 1;
+  }
+
+  // initialize output with WB'd source so strength = 0 is exact
+  // pass-through and tile-edge gaps don't leave uninitialized data
+  memcpy(rgb_out, rgb_src, npix * 3 * sizeof(float));
+
+  // tile setup
+  const int O = OVERLAP_LINEAR;
+  int T = dt_restore_get_tile_size(ctx);
+  int n_ladder = 0;
+  const int *ladder = dt_restore_get_tile_ladder(ctx, &n_ladder);
+  if(T <= 2 * O) T = 256;
+
+retry:;
+  const int step = T - 2 * O;
+  if(step <= 0)
+  {
+    dt_free_align(rgb_src);
+    dt_free_align(rgb_out);
+    dt_free_align(rgba);
+    return 1;
+  }
+  const size_t tile_plane = (size_t)T * T;
+  const int cols = (w + step - 1) / step;
+  const int rows = (h + step - 1) / step;
+  const int total_tiles = cols * rows;
+
+  dt_print(DT_DEBUG_AI,
+           "[restore_raw_linear] tile T=%d step=%d, grid %dx%d (%d tiles)",
+           T, step, cols, rows, total_tiles);
+
+  float *tile_in = g_try_malloc(tile_plane * 3 * sizeof(float));
+  float *tile_out = g_try_malloc(tile_plane * 3 * sizeof(float));
+  if(!tile_in || !tile_out)
+  {
+    g_free(tile_in);
+    g_free(tile_out);
+    dt_free_align(rgb_src);
+    dt_free_align(rgb_out);
+    dt_free_align(rgba);
+    return 1;
+  }
+
+  int res = 0;
+  int tile_count = 0;
+
+  for(int ty = 0; ty < rows && res == 0; ty++)
+  {
+    for(int tx = 0; tx < cols && res == 0; tx++)
+    {
+      if(control_job
+         && dt_control_job_get_state(control_job)
+              == DT_JOB_STATE_CANCELLED)
+      {
+        res = 1;
+        break;
+      }
+
+      const int y_base = ty * step;
+      const int x_base = tx * step;
+      const int y_end  = (y_base + step > h) ? h : y_base + step;
+      const int x_end  = (x_base + step > w) ? w : x_base + step;
+      const int core_h = y_end - y_base;
+      const int core_w = x_end - x_base;
+
+      // extract T x T tile with mirror-pad at boundaries, planar
+      for(int dy = 0; dy < T; dy++)
+      {
+        const int sy = _mirror(y_base - O + dy, h);
+        for(int dx = 0; dx < T; dx++)
+        {
+          const int sx = _mirror(x_base - O + dx, w);
+          const size_t src = (size_t)sy * w + sx;
+          const size_t dst = (size_t)dy * T + dx;
+          tile_in[dst]                  = rgb_src[src];
+          tile_in[dst + tile_plane]     = rgb_src[src + plane];
+          tile_in[dst + 2 * tile_plane] = rgb_src[src + 2 * plane];
+        }
+      }
+
+      // inference
+      if(dt_restore_run_patch_3ch_raw(ctx, tile_in, T, T, tile_out) != 0)
+      {
+        int next_T = 0;
+        for(int i = 0; i < n_ladder; i++)
+          if(ladder[i] < T) { next_T = ladder[i]; break; }
+        if(next_T > 0 && ty == 0 && tx == 0
+           && dt_restore_reload_session(ctx, next_T))
+        {
+          dt_print(DT_DEBUG_AI,
+                   "[restore_raw_linear] inference failed at T=%d, retry T=%d",
+                   T, next_T);
+          g_free(tile_in);
+          g_free(tile_out);
+          T = next_T;
+          goto retry;
+        }
+        dt_print(DT_DEBUG_AI,
+                 "[restore_raw_linear] inference failed at tile %d,%d "
+                 "(T=%d)", tx, ty, T);
+        res = 1;
+        break;
+      }
+
+      // scalar match_gain per channel: tile_out *= in_mean / out_mean
+      // (applied in place by the helper). skipped for ABSOLUTE-scale
+      // models whose output is already calibrated
+      const size_t per_ch = tile_plane;
+      float gain_ch[3] = { 1.0f, 1.0f, 1.0f };
+      if(ctx->output_scale == DT_RESTORE_OUT_MATCH_GAIN)
+        _linear_gain_match_3ch(tile_in, tile_out, per_ch, gain_ch);
+      if(tx == 0 && ty == 0)
+      {
+        dt_print(DT_DEBUG_AI,
+                 "[restore_raw_linear] tile0 match_gain "
+                 "R=%.3e G=%.3e B=%.3e",
+                 gain_ch[0], gain_ch[1], gain_ch[2]);
+      }
+
+      // blend: write (α·denoised + (1-α)·source) per channel into
+      // the core-valid region. rgb_out was pre-filled with rgb_src
+      // so overlap gaps stay as source
+      for(int dy = 0; dy < core_h; dy++)
+      {
+        const int y = y_base + dy;
+        const int my = O + dy;
+        for(int dx = 0; dx < core_w; dx++)
+        {
+          const int x = x_base + dx;
+          const int mx = O + dx;
+          const size_t tloc = (size_t)my * T + mx;
+          const size_t dst = (size_t)y * w + x;
+
+          for(int k = 0; k < 3; k++)
+          {
+            const float model_v
+              = tile_out[tloc + (size_t)k * per_ch];
+            const float src_v = rgb_src[dst + (size_t)k * plane];
+            rgb_out[dst + (size_t)k * plane]
+              = alpha * model_v + inv_alpha * src_v;
+          }
+        }
+      }
+
+      tile_count++;
+      if(control_job)
+        dt_control_job_set_progress(control_job,
+                                    (double)tile_count / total_tiles);
+    }
+  }
+
+  g_free(tile_in);
+  g_free(tile_out);
+
+  if(res == 0)
+  {
+    // final undo pass: input-space -> camRGB (matrix), divide by
+    // exposure boost, divide by WB. the DNG writer expects un-WB'd
+    // normalized camRGB in [0, 1] — AsShotNeutral tells the consumer
+    // what WB to apply
+    // out = (input_to_cam · in) / (boost · wb_norm[k])
+    // all ops are linear, fold into a single per-pixel 3x3 mul
+    const float inv_boost = 1.0f / exposure_boost;
+    float M[9];
+    _linear_build_M_boosted(input_to_cam, inv_boost, wb_norm, M);
+
+    for(size_t i = 0; i < npix; i++)
+    {
+      const float input_rgb[3] = {
+        rgb_out[i],
+        rgb_out[i + plane],
+        rgb_out[i + 2 * plane],
+      };
+      float cam[3];
+      mat3mulv(cam, M, input_rgb);
+      rgb_out[i]             = cam[0];
+      rgb_out[i + plane]     = cam[1];
+      rgb_out[i + 2 * plane] = cam[2];
+    }
+
+    dt_restore_persist_tile_size(ctx);
+  }
+
+  dt_free_align(rgb_src);
+  dt_free_align(rgba);
+
+  if(res != 0)
+  {
+    dt_free_align(rgb_out);
+    return res;
+  }
+
+  // convert planar RGB back to interleaved for caller convenience
+  float *interleaved = dt_alloc_align_float(npix * 3);
+  if(!interleaved)
+  {
+    dt_free_align(rgb_out);
+    return 1;
+  }
+  for(size_t i = 0; i < npix; i++)
+  {
+    interleaved[i * 3 + 0] = rgb_out[i];
+    interleaved[i * 3 + 1] = rgb_out[i + plane];
+    interleaved[i * 3 + 2] = rgb_out[i + 2 * plane];
+  }
+  dt_free_align(rgb_out);
+
+  *out_rgb = interleaved;
+  *out_w = w;
+  *out_h = h;
+  return 0;
+}
+
+// preview prep: demosaic-once per image
+//
+// dt_restore_raw_linear_prepare runs the full per-image demosaic +
+// WB + camRGB->lin_rec2020 once and returns a 3ch interleaved buffer at
+// sensor resolution; neural_restore.c caches it across previews of the
+// same image
+int dt_restore_raw_linear_prepare(const dt_imgid_t imgid,
+                                  float **out_rgb,
+                                  int *out_w,
+                                  int *out_h)
+{
+  if(!out_rgb || !out_w || !out_h) return 1;
+  *out_rgb = NULL;
+
+  // 1. demosaic via minimal darktable pipe (rawprepare + highlights +
+  //    demosaic; no temperature, no post-demosaic modules)
+  float *rgba = NULL;
+  int w = 0, h = 0;
+  if(_run_demosaic_pipe(imgid, &rgba, &w, &h)) return 1;
+
+  // 2. snapshot image metadata for WB + matrix derivation
+  const dt_image_t *cached = dt_image_cache_get(imgid, 'r');
+  if(!cached) { dt_free_align(rgba); return 1; }
+  dt_image_t img_meta = *cached;
+  dt_image_cache_read_release(cached);
+
+  // this prepare path has no ctx; use the default (AS_SHOT) WB. the
+  // cached lin_rec2020 buffer assumes this mode, so a future model
+  // that needs a different WB would require keying the cache on ctx
+  // too (or adding ctx to this API)
+  float wb_norm[3];
+  _resolve_linear_wb(NULL, &img_meta, wb_norm);
+
+  // 3. camRGB -> input-space matrix. this prepare path has no ctx
+  // so we use the default (LIN_REC2020); a second model expecting a
+  // different input space would need the cache keyed on it too
+  float cam_to_input[9];
+  float input_to_cam[9];
+  if(!_build_cam_matrices(NULL, &img_meta, cam_to_input, input_to_cam))
+  {
+    for(int i = 0; i < 9; i++)
+      cam_to_input[i] = (i % 4 == 0) ? 1.0f : 0.0f;
+  }
+
+  // 4. interleaved RGBA -> interleaved RGB in input-space + WB
+  const size_t npix = (size_t)w * h;
+  float *interleaved = dt_alloc_align_float(npix * 3);
+  if(!interleaved) { dt_free_align(rgba); return 1; }
+
+  for(size_t i = 0; i < npix; i++)
+  {
+    const float cam[3] = {
+      rgba[i * 4 + 0] * wb_norm[0],
+      rgba[i * 4 + 1] * wb_norm[1],
+      rgba[i * 4 + 2] * wb_norm[2],
+    };
+    float input_rgb[3];
+    mat3mulv(input_rgb, cam_to_input, cam);
+    interleaved[i * 3 + 0] = input_rgb[0];
+    interleaved[i * 3 + 1] = input_rgb[1];
+    interleaved[i * 3 + 2] = input_rgb[2];
+  }
+  dt_free_align(rgba);
+
+  *out_rgb = interleaved;
+  *out_w = w;
+  *out_h = h;
+  return 0;
+}
+
+// preview: single-tile X-Trans/linear inference, un-matrix + un-WB +
+// un-boost back to raw-ADC, re-mosaic onto the X-Trans CFA, then run
+// the user's pipe twice (via dt_restore_run_user_pipe_roi) on the
+// patched vs. original CFA to produce display-referred before/after
+// crops matching the darkroom render
+int dt_restore_raw_linear_preview_piped(dt_restore_context_t *ctx,
+                                        const dt_image_t *img,
+                                        dt_imgid_t imgid,
+                                        const float *full_rgb,
+                                        int width, int height,
+                                        int crop_x, int crop_y,
+                                        int crop_w, int crop_h,
+                                        float **out_before_rgb,
+                                        float **out_denoised_rgb,
+                                        int *out_w,
+                                        int *out_h)
+{
+  if(!ctx || !img || !full_rgb || !out_before_rgb || !out_denoised_rgb)
+    return 1;
+  *out_before_rgb = NULL;
+  *out_denoised_rgb = NULL;
+  if(out_w) *out_w = 0;
+  if(out_h) *out_h = 0;
+
+  if(width <= 0 || height <= 0 || crop_w <= 0 || crop_h <= 0) return 1;
+
+  const int T = dt_restore_get_tile_size(ctx);
+  if(T <= 0) return 1;
+  const int max_disp = T - 2 * OVERLAP_LINEAR;
+  if(crop_w > max_disp || crop_h > max_disp) return 1;
+
+  int inf_x = crop_x + crop_w / 2 - T / 2;
+  int inf_y = crop_y + crop_h / 2 - T / 2;
+
+  // WB + matrix prep (same as dt_restore_raw_linear_prepare /
+  // dt_restore_raw_linear_preview — but we also need the REVERSE
+  // transforms to go back to camRGB raw for pipe input)
+  float wb_norm[3];
+  _resolve_linear_wb(ctx, img, wb_norm);
+
+  // NOTE: full_rgb comes from dt_restore_raw_linear_prepare, which
+  // always caches in LIN_REC2020. if ctx->input_colorspace is something
+  // else, the reverse-matrix below won't undo what _prepare did and
+  // output will be wrong. until the cache keys on colorspace, this
+  // branch is only correct for LIN_REC2020. we still thread ctx so
+  // the invocation shape is right for future work
+  float cam_to_input[9];
+  float input_to_cam[9];
+  if(!_build_cam_matrices(ctx, img, cam_to_input, input_to_cam))
+  {
+    for(int i = 0; i < 9; i++)
+      cam_to_input[i] = input_to_cam[i] = (i % 4 == 0) ? 1.0f : 0.0f;
+  }
+
+  // extract crop + overlap from cached full lin_rec2020 -> tile_in
+  // apply exposure boost (same as preview), run inference
+  const size_t tile_plane = (size_t)T * T;
+  float *tile_in = g_try_malloc(tile_plane * 3 * sizeof(float));
+  float *tile_out = g_try_malloc(tile_plane * 3 * sizeof(float));
+  if(!tile_in || !tile_out)
+  {
+    g_free(tile_in);
+    g_free(tile_out);
+    return 1;
+  }
+
+  for(int dy = 0; dy < T; dy++)
+  {
+    const int sy = _mirror(inf_y + dy, height);
+    for(int dx = 0; dx < T; dx++)
+    {
+      const int sx = _mirror(inf_x + dx, width);
+      const size_t src = ((size_t)sy * width + sx) * 3;
+      const size_t dst = (size_t)dy * T + dx;
+      tile_in[dst]                  = full_rgb[src + 0];
+      tile_in[dst + tile_plane]     = full_rgb[src + 1];
+      tile_in[dst + 2 * tile_plane] = full_rgb[src + 2];
+    }
+  }
+
+  float exposure_boost = 1.0f;
+  _linear_exposure_boost(ctx, tile_in, tile_plane, NULL, &exposure_boost);
+
+  if(dt_restore_run_patch_3ch_raw(ctx, tile_in, T, T, tile_out) != 0)
+  {
+    g_free(tile_in);
+    g_free(tile_out);
+    return 1;
+  }
+
+  if(ctx->output_scale == DT_RESTORE_OUT_MATCH_GAIN)
+    _linear_gain_match_3ch(tile_in, tile_out, tile_plane, NULL);
+  g_free(tile_in);
+
+  // build matrix to reverse matrix + WB + boost + normalise
+  // tile_out came from a boosted tile_in (gain_match matches boosted
+  // magnitudes). to write it back to the native CFA, we reverse the
+  // whole prepare chain:
+  //   input-space → (input_to_cam) → cam[k] = sum_i M[k][i] * in[i]
+  //   → /wb_norm[k] → un-WB'd raw scale (normalised)
+  //   → *range[?]+black[?] → raw ADC range
+  //   → rounded uint16 CFA value
+  // folding WB undo and boost undo into one matrix applied per-pixel
+  const float inv_boost = 1.0f / exposure_boost;
+  float M_boosted[9];
+  _linear_build_M_boosted(input_to_cam, inv_boost, wb_norm, M_boosted);
+
+  // tile_in was already freed after the gain-match loop above
+
+  // fetch native raw buffer + rawprepare params for un-normalise
+  // mbuf is at RAW sensor dims (e.g. 6336x4182), which are larger than
+  // the post-rawprepare dims the caller passed (e.g. 6240x4160). the
+  // rawprepare crop offset lives in img->crop_x / img->crop_y
+  dt_mipmap_buffer_t mbuf;
+  dt_mipmap_cache_get(&mbuf, imgid, DT_MIPMAP_FULL,
+                      DT_MIPMAP_BLOCKING, 'r');
+  if(!mbuf.buf || mbuf.width <= 0 || mbuf.height <= 0)
+  {
+    dt_mipmap_cache_release(&mbuf);
+    g_free(tile_out);
+    return 1;
+  }
+  const int raw_w = mbuf.width;
+  const int raw_h = mbuf.height;
+  const int raw_off_x = img->crop_x;
+  const int raw_off_y = img->crop_y;
+  const int is_uint16 = (img->buf_dsc.datatype == TYPE_UINT16);
+  const int is_float  = (img->buf_dsc.datatype == TYPE_FLOAT);
+  if(!is_uint16 && !is_float)
+  {
+    dt_mipmap_cache_release(&mbuf);
+    g_free(tile_out);
+    return 1;
+  }
+  const size_t pixel_sz = is_uint16 ? 2 : 4;
+  const size_t total_bytes = (size_t)raw_w * raw_h * pixel_sz;
+
+  // rawprepare's normalisation: pipe will do (value - sub) / div where
+  // sub is per-CFA-site black level and div is (white - black). to get
+  // back to raw ADC space we compute per-site (value * range[idx]) + black[idx]
+  // NOTE: raw_black_level_separate is indexed by CFA position k in 0..3
+  // (even if X-Trans has 6 colours, darktable's per-sensel black is 4-
+  // entry; typical cameras use one value for all positions anyway)
+  float black[4], range[4], white;
+  _compute_cfa_black_range(img, black, range, &white);
+
+  // build patched CFA: copy original, overwrite crop with re-mosaiced denoised
+  void *patched = g_try_malloc(total_bytes);
+  if(!patched)
+  {
+    dt_mipmap_cache_release(&mbuf);
+    g_free(tile_out);
+    return 1;
+  }
+  memcpy(patched, mbuf.buf, total_bytes);
+
+  // patch the full T × T inference region (clamped to the post-rawprepare
+  // buffer extent) rather than just the display crop. this gives the
+  // pipe's geometry chain ~tile-size/2 pixels of slop on each side so
+  // any residual coordinate drift falls inside denoised data instead of
+  // showing original CFA at the preview edge
+  const int patch_x0 = (inf_x < 0) ? 0 : inf_x;
+  const int patch_y0 = (inf_y < 0) ? 0 : inf_y;
+  const int patch_x1 = (inf_x + T > width)  ? width  : inf_x + T;
+  const int patch_y1 = (inf_y + T > height) ? height : inf_y + T;
+
+  for(int py = patch_y0; py < patch_y1; py++)
+  {
+    const int sr_raw = raw_off_y + py;
+    const size_t mo_row = (size_t)(py - inf_y) * T;
+    for(int px = patch_x0; px < patch_x1; px++)
+    {
+      const int sc_raw = raw_off_x + px;
+      const size_t mx = (size_t)(px - inf_x);
+      const float rec[3] = {
+        tile_out[0 * tile_plane + mo_row + mx],
+        tile_out[1 * tile_plane + mo_row + mx],
+        tile_out[2 * tile_plane + mo_row + mx],
+      };
+      // rec → cam (un-matrix + un-WB + un-boost); clamp to [0, 1]
+      float cam[3];
+      mat3mulv(cam, M_boosted, rec);
+      for(int c = 0; c < 3; c++)
+      {
+        if(cam[c] < 0.0f) cam[c] = 0.0f;
+        if(cam[c] > 1.0f) cam[c] = 1.0f;
+      }
+      // re-mosaic: pick the single colour that the X-Trans pattern
+      // wants at this sensor position, scaled back to raw ADC range.
+      // FCxtrans uses raw-sensor parity (since xtrans[6][6] is aligned
+      // with the raw, not the post-crop buffer)
+      const int ch = FCxtrans(sr_raw, sc_raw, NULL, img->buf_dsc.xtrans);
+      const int bl_idx = ((sr_raw & 1) << 1) | (sc_raw & 1);
+      const float adc = cam[ch] * range[bl_idx] + black[bl_idx];
+      const float clipped
+        = adc < 0.0f ? 0.0f : (adc > white ? white : adc);
+      const size_t idx = (size_t)sr_raw * raw_w + sc_raw;
+      if(is_uint16)
+        ((uint16_t *)patched)[idx] = (uint16_t)(clipped + 0.5f);
+      else
+        ((float *)patched)[idx] = clipped;
+    }
+  }
+
+  g_free(tile_out);
+
+  // run pipe twice on raw-sensor-sized buffers
+  // ROI is in sensor coords (matching the patched region we built
+  // above); dt_restore_run_user_pipe_roi forward-transforms it
+  // through the user's geometry chain before handing to the pipe
+  int dw = 0, dh = 0, bw = 0, bh = 0;
+  int err = dt_restore_run_user_pipe_roi(imgid, patched, raw_w, raw_h,
+                                 crop_x, crop_y, crop_w, crop_h,
+                                 &dw, &dh, out_denoised_rgb);
+  g_free(patched);
+
+  if(err == 0)
+  {
+    err = dt_restore_run_user_pipe_roi(imgid, (void *)mbuf.buf, raw_w, raw_h,
+                               crop_x, crop_y, crop_w, crop_h,
+                               &bw, &bh, out_before_rgb);
+  }
+  dt_mipmap_cache_release(&mbuf);
+
+  if(err || dw != bw || dh != bh)
+  {
+    if(dw != bw || dh != bh)
+      dt_print(DT_DEBUG_AI,
+               "[restore_raw_linear] preview_piped: before/after dim "
+               "mismatch (%dx%d vs %dx%d) — aborting",
+               bw, bh, dw, dh);
+    g_free(*out_before_rgb);   *out_before_rgb = NULL;
+    g_free(*out_denoised_rgb); *out_denoised_rgb = NULL;
+    return 1;
+  }
+  if(out_w) *out_w = dw;
+  if(out_h) *out_h = dh;
+  return 0;
+}
+
+// clang-format off
+// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
+// vim: shiftwidth=2 expandtab tabstop=2 cindent
+// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
+// clang-format on
diff --git a/src/common/ai/restore_raw_linear.h b/src/common/ai/restore_raw_linear.h
new file mode 100644
index 000000000000..53f3eaeec980
--- /dev/null
+++ b/src/common/ai/restore_raw_linear.h
@@ -0,0 +1,161 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// raw_restore_linear — RawNIND linear-variant denoise pipeline
+//
+// for sensors the bayer variant can't handle (X-Trans in particular),
+// we feed the linear variant of the RawNIND model, which expects a
+// 3-channel demosaicked image in lin_rec2020 space at raw scale.
+//
+// input is produced by running a minimal darktable pipeline:
+// rawprepare -> highlights -> demosaic
+// while skipping temperature (so we apply our own daylight WB later)
+// and every iop after demosaic. this reuses darktable's sensor-aware
+// demosaic (AMaZE / VNG / Markesteijn / …) instead of rolling our own,
+// which matters for X-Trans quality.
+//
+// output is a 3ch float RGB buffer at full sensor resolution, in the
+// same camRGB + raw ADC range as the source. the neural_restore batch
+// path re-mosaics nothing (this sensor type can't be round-tripped
+// through a CFA DNG) and writes a LinearRaw DNG via dng_writer.
+
+#pragma once
+
+#include <glib.h>
+
+#include "common/ai/restore.h"
+#include "common/darktable.h"
+
+struct _dt_job_t;
+struct dt_image_t;
+
+// @brief Run the RawNIND linear denoise pipeline end-to-end.
+//
+// Internally:
+//   1. builds a minimal darktable pixelpipe (rawprepare + highlights
+//      + demosaic, nothing after), disables temperature so no WB is
+//      baked in;
+//   2. allocates the 3ch float demosaicked output at sensor res;
+//   3. applies daylight WB + camRGB -> lin_rec2020 matrix;
+//   4. tiles the image and calls dt_restore_run_patch_3ch_raw on each,
+//      gain-matching the output per tile;
+//   5. inverts the matrix + WB;
+//   6. strength-blends with the pre-inference demosaicked buffer.
+//
+// @param ctx       loaded linear context
+//                  (dt_restore_load_rawdenoise_linear)
+// @param imgid     image id (pipeline is built per image)
+// @param out_rgb   caller-allocated 3ch float buffer,
+//                  3 * sensor_w * sensor_h floats (interleaved RGB).
+//                  on success contains the denoised image in camRGB
+//                  raw-ADC units (same range as the source pre-demosaic
+//                  pipeline would produce).
+// @param out_w     out: sensor width at which the buffer is filled
+// @param out_h     out: sensor height at which the buffer is filled
+// @param strength  0..1 blend between the demosaicked source (0) and
+//                  the denoised result (1)
+// @param control_job job handle for progress/cancellation (NULL-safe)
+// @return 0 on success; out_rgb left untouched on failure
+int dt_restore_raw_linear(dt_restore_context_t *ctx,
+                          const dt_imgid_t imgid,
+                          float **out_rgb,
+                          int *out_w,
+                          int *out_h,
+                          float strength,
+                          struct _dt_job_t *control_job);
+
+// @brief Once-per-image demosaic + WB + camRGB->lin_rec2020 prep.
+//
+// Runs the same minimal pipeline as dt_restore_raw_linear (rawprepare +
+// highlights + demosaic, no temperature, no post-demosaic modules) and
+// returns a 3ch interleaved lin_rec2020 buffer at sensor resolution.
+//
+// Slow (full-image demosaic via darktable's pipeline). neural_restore
+// caches the result across multiple preview refreshes of the same image.
+//
+// @param imgid     image id
+// @param out_rgb   caller-frees with dt_free_align. 3ch interleaved
+//                  (sensor_w * sensor_h * 3 floats), in lin_rec2020 with
+//                  as-shot WB applied.
+// @param out_w     out: sensor width
+// @param out_h     out: sensor height
+// @return 0 on success
+int dt_restore_raw_linear_prepare(const dt_imgid_t imgid,
+                                  float **out_rgb,
+                                  int *out_w,
+                                  int *out_h);
+
+// @brief Linear preview through darktable's real pixelpipe — "preview =
+//        batch" for X-Trans / non-Bayer sensors.
+//
+// Runs inference on the crop, un-matrix / un-WB / un-boost the denoised
+// crop back to raw-ADC space, re-mosaics onto the X-Trans CFA grid at
+// the original sensor positions, then runs darktable's full pixelpipe
+// twice on the raw-sensor-sized CFA — once on the patched CFA for
+// "after", once on the original for "before". The pipe runs natively
+// (rawprepare + highlights + X-Trans demosaic + temperature + colorin
+// + filmic + output profile), so the output matches what the user sees
+// in darkroom.
+//
+// Expensive: two full pipelined renders per refresh plus a full-sensor
+// un-matrix pass. First refresh on a new image also pays one demosaic
+// via dt_restore_raw_linear_prepare.
+//
+// @param ctx         loaded linear context
+// @param img         source image metadata (for WB / matrix derivation)
+// @param imgid       image id (used by the pixelpipe)
+// @param full_rgb    3ch interleaved lin_rec2020 buffer covering the
+//                    whole sensor (from dt_restore_raw_linear_prepare)
+// @param width       sensor width
+// @param height      sensor height
+// @param crop_x      displayed crop top-left x
+// @param crop_y      displayed crop top-left y
+// @param crop_w      displayed crop width (≤ tile_size - 2*OVERLAP_LINEAR)
+// @param crop_h      displayed crop height
+// @param out_before_rgb    caller-frees with g_free. 3ch interleaved
+//                          (*out_w * *out_h * 3 floats), linear Rec.709,
+//                          pipe output on the original camRGB raw.
+// @param out_denoised_rgb  caller-frees with g_free. same shape, pipe
+//                          output on the denoised-patched camRGB raw
+//                          at α = 1.
+// @param out_w             receives actual rendered width (may differ
+//                          from crop_w when user history contains
+//                          geometry-modifying modules; both returned
+//                          buffers share these dims).
+// @param out_h             receives actual rendered height.
+// @return 0 on success; both outputs NULL on failure.
+int dt_restore_raw_linear_preview_piped(dt_restore_context_t *ctx,
+                                        const struct dt_image_t *img,
+                                        dt_imgid_t imgid,
+                                        const float *full_rgb,
+                                        int width,
+                                        int height,
+                                        int crop_x,
+                                        int crop_y,
+                                        int crop_w,
+                                        int crop_h,
+                                        float **out_before_rgb,
+                                        float **out_denoised_rgb,
+                                        int *out_w,
+                                        int *out_h);
+
+// clang-format off
+// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
+// vim: shiftwidth=2 expandtab tabstop=2 cindent
+// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
+// clang-format on
diff --git a/src/common/ai/restore_rgb.c b/src/common/ai/restore_rgb.c
new file mode 100644
index 000000000000..640486208aaf
--- /dev/null
+++ b/src/common/ai/restore_rgb.c
@@ -0,0 +1,832 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// restore_rgb — RGB-path glue for the AI denoise + upscale tasks.
+//
+// inputs here are linear-working-profile float4 RGBA (from darktable
+// export). this file owns:
+//   - color management: working-profile → sRGB before inference and
+//     back after, with optional wide-gamut preservation mask
+//   - shadow boost: per-image luminance curve to protect deep shadows
+//     during sRGB round-trip (opt-in via model attribute)
+//   - wavelet (DWT) detail recovery: preserve high-frequency texture
+//     in the luminance residual after denoise
+//   - dt_restore_process_tiled driver that ties together tiling,
+//     gamut masking, shadow boost and the per-patch inference call
+//     (dt_restore_run_patch). the low-level inference helpers live in
+//     restore.c; this file composes them for RGB.
+//
+// the raw denoise variants (Bayer / X-Trans) do their own pre/post-
+// processing (per-CFA-site black / WB / re-mosaic) and live in
+// restore_raw_bayer.c / restore_raw_linear.c. they share the generic
+// pipeline-bridge dt_restore_run_user_pipe_roi() in restore.c
+
+#include "common/ai/restore_rgb.h"
+#include "common/ai/restore_common.h"
+#include "ai/backend.h"
+#include "common/darktable.h"
+#include "common/colorspaces.h"
+#include "common/colorspaces_inline_conversions.h"
+#include "common/imagebuf.h"
+#include "common/math.h"
+#include "common/matrices.h"
+#include "control/conf.h"
+#include "control/jobs.h"
+
+#include <glib.h>
+#include <math.h>
+#include <string.h>
+
+// forward-declare to avoid pulling in dwt.h (which includes OpenCL
+// types when HAVE_OPENCL is defined — and the AI shared library
+// is built without OpenCL)
+extern void dwt_denoise(float *buf, int width, int height,
+                        int bands, const float *noise);
+
+#define MAX_MODEL_INPUTS 4
+
+// default multipliers of residual sigma for each wavelet band.
+// band 0 (finest) gets the strongest suppression since fine-scale
+// features are hardest to distinguish from noise. coarser bands
+// preserve more because they capture real texture.
+// tunable via darktablerc: plugins/lighttable/neural_restore/detail_recovery_bands
+static const float _dwt_sigma_mul_default[DWT_DETAIL_BANDS] = {
+  0.25f,  // band 0 (finest) — suppress fine luminance noise
+  0.15f,  // band 1
+  0.05f,  // band 2
+  0.02f,  // band 3
+  0.01f   // band 4 (coarsest) — keep almost everything
+};
+
+// sRGB transfer function (gamma curve only, no primaries change).
+// values > 1.0 are allowed to preserve wide-gamut colors
+static inline float _linear_to_srgb(const float v)
+{
+  if(v <= 0.0f) return 0.0f;
+  return (v <= 0.0031308f)
+    ? 12.92f * v
+    : 1.055f * powf(v, 1.0f / 2.4f) - 0.055f;
+}
+
+static inline float _srgb_to_linear(const float v)
+{
+  if(v <= 0.0f) return 0.0f;
+  return (v <= 0.04045f)
+    ? v / 12.92f
+    : powf((v + 0.055f) / 1.055f, 2.4f);
+}
+
+// Rec.709 / sRGB luminance weights (Y row of sRGB->XYZ D65);
+// applied to working-profile-linear pixels in the pass-through
+// blending below; exact only when the working profile is
+// sRGB/Rec.709, but correct enough for luminance deltas
+static inline float _luma_rec709(float r, float g, float b)
+{
+  return 0.2126f * r + 0.7152f * g + 0.0722f * b;
+}
+
+// compute adaptive noise thresholds from residual standard deviation
+static void _compute_adaptive_noise(const float *const restrict buf,
+                                    const size_t npix,
+                                    float noise[DWT_DETAIL_BANDS])
+{
+  // read band multipliers from config (comma-separated list).
+  // e.g. "0.5,0.3,0.1,0.05,0.02" in darktablerc
+  float sigma_mul[DWT_DETAIL_BANDS];
+  memcpy(sigma_mul, _dwt_sigma_mul_default, sizeof(sigma_mul));
+  gchar *val = dt_conf_get_string("plugins/lighttable/neural_restore/detail_recovery_bands");
+  if(val && val[0])
+  {
+    gchar **parts = g_strsplit(val, ",", DWT_DETAIL_BANDS);
+    for(int b = 0; parts[b] && b < DWT_DETAIL_BANDS; b++)
+      sigma_mul[b] = g_ascii_strtod(g_strstrip(parts[b]), NULL);
+    g_strfreev(parts);
+  }
+  g_free(val);
+
+  double sum = 0.0, sum2 = 0.0;
+  for(size_t i = 0; i < npix; i++)
+  {
+    sum += (double)buf[i];
+    sum2 += (double)buf[i] * (double)buf[i];
+  }
+  const double mean = sum / (double)npix;
+  const float sigma = (float)sqrt(sum2 / (double)npix - mean * mean);
+
+  for(int b = 0; b < DWT_DETAIL_BANDS; b++)
+    noise[b] = sigma * sigma_mul[b];
+}
+
+void dt_restore_set_profile(dt_restore_context_t *ctx, void *profile)
+{
+  if(!ctx) return;
+  if(!profile)
+  {
+    ctx->has_profile = FALSE;
+    return;
+  }
+
+  float primaries[3][2], whitepoint[2];
+  if(!dt_colorspaces_get_primaries_and_whitepoint_from_profile(
+       (cmsHPROFILE)profile, primaries, whitepoint))
+  {
+    dt_print(DT_DEBUG_AI,
+             "[restore_rgb] could not read primaries from working profile, "
+             "falling back to gamma-only conversion");
+    ctx->has_profile = FALSE;
+    return;
+  }
+
+  // build WP -> XYZ (stored transposed by dt, convert to row-major)
+  dt_colormatrix_t wp_to_xyz_T;
+  dt_make_transposed_matrices_from_primaries_and_whitepoint(primaries,
+                                                            whitepoint,
+                                                            wp_to_xyz_T);
+  float wp_to_xyz[9];
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+      wp_to_xyz[3 * i + j] = wp_to_xyz_T[j][i];
+
+  // transpose dt's sRGB<->XYZ matrices (Bradford D50) to row-major
+  float xyz_to_srgb[9], srgb_to_xyz[9];
+  for(int i = 0; i < 3; i++)
+    for(int j = 0; j < 3; j++)
+    {
+      xyz_to_srgb[3 * i + j] = xyz_to_srgb_transposed[j][i];
+      srgb_to_xyz[3 * i + j] = sRGB_to_xyz_transposed[j][i];
+    }
+
+  // WP -> sRGB = (XYZ -> sRGB) * (WP -> XYZ)
+  mat3mul(ctx->wp_to_srgb, xyz_to_srgb, wp_to_xyz);
+
+  // invert WP -> XYZ to get XYZ -> WP, then compose sRGB -> WP
+  float xyz_to_wp[9];
+  if(mat3inv(xyz_to_wp, wp_to_xyz) != 0)
+  {
+    dt_print(DT_DEBUG_AI,
+             "[restore_rgb] singular WP->XYZ matrix, falling back to gamma-only");
+    ctx->has_profile = FALSE;
+    return;
+  }
+  mat3mul(ctx->srgb_to_wp, xyz_to_wp, srgb_to_xyz);
+
+  ctx->has_profile = TRUE;
+  dt_print(DT_DEBUG_AI, "[restore_rgb] working profile color matrices ready");
+}
+
+void dt_restore_set_preserve_wide_gamut(dt_restore_context_t *ctx, gboolean preserve)
+{
+  if(ctx) ctx->preserve_wide_gamut = preserve;
+}
+
+int dt_restore_run_patch(dt_restore_context_t *ctx,
+                         const float *in_patch,
+                         int w, int h,
+                         float *out_patch,
+                         int scale)
+{
+  if(!ctx || !ctx->ai_ctx) return 1;
+  const size_t in_pixels = (size_t)w * h * 3;
+  const int out_w = w * scale;
+  const int out_h = h * scale;
+  const size_t out_pixels = (size_t)out_w * out_h * 3;
+  const size_t plane = (size_t)w * h;
+
+  // convert to sRGB gamma-encoded. If a working profile is set,
+  // first convert primaries (working profile -> sRGB linear) so the
+  // model sees the image as if it were native sRGB. Otherwise only
+  // apply the gamma curve (legacy path, shifts hues for wide-gamut).
+  // input layout is planar NCHW: R plane, then G plane, then B plane.
+  // in_gamut_mask records which pixels were in sRGB gamut (scale==1
+  // only) so the output pass can skip recomputing WP->sRGB
+  float *srgb_in = g_try_malloc(in_pixels * sizeof(float));
+  uint8_t *in_gamut_mask = NULL;
+  if(!srgb_in) return 1;
+  // only allocate the gamut mask when denoise pass-through is requested
+  const gboolean need_gamut_mask
+    = ctx->has_profile && scale == 1 && ctx->preserve_wide_gamut;
+  if(need_gamut_mask)
+  {
+    in_gamut_mask = g_try_malloc(plane);
+    if(!in_gamut_mask)
+    {
+      g_free(srgb_in);
+      return 1;
+    }
+  }
+
+  if(ctx->has_profile)
+  {
+    const float *M = ctx->wp_to_srgb;
+    const gboolean boost = ctx->shadow_boost;
+    for(size_t p = 0; p < plane; p++)
+    {
+      const float r = in_patch[p];
+      const float g = in_patch[p + plane];
+      const float b = in_patch[p + 2 * plane];
+      float sr = M[0] * r + M[1] * g + M[2] * b;
+      float sg = M[3] * r + M[4] * g + M[5] * b;
+      float sb = M[6] * r + M[7] * g + M[8] * b;
+      // gamut check uses pre-boost values so pass-through decisions
+      // reflect the original color
+      if(in_gamut_mask)
+      {
+        const float m = 0.01f;  // ~1% margin beyond [0, 1]
+        in_gamut_mask[p] = (sr >= -m && sr <= 1.0f + m
+                           && sg >= -m && sg <= 1.0f + m
+                           && sb >= -m && sb <= 1.0f + m) ? 1 : 0;
+      }
+      if(boost)
+      {
+        sr = sr > 0.0f ? sqrtf(sr) : 0.0f;
+        sg = sg > 0.0f ? sqrtf(sg) : 0.0f;
+        sb = sb > 0.0f ? sqrtf(sb) : 0.0f;
+      }
+      srgb_in[p]             = _linear_to_srgb(sr);
+      srgb_in[p + plane]     = _linear_to_srgb(sg);
+      srgb_in[p + 2 * plane] = _linear_to_srgb(sb);
+    }
+  }
+  else if(ctx->shadow_boost)
+  {
+    // no profile: still boost shadows so the model stays within its
+    // comfort zone, even though we treat WP values as sRGB
+    for(size_t i = 0; i < in_pixels; i++)
+    {
+      const float v = in_patch[i];
+      const float boosted = v > 0.0f ? sqrtf(v) : 0.0f;
+      srgb_in[i] = _linear_to_srgb(boosted);
+    }
+  }
+  else
+  {
+    for(size_t i = 0; i < in_pixels; i++)
+      srgb_in[i] = _linear_to_srgb(in_patch[i]);
+  }
+
+  const int num_inputs = dt_ai_get_input_count(ctx->ai_ctx);
+  if(num_inputs > MAX_MODEL_INPUTS)
+  {
+    g_free(srgb_in);
+    return 1;
+  }
+
+  int64_t input_shape[] = {1, 3, h, w};
+  dt_ai_tensor_t inputs[MAX_MODEL_INPUTS];
+  memset(inputs, 0, sizeof(inputs));
+  inputs[0] = (dt_ai_tensor_t){
+    .data = (void *)srgb_in,
+    .shape = input_shape,
+    .ndim = 4,
+    .type = DT_AI_FLOAT};
+
+  // noise level map for multi-input models
+  float *noise_map = NULL;
+  int64_t noise_shape[] = {1, 1, h, w};
+  if(num_inputs >= 2)
+  {
+    const size_t map_size = (size_t)w * h;
+    noise_map = g_try_malloc(map_size * sizeof(float));
+    if(!noise_map)
+    {
+      g_free(srgb_in);
+      return 1;
+    }
+    const float sigma_norm = 25.0f / 255.0f;
+    for(size_t i = 0; i < map_size; i++)
+      noise_map[i] = sigma_norm;
+    inputs[1] = (dt_ai_tensor_t){
+      .data = (void *)noise_map,
+      .shape = noise_shape,
+      .ndim = 4,
+      .type = DT_AI_FLOAT};
+  }
+
+  int64_t output_shape[] = {1, 3, out_h, out_w};
+  dt_ai_tensor_t output = {
+    .data = (void *)out_patch,
+    .shape = output_shape,
+    .ndim = 4,
+    .type = DT_AI_FLOAT};
+
+  int ret = dt_ai_run(ctx->ai_ctx, inputs, num_inputs,
+                      &output, 1);
+  g_free(srgb_in);
+  g_free(noise_map);
+  if(ret != 0)
+  {
+    g_free(in_gamut_mask);
+    return ret;
+  }
+
+  // convert model output back to the working profile
+  //
+  // with profile: apply inverse sRGB gamma, then check if the ORIGINAL
+  // input pixel (converted to sRGB linear) is representable in sRGB
+  // gamut. if yes, use model output converted back to working profile.
+  // if no, pass through the original pixel (wide-gamut colors preserved,
+  // no denoising on those pixels). upscale has no pixel-to-pixel
+  // correspondence so pass-through is not possible — always use the
+  // model output
+  //
+  // without profile: fall back to per-channel pass-through in the
+  // original (working-profile-as-sRGB) space
+  const gboolean boost = ctx->shadow_boost;
+  if(ctx->has_profile && scale == 1 && ctx->preserve_wide_gamut)
+  {
+    const size_t out_plane = (size_t)out_w * out_h;
+    const float *Mi = ctx->srgb_to_wp;
+    // pass 1: write denoised values for in-gamut pixels; out-of-gamut
+    // pixels get plain pass-through as a fallback (used only when no
+    // in-gamut neighbors are found in pass 2)
+    for(size_t p = 0; p < out_plane; p++)
+    {
+      if(in_gamut_mask[p])
+      {
+        float sr = _srgb_to_linear(out_patch[p]);
+        float sg = _srgb_to_linear(out_patch[p + out_plane]);
+        float sb = _srgb_to_linear(out_patch[p + 2 * out_plane]);
+        if(boost) { sr *= sr; sg *= sg; sb *= sb; }
+        out_patch[p]                 = Mi[0] * sr + Mi[1] * sg + Mi[2] * sb;
+        out_patch[p + out_plane]     = Mi[3] * sr + Mi[4] * sg + Mi[5] * sb;
+        out_patch[p + 2 * out_plane] = Mi[6] * sr + Mi[7] * sg + Mi[8] * sb;
+      }
+      else
+      {
+        out_patch[p]                 = in_patch[p];
+        out_patch[p + out_plane]     = in_patch[p + plane];
+        out_patch[p + 2 * out_plane] = in_patch[p + 2 * plane];
+      }
+    }
+    // pass 2: luminance-only smoothing for out-of-gamut pixels. the
+    // original pixel keeps its chroma (wide-gamut color preserved
+    // exactly) but its brightness is shifted to match the local
+    // average luminance of denoised in-gamut neighbors; this kills
+    // the single-pixel speckles that pass-through would otherwise
+    // leave visible against the denoised background
+    const int radius = 2;  // 5x5 window
+    for(int y = 0; y < out_h; y++)
+    {
+      for(int x = 0; x < out_w; x++)
+      {
+        const size_t p = (size_t)y * out_w + x;
+        if(in_gamut_mask[p]) continue;
+        const float r0 = in_patch[p];
+        const float g0 = in_patch[p + plane];
+        const float b0 = in_patch[p + 2 * plane];
+        const float Y_orig = _luma_rec709(r0, g0, b0);
+        float sumY = 0.0f;
+        int count = 0;
+        const int y0 = y - radius < 0 ? 0 : y - radius;
+        const int y1 = y + radius >= out_h ? out_h - 1 : y + radius;
+        const int x0 = x - radius < 0 ? 0 : x - radius;
+        const int x1 = x + radius >= out_w ? out_w - 1 : x + radius;
+        for(int yy = y0; yy <= y1; yy++)
+        {
+          for(int xx = x0; xx <= x1; xx++)
+          {
+            const size_t q = (size_t)yy * out_w + xx;
+            if(!in_gamut_mask[q]) continue;
+            const float rq = out_patch[q];
+            const float gq = out_patch[q + out_plane];
+            const float bq = out_patch[q + 2 * out_plane];
+            sumY += _luma_rec709(rq, gq, bq);
+            count++;
+          }
+        }
+        if(count > 0)
+        {
+          const float dY = sumY / (float)count - Y_orig;
+          out_patch[p]                 = r0 + dY;
+          out_patch[p + out_plane]     = g0 + dY;
+          out_patch[p + 2 * out_plane] = b0 + dY;
+        }
+      }
+    }
+  }
+  else if(ctx->has_profile && scale == 1)
+  {
+    // denoise with profile but NO pass-through: apply the inverse
+    // matrix to every pixel. wide-gamut inputs will have been clipped
+    // by the model, but we get denoising everywhere
+    const size_t out_plane = (size_t)out_w * out_h;
+    const float *Mi = ctx->srgb_to_wp;
+    for(size_t p = 0; p < out_plane; p++)
+    {
+      float sr = _srgb_to_linear(out_patch[p]);
+      float sg = _srgb_to_linear(out_patch[p + out_plane]);
+      float sb = _srgb_to_linear(out_patch[p + 2 * out_plane]);
+      if(boost) { sr *= sr; sg *= sg; sb *= sb; }
+      out_patch[p]                 = Mi[0] * sr + Mi[1] * sg + Mi[2] * sb;
+      out_patch[p + out_plane]     = Mi[3] * sr + Mi[4] * sg + Mi[5] * sb;
+      out_patch[p + 2 * out_plane] = Mi[6] * sr + Mi[7] * sg + Mi[8] * sb;
+    }
+  }
+  else if(scale == 1)
+  {
+    // no profile set: per-channel pass-through, treats working-profile
+    // numbers as if they were sRGB. colors will be slightly shifted
+    // for wide-gamut working profiles — rely on the profile path above
+    // when possible. pass-through still honored via preserve_wide_gamut
+    for(size_t i = 0; i < out_pixels; i++)
+    {
+      const float in = in_patch[i];
+      if(ctx->preserve_wide_gamut && (in < 0.0f || in > 1.0f))
+      {
+        out_patch[i] = in;
+      }
+      else
+      {
+        float v = _srgb_to_linear(out_patch[i]);
+        if(boost) v *= v;
+        out_patch[i] = v;
+      }
+    }
+  }
+  else
+  {
+    // upscale: no pixel-to-pixel correspondence, use model output as-is
+    if(ctx->has_profile)
+    {
+      const size_t out_plane = (size_t)out_w * out_h;
+      const float *Mi = ctx->srgb_to_wp;
+      for(size_t p = 0; p < out_plane; p++)
+      {
+        float sr = _srgb_to_linear(out_patch[p]);
+        float sg = _srgb_to_linear(out_patch[p + out_plane]);
+        float sb = _srgb_to_linear(out_patch[p + 2 * out_plane]);
+        if(boost) { sr *= sr; sg *= sg; sb *= sb; }
+        out_patch[p]                 = Mi[0] * sr + Mi[1] * sg + Mi[2] * sb;
+        out_patch[p + out_plane]     = Mi[3] * sr + Mi[4] * sg + Mi[5] * sb;
+        out_patch[p + 2 * out_plane] = Mi[6] * sr + Mi[7] * sg + Mi[8] * sb;
+      }
+    }
+    else
+    {
+      for(size_t i = 0; i < out_pixels; i++)
+      {
+        float v = _srgb_to_linear(out_patch[i]);
+        if(boost) v *= v;
+        out_patch[i] = v;
+      }
+    }
+  }
+
+  g_free(in_gamut_mask);
+  return 0;
+}
+
+// per-image gate for the shadow-boost curve; enable only when the image
+// has substantial near-black area to protect — bright images would only
+// pay the curve cost (minor highlight compression) for no gain;
+// thresholds tuned so localized very-dark features (a tree hollow, a
+// silhouette) do NOT trigger; only broad noisy shadow regions do
+//
+// in_data is interleaved float4 RGBA
+#define _SHADOW_BOOST_THRESHOLD 0.005f  // 0.5% linear luminance
+#define _SHADOW_BOOST_FRACTION  0.10f   // 10% of sampled pixels
+static gboolean _image_has_deep_shadows(const float *in_data, int w, int h)
+{
+  const size_t stride = 16;  // sample 1/256 of pixels for speed
+  size_t dark = 0, total = 0;
+  for(size_t y = 0; y < (size_t)h; y += stride)
+    for(size_t x = 0; x < (size_t)w; x += stride)
+    {
+      const size_t p = ((size_t)y * w + x) * 4;
+      const float luma = 0.2126f * in_data[p]
+                       + 0.7152f * in_data[p + 1]
+                       + 0.0722f * in_data[p + 2];
+      if(luma < _SHADOW_BOOST_THRESHOLD) dark++;
+      total++;
+    }
+  return total > 0 && (float)dark / total >= _SHADOW_BOOST_FRACTION;
+}
+
+int dt_restore_process_tiled(dt_restore_context_t *ctx,
+                             const float *in_data,
+                             int width, int height,
+                             int scale,
+                             dt_restore_row_writer_t row_writer,
+                             void *writer_data,
+                             struct _dt_job_t *control_job)
+{
+  if(!ctx || !ctx->ai_ctx || !in_data || !row_writer)
+    return 1;
+
+  // for shadow-boost-capable models, decide per-image whether the
+  // curve is worth applying; one analysis per call, before tiling,
+  // so all tiles see the same flag (avoids per-tile seams)
+  if(ctx->shadow_boost_capable)
+  {
+    const gboolean dark = _image_has_deep_shadows(in_data, width, height);
+    ctx->shadow_boost = dark;
+    dt_print(DT_DEBUG_AI, "[restore_rgb] shadow boost %s",
+             dark ? "enabled" : "disabled");
+  }
+
+  const int O = dt_restore_get_overlap(scale);
+  const int S = scale;
+  const int out_w = width * S;
+  // ladder was resolved at load time (either model's input_sizes or
+  // the built-in default for this scale) and travels with the context
+  const int *ladder = ctx->tile_ladder;
+  const int n_ladder = ctx->n_tile_ladder;
+  int T = ctx->tile_size;
+
+  // outer retry loop: on inference failure (e.g. GPU OOM) drop to the
+  // next smaller candidate in the shared ladder and try again
+retry:;
+  int step = T - 2 * O;
+  int T_out = T * S;
+  int O_out = O * S;
+  int step_out = step * S;
+  size_t in_plane = (size_t)T * T;
+  size_t out_plane = (size_t)T_out * T_out;
+  int cols = (width + step - 1) / step;
+  int rows = (height + step - 1) / step;
+  int total_tiles = cols * rows;
+
+  dt_print(DT_DEBUG_AI,
+           "[restore_rgb] tiling %dx%d (scale=%d)"
+           " -> %dx%d, %dx%d grid (%d tiles, T=%d)",
+           width, height, S, out_w, height * S,
+           cols, rows, total_tiles, T);
+
+  float *tile_in = g_try_malloc(
+    in_plane * 3 * sizeof(float));
+  float *tile_out = g_try_malloc(
+    out_plane * 3 * sizeof(float));
+  float *row_buf = g_try_malloc(
+    (size_t)out_w * step_out * 3 * sizeof(float));
+  if(!tile_in || !tile_out || !row_buf)
+  {
+    g_free(tile_in);
+    g_free(tile_out);
+    g_free(row_buf);
+    return 1;
+  }
+
+  int res = 0;
+  int tile_count = 0;
+
+  for(int ty = 0; ty < rows; ty++)
+  {
+    const int y = ty * step;
+    const int valid_h = (y + step > height)
+      ? height - y : step;
+    const int valid_h_out = valid_h * S;
+
+    memset(row_buf, 0,
+           (size_t)out_w * valid_h_out * 3
+           * sizeof(float));
+
+    for(int tx = 0; tx < cols; tx++)
+    {
+      if(control_job
+         && dt_control_job_get_state(control_job)
+              == DT_JOB_STATE_CANCELLED)
+      {
+        res = 1;
+        goto cleanup;
+      }
+
+      const int x = tx * step;
+      const int in_x = x - O;
+      const int in_y = y - O;
+      const int needs_mirror
+        = (in_x < 0 || in_y < 0
+           || in_x + T > width
+           || in_y + T > height);
+
+      // interleaved RGBx -> planar RGB
+      if(needs_mirror)
+      {
+        for(int dy = 0; dy < T; ++dy)
+        {
+          const int sy = _mirror(in_y + dy, height);
+          for(int dx = 0; dx < T; ++dx)
+          {
+            const int sx
+              = _mirror(in_x + dx, width);
+            const size_t po = (size_t)dy * T + dx;
+            const size_t si
+              = ((size_t)sy * width + sx) * 4;
+            tile_in[po] = in_data[si + 0];
+            tile_in[po + in_plane]
+              = in_data[si + 1];
+            tile_in[po + 2 * in_plane]
+              = in_data[si + 2];
+          }
+        }
+      }
+      else
+      {
+        for(int dy = 0; dy < T; ++dy)
+        {
+          const float *row
+            = in_data
+              + ((size_t)(in_y + dy) * width
+                 + in_x) * 4;
+          const size_t ro = (size_t)dy * T;
+          for(int dx = 0; dx < T; ++dx)
+          {
+            tile_in[ro + dx] = row[dx * 4 + 0];
+            tile_in[ro + dx + in_plane]
+              = row[dx * 4 + 1];
+            tile_in[ro + dx + 2 * in_plane]
+              = row[dx * 4 + 2];
+          }
+        }
+      }
+
+      if(dt_restore_run_patch(
+           ctx, tile_in, T, T, tile_out, S) != 0)
+      {
+        // retry with the next smaller ladder entry if no rows have
+        // been delivered yet (safe to restart). once rows are written
+        // we can't rewind the row_writer (e.g. TIFF is sequential).
+        // _reload_session() recreates the ORT session for the smaller
+        // tile size (dim overrides are shape-specific).
+        int next_T = 0;
+        for(int i = 0; i < n_ladder; i++)
+          if(ladder[i] < T) { next_T = ladder[i]; break; }
+        if(next_T > 0 && ty == 0
+           && dt_restore_reload_session(ctx, next_T))
+        {
+          dt_print(DT_DEBUG_AI,
+                   "[restore_rgb] inference failed at tile %d,%d "
+                   "(T=%d), retrying with T=%d",
+                   x, y, T, next_T);
+          g_free(tile_in);
+          g_free(tile_out);
+          g_free(row_buf);
+          T = next_T;
+          goto retry;
+        }
+        dt_print(DT_DEBUG_AI,
+                 "[restore_rgb] inference failed at"
+                 " tile %d,%d (T=%d, minimum reached)", x, y, T);
+        res = 1;
+        goto cleanup;
+      }
+
+      // valid region -> row buffer
+      const int valid_w = (x + step > width)
+        ? width - x : step;
+      const int valid_w_out = valid_w * S;
+
+      for(int dy = 0; dy < valid_h_out; ++dy)
+      {
+        const size_t src_row
+          = (size_t)(O_out + dy) * T_out + O_out;
+        const size_t dst_row
+          = ((size_t)dy * out_w + x * S) * 3;
+        for(int dx = 0; dx < valid_w_out; ++dx)
+        {
+          row_buf[dst_row + dx * 3 + 0]
+            = tile_out[src_row + dx];
+          row_buf[dst_row + dx * 3 + 1]
+            = tile_out[src_row + dx + out_plane];
+          row_buf[dst_row + dx * 3 + 2]
+            = tile_out[src_row + dx
+                       + 2 * out_plane];
+        }
+      }
+
+      tile_count++;
+      if(control_job)
+        dt_control_job_set_progress(control_job,
+                                    (double)tile_count / total_tiles);
+    }
+
+    // deliver completed scanlines via callback
+    for(int dy = 0; dy < valid_h_out; dy++)
+    {
+      const float *src = row_buf + (size_t)dy * out_w * 3;
+      if(row_writer(src, out_w, y * S + dy,
+                    writer_data) != 0)
+      {
+        res = 1;
+        goto cleanup;
+      }
+    }
+  }
+
+  // persist tile size on first full success so subsequent runs skip OOM retry
+  if(res == 0)
+    dt_restore_persist_tile_size(ctx);
+
+cleanup:
+  g_free(tile_in);
+  g_free(tile_out);
+  g_free(row_buf);
+  return res;
+}
+
+void dt_restore_apply_detail_recovery(const float *original_4ch,
+                                      float *denoised_4ch,
+                                      int width, int height,
+                                      float alpha)
+{
+  const size_t npix = (size_t)width * height;
+
+  float *const restrict lum_residual
+    = dt_alloc_align_float(npix);
+  if(!lum_residual) return;
+
+#ifdef _OPENMP
+#pragma omp parallel for simd default(none)           \
+  dt_omp_firstprivate(original_4ch, denoised_4ch,     \
+                      lum_residual, npix)             \
+  schedule(simd:static)                               \
+  aligned(original_4ch, denoised_4ch, lum_residual:64)
+#endif
+  for(size_t i = 0; i < npix; i++)
+  {
+    const size_t p = i * 4;
+    const float lum_orig
+      = 0.2126f * original_4ch[p + 0]
+        + 0.7152f * original_4ch[p + 1]
+        + 0.0722f * original_4ch[p + 2];
+    const float lum_den
+      = 0.2126f * denoised_4ch[p + 0]
+        + 0.7152f * denoised_4ch[p + 1]
+        + 0.0722f * denoised_4ch[p + 2];
+    lum_residual[i] = lum_orig - lum_den;
+  }
+
+  float noise[DWT_DETAIL_BANDS];
+  _compute_adaptive_noise(lum_residual, npix, noise);
+  dwt_denoise(lum_residual, width, height,
+              DWT_DETAIL_BANDS, noise);
+
+#ifdef _OPENMP
+#pragma omp parallel for simd default(none)       \
+  dt_omp_firstprivate(denoised_4ch, lum_residual, \
+                      npix, alpha)                \
+  schedule(simd:static)                           \
+  aligned(denoised_4ch, lum_residual:64)
+#endif
+  for(size_t i = 0; i < npix; i++)
+  {
+    const size_t p = i * 4;
+    const float d = alpha * lum_residual[i];
+    denoised_4ch[p + 0] += d;
+    denoised_4ch[p + 1] += d;
+    denoised_4ch[p + 2] += d;
+  }
+
+  dt_free_align(lum_residual);
+}
+
+
+float *dt_restore_compute_dwt_detail(const float *before_3ch,
+                                     const float *after_3ch,
+                                     int width, int height)
+{
+  const size_t npix = (size_t)width * height;
+  float *lum_residual = dt_alloc_align_float(npix);
+  if(!lum_residual) return NULL;
+
+  for(size_t i = 0; i < npix; i++)
+  {
+    const size_t si = i * 3;
+    const float lum_orig
+      = 0.2126f * before_3ch[si + 0]
+        + 0.7152f * before_3ch[si + 1]
+        + 0.0722f * before_3ch[si + 2];
+    const float lum_den
+      = 0.2126f * after_3ch[si + 0]
+        + 0.7152f * after_3ch[si + 1]
+        + 0.0722f * after_3ch[si + 2];
+    lum_residual[i] = lum_orig - lum_den;
+  }
+
+  float noise[DWT_DETAIL_BANDS];
+  _compute_adaptive_noise(lum_residual, npix, noise);
+  dwt_denoise(lum_residual, width, height,
+              DWT_DETAIL_BANDS, noise);
+
+  return lum_residual;
+}
+
+// clang-format off
+// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
+// vim: shiftwidth=2 expandtab tabstop=2 cindent
+// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
+// clang-format on
diff --git a/src/common/ai/restore_rgb.h b/src/common/ai/restore_rgb.h
new file mode 100644
index 000000000000..fa922fca5275
--- /dev/null
+++ b/src/common/ai/restore_rgb.h
@@ -0,0 +1,180 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// restore_rgb — public API for the RGB-path AI tasks (denoise + upscale).
+//
+// consumers:
+// - src/libs/neural_restore.c
+//
+// pixel pipeline:
+// input is linear working-profile float4 RGBA (from darktable export).
+// dt_restore_run_patch() converts linear→sRGB before inference and
+// sRGB→linear after. models operate in planar NCHW layout.
+// dt_restore_process_tiled() handles interleaved→planar conversion,
+// mirror padding at boundaries, gamut masking, shadow boost, and
+// overlap blending.
+//
+// detail recovery:
+// dt_restore_apply_detail_recovery() uses wavelet (DWT) decomposition
+// to separate noise from texture in the luminance residual (original
+// − denoised). fine bands are thresholded; coarser bands are preserved
+// and blended back.
+
+#pragma once
+
+#include "common/ai/restore.h"
+
+#include <glib.h>
+
+struct _dt_job_t;
+
+// --- color management (RGB path) ---
+
+// @brief Set the working color profile for the context.
+//
+// The AI model was trained on sRGB primaries. If the input pixels are
+// in a different working profile (e.g. Rec.2020), we must convert to
+// sRGB before inference and back after to avoid hue shifts. Call this
+// before running inference on each image that may use a different
+// working profile.
+//
+// If profile is NULL, the pipeline falls back to gamma-only conversion
+// (treating working-profile numbers as if they were sRGB), which can
+// cause color shifts for wide-gamut working profiles.
+//
+// Thread-safety: must not be called concurrently with
+// dt_restore_run_patch() or dt_restore_process_tiled(). Set the
+// profile before dispatching inference on a given image.
+//
+// @param ctx context handle (NULL-safe)
+// @param profile lcms2 cmsHPROFILE handle cast to void*; NULL to disable
+void dt_restore_set_profile(dt_restore_context_t *ctx, void *profile);
+
+// @brief Enable/disable wide-gamut pass-through for denoise.
+//
+// When TRUE (default): pixels that would be out of sRGB gamut pass
+// through unchanged, preserving color but not denoising them. When
+// FALSE: all pixels use the model output, wide-gamut colors are
+// clipped to sRGB but everything gets denoised.
+//
+// Affects denoise only (scale == 1). Upscale always uses the model
+// output because there is no pixel-to-pixel correspondence to
+// pass through.
+//
+// @param ctx context handle (NULL-safe)
+// @param preserve TRUE to enable pass-through, FALSE to denoise everything
+void dt_restore_set_preserve_wide_gamut(dt_restore_context_t *ctx,
+                                        gboolean preserve);
+
+// --- inference ---
+
+// @brief row writer callback for dt_restore_process_tiled
+//
+// called once per tile-row with 3ch interleaved float scanlines.
+// the callback can write to a buffer, TIFF, or any other sink.
+//
+// @param scanline 3ch interleaved float data (out_w pixels)
+// @param out_w output width in pixels
+// @param y scanline index in the output image
+// @param user_data caller-provided context
+// @return 0 on success, non-zero to abort
+typedef int (*dt_restore_row_writer_t)(const float *scanline,
+                                       int out_w,
+                                       int y,
+                                       void *user_data);
+
+// @brief run a single inference patch with sRGB conversion
+//
+// converts linear RGB input to sRGB, runs ONNX inference,
+// converts output back to linear. input is planar NCHW float.
+//
+// @param ctx loaded restore context
+// @param in_patch input tile (planar RGB, 3 * w * h floats)
+// @param w tile width
+// @param h tile height
+// @param out_patch output buffer (planar RGB, 3 * w*s * h*s)
+// @param scale upscale factor (1 for denoise)
+// @return 0 on success
+int dt_restore_run_patch(dt_restore_context_t *ctx,
+                         const float *in_patch,
+                         int w, int h,
+                         float *out_patch,
+                         int scale);
+
+// @brief process an image with tiled inference
+//
+// tiles the input, runs inference on each tile, and delivers
+// completed scanlines via the row_writer callback. input is
+// float4 RGBA interleaved (from dt export).
+//
+// @param ctx loaded restore context (tile_size is stored in ctx)
+// @param in_data input pixels (float4 RGBA, width * height)
+// @param width input width
+// @param height input height
+// @param scale upscale factor (1 for denoise)
+// @param row_writer callback receiving 3ch float scanlines
+// @param writer_data user data passed to row_writer
+// @param control_job job handle for progress/cancellation (NULL-safe)
+// @return 0 on success
+int dt_restore_process_tiled(dt_restore_context_t *ctx,
+                             const float *in_data,
+                             int width, int height,
+                             int scale,
+                             dt_restore_row_writer_t row_writer,
+                             void *writer_data,
+                             struct _dt_job_t *control_job);
+
+// --- detail recovery ---
+
+// @brief apply DWT-based detail recovery after denoising
+//
+// extracts luminance residual, filters noise with wavelet
+// decomposition, and blends preserved texture back.
+// both buffers are float4 RGBA at the same dimensions.
+//
+// @param original_4ch original input pixels (read-only)
+// @param denoised_4ch denoised pixels (modified in-place)
+// @param width image width
+// @param height image height
+// @param alpha blend strength (0 = none, 1 = full)
+void dt_restore_apply_detail_recovery(const float *original_4ch,
+                                      float *denoised_4ch,
+                                      int width, int height,
+                                      float alpha);
+
+// @brief compute DWT-filtered luminance detail from 3ch buffers
+//
+// returns a 1ch float array with wavelet-filtered luminance
+// residual (noise removed, texture preserved). used for
+// preview split visualization.
+//
+// @param before_3ch original image (3ch interleaved float)
+// @param after_3ch processed image (3ch interleaved float)
+// @param width image width
+// @param height image height
+// @return newly allocated 1ch buffer, or NULL. caller frees
+//         with dt_free_align()
+float *dt_restore_compute_dwt_detail(const float *before_3ch,
+                                     const float *after_3ch,
+                                     int width, int height);
+
+// clang-format off
+// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
+// vim: shiftwidth=2 expandtab tabstop=2 cindent
+// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
+// clang-format on
diff --git a/src/common/ai/segmentation.c b/src/common/ai/segmentation.c
index 946d26749d4c..13cf7c06f1ef 100644
--- a/src/common/ai/segmentation.c
+++ b/src/common/ai/segmentation.c
@@ -225,7 +225,7 @@ dt_seg_context_t *dt_seg_load(dt_ai_environment_t *env, const char *model_id)
   // issues with some decoder graphs (e.g. SegNext's Concat->Reshape)
   dt_ai_context_t *decoder
     = dt_ai_load_model_ext(env, model_id, "decoder.onnx", DT_AI_PROVIDER_CPU,
-                           DT_AI_OPT_DISABLED, NULL, 0);
+                           DT_AI_OPT_DISABLED, NULL, 0, 0);
   if(!decoder)
   {
     dt_print(DT_DEBUG_AI, "[segmentation] failed to load decoder for %s", model_id);
@@ -388,7 +388,7 @@ dt_seg_context_t *dt_seg_load(dt_ai_environment_t *env, const char *model_id)
       const dt_ai_dim_override_t overrides[] = {{"num_labels", 1}};
       ctx->decoder = dt_ai_load_model_ext(env, model_id, "decoder.onnx",
                                            DT_AI_PROVIDER_CPU, DT_AI_OPT_BASIC,
-                                           overrides, 1);
+                                           overrides, 1, 0);
       if(!ctx->decoder)
       {
         dt_print(DT_DEBUG_AI, "[segmentation] failed to reload decoder for %s", model_id);
diff --git a/src/common/dng_writer.c b/src/common/dng_writer.c
new file mode 100644
index 000000000000..9c537f56c7a4
--- /dev/null
+++ b/src/common/dng_writer.c
@@ -0,0 +1,373 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "common/dng_writer.h"
+#include "common/darktable.h"
+#include "common/exif.h"
+#include "common/image.h"
+#include "develop/imageop_math.h"
+
+#include <glib.h>
+#include <glib/gstdio.h>
+#include <string.h>
+#include <tiffio.h>
+
+#ifdef _WIN32
+#include <wchar.h>
+#endif
+
+// DNG uses SRATIONAL / RATIONAL for matrix and WB tags. libtiff accepts
+// these as float/double arrays and handles the conversion; we just pass
+// the values as double
+
+// map the dcraw 2x2 CFA filters word to 4 single-byte channel indices
+// for the DNG CFAPattern tag: 0=R, 1=G, 2=B, following DNG spec §A.3.1
+static void _cfa_bytes_from_filters(uint32_t filters, uint8_t out[4])
+{
+  out[0] = FC(0, 0, filters);
+  out[1] = FC(0, 1, filters);
+  out[2] = FC(1, 0, filters);
+  out[3] = FC(1, 1, filters);
+}
+
+int dt_dng_write_cfa_bayer(const char *filename,
+                           const uint16_t *cfa,
+                           int width,
+                           int height,
+                           const dt_image_t *img,
+                           const void *exif_blob,
+                           int exif_len)
+{
+  if(!filename || !cfa || !img || width <= 0 || height <= 0)
+    return 1;
+
+#ifdef _WIN32
+  wchar_t *wfilename = g_utf8_to_utf16(filename, -1, NULL, NULL, NULL);
+  TIFF *tif = TIFFOpenW(wfilename, "wl");
+  g_free(wfilename);
+#else
+  TIFF *tif = TIFFOpen(filename, "wl");
+#endif
+  if(!tif) return 1;
+
+  // required baseline TIFF tags for a single-plane raw image
+  TIFFSetField(tif, TIFFTAG_SUBFILETYPE, 0);
+  TIFFSetField(tif, TIFFTAG_IMAGEWIDTH, (uint32_t)width);
+  TIFFSetField(tif, TIFFTAG_IMAGELENGTH, (uint32_t)height);
+  TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, (uint16_t)16);
+  TIFFSetField(tif, TIFFTAG_SAMPLESPERPIXEL, (uint16_t)1);
+  TIFFSetField(tif, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);
+  TIFFSetField(tif, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_CFA);
+  TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_UINT);
+  TIFFSetField(tif, TIFFTAG_COMPRESSION, COMPRESSION_NONE);
+  TIFFSetField(tif, TIFFTAG_ORIENTATION, ORIENTATION_TOPLEFT);
+  TIFFSetField(tif, TIFFTAG_ROWSPERSTRIP, TIFFDefaultStripSize(tif, 0));
+  TIFFSetField(tif, TIFFTAG_XRESOLUTION, 300.0);
+  TIFFSetField(tif, TIFFTAG_YRESOLUTION, 300.0);
+  TIFFSetField(tif, TIFFTAG_RESOLUTIONUNIT, RESUNIT_INCH);
+  {
+    gchar *software = g_strdup_printf("darktable %s",
+                                       darktable_package_version);
+    TIFFSetField(tif, TIFFTAG_SOFTWARE, software);
+    g_free(software);
+  }
+
+  // camera identification
+  if(img->camera_maker[0])
+    TIFFSetField(tif, TIFFTAG_MAKE, img->camera_maker);
+  if(img->camera_model[0])
+    TIFFSetField(tif, TIFFTAG_MODEL, img->camera_model);
+  if(img->camera_makermodel[0])
+    TIFFSetField(tif, TIFFTAG_UNIQUECAMERAMODEL, img->camera_makermodel);
+
+  // DNG identification
+  const uint8_t dng_version[4] = { 1, 4, 0, 0 };
+  const uint8_t dng_backward[4] = { 1, 2, 0, 0 };
+  TIFFSetField(tif, TIFFTAG_DNGVERSION, dng_version);
+  TIFFSetField(tif, TIFFTAG_DNGBACKWARDVERSION, dng_backward);
+
+  // CFA description
+  const uint16_t cfa_repeat_dim[2] = { 2, 2 };
+  TIFFSetField(tif, TIFFTAG_CFAREPEATPATTERNDIM, cfa_repeat_dim);
+
+  uint8_t cfa_pattern[4];
+  _cfa_bytes_from_filters(img->buf_dsc.filters, cfa_pattern);
+  TIFFSetField(tif, TIFFTAG_CFAPATTERN, 4, cfa_pattern);
+
+  const uint8_t cfa_plane_color[3] = { 0, 1, 2 };   // R, G, B
+  TIFFSetField(tif, TIFFTAG_CFAPLANECOLOR, 3, cfa_plane_color);
+  TIFFSetField(tif, TIFFTAG_CFALAYOUT, (uint16_t)1); // rectangular
+
+  // black/white levels
+  // BlackLevel is declared as a 2x2 repeat over the CFA pattern. we
+  // honor per-channel values when rawspeed provided them, otherwise
+  // fall back to the single raw_black_level broadcast to all four
+  const uint16_t bl_repeat_dim[2] = { 2, 2 };
+  TIFFSetField(tif, TIFFTAG_BLACKLEVELREPEATDIM, bl_repeat_dim);
+
+  float black_level[4];
+  const gboolean have_separate
+    = (img->raw_black_level_separate[0] != 0
+       || img->raw_black_level_separate[1] != 0
+       || img->raw_black_level_separate[2] != 0
+       || img->raw_black_level_separate[3] != 0);
+  for(int i = 0; i < 4; i++)
+  {
+    black_level[i] = have_separate
+      ? (float)img->raw_black_level_separate[i]
+      : (float)img->raw_black_level;
+  }
+  TIFFSetField(tif, TIFFTAG_BLACKLEVEL, 4, black_level);
+
+  const uint32_t white = img->raw_white_point
+    ? img->raw_white_point : 65535u;
+  TIFFSetField(tif, TIFFTAG_WHITELEVEL, 1, &white);
+
+  // AsShotNeutral (derived from wb_coeffs)
+  // DNG AsShotNeutral encodes the neutral white balance as a
+  // cameraRGB triple where smaller values mean more amplification.
+  // darktable's wb_coeffs are raw-to-white multipliers; AsShotNeutral
+  // is their inverse, normalized so the maximum element is 1
+  if(img->wb_coeffs[0] > 0.0f
+     && img->wb_coeffs[1] > 0.0f
+     && img->wb_coeffs[2] > 0.0f)
+  {
+    float inv[3];
+    for(int i = 0; i < 3; i++)
+      inv[i] = 1.0f / img->wb_coeffs[i];
+    const float m = fmaxf(inv[0], fmaxf(inv[1], inv[2]));
+    if(m > 0.0f)
+      for(int i = 0; i < 3; i++) inv[i] /= m;
+    TIFFSetField(tif, TIFFTAG_ASSHOTNEUTRAL, 3, inv);
+  }
+
+  // ColorMatrix1 (XYZ D50 -> cameraRGB, 3x3 for trichromatic)
+  // darktable's adobe_XYZ_to_CAM is populated from the rawspeed
+  // cameras.xml matrix in row-major [camRGB][XYZ] layout, which
+  // matches the DNG ColorMatrix1 layout exactly (row = camera axis,
+  // column = XYZ axis)
+  {
+    float non_zero = 0.0f;
+    for(int k = 0; k < 3; k++)
+      for(int i = 0; i < 3; i++)
+        non_zero += fabsf(img->adobe_XYZ_to_CAM[k][i]);
+
+    if(non_zero > 0.0f)
+    {
+      float color_matrix[9];
+      for(int k = 0; k < 3; k++)
+        for(int i = 0; i < 3; i++)
+          color_matrix[k * 3 + i] = img->adobe_XYZ_to_CAM[k][i];
+      TIFFSetField(tif, TIFFTAG_COLORMATRIX1, 9, color_matrix);
+    }
+  }
+
+  // default scale / crop
+  // DefaultScale=1,1; DefaultCropOrigin=0,0; DefaultCropSize=W,H.
+  // this keeps the importer from applying any additional framing;
+  // darktable's crop module handles downstream framing
+  const float default_scale[2] = { 1.0f, 1.0f };
+  const float default_crop_origin[2] = { 0.0f, 0.0f };
+  const float default_crop_size[2] = { (float)width, (float)height };
+  TIFFSetField(tif, TIFFTAG_DEFAULTSCALE, default_scale);
+  TIFFSetField(tif, TIFFTAG_DEFAULTCROPORIGIN, default_crop_origin);
+  TIFFSetField(tif, TIFFTAG_DEFAULTCROPSIZE, default_crop_size);
+
+  // scanline write
+  int res = 0;
+  for(int y = 0; y < height && res == 0; y++)
+  {
+    const uint16_t *row = cfa + (size_t)y * width;
+    if(TIFFWriteScanline(tif, (void *)row, y, 0) < 0)
+      res = 1;
+  }
+
+  TIFFClose(tif);
+
+  // embed source EXIF (datetime, ISO, shutter, etc.)
+  // dt_exif_write_blob takes a non-const pointer; we don't modify it
+  if(res == 0 && exif_blob && exif_len > 0)
+    dt_exif_write_blob((uint8_t *)exif_blob, (uint32_t)exif_len,
+                       filename, 0);
+
+  if(res != 0)
+    g_unlink(filename);
+
+  return res;
+}
+
+int dt_dng_write_linear(const char *filename,
+                        const float *rgb,
+                        int width,
+                        int height,
+                        const dt_image_t *img,
+                        const void *exif_blob,
+                        int exif_len)
+{
+  if(!filename || !rgb || !img || width <= 0 || height <= 0)
+    return 1;
+
+#ifdef _WIN32
+  wchar_t *wfilename = g_utf8_to_utf16(filename, -1, NULL, NULL, NULL);
+  TIFF *tif = TIFFOpenW(wfilename, "wl");
+  g_free(wfilename);
+#else
+  TIFF *tif = TIFFOpen(filename, "wl");
+#endif
+  if(!tif) return 1;
+
+  // baseline TIFF tags, 3 samples per pixel (demosaicked)
+  TIFFSetField(tif, TIFFTAG_SUBFILETYPE, 0);
+  TIFFSetField(tif, TIFFTAG_IMAGEWIDTH, (uint32_t)width);
+  TIFFSetField(tif, TIFFTAG_IMAGELENGTH, (uint32_t)height);
+  TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, (uint16_t)16);
+  TIFFSetField(tif, TIFFTAG_SAMPLESPERPIXEL, (uint16_t)3);
+  TIFFSetField(tif, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);
+  TIFFSetField(tif, TIFFTAG_PHOTOMETRIC, 34892);  // LinearRaw
+  TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_UINT);
+  TIFFSetField(tif, TIFFTAG_COMPRESSION, COMPRESSION_NONE);
+  TIFFSetField(tif, TIFFTAG_ORIENTATION, ORIENTATION_TOPLEFT);
+  TIFFSetField(tif, TIFFTAG_ROWSPERSTRIP, TIFFDefaultStripSize(tif, 0));
+  TIFFSetField(tif, TIFFTAG_XRESOLUTION, 300.0);
+  TIFFSetField(tif, TIFFTAG_YRESOLUTION, 300.0);
+  TIFFSetField(tif, TIFFTAG_RESOLUTIONUNIT, RESUNIT_INCH);
+  {
+    gchar *software = g_strdup_printf("darktable %s",
+                                       darktable_package_version);
+    TIFFSetField(tif, TIFFTAG_SOFTWARE, software);
+    g_free(software);
+  }
+
+  // camera identification
+  if(img->camera_maker[0])
+    TIFFSetField(tif, TIFFTAG_MAKE, img->camera_maker);
+  if(img->camera_model[0])
+    TIFFSetField(tif, TIFFTAG_MODEL, img->camera_model);
+  if(img->camera_makermodel[0])
+    TIFFSetField(tif, TIFFTAG_UNIQUECAMERAMODEL, img->camera_makermodel);
+
+  // DNG identification
+  const uint8_t dng_version[4] = { 1, 4, 0, 0 };
+  const uint8_t dng_backward[4] = { 1, 2, 0, 0 };
+  TIFFSetField(tif, TIFFTAG_DNGVERSION, dng_version);
+  TIFFSetField(tif, TIFFTAG_DNGBACKWARDVERSION, dng_backward);
+
+  // NO CFA tags: this is demosaicked data.
+  //     encode as normalized: BlackLevel=0, WhiteLevel=65535. the
+  //     pixel data is already un-WB'd camRGB in [0, 1] range (the
+  //     raw_restore_linear pipeline does matrix + un-boost + un-WB
+  //     before handing off). the consumer applies WB via
+  //     AsShotNeutral, reads uint16 as [0, 65535] and normalizes to
+  //     [0, 1] via black/white
+  const uint32_t white_norm = 65535u;
+  const float black3[3] = { 0.0f, 0.0f, 0.0f };
+  TIFFSetField(tif, TIFFTAG_BLACKLEVEL, 3, black3);
+  TIFFSetField(tif, TIFFTAG_WHITELEVEL, 1, &white_norm);
+
+  // AsShotNeutral = inverse of WB multipliers, normalized so max=1.
+  // on re-import, darktable reads this and derives WB coeffs via
+  // wb[c] = 1/AsShotNeutral[c] / wb[G-normalized]. the temperature
+  // iop then applies this WB to our un-WB'd data, giving the standard
+  // raw-pipeline result
+  if(img->wb_coeffs[0] > 0.0f
+     && img->wb_coeffs[1] > 0.0f
+     && img->wb_coeffs[2] > 0.0f)
+  {
+    float inv[3];
+    for(int i = 0; i < 3; i++) inv[i] = 1.0f / img->wb_coeffs[i];
+    const float m = fmaxf(inv[0], fmaxf(inv[1], inv[2]));
+    if(m > 0.0f) for(int i = 0; i < 3; i++) inv[i] /= m;
+    TIFFSetField(tif, TIFFTAG_ASSHOTNEUTRAL, 3, inv);
+  }
+  else
+  {
+    const float neutral[3] = { 1.0f, 1.0f, 1.0f };
+    TIFFSetField(tif, TIFFTAG_ASSHOTNEUTRAL, 3, neutral);
+  }
+
+  // ColorMatrix1 from camera's XYZ->CAM (3x3 portion)
+  {
+    float non_zero = 0.0f;
+    for(int k = 0; k < 3; k++)
+      for(int i = 0; i < 3; i++)
+        non_zero += fabsf(img->adobe_XYZ_to_CAM[k][i]);
+    if(non_zero > 0.0f)
+    {
+      float color_matrix[9];
+      for(int k = 0; k < 3; k++)
+        for(int i = 0; i < 3; i++)
+          color_matrix[k * 3 + i] = img->adobe_XYZ_to_CAM[k][i];
+      TIFFSetField(tif, TIFFTAG_COLORMATRIX1, 9, color_matrix);
+    }
+  }
+
+  // default scale / crop (full frame, no inset)
+  const float default_scale[2] = { 1.0f, 1.0f };
+  const float default_crop_origin[2] = { 0.0f, 0.0f };
+  const float default_crop_size[2] = { (float)width, (float)height };
+  TIFFSetField(tif, TIFFTAG_DEFAULTSCALE, default_scale);
+  TIFFSetField(tif, TIFFTAG_DEFAULTCROPORIGIN, default_crop_origin);
+  TIFFSetField(tif, TIFFTAG_DEFAULTCROPSIZE, default_crop_size);
+
+  // scanline write: float [0, 1] normalized camRGB -> uint16
+  //     [0, 65535]. BlackLevel=0 and WhiteLevel=65535 let the
+  //     re-importer recover the [0, 1] range via the standard raw
+  //     normalization (val - black) / (white - black)
+  const float clip_hi = 65535.0f;
+  uint16_t *scan = g_malloc((size_t)width * 3 * sizeof(uint16_t));
+  int res = 0;
+  if(!scan)
+  {
+    TIFFClose(tif);
+    g_unlink(filename);
+    return 1;
+  }
+  for(int y = 0; y < height && res == 0; y++)
+  {
+    const float *row = rgb + (size_t)y * width * 3;
+    for(int x = 0; x < width; x++)
+    {
+      for(int c = 0; c < 3; c++)
+      {
+        float adc = row[x * 3 + c] * 65535.0f;
+        if(adc < 0.0f) adc = 0.0f;
+        if(adc > clip_hi) adc = clip_hi;
+        scan[x * 3 + c] = (uint16_t)(adc + 0.5f);
+      }
+    }
+    if(TIFFWriteScanline(tif, scan, y, 0) < 0) res = 1;
+  }
+  g_free(scan);
+
+  TIFFClose(tif);
+
+  if(res == 0 && exif_blob && exif_len > 0)
+    dt_exif_write_blob((uint8_t *)exif_blob, (uint32_t)exif_len,
+                       filename, 0);
+
+  if(res != 0)
+    g_unlink(filename);
+
+  return res;
+}
+
+// clang-format off
+// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
+// vim: shiftwidth=2 expandtab tabstop=2 cindent
+// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
+// clang-format on
diff --git a/src/common/dng_writer.h b/src/common/dng_writer.h
new file mode 100644
index 000000000000..e039cc848a09
--- /dev/null
+++ b/src/common/dng_writer.h
@@ -0,0 +1,105 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2026 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// dng_writer — minimal DNG CFA writer
+//
+// writes a single-plane uint16 Bayer mosaic plus enough DNG metadata
+// to let a raw processor (darktable, adobe, etc.) re-import the file
+// and run the normal raw pipeline: black/white level, CFA pattern,
+// AsShotNeutral (white balance), ColorMatrix1 (camRGB -> CIE XYZ),
+// camera make/model, and a pass-through of the source EXIF blob.
+//
+// this writer is intentionally narrow in scope:
+// - bayer only (no X-Trans, no LinearRaw demosaiced DNG)
+// - uncompressed strip layout
+// - single IFD (no embedded JPEG preview or thumbnails)
+// - 16-bit integer data only
+//
+// the consumer of the DNG (darktable itself) does not need more than
+// this for the neural restore round-trip.
+
+#pragma once
+
+#include <stdint.h>
+#include <glib.h>
+
+struct dt_image_t;
+
+// @brief Write a Bayer CFA mosaic as a DNG file.
+//
+// The output file contains a single IFD with PhotometricInterpretation=CFA.
+// All DNG metadata required for darktable re-import is sourced from @p img:
+//   - BlackLevel[4]                from img->raw_black_level_separate
+//   - WhiteLevel                   from img->raw_white_point
+//   - CFAPattern / CFARepeatDim    from img->buf_dsc.filters (dcraw format)
+//   - AsShotNeutral                from img->wb_coeffs (inverted)
+//   - ColorMatrix1                 from img->adobe_XYZ_to_CAM
+//   - Make / Model / UniqueModel   from img->camera_maker / camera_model
+//
+// @param filename output path (UTF-8)
+// @param cfa      Bayer mosaic (uint16, width * height samples, row-major)
+// @param width    image width in pixels (CFA samples per row)
+// @param height   image height in rows
+// @param img      source image, for DNG metadata
+// @param exif_blob optional Exif blob to embed (NULL = skip)
+// @param exif_len  size of exif_blob in bytes
+// @return 0 on success, non-zero on failure (file is removed on failure)
+int dt_dng_write_cfa_bayer(const char *filename,
+                           const uint16_t *cfa,
+                           int width,
+                           int height,
+                           const struct dt_image_t *img,
+                           const void *exif_blob,
+                           int exif_len);
+
+// @brief Write a demosaicked 3-channel linear DNG.
+//
+// Used for sensors the bayer DNG round-trip can't handle (X-Trans,
+// Foveon-like, pre-demosaicked raws). The output file has
+// PhotometricInterpretation=LinearRaw, SamplesPerPixel=3, and carries
+// the camera's ColorMatrix1 / AsShotNeutral / BlackLevel / WhiteLevel
+// so darktable re-imports it as a raw-origin image and skips its own
+// demosaic stage.
+//
+// Pixel data is interpreted as float-normalized camRGB in [0, ~1+]
+// (1.0 = source sensor white point after black subtract). The writer
+// scales that to uint16 using black = img->raw_black_level,
+// white = img->raw_white_point, so the encoding matches what the
+// corresponding raw CFA data would be in ADC units.
+//
+// @param filename output path (UTF-8)
+// @param rgb      interleaved 3ch float RGB, width*height*3 samples
+// @param width    image width in pixels
+// @param height   image height in pixels
+// @param img      source image, for DNG metadata + encoding range
+// @param exif_blob optional Exif blob to embed (NULL = skip)
+// @param exif_len  size of exif_blob in bytes
+// @return 0 on success, non-zero on failure (file removed on failure)
+int dt_dng_write_linear(const char *filename,
+                        const float *rgb,
+                        int width,
+                        int height,
+                        const struct dt_image_t *img,
+                        const void *exif_blob,
+                        int exif_len);
+
+// clang-format off
+// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
+// vim: shiftwidth=2 expandtab tabstop=2 cindent
+// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
+// clang-format on
diff --git a/src/libs/neural_restore.c b/src/libs/neural_restore.c
index e11360e960a5..07ca26834410 100644
--- a/src/libs/neural_restore.c
+++ b/src/libs/neural_restore.c
@@ -16,114 +16,195 @@
     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-/*
-   neural restore — lighttable module for AI-based image restoration
-
-   overview
-   --------
-   provides two operations via a tabbed notebook UI:
-     - denoise: run an ONNX denoiser (e.g. NIND UNet) on selected images
-     - upscale: run an ONNX super-resolution model (e.g. BSRGAN) at 2x or 4x
-
-   the module lives in the right panel (DT_UI_CONTAINER_PANEL_RIGHT_CENTER)
-   and is available in both lighttable and darkroom views. it is only built
-   when cmake option USE_AI=ON.
-
-   architecture
-   ------------
-   the core AI inference, tiling, and detail recovery logic lives in
-   src/common/ai/restore.c (the darktable_ai library). this module handles:
-
-   1. preview (interactive, single-image)
-      triggered by clicking the preview widget or switching tabs.
-      runs on a background GThread (_preview_thread):
-        - exports the selected image at reduced resolution via the
-          darktable export pipeline (captures fully-processed pixels)
-        - crops a patch matching the widget aspect ratio
-        - runs AI inference on the patch via dt_restore_run_patch()
-        - delivers before/after buffers to the main thread via g_idle_add
-      the preview widget draws a split before/after view with a draggable
-      divider. for denoise, DWT-filtered detail is pre-computed so the
-      detail recovery slider updates instantly without re-running inference.
-      cancellation uses an atomic sequence counter (preview_sequence):
-      the thread checks it at key points and discards stale results.
-
-   2. batch processing (multi-image)
-      runs as a dt_control_job on the user background queue.
-      for each selected image:
-        - exports via the darktable pipeline with a custom format module
-          that intercepts the pixel buffer in _ai_write_image()
-        - for denoise with detail recovery: buffers the full output via
-          dt_restore_process_tiled(), applies dt_restore_apply_detail_recovery(),
-          then writes TIFF
-        - for plain denoise/upscale: streams tiles directly to TIFF via
-          _process_tiled_tiff() to avoid buffering the full output
-        - output TIFF embeds the selected output ICC profile and source EXIF
-        - imports the result into the darktable library and groups it
-          with the source image
-      when the batch finishes, a single completion toast is shown via
-      dt_control_log (e.g. "neural denoise: 3 images processed"). the
-      module deliberately does NOT raise DT_SIGNAL_VIEWMANAGER_THUMBTABLE_ACTIVATE:
-      lighttable ignores that signal while darkroom / map / culling /
-      tethering / print_settings would swap the user's current view to
-      the freshly-imported image and clobber any in-progress edit.
-
-   3. output parameters (collapsible section)
-      - bit depth: 8/16/32-bit TIFF (default 16-bit)
-      - output ICC profile: pick any installed profile, or keep image settings
-      - preserve wide-gamut: when on, out-of-sRGB-gamut pixels pass through
-        the denoise model unchanged (wide-gamut colors preserved exactly);
-        when off, those pixels are clipped to sRGB and denoised like the rest
-      - add to catalog: auto-import output into darktable library
-      - output directory: supports darktable variables (e.g. $(FILE_FOLDER))
-
-   threading
-   ---------
-   - preview: background GThread, one at a time. joined before starting
-     a new preview and in gui_cleanup. stale results are discarded via
-     atomic preview_sequence counter
-   - batch: dt_control_job on DT_JOB_QUEUE_USER_BG. supports cancellation
-     via dt_control_job_get_state()
-   - ai_registry->lock: held briefly to read provider setting
-   - all GTK updates go through g_idle_add to stay on the main thread
-
-   key structs
-   -----------
-   dt_lib_neural_restore_t    — module GUI state and preview data
-   dt_neural_job_t            — batch processing job parameters
-   dt_neural_format_params_t  — custom format module for export interception
-   dt_neural_preview_data_t   — preview thread input parameters
-   dt_neural_preview_result_t — preview thread output (delivered via idle)
-   dt_neural_preview_capture_t — captures export pixels for preview
-
-   preferences
-   -----------
-   CONF_DETAIL_RECOVERY     — detail recovery slider value
-   CONF_ACTIVE_PAGE         — last active notebook tab
-   CONF_BIT_DEPTH           — output TIFF bit depth (0=8, 1=16, 2=32)
-   CONF_ADD_CATALOG         — auto-import output into library
-   CONF_OUTPUT_DIR          — output directory pattern (supports variables)
-   CONF_ICC_TYPE            — output ICC profile type (image settings by default)
-   CONF_ICC_FILE            — filename for file-type ICC profiles
-   CONF_PRESERVE_WIDE_GAMUT — pass-through out-of-sRGB-gamut pixels during denoise
-   CONF_PREVIEW_EXPORT_SIZE — preview export longest-edge in pixels
-   CONF_PREVIEW_HEIGHT      — preview widget height in pixels
-   CONF_EXPAND_OUTPUT       — output section collapsed/expanded state
-*/
+// neural restore — lighttable module for AI-based image restoration
+//
+// overview
+// --------
+// provides three operations via a tabbed notebook UI:
+//   - raw denoise: run a RawNIND UtNet2 denoiser pre-demosaic (Bayer
+//     CFA) or post-demosaic (X-Trans / Foveon, via lin_rec2020).
+//     output is a DNG (CFA Bayer or LinearRaw) that re-imports into
+//     the user's full pipeline.
+//   - denoise: run an ONNX denoiser (e.g. NIND UNet) on the user's
+//     processed/exported image. output is a TIFF.
+//   - upscale: run an ONNX super-resolution model (e.g. BSRGAN) at
+//     2x or 4x. output is a TIFF.
+//
+// the module lives in the right panel (DT_UI_CONTAINER_PANEL_RIGHT_CENTER)
+// and is available in both lighttable and darkroom views. it is only built
+// when cmake option USE_AI=ON.
+//
+// architecture
+// ------------
+// the core AI inference, tiling, color management and DWT detail
+// recovery logic lives in the darktable_ai library, split across:
+//   - src/common/ai/restore.{c,h}        env/ctx lifecycle, model
+//                                         loaders, raw patch runners,
+//                                         shared user-pipe ROI bridge
+//   - src/common/ai/restore_rgb.{c,h}    RGB denoise + upscale tiled
+//                                         driver, sRGB wrapper, shadow
+//                                         boost, DWT detail recovery
+//   - src/common/ai/restore_raw_bayer.   RawNIND Bayer end-to-end
+//     {c,h}                               (batch + piped preview)
+//   - src/common/ai/restore_raw_linear.  RawNIND linear / X-Trans
+//     {c,h}                               end-to-end
+//
+// this module handles UI, threading, output writing, and the per-task
+// preview cache.
+//
+// 1. preview (interactive, single-image)
+//    triggered by clicking the picker thumbnail (which sets a
+//    "preview requested" flag) or by tab switching afterwards.
+//    two worker functions, dispatched via _preview_thread_dispatch:
+//      - _preview_thread:     RGB denoise + upscale. exports the
+//                             image at reduced resolution and runs
+//                             dt_restore_run_patch() on a crop.
+//      - _preview_thread_raw: raw denoise. reads the mipmap CFA
+//                             (Bayer) or runs darktable's demosaic
+//                             pipe (X-Trans → lin_rec2020), feeds
+//                             one inference tile, then runs the
+//                             user's full pipe twice (with
+//                             rawdenoise disabled) on the patched
+//                             vs original CFA so before/after match
+//                             a re-imported DNG.
+//    both deliver result buffers to the main thread via g_idle_add.
+//    the preview widget draws a split before/after view with a
+//    draggable divider. for RGB denoise, DWT-filtered detail is
+//    pre-computed once per inference so the strength slider can
+//    re-blend interactively without re-running the model.
+//
+//    responsiveness:
+//      - tab switch routes through a 150 ms debounce
+//        (_schedule_preview_refresh) so rapid cycling collapses to
+//        one preview run
+//      - per-task preview cache keyed by (imgid, patch_center): on
+//        tab switch back to a previously-seen state we install the
+//        cached buffers and skip inference entirely
+//      - new triggers do NOT join the previous worker (would freeze
+//        the UI for the duration of an in-flight pipe call). the
+//        previous thread is detached; preview_inference_lock
+//        serialises inference so the new worker queues without
+//        fighting for the GPU; preview_sequence is bumped so any
+//        in-flight result is discarded by its idle callback when it
+//        eventually arrives.
+//
+// 2. batch processing (multi-image)
+//    runs as a dt_control_job on the user background queue.
+//    for each selected image, dispatches by task:
+//      - raw denoise (Bayer): pre-process the sensor CFA (black/WB/
+//        pack), run tiled inference via dt_restore_raw_bayer(), un-
+//        process and re-mosaic, write a CFA DNG via
+//        dt_dng_write_cfa_bayer().
+//      - raw denoise (X-Trans / linear): demosaic via the darktable
+//        pipe (rawprepare + highlights + demosaic only), run
+//        dt_restore_raw_linear(), write a LinearRaw DNG via
+//        dt_dng_write_linear().
+//      - denoise / upscale (RGB): export via the darktable pipeline
+//        with a custom format module that intercepts the pixel
+//        buffer in _ai_write_image(). when strength < 100 (so the
+//        DWT detail recovery is active), buffer the full denoised
+//        output, apply dt_restore_apply_detail_recovery(), then
+//        write TIFF. otherwise stream tiles directly to TIFF via
+//        _process_tiled_tiff() to avoid buffering the full output.
+//        output TIFF embeds the selected output ICC profile and
+//        source EXIF.
+//    in all cases, the output is imported into the darktable library
+//    and grouped with the source image (when add-to-catalog is on).
+//
+//    when the batch finishes, a single completion toast is shown via
+//    dt_control_log (e.g. "neural denoise: 3 images processed"). the
+//    module deliberately does NOT raise DT_SIGNAL_VIEWMANAGER_THUMBTABLE_ACTIVATE:
+//    lighttable ignores that signal while darkroom / map / culling /
+//    tethering / print_settings would swap the user's current view to
+//    the freshly-imported image and clobber any in-progress edit.
+//
+// 3. output parameters (collapsible section)
+//    common (all three tabs):
+//      - add to catalog: auto-import output into darktable library
+//      - output directory: supports darktable variables (e.g. $(FILE_FOLDER))
+//    RGB tabs only (denoise / upscale — raw denoise writes DNG):
+//      - bit depth: 8/16/32-bit TIFF (default 16-bit)
+//      - output ICC profile: pick any installed profile, or keep
+//        image settings
+//      - preserve wide-gamut: when on, out-of-sRGB-gamut pixels pass
+//        through the denoise model unchanged (wide-gamut colors
+//        preserved exactly); when off, those pixels are clipped to
+//        sRGB and denoised like the rest
+//
+// threading
+// ---------
+// - preview: background GThread spawned per refresh. previous worker
+//   is detached (not joined) so the UI thread never blocks on tab
+//   switch. preview_inference_lock (GMutex) serialises the actual
+//   inference / pipe work so two workers don't fight for the GPU.
+//   stale workers are discarded via the atomic preview_sequence
+//   counter, checked at the dispatcher entry and at key points
+//   inside the worker; idle callbacks re-check before installing.
+//   gui_cleanup joins the latest worker and drains the main context
+//   to flush any pending idle callbacks before freeing module state.
+// - batch: dt_control_job on DT_JOB_QUEUE_USER_BG. supports
+//   cancellation via dt_control_job_get_state().
+// - ai_registry->lock: held briefly to read provider setting.
+// - all GTK updates go through g_idle_add to stay on the main thread.
+//
+// key structs
+// -----------
+// dt_lib_neural_restore_t        — module GUI state, preview cache,
+//                                   inference lock, debounce timer
+// dt_neural_job_t                — batch processing job parameters
+// dt_neural_format_params_t      — custom format module for export
+//                                   interception (RGB tabs)
+// dt_neural_preview_data_t       — preview thread input parameters
+//                                   (shared by both workers)
+// dt_neural_preview_result_t     — RGB-tab preview thread output
+// dt_neural_preview_result_raw_t — raw-tab preview thread output
+//                                   (also carries the cached
+//                                   full-image buffers for re-pick)
+// dt_neural_preview_capture_t    — captures export pixels for the
+//                                   RGB-tab preview
+//
+// preferences
+// -----------
+// CONF_STRENGTH            — RGB denoise strength slider (0 = source,
+//                            100 = full denoise; internally mapped to
+//                            a DWT-filtered residual recovery amount
+//                            so lowering strength brings back texture
+//                            without reintroducing noise-frequency
+//                            content)
+// CONF_RAW_STRENGTH        — raw denoise strength slider (0 = source
+//                            CFA, 100 = full denoised CFA; uniform
+//                            blend at the re-mosaic sample level)
+// CONF_ACTIVE_PAGE         — last active notebook tab
+// CONF_BIT_DEPTH           — output TIFF bit depth (0=8, 1=16, 2=32)
+// CONF_ADD_CATALOG         — auto-import output into library
+// CONF_OUTPUT_DIR          — output directory pattern (supports variables)
+// CONF_ICC_TYPE            — output ICC profile type (image settings by default)
+// CONF_ICC_FILE            — filename for file-type ICC profiles
+// CONF_PRESERVE_WIDE_GAMUT — pass-through out-of-sRGB-gamut pixels during denoise
+// CONF_PREVIEW_EXPORT_SIZE — preview export longest-edge in pixels
+// CONF_PREVIEW_HEIGHT      — preview widget height in pixels
+// CONF_EXPAND_OUTPUT       — output section collapsed/expanded state
 
 #include "common/ai/restore.h"
+#include "common/ai/restore_rgb.h"
+#include "common/ai/restore_raw_bayer.h"
+#include "common/ai/restore_raw_linear.h"
 #include "control/conf.h"
 #include "bauhaus/bauhaus.h"
 #include "common/act_on.h"
 #include "common/collection.h"
 #include "common/variables.h"
 #include "common/colorspaces.h"
+#include "common/dng_writer.h"
 #include "common/exif.h"
 #include "common/film.h"
 #include "common/grouping.h"
+#include "common/image_cache.h"
+#include "common/mipmap_cache.h"
 #include "control/jobs/control_jobs.h"
 #include "control/signal.h"
 #include "develop/develop.h"
+#include "develop/format.h"
 #include "dtgtk/button.h"
 #include "dtgtk/paint.h"
 #include "gui/accelerators.h"
@@ -140,7 +221,8 @@ DT_MODULE(1)
 // warn the user when upscaled output exceeds this many megapixels
 #define LARGE_OUTPUT_MP 60.0
 
-#define CONF_DETAIL_RECOVERY "plugins/lighttable/neural_restore/detail_recovery"
+#define CONF_STRENGTH "plugins/lighttable/neural_restore/strength"
+#define CONF_RAW_STRENGTH "plugins/lighttable/neural_restore/raw_strength"
 #define CONF_ACTIVE_PAGE "plugins/lighttable/neural_restore/active_page"
 #define CONF_BIT_DEPTH "plugins/lighttable/neural_restore/bit_depth"
 #define CONF_ADD_CATALOG "plugins/lighttable/neural_restore/add_to_catalog"
@@ -156,6 +238,8 @@ typedef enum dt_neural_task_t
   NEURAL_TASK_DENOISE = 0,
   NEURAL_TASK_UPSCALE_2X,
   NEURAL_TASK_UPSCALE_4X,
+  NEURAL_TASK_RAW_DENOISE,
+  NEURAL_TASK_COUNT,         // sentinel; used to size per-task arrays
 } dt_neural_task_t;
 
 typedef enum dt_neural_bpp_t
@@ -168,6 +252,7 @@ typedef enum dt_neural_bpp_t
 typedef struct dt_lib_neural_restore_t
 {
   GtkNotebook *notebook;
+  GtkWidget *raw_denoise_page;
   GtkWidget *denoise_page;
   GtkWidget *upscale_page;
   GtkWidget *scale_combo;
@@ -177,6 +262,7 @@ typedef struct dt_lib_neural_restore_t
   char info_text_right[128];
   char warning_text[128];
   GtkWidget *recovery_slider;
+  GtkWidget *raw_strength_slider;
   dt_neural_task_t task;
   dt_restore_env_t *env;
   dt_restore_context_t *cached_ctx;
@@ -212,6 +298,53 @@ typedef struct dt_lib_neural_restore_t
   unsigned char *export_cairo;
   int export_cairo_stride;
 
+  // raw denoise preview state — disjoint from the export-based preview
+  // above. cached per-image (CFA for Bayer, demosaicked lin_rec2020 for
+  // X-Trans / linear) so re-picking a new crop on the same image skips
+  // the slow load + demosaic; freed on imgid or sensor-type change.
+  dt_imgid_t preview_raw_imgid;
+  dt_restore_sensor_class_t preview_raw_sensor_class;
+  float *preview_full_cfa;       // Bayer: full sensor (w*h floats)
+  int preview_full_w;
+  int preview_full_h;
+  float *preview_full_lin;       // linear: 3ch interleaved (w*h*3 floats)
+  int preview_lin_w;
+  int preview_lin_h;
+  // per-refresh inference output (3ch interleaved at the displayed crop
+  // dims, both in lin_rec2020). cached so the strength slider can blend
+  // these without re-running inference.
+  float *preview_raw_src_rgb;
+  float *preview_raw_denoised_rgb;
+  int preview_raw_crop_w;
+  int preview_raw_crop_h;
+  // strength slider debounce: re-blend on UI thread 50 ms after the
+  // last value-changed emit. set/replaced via g_timeout_add.
+  guint preview_strength_timer;
+  // debounce timer for `_trigger_preview`. tab switches and rapid
+  // re-triggers schedule via this so a quick burst of switches
+  // doesn't pile up worker threads. set/replaced via g_timeout_add;
+  // 0 means no pending trigger
+  guint preview_trigger_timer;
+  // serializes the expensive inference / pipe work in worker threads.
+  // a stale worker (sequence bumped while it was in flight) holds this
+  // until it finishes, so a freshly-spawned worker waits its turn
+  // rather than competing for the same GPU/CPU
+  GMutex preview_inference_lock;
+  // per-task cache of the last successful preview, keyed by
+  // (imgid, patch_center). on tab switch we look up the new task's
+  // slot; if it matches the current image+patch we install the
+  // cached buffers and skip inference entirely. invalidated on
+  // image change or patch move
+  struct {
+    gboolean valid;
+    dt_imgid_t imgid;
+    float patch_center[2];
+    int crop_w, crop_h;
+    float *before_rgb;       // 3ch interleaved, crop_w*crop_h*3 floats
+    float *after_rgb;        // same
+    float *detail;           // denoise: DWT luminance detail; NULL otherwise
+  } preview_cache[NEURAL_TASK_COUNT];
+
   // output settings (collapsible)
   dt_gui_collapsible_section_t cs_output;
   GtkWidget *bpp_combo;
@@ -230,7 +363,12 @@ typedef struct dt_neural_job_t
   dt_job_t *control_job;
   dt_restore_context_t *ctx;
   int scale;
-  float detail_recovery;
+  float strength;
+  float raw_strength;  // 0..1 blend for raw denoise
+  // raw denoise only: sensor class of the currently-loaded rawdenoise
+  // ctx. lets us reuse ctx across images of the same class in a batch
+  // and avoid reloading the ORT session for every image
+  dt_restore_sensor_class_t raw_ctx_sensor_class;
   dt_lib_module_t *self;
   dt_neural_bpp_t bpp;
   gboolean add_to_catalog;
@@ -286,6 +424,14 @@ typedef struct dt_neural_preview_result_t
   int sequence;
   int width;
   int height;
+  // cache key components copied from the originating preview request:
+  // the worker may run after the user has switched tabs/images, so
+  // the idle callback uses these (not d->* live values) to decide
+  // whether the result is still applicable to the current state and
+  // to populate the per-task preview cache slot
+  dt_neural_task_t task;
+  dt_imgid_t imgid;
+  float patch_center[2];
 } dt_neural_preview_result_t;
 const char *name(dt_lib_module_t *self) { return _("neural restore"); }
 
@@ -428,6 +574,10 @@ static dt_restore_context_t *_load_for_task(
   {
     case NEURAL_TASK_DENOISE:
       return dt_restore_load_denoise(env);
+    case NEURAL_TASK_RAW_DENOISE:
+      // focus on bayer for now; auto-pick bayer vs linear per image
+      // sensor is a follow-up (see dt_restore_load_rawdenoise_linear)
+      return dt_restore_load_rawdenoise_bayer(env);
     case NEURAL_TASK_UPSCALE_2X:
       return dt_restore_load_upscale_x2(env);
     case NEURAL_TASK_UPSCALE_4X:
@@ -437,6 +587,20 @@ static dt_restore_context_t *_load_for_task(
   }
 }
 
+// short, untranslated task names for debug logs (use the localised
+// labels at line ~1022 for user-visible strings)
+static const char *_task_log_name(dt_neural_task_t task)
+{
+  switch(task)
+  {
+    case NEURAL_TASK_DENOISE:     return "denoise";
+    case NEURAL_TASK_RAW_DENOISE: return "raw denoise";
+    case NEURAL_TASK_UPSCALE_2X:  return "upscale 2x";
+    case NEURAL_TASK_UPSCALE_4X:  return "upscale 4x";
+    default:                      return "?";
+  }
+}
+
 // check if a model is available for a task
 static gboolean _task_model_available(
   dt_restore_env_t *env,
@@ -446,6 +610,8 @@ static gboolean _task_model_available(
   {
     case NEURAL_TASK_DENOISE:
       return dt_restore_denoise_available(env);
+    case NEURAL_TASK_RAW_DENOISE:
+      return dt_restore_rawdenoise_available(env);
     default:
       return dt_restore_upscale_available(env);
   }
@@ -579,7 +745,10 @@ static int _ai_write_image(dt_imageio_module_data_t *data,
     }
   }
 
-  const float recovery_alpha = job->detail_recovery / 100.0f;
+  // strength: 100 = full denoise (no recovery), 0 = source-like.
+  // DWT detail recovery runs whenever strength < 100, mixing
+  // (1 - strength/100) of the filtered residual back into the output.
+  const float recovery_alpha = 1.0f - job->strength / 100.0f;
   const gboolean need_buffer = (recovery_alpha > 0.0f && S == 1);
 
   int res;
@@ -687,10 +856,11 @@ static const char *_task_suffix(dt_neural_task_t task)
 {
   switch(task)
   {
-    case NEURAL_TASK_DENOISE:    return "_denoise";
-    case NEURAL_TASK_UPSCALE_2X: return "_upscale-2x";
-    case NEURAL_TASK_UPSCALE_4X: return "_upscale-4x";
-    default:                     return "_restore";
+    case NEURAL_TASK_DENOISE:     return "_denoise";
+    case NEURAL_TASK_RAW_DENOISE: return "_raw-denoise";
+    case NEURAL_TASK_UPSCALE_2X:  return "_upscale-2x";
+    case NEURAL_TASK_UPSCALE_4X:  return "_upscale-4x";
+    default:                      return "_restore";
   }
 }
 
@@ -748,12 +918,237 @@ static void _job_cleanup(void *param)
   g_free(job);
 }
 
+// raw-denoise batch path: bypasses the darktable export pipeline and
+// goes directly from the source CFA mosaic to a denoised DNG. unlike
+// RGB denoise/upscale, there is no demosaic / WB / tonemap / export
+// involved — the raw pixels leave and re-enter the pipeline at the
+// same stage, so the darktable re-import runs its normal pipeline on
+// cleaner data. intentionally self-contained and free of interactions
+// with the RGB denoise path
+// ensure j->ctx is loaded with the rawdenoise variant matching the
+// image's sensor class. reloads if needed; tracks the currently-loaded
+// variant in j->raw_ctx_sensor_class so consecutive images of the same
+// class don't pay the reload cost. returns 0 on success
+static int _ensure_raw_ctx(dt_neural_job_t *j,
+                           dt_restore_sensor_class_t cls)
+{
+  if(j->ctx && j->raw_ctx_sensor_class == cls)
+    return 0;
+
+  if(j->ctx)
+  {
+    dt_restore_unref(j->ctx);
+    j->ctx = NULL;
+  }
+  const char *label = NULL;
+  switch(cls)
+  {
+    case DT_RESTORE_SENSOR_CLASS_BAYER:
+      j->ctx = dt_restore_load_rawdenoise_bayer(j->env);
+      label = _("bayer");
+      break;
+    case DT_RESTORE_SENSOR_CLASS_XTRANS:
+      j->ctx = dt_restore_load_rawdenoise_xtrans(j->env);
+      label = _("x-trans");
+      break;
+    case DT_RESTORE_SENSOR_CLASS_LINEAR:
+      j->ctx = dt_restore_load_rawdenoise_linear(j->env);
+      label = _("linear");
+      break;
+    default:
+      return 1;
+  }
+  if(!j->ctx)
+  {
+    dt_control_log(_("failed to load AI raw denoise %s model"), label);
+    return 1;
+  }
+  j->raw_ctx_sensor_class = cls;
+  return 0;
+}
+
+// bayer variant: source CFA (single-channel) -> denoise -> CFA DNG
+static int _process_raw_denoise_bayer(dt_neural_job_t *j,
+                                      dt_imgid_t imgid,
+                                      const char *out_filename,
+                                      const char *src_path,
+                                      const dt_image_t *img_meta)
+{
+  dt_mipmap_buffer_t mbuf;
+  dt_mipmap_cache_get(&mbuf, imgid, DT_MIPMAP_FULL,
+                      DT_MIPMAP_BLOCKING, 'r');
+  if(!mbuf.buf)
+  {
+    dt_print(DT_DEBUG_AI,
+             "[neural_restore] failed to load raw mosaic for imgid %d",
+             imgid);
+    dt_mipmap_cache_release(&mbuf);
+    return 1;
+  }
+
+  const int width = img_meta->width;
+  const int height = img_meta->height;
+  const size_t npix = (size_t)width * height;
+  float *cfa_in = g_try_malloc(npix * sizeof(float));
+  if(!cfa_in)
+  {
+    dt_mipmap_cache_release(&mbuf);
+    return 1;
+  }
+
+  if(img_meta->buf_dsc.datatype == TYPE_UINT16)
+  {
+    const uint16_t *const src = (const uint16_t *)mbuf.buf;
+    for(size_t i = 0; i < npix; i++) cfa_in[i] = (float)src[i];
+  }
+  else if(img_meta->buf_dsc.datatype == TYPE_FLOAT)
+  {
+    memcpy(cfa_in, mbuf.buf, npix * sizeof(float));
+  }
+  else
+  {
+    dt_control_log(_("raw denoise: unsupported raw datatype"));
+    g_free(cfa_in);
+    dt_mipmap_cache_release(&mbuf);
+    return 1;
+  }
+
+  dt_mipmap_cache_release(&mbuf);
+
+  uint16_t *cfa_out = g_try_malloc(npix * sizeof(uint16_t));
+  if(!cfa_out)
+  {
+    g_free(cfa_in);
+    return 1;
+  }
+
+  int res = dt_restore_raw_bayer(j->ctx, img_meta, cfa_in,
+                                 width, height, cfa_out,
+                                 j->raw_strength,
+                                 j->control_job);
+  g_free(cfa_in);
+  if(res != 0)
+  {
+    g_free(cfa_out);
+    return res;
+  }
+
+  uint8_t *exif_blob = NULL;
+  const int exif_len = dt_exif_read_blob(&exif_blob, src_path, imgid,
+                                         FALSE, width, height, TRUE);
+  res = dt_dng_write_cfa_bayer(out_filename, cfa_out,
+                               width, height, img_meta,
+                               exif_blob, exif_len);
+  g_free(exif_blob);
+  g_free(cfa_out);
+  return res;
+}
+
+// linear variant: darktable's demosaic runs inside raw_restore_linear,
+// so there's no CFA buffer to hand in. output is a 3ch linear DNG
+static int _process_raw_denoise_linear(dt_neural_job_t *j,
+                                       dt_imgid_t imgid,
+                                       const char *out_filename,
+                                       const char *src_path,
+                                       const dt_image_t *img_meta)
+{
+  float *rgb = NULL;
+  int w = 0, h = 0;
+  int res = dt_restore_raw_linear(j->ctx, imgid, &rgb, &w, &h,
+                                  j->raw_strength, j->control_job);
+  if(res != 0 || !rgb)
+  {
+    g_free(rgb);
+    return res ? res : 1;
+  }
+
+  uint8_t *exif_blob = NULL;
+  const int exif_len = dt_exif_read_blob(&exif_blob, src_path, imgid,
+                                         FALSE, w, h, TRUE);
+  res = dt_dng_write_linear(out_filename, rgb, w, h, img_meta,
+                            exif_blob, exif_len);
+  g_free(exif_blob);
+  dt_free_align(rgb);
+  return res;
+}
+
+static int _process_raw_denoise_one(dt_neural_job_t *j,
+                                    dt_imgid_t imgid,
+                                    const char *out_filename,
+                                    const char *src_path)
+{
+  // force the raw to be loaded so buf_dsc.{filters,channels,datatype}
+  // are populated. for a fresh session, dt_image_cache_get alone may
+  // return a dt_image_t whose buf_dsc is zeroed because rawspeed has
+  // not been invoked on this id yet
+  dt_mipmap_buffer_t warmup;
+  dt_mipmap_cache_get(&warmup, imgid, DT_MIPMAP_FULL,
+                      DT_MIPMAP_BLOCKING, 'r');
+  const gboolean loaded = (warmup.buf != NULL);
+  dt_mipmap_cache_release(&warmup);
+  if(!loaded)
+  {
+    dt_print(DT_DEBUG_AI,
+             "[neural_restore] mipmap warmup failed for imgid %d", imgid);
+    dt_control_log(_("raw denoise: cannot load source image"));
+    return 1;
+  }
+
+  const dt_image_t *cached = dt_image_cache_get(imgid, 'r');
+  if(!cached) return 1;
+  dt_image_t img_meta = *cached;
+  dt_image_cache_read_release(cached);
+
+  const uint32_t filters = img_meta.buf_dsc.filters;
+  const uint32_t channels = img_meta.buf_dsc.channels;
+  const uint32_t flags = img_meta.flags;
+  const dt_restore_sensor_class_t cls = dt_restore_classify_sensor(&img_meta);
+
+  const char *cls_name
+    = (cls == DT_RESTORE_SENSOR_CLASS_BAYER)  ? "bayer"
+    : (cls == DT_RESTORE_SENSOR_CLASS_XTRANS) ? "x-trans"
+    : (cls == DT_RESTORE_SENSOR_CLASS_LINEAR) ? "linear"
+    :                                           "unsupported";
+  dt_print(DT_DEBUG_AI,
+           "[neural_restore] imgid %d: flags=0x%x channels=%u "
+           "filters=0x%x (%s)", imgid, flags, channels, filters,
+           cls_name);
+
+  if(cls == DT_RESTORE_SENSOR_CLASS_UNSUPPORTED)
+  {
+    dt_control_log(_("raw denoise: image is not a supported raw sensor format"));
+    return 1;
+  }
+
+  if(_ensure_raw_ctx(j, cls)) return 1;
+
+  switch(cls)
+  {
+    case DT_RESTORE_SENSOR_CLASS_BAYER:
+      return _process_raw_denoise_bayer(j, imgid, out_filename,
+                                        src_path, &img_meta);
+    case DT_RESTORE_SENSOR_CLASS_XTRANS:
+      // today: X-Trans runs through the linear pipeline. a future
+      // dedicated xtrans_v1 model with a different input format would
+      // get its own _process_raw_denoise_xtrans() here; the dispatch
+      // structure stays, just the target function swaps
+      return _process_raw_denoise_linear(j, imgid, out_filename,
+                                         src_path, &img_meta);
+    case DT_RESTORE_SENSOR_CLASS_LINEAR:
+      return _process_raw_denoise_linear(j, imgid, out_filename,
+                                         src_path, &img_meta);
+    default:
+      return 1;
+  }
+}
+
 static int32_t _process_job_run(dt_job_t *job)
 {
   dt_neural_job_t *j = dt_control_job_get_params(job);
 
-  const char *task_name = (j->task == NEURAL_TASK_DENOISE)
-    ? _("denoise") : _("upscale");
+  const char *task_name = (j->task == NEURAL_TASK_DENOISE)     ? _("denoise")
+                        : (j->task == NEURAL_TASK_RAW_DENOISE) ? _("raw denoise")
+                        :                                        _("upscale");
   char msg[256];
   snprintf(msg, sizeof(msg), _("loading %s model..."), task_name);
   dt_control_job_set_progress_message(job, msg);
@@ -863,16 +1258,20 @@ static int32_t _process_job_run(dt_job_t *job)
     }
     g_free(out_dir_resolved);
 
-    // find unique filename: base.tif, base_1.tif, ...
+    // raw denoise writes DNG; RGB denoise/upscale writes TIFF
+    const char *ext
+      = (j->task == NEURAL_TASK_RAW_DENOISE) ? "dng" : "tif";
+
+    // find unique filename: base.<ext>, base_1.<ext>, ...
     char filename[PATH_MAX];
-    snprintf(filename, sizeof(filename), "%s.tif", base);
+    snprintf(filename, sizeof(filename), "%s.%s", base, ext);
 
     if(g_file_test(filename, G_FILE_TEST_EXISTS))
     {
       gboolean found = FALSE;
       for(int s = 1; s < 10000; s++)
       {
-        snprintf(filename, sizeof(filename), "%s_%d.tif", base, s);
+        snprintf(filename, sizeof(filename), "%s_%d.%s", base, s, ext);
         if(!g_file_test(filename, G_FILE_TEST_EXISTS))
         {
           found = TRUE;
@@ -893,40 +1292,54 @@ static int32_t _process_job_run(dt_job_t *job)
     dt_print(DT_DEBUG_AI,
              "[neural_restore] processing imgid %d -> %s", imgid, filename);
     snprintf(msg, sizeof(msg),
-             (j->task == NEURAL_TASK_DENOISE) ? _("denoising image %d/%d...")
-             : (j->task == NEURAL_TASK_UPSCALE_2X) ? _("upscaling 2x image %d/%d...")
-             : _("upscaling 4x image %d/%d..."),
+             (j->task == NEURAL_TASK_DENOISE)      ? _("denoising image %d/%d...")
+             : (j->task == NEURAL_TASK_RAW_DENOISE) ? _("raw denoising image %d/%d...")
+             : (j->task == NEURAL_TASK_UPSCALE_2X)  ? _("upscaling 2x image %d/%d...")
+             :                                       _("upscaling 4x image %d/%d..."),
              count + 1, total);
     dt_control_job_set_progress_message(job, msg);
 
-    const int export_err
-      = dt_imageio_export_with_flags(imgid,
-                                     filename,
-                                     &fmt,
-                                     (dt_imageio_module_data_t *)&fmt_params,
-                                     FALSE,  // ignore_exif — pass EXIF to write_image
-                                     FALSE,  // display_byteorder
-                                     TRUE,   // high_quality
-                                     TRUE,   // upscale
-                                     FALSE,  // is_scaling
-                                     1.0,    // scale_factor
-                                     FALSE,  // thumbnail_export
-                                     NULL,   // filter
-                                     FALSE,  // copy_metadata
-                                     FALSE,  // export_masks
-                                     (j->icc_type == DT_COLORSPACE_NONE)
-                                       ? dt_colorspaces_get_work_profile(imgid)->type
-                                       : j->icc_type,
-                                     j->icc_filename,
-                                     DT_INTENT_PERCEPTUAL,
-                                     NULL, NULL,
-                                     count, total, NULL, -1);
-
-    if(export_err)
+    int step_err = 0;
+    if(j->task == NEURAL_TASK_RAW_DENOISE)
+    {
+      step_err = _process_raw_denoise_one(j, imgid, filename, srcpath);
+      if(step_err)
+        dt_control_log(_("raw denoise failed for image %d"), imgid);
+    }
+    else
+    {
+      step_err = dt_imageio_export_with_flags(
+        imgid,
+        filename,
+        &fmt,
+        (dt_imageio_module_data_t *)&fmt_params,
+        FALSE,  // ignore_exif — pass EXIF to write_image
+        FALSE,  // display_byteorder
+        TRUE,   // high_quality
+        TRUE,   // upscale
+        FALSE,  // is_scaling
+        1.0,    // scale_factor
+        FALSE,  // thumbnail_export
+        NULL,   // filter
+        FALSE,  // copy_metadata
+        FALSE,  // export_masks
+        (j->icc_type == DT_COLORSPACE_NONE)
+          ? dt_colorspaces_get_work_profile(imgid)->type
+          : j->icc_type,
+        j->icc_filename,
+        DT_INTENT_PERCEPTUAL,
+        NULL, NULL,
+        count, total, NULL, -1);
+      if(step_err)
+      {
+        dt_print(DT_DEBUG_AI,
+                 "[neural_restore] export failed for imgid %d", imgid);
+        dt_control_log(_("neural restore: export failed"));
+      }
+    }
+
+    if(step_err)
     {
-      dt_print(DT_DEBUG_AI,
-               "[neural_restore] export failed for imgid %d", imgid);
-      dt_control_log(_("neural restore: export failed"));
       dt_control_job_set_progress(job, (double)++count / total);
       continue;
     }
@@ -1031,29 +1444,50 @@ static void _update_info_label(dt_lib_neural_restore_t *d)
     }
   }
 
+  // raw denoise: DNG variant batch will produce. Source of truth is
+  // preview_raw_sensor_class — buf_dsc.filters is zeroed until rawspeed
+  // decodes the image, so probing it here would misclassify an
+  // unloaded X-Trans RAF. both X-Trans and the linear-fallback class
+  // currently write LinearRaw DNG; only Bayer writes CFA DNG
+  if(d->task == NEURAL_TASK_RAW_DENOISE
+     && dt_is_valid_imgid(imgid)
+     && d->preview_raw_imgid == imgid)
+  {
+    const gboolean is_bayer_out
+      = (d->preview_raw_sensor_class == DT_RESTORE_SENSOR_CLASS_BAYER);
+    snprintf(d->info_text_left, sizeof(d->info_text_left), "%s",
+             is_bayer_out ? _("output: Bayer CFA DNG")
+                          : _("output: LinearRaw DNG"));
+  }
+
   // gamut note (informational, not a warning): reuse the same info
   // line as the upscale size display. for denoise, shows standalone
-  // in info_text_left; for upscale, appended to the size info
-  const dt_colorspaces_color_profile_type_t icc_type
-    = dt_conf_key_exists(CONF_ICC_TYPE)
-      ? dt_conf_get_int(CONF_ICC_TYPE)
-      : DT_COLORSPACE_NONE;
-  if(dt_image_has_wide_gamut_output_profile(imgid, icc_type))
-  {
-    const gboolean preserve = dt_conf_key_exists(CONF_PRESERVE_WIDE_GAMUT)
-      ? dt_conf_get_bool(CONF_PRESERVE_WIDE_GAMUT) : TRUE;
-    const char *msg = (scale == 1 && preserve)
-      ? _("wide-gamut preserved, not denoised")
-      : _("wide-gamut clipped");
-    if(d->info_text_right[0])
-    {
-      const size_t used = strlen(d->info_text_right);
-      snprintf(d->info_text_right + used, sizeof(d->info_text_right) - used,
-               "  ·  %s", msg);
-    }
-    else
+  // in info_text_left; for upscale, appended to the size info. not
+  // applicable to raw denoise — that path writes camRGB DNG without
+  // any sRGB wrapper, so there's no gamut clipping to warn about
+  if(d->task != NEURAL_TASK_RAW_DENOISE)
+  {
+    const dt_colorspaces_color_profile_type_t icc_type
+      = dt_conf_key_exists(CONF_ICC_TYPE)
+        ? dt_conf_get_int(CONF_ICC_TYPE)
+        : DT_COLORSPACE_NONE;
+    if(dt_image_has_wide_gamut_output_profile(imgid, icc_type))
     {
-      snprintf(d->info_text_left, sizeof(d->info_text_left), "%s", msg);
+      const gboolean preserve = dt_conf_key_exists(CONF_PRESERVE_WIDE_GAMUT)
+        ? dt_conf_get_bool(CONF_PRESERVE_WIDE_GAMUT) : TRUE;
+      const char *msg = (scale == 1 && preserve)
+        ? _("wide-gamut preserved, not denoised")
+        : _("wide-gamut clipped");
+      if(d->info_text_right[0])
+      {
+        const size_t used = strlen(d->info_text_right);
+        snprintf(d->info_text_right + used, sizeof(d->info_text_right) - used,
+                 "  ·  %s", msg);
+      }
+      else
+      {
+        snprintf(d->info_text_left, sizeof(d->info_text_left), "%s", msg);
+      }
     }
   }
 
@@ -1074,26 +1508,141 @@ static void _task_changed(dt_lib_neural_restore_t *d)
     gtk_widget_queue_draw(d->preview_area);
   }
 
-  // restore detail recovery slider from conf when switching to denoise,
-  // reset to 0 when switching away (upscale has no detail recovery).
-  // use _recovery_changing flag to avoid redundant conf writes from
-  // the slider's value-changed callback
+  // restore strength slider from conf when switching to a scale==1
+  // task (denoise / raw denoise — both benefit from the strength
+  // knob), snap to 100 when switching to upscale (upscale has no
+  // strength semantics — see note below). use _strength_changing
+  // flag to avoid redundant conf writes from the slider's
+  // value-changed callback.
+  // raw denoise has no DWT analysis of exported preview pixels, so
+  // the slider is hidden there — but for DENOISE and UPSCALE we
+  // preserve the master behaviour (slider visible, value restored /
+  // reset).
   d->recovery_changing = TRUE;
-  if(d->task == NEURAL_TASK_DENOISE)
+  if(d->task == NEURAL_TASK_DENOISE || d->task == NEURAL_TASK_RAW_DENOISE)
   {
-    const float saved = dt_conf_get_float(CONF_DETAIL_RECOVERY);
+    const float saved = dt_conf_key_exists(CONF_STRENGTH)
+      ? dt_conf_get_float(CONF_STRENGTH) : 100.0f;
     dt_bauhaus_slider_set(d->recovery_slider, saved);
   }
   else
   {
-    dt_bauhaus_slider_set(d->recovery_slider, 0.0f);
+    dt_bauhaus_slider_set(d->recovery_slider, 100.0f);
   }
+  gtk_widget_set_visible(d->recovery_slider,
+                         d->task != NEURAL_TASK_RAW_DENOISE);
   d->recovery_changing = FALSE;
 
+  // output settings that only apply to the RGB-export-based tasks:
+  // bit depth selects TIFF bits-per-pixel (raw denoise writes DNG),
+  // output ICC profile applies to the exported TIFF, and
+  // preserve-wide-gamut is part of the sRGB-gamma wrapper around the
+  // denoise-nind model. bpp + ICC apply to any TIFF-emitting task
+  // (denoise + upscale); preserve-wide-gamut only matters for denoise
+  // (upscale has no pixel-to-pixel correspondence to pass through).
+  // raw denoise writes a DNG directly from the sensor-space inference
+  // result, so none of these knobs apply to it
+  const gboolean tiff_knobs_visible
+    = (d->task != NEURAL_TASK_RAW_DENOISE);
+  const gboolean wide_gamut_visible
+    = (d->task == NEURAL_TASK_DENOISE);
+  if(d->bpp_combo)
+    gtk_widget_set_visible(d->bpp_combo, tiff_knobs_visible);
+  if(d->profile_combo)
+    gtk_widget_set_visible(d->profile_combo, tiff_knobs_visible);
+  if(d->preserve_wide_gamut_toggle)
+    gtk_widget_set_visible(d->preserve_wide_gamut_toggle, wide_gamut_visible);
+
   _update_info_label(d);
   _update_button_sensitivity(d);
 }
 
+// per-task preview cache helpers
+//
+// the cache holds, per task, the buffers needed to redisplay the most
+// recent successful preview without re-running inference. on tab switch
+// we look up the new task's slot keyed by (imgid, patch_center). on
+// hit we install the cached buffers and skip the worker; on miss we
+// schedule one. invalidated wholesale on image / patch change
+//
+// for raw denoise, before_rgb / after_rgb hold the unblended source vs
+// fully-denoised lin_rec2020 crops (= preview_raw_src_rgb /
+// preview_raw_denoised_rgb). detail is always NULL there
+//
+// for RGB denoise + upscale, before_rgb / after_rgb hold the displayed
+// preview_before / preview_after, and detail (denoise only) holds the
+// DWT luminance residual used by the strength slider
+
+static void _preview_cache_free_slot(dt_lib_neural_restore_t *d, int task)
+{
+  g_free(d->preview_cache[task].before_rgb);
+  d->preview_cache[task].before_rgb = NULL;
+  g_free(d->preview_cache[task].after_rgb);
+  d->preview_cache[task].after_rgb = NULL;
+  dt_free_align(d->preview_cache[task].detail);
+  d->preview_cache[task].detail = NULL;
+  d->preview_cache[task].valid = FALSE;
+}
+
+static void _preview_cache_invalidate_all(dt_lib_neural_restore_t *d)
+{
+  for(int t = 0; t < NEURAL_TASK_COUNT; t++)
+    _preview_cache_free_slot(d, t);
+}
+
+static gboolean _preview_cache_hit(dt_lib_neural_restore_t *d,
+                                   dt_neural_task_t task,
+                                   dt_imgid_t imgid)
+{
+  if(task >= NEURAL_TASK_COUNT) return FALSE;
+  const __typeof__(d->preview_cache[0]) *e = &d->preview_cache[task];
+  // exact match on patch_center (no fp tolerance: we store the exact
+  // value the worker received, so equality is reliable)
+  return e->valid
+      && e->imgid == imgid
+      && e->patch_center[0] == d->patch_center[0]
+      && e->patch_center[1] == d->patch_center[1];
+}
+
+// memcpy buffers into the cache slot for `task`. caller retains
+// ownership of the source pointers (we duplicate)
+static void _preview_cache_store(dt_lib_neural_restore_t *d,
+                                 dt_neural_task_t task,
+                                 dt_imgid_t imgid,
+                                 const float patch_center[2],
+                                 int crop_w, int crop_h,
+                                 const float *before, const float *after,
+                                 const float *detail)
+{
+  // task is an unsigned enum, no need for < 0 check
+  if(task >= NEURAL_TASK_COUNT) return;
+  if(crop_w <= 0 || crop_h <= 0 || !before || !after) return;
+  _preview_cache_free_slot(d, task);
+  const size_t n3 = (size_t)crop_w * crop_h * 3;
+  d->preview_cache[task].before_rgb = g_try_malloc(n3 * sizeof(float));
+  d->preview_cache[task].after_rgb  = g_try_malloc(n3 * sizeof(float));
+  if(!d->preview_cache[task].before_rgb || !d->preview_cache[task].after_rgb)
+  {
+    _preview_cache_free_slot(d, task);
+    return;
+  }
+  memcpy(d->preview_cache[task].before_rgb, before, n3 * sizeof(float));
+  memcpy(d->preview_cache[task].after_rgb,  after,  n3 * sizeof(float));
+  if(detail)
+  {
+    const size_t n1 = (size_t)crop_w * crop_h;
+    d->preview_cache[task].detail = dt_alloc_align_float(n1);
+    if(d->preview_cache[task].detail)
+      memcpy(d->preview_cache[task].detail, detail, n1 * sizeof(float));
+  }
+  d->preview_cache[task].imgid = imgid;
+  d->preview_cache[task].patch_center[0] = patch_center[0];
+  d->preview_cache[task].patch_center[1] = patch_center[1];
+  d->preview_cache[task].crop_w = crop_w;
+  d->preview_cache[task].crop_h = crop_h;
+  d->preview_cache[task].valid = TRUE;
+}
+
 // rebuild the "after" cairo surface from cached float buffers, applying
 // DWT-filtered detail recovery so that slider changes don't re-run inference
 static void _rebuild_cairo_after(dt_lib_neural_restore_t *d)
@@ -1103,7 +1652,11 @@ static void _rebuild_cairo_after(dt_lib_neural_restore_t *d)
   const int w = d->preview_w;
   const int h = d->preview_h;
   const int stride = d->cairo_stride;
-  const float alpha = dt_conf_get_float(CONF_DETAIL_RECOVERY) / 100.0f;
+  // strength = 100 → no recovery (max denoise visible); strength = 0
+  // → full filtered detail back. preview mirrors batch semantics.
+  const float strength = dt_conf_key_exists(CONF_STRENGTH)
+    ? dt_conf_get_float(CONF_STRENGTH) : 100.0f;
+  const float alpha = 1.0f - strength / 100.0f;
   const gboolean recover = (alpha > 0.0f && d->preview_detail);
 
   for(int y = 0; y < h; y++)
@@ -1180,6 +1733,13 @@ static gboolean _preview_result_idle(gpointer data)
                       res->width, res->height, stride);
   _rebuild_cairo_after(d);
 
+  // store this result in the per-task cache so a later tab switch
+  // back to the same task / image / patch skips inference
+  _preview_cache_store(d, res->task, res->imgid, res->patch_center,
+                       res->width, res->height,
+                       d->preview_before, d->preview_after,
+                       d->preview_detail);
+
   d->preview_ready = TRUE;
   d->preview_generating = FALSE;
   _update_button_sensitivity(d);
@@ -1443,6 +2003,10 @@ static gpointer _preview_thread(gpointer data)
   result->sequence = pd->sequence;
   result->width = pw;
   result->height = ph;
+  result->task = pd->task;
+  result->imgid = pd->imgid;
+  result->patch_center[0] = pd->patch_center[0];
+  result->patch_center[1] = pd->patch_center[1];
   g_idle_add(_preview_result_idle, result);
 
 cleanup:
@@ -1455,10 +2019,26 @@ static void _cancel_preview(dt_lib_module_t *self)
   dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
   d->preview_ready = FALSE;
   d->preview_generating = FALSE;
+  // bump sequence so any in-flight worker (and its idle callback)
+  // discards its result. we DO NOT join here — that would block the
+  // UI for the full duration of a running inference. the worker
+  // keeps running in the background and exits silently. but: we DO
+  // need to wait for it before freeing export_pixels below, since the
+  // worker may still be reading from them. take + release the
+  // inference lock as a synchronisation barrier — the worker holds
+  // it during the heavy work, so once we get it we know it's done
+  // touching shared buffers
   g_atomic_int_inc(&d->preview_sequence);
+  if(d->preview_trigger_timer)
+  {
+    g_source_remove(d->preview_trigger_timer);
+    d->preview_trigger_timer = 0;
+  }
   if(d->preview_thread)
   {
-    g_thread_join(d->preview_thread);
+    g_mutex_lock(&d->preview_inference_lock);
+    g_mutex_unlock(&d->preview_inference_lock);
+    g_thread_unref(d->preview_thread);
     d->preview_thread = NULL;
   }
   // invalidate cached export (image changed)
@@ -1470,6 +2050,759 @@ static void _cancel_preview(dt_lib_module_t *self)
   gtk_widget_queue_draw(d->preview_area);
 }
 
+// ============================================================================
+// raw denoise preview path. parallel to the export-based preview above:
+//   * pixel source: full-resolution CFA (Bayer) or demosaicked lin_rec2020
+//     (X-Trans / linear), cached per image so re-picks reuse it
+//   * inference: dt_restore_raw_{bayer,linear}_preview returns 3ch
+//     source + 3ch denoised crops (both lin_rec2020) for the displayed
+//     region, using one fixed-size tile that matches the JIT-compiled
+//     batch session
+//   * strength slider: re-blends the cached src/denoised on the UI thread
+//     (microseconds) without touching the model — debounced to 50 ms so
+//     fast drags don't queue up redraws
+// ============================================================================
+
+#define RAW_PREVIEW_STRENGTH_DEBOUNCE_MS 50
+
+typedef struct dt_neural_preview_result_raw_t
+{
+  dt_lib_module_t *self;
+  float *src_rgb;       // crop_w * crop_h * 3, lin_rec2020
+  float *denoised_rgb;  // same dims, lin_rec2020 (gain-matched)
+  int width;
+  int height;
+  int sequence;
+  // optional: full-image buffers to install into the cache (NULL when
+  // the worker reused an already-cached buffer for this image).
+  // ownership transfers to d on idle install
+  float *take_full_cfa;       // Bayer; allocated with g_malloc
+  float *take_full_lin;       // X-Trans/linear; allocated with dt_alloc_align
+  int    full_w;
+  int    full_h;
+  dt_imgid_t full_imgid;
+  dt_restore_sensor_class_t full_sensor_class;
+  // optional: picker thumbnail (4ch interleaved float) produced via
+  // dt_imageio_export_with_flags. matches whatever the user sees in
+  // darkroom — identical colours to the preview's before/after. NULL
+  // when we reused an already-cached export_pixels on d.
+  float *take_export_pixels;
+  int    export_thumb_w;
+  int    export_thumb_h;
+  // cache key (see comment on dt_neural_preview_result_t)
+  float patch_center[2];
+} dt_neural_preview_result_raw_t;
+
+// blend cached src + denoised crops at the given strength, write into
+// preview_before/after (allocating fresh buffers), rebuild cairo, and
+// queue a redraw. UI thread only.
+static void _blend_raw_into_preview(dt_lib_neural_restore_t *d,
+                                    float strength)
+{
+  if(!d->preview_raw_src_rgb || !d->preview_raw_denoised_rgb) return;
+  const int w = d->preview_raw_crop_w;
+  const int h = d->preview_raw_crop_h;
+  if(w <= 0 || h <= 0) return;
+
+  if(strength < 0.0f) strength = 0.0f;
+  if(strength > 1.0f) strength = 1.0f;
+  const float a = strength;
+  const float ia = 1.0f - strength;
+
+  const size_t n3 = (size_t)w * h * 3;
+
+  // preview_before stays at the source (split widget shows pre-denoise
+  // on one side, blended-strength on the other)
+  g_free(d->preview_before);
+  d->preview_before = g_malloc(n3 * sizeof(float));
+  memcpy(d->preview_before, d->preview_raw_src_rgb, n3 * sizeof(float));
+
+  // preview_after = α · denoised + (1-α) · source, per channel
+  g_free(d->preview_after);
+  d->preview_after = g_malloc(n3 * sizeof(float));
+  for(size_t i = 0; i < n3; i++)
+    d->preview_after[i]
+      = a  * d->preview_raw_denoised_rgb[i]
+      + ia * d->preview_raw_src_rgb[i];
+
+  // detail-recovery DWT does not apply to raw (different pipeline
+  // position; would need its own analysis pass). leave NULL so
+  // _rebuild_cairo_after takes the no-recovery branch.
+  dt_free_align(d->preview_detail);
+  d->preview_detail = NULL;
+
+  d->preview_w = w;
+  d->preview_h = h;
+
+  // rebuild cached cairo surfaces
+  g_free(d->cairo_before);
+  g_free(d->cairo_after);
+  const int stride = cairo_format_stride_for_width(CAIRO_FORMAT_RGB24, w);
+  d->cairo_before = g_malloc(stride * h);
+  d->cairo_after = g_malloc(stride * h);
+  d->cairo_stride = stride;
+  _float_rgb_to_cairo(d->preview_before, d->cairo_before, w, h, stride);
+  _rebuild_cairo_after(d);
+
+  d->preview_ready = TRUE;
+  gtk_widget_queue_draw(d->preview_area);
+}
+
+// install a cached preview slot into the active preview buffers and
+// rebuild cairo so the widget displays it. dispatches by task: raw
+// denoise needs to repopulate preview_raw_src/denoised_rgb and re-blend
+// at the current strength; RGB denoise / upscale just install
+// preview_before/after/detail and rebuild the after surface
+static void _install_cache_slot_raw(dt_lib_module_t *self,
+                                    dt_neural_task_t task)
+{
+  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
+  const __typeof__(d->preview_cache[0]) *e = &d->preview_cache[task];
+  if(!e->valid) return;
+  const size_t n3 = (size_t)e->crop_w * e->crop_h * 3;
+  g_free(d->preview_raw_src_rgb);
+  g_free(d->preview_raw_denoised_rgb);
+  d->preview_raw_src_rgb = g_malloc(n3 * sizeof(float));
+  d->preview_raw_denoised_rgb = g_malloc(n3 * sizeof(float));
+  memcpy(d->preview_raw_src_rgb, e->before_rgb, n3 * sizeof(float));
+  memcpy(d->preview_raw_denoised_rgb, e->after_rgb, n3 * sizeof(float));
+  d->preview_raw_crop_w = e->crop_w;
+  d->preview_raw_crop_h = e->crop_h;
+  const float strength
+    = dt_bauhaus_slider_get(d->raw_strength_slider) / 100.0f;
+  _blend_raw_into_preview(d, strength);
+}
+
+static void _install_cache_slot_rgb(dt_lib_module_t *self,
+                                    dt_neural_task_t task)
+{
+  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
+  const __typeof__(d->preview_cache[0]) *e = &d->preview_cache[task];
+  if(!e->valid) return;
+  const size_t n3 = (size_t)e->crop_w * e->crop_h * 3;
+  g_free(d->preview_before);
+  g_free(d->preview_after);
+  dt_free_align(d->preview_detail);
+  d->preview_before = g_malloc(n3 * sizeof(float));
+  d->preview_after  = g_malloc(n3 * sizeof(float));
+  memcpy(d->preview_before, e->before_rgb, n3 * sizeof(float));
+  memcpy(d->preview_after,  e->after_rgb,  n3 * sizeof(float));
+  d->preview_detail = NULL;
+  if(e->detail)
+  {
+    const size_t n1 = (size_t)e->crop_w * e->crop_h;
+    d->preview_detail = dt_alloc_align_float(n1);
+    if(d->preview_detail)
+      memcpy(d->preview_detail, e->detail, n1 * sizeof(float));
+  }
+  d->preview_w = e->crop_w;
+  d->preview_h = e->crop_h;
+  // rebuild cairo surfaces
+  g_free(d->cairo_before);
+  g_free(d->cairo_after);
+  const int stride
+    = cairo_format_stride_for_width(CAIRO_FORMAT_RGB24, e->crop_w);
+  d->cairo_before = g_malloc(stride * e->crop_h);
+  d->cairo_after  = g_malloc(stride * e->crop_h);
+  d->cairo_stride = stride;
+  _float_rgb_to_cairo(d->preview_before, d->cairo_before,
+                      e->crop_w, e->crop_h, stride);
+  _rebuild_cairo_after(d);
+  d->preview_ready = TRUE;
+  gtk_widget_queue_draw(d->preview_area);
+}
+
+static void _install_cache_slot(dt_lib_module_t *self, dt_neural_task_t task)
+{
+  if(task == NEURAL_TASK_RAW_DENOISE) _install_cache_slot_raw(self, task);
+  else                                _install_cache_slot_rgb(self, task);
+}
+
+// debounced strength-slider re-blend. returns G_SOURCE_REMOVE so the
+// timer fires once.
+static gboolean _strength_blend_timer_cb(gpointer data)
+{
+  dt_lib_module_t *self = (dt_lib_module_t *)data;
+  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
+  d->preview_strength_timer = 0;
+
+  if(d->task != NEURAL_TASK_RAW_DENOISE) return G_SOURCE_REMOVE;
+  if(!d->preview_raw_src_rgb || !d->preview_raw_denoised_rgb)
+    return G_SOURCE_REMOVE;
+
+  const float strength
+    = dt_bauhaus_slider_get(d->raw_strength_slider) / 100.0f;
+  _blend_raw_into_preview(d, strength);
+  return G_SOURCE_REMOVE;
+}
+
+static void _schedule_raw_strength_reblend(dt_lib_module_t *self)
+{
+  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
+  if(d->preview_strength_timer)
+    g_source_remove(d->preview_strength_timer);
+  d->preview_strength_timer
+    = g_timeout_add(RAW_PREVIEW_STRENGTH_DEBOUNCE_MS,
+                    _strength_blend_timer_cb, self);
+}
+
+// fired when the raw worker bails before producing a result, so the UI
+// doesn't get stuck with preview_generating == TRUE forever.
+static gboolean _preview_raw_failed_idle(gpointer data)
+{
+  dt_lib_module_t *self = (dt_lib_module_t *)data;
+  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
+  d->preview_generating = FALSE;
+  _update_button_sensitivity(d);
+  gtk_widget_queue_draw(d->preview_area);
+  return G_SOURCE_REMOVE;
+}
+
+static gboolean _preview_raw_result_idle(gpointer data)
+{
+  dt_neural_preview_result_raw_t *res
+    = (dt_neural_preview_result_raw_t *)data;
+  dt_lib_neural_restore_t *d
+    = (dt_lib_neural_restore_t *)res->self->data;
+
+  // discard stale results
+  if(res->sequence != g_atomic_int_get(&d->preview_sequence))
+  {
+    g_free(res->src_rgb);
+    g_free(res->denoised_rgb);
+    g_free(res->take_full_cfa);
+    dt_free_align(res->take_full_lin);
+    g_free(res->take_export_pixels);
+    g_free(res);
+    return G_SOURCE_REMOVE;
+  }
+
+  // install the per-image full buffer if the worker freshly loaded one.
+  // also (re)build the patch-picker thumbnail (export_pixels) from it —
+  // the raw path doesn't run a pipeline export so the picker needs us
+  // to synthesise a whole-image 4ch RGBA buffer.
+  gboolean refresh_thumbnail = FALSE;
+  if(res->take_full_cfa)
+  {
+    g_free(d->preview_full_cfa);
+    d->preview_full_cfa = res->take_full_cfa;
+    d->preview_full_w = res->full_w;
+    d->preview_full_h = res->full_h;
+    d->preview_raw_imgid = res->full_imgid;
+    d->preview_raw_sensor_class = res->full_sensor_class;
+    // free the other variant's cache (we switched sensor type)
+    dt_free_align(d->preview_full_lin);
+    d->preview_full_lin = NULL;
+    refresh_thumbnail = TRUE;
+  }
+  if(res->take_full_lin)
+  {
+    dt_free_align(d->preview_full_lin);
+    d->preview_full_lin = res->take_full_lin;
+    d->preview_lin_w = res->full_w;
+    d->preview_lin_h = res->full_h;
+    d->preview_raw_imgid = res->full_imgid;
+    d->preview_raw_sensor_class = res->full_sensor_class;
+    g_free(d->preview_full_cfa);
+    d->preview_full_cfa = NULL;
+    refresh_thumbnail = TRUE;
+  }
+
+  // install the picker thumbnail when the worker produced a fresh
+  // export (triggered by new imgid / sensor-type change). matches
+  // exactly what the user sees in darkroom — same pipeline the RGB
+  // denoise preview uses for its picker thumbnail.
+  if(res->take_export_pixels)
+  {
+    g_free(d->export_pixels);
+    g_free(d->export_cairo);
+    d->export_pixels = res->take_export_pixels;
+    d->export_w = res->export_thumb_w;
+    d->export_h = res->export_thumb_h;
+    d->export_cairo = NULL;  // rebuilt on demand by picker
+    res->take_export_pixels = NULL;
+  }
+  (void)refresh_thumbnail;  // legacy flag; export is handled above
+
+  // install per-refresh inference output
+  g_free(d->preview_raw_src_rgb);
+  g_free(d->preview_raw_denoised_rgb);
+  d->preview_raw_src_rgb = res->src_rgb;
+  d->preview_raw_denoised_rgb = res->denoised_rgb;
+  d->preview_raw_crop_w = res->width;
+  d->preview_raw_crop_h = res->height;
+
+  const float strength
+    = dt_bauhaus_slider_get(d->raw_strength_slider) / 100.0f;
+  _blend_raw_into_preview(d, strength);
+
+  // preview_raw_sensor_class is now authoritative for this imgid;
+  // refresh the overlay so it shows the correct DNG output format
+  _update_info_label(d);
+
+  // store unblended source / denoised in the cache. raw never has DWT
+  // detail; the strength slider blends src ↔ denoised on the fly via
+  // _blend_raw_into_preview, so the cache only needs the two anchors
+  _preview_cache_store(d, NEURAL_TASK_RAW_DENOISE, res->full_imgid,
+                       res->patch_center,
+                       res->width, res->height,
+                       d->preview_raw_src_rgb, d->preview_raw_denoised_rgb,
+                       NULL);
+
+  d->preview_generating = FALSE;
+  _update_button_sensitivity(d);
+  g_free(res);
+  return G_SOURCE_REMOVE;
+}
+
+static gpointer _preview_thread_raw(gpointer data)
+{
+  dt_neural_preview_data_t *pd = (dt_neural_preview_data_t *)data;
+  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)pd->self->data;
+
+
+  // 1. load source image metadata to determine sensor type.
+  //    on a fresh session dt_image_cache_get returns img_meta with a
+  //    zeroed buf_dsc until rawspeed has been invoked on this id. the
+  //    batch path (_process_raw_denoise_one) does this same warmup
+  //    BEFORE reading metadata for the same reason.
+  if(pd->sequence != g_atomic_int_get(&d->preview_sequence)) goto cleanup;
+
+  dt_mipmap_buffer_t warmup;
+  dt_mipmap_cache_get(&warmup, pd->imgid, DT_MIPMAP_FULL,
+                      DT_MIPMAP_BLOCKING, 'r');
+  const gboolean warm_loaded = (warmup.buf != NULL);
+  dt_mipmap_cache_release(&warmup);
+  if(!warm_loaded)
+  {
+    dt_print(DT_DEBUG_AI,
+             "[neural_restore] raw preview: mipmap warmup failed for imgid %d",
+             pd->imgid);
+    goto cleanup;
+  }
+
+  const dt_image_t *cached = dt_image_cache_get(pd->imgid, 'r');
+  if(!cached) goto cleanup;
+  dt_image_t img_meta = *cached;
+  dt_image_cache_read_release(cached);
+
+  const uint32_t filters = img_meta.buf_dsc.filters;
+  const dt_restore_sensor_class_t cls = dt_restore_classify_sensor(&img_meta);
+  const gboolean is_xtrans = (cls == DT_RESTORE_SENSOR_CLASS_XTRANS);
+  if(cls != DT_RESTORE_SENSOR_CLASS_BAYER
+     && cls != DT_RESTORE_SENSOR_CLASS_XTRANS)
+  {
+    dt_print(DT_DEBUG_AI,
+             "[neural_restore] raw preview: imgid %d is not bayer/xtrans "
+             "(filters=0x%x class=%d)",
+             pd->imgid, filters, cls);
+    goto cleanup;
+  }
+  dt_print(DT_DEBUG_AI,
+           "[neural_restore] raw preview: imgid=%d %s patch=(%.3f,%.3f) "
+           "widget=%dx%d filters=0x%x",
+           pd->imgid, is_xtrans ? "x-trans" : "bayer",
+           pd->patch_center[0], pd->patch_center[1],
+           pd->preview_w, pd->preview_h, filters);
+
+  // 2. ensure the right ctx is loaded (matches batch logic in
+  //    _ensure_raw_ctx). reload if cached_task is wrong or if the
+  //    cached sensor class doesn't match this image.
+  dt_restore_context_t *ctx = NULL;
+  dt_pthread_mutex_lock(&d->ctx_lock);
+  {
+    const gboolean cached_is_raw_correct
+      = d->cached_ctx
+        && d->cached_task == NEURAL_TASK_RAW_DENOISE
+        && (cls == d->preview_raw_sensor_class);
+    if(!cached_is_raw_correct)
+    {
+      dt_restore_unref(d->cached_ctx);
+      switch(cls)
+      {
+        case DT_RESTORE_SENSOR_CLASS_BAYER:
+          d->cached_ctx = dt_restore_load_rawdenoise_bayer(pd->env);
+          break;
+        case DT_RESTORE_SENSOR_CLASS_XTRANS:
+          d->cached_ctx = dt_restore_load_rawdenoise_xtrans(pd->env);
+          break;
+        default:
+          d->cached_ctx = NULL;
+          break;
+      }
+      d->cached_task = NEURAL_TASK_RAW_DENOISE;
+      // mark cached sensor class so a follow-up preview matches; this
+      // does NOT update preview_raw_imgid because we may not have a
+      // fresh full-image buffer for this image yet
+      d->preview_raw_sensor_class = cls;
+    }
+    if(d->cached_ctx) ctx = dt_restore_ref(d->cached_ctx);
+  }
+  dt_pthread_mutex_unlock(&d->ctx_lock);
+
+  if(!ctx)
+  {
+    dt_print(DT_DEBUG_AI,
+             "[neural_restore] raw preview: failed to load model");
+    goto cleanup;
+  }
+  if(pd->sequence != g_atomic_int_get(&d->preview_sequence))
+  {
+    dt_restore_unref(ctx);
+    goto cleanup;
+  }
+
+  // 3. acquire the full-image buffer. reuse cache if it matches imgid +
+  //    sensor type; otherwise load fresh and stage into result for the
+  //    UI to install on idle.
+  float *take_full_cfa = NULL;
+  float *take_full_lin = NULL;
+  int full_w = 0, full_h = 0;
+  const float *full_cfa_use = NULL;  // borrowed pointer (cache or take_*)
+  const float *full_lin_use = NULL;
+
+  const gboolean cache_matches
+    = d->preview_raw_imgid == pd->imgid
+      && d->preview_raw_sensor_class == cls
+      && ((is_xtrans && d->preview_full_lin)
+          || (!is_xtrans && d->preview_full_cfa));
+
+  if(cache_matches)
+  {
+    if(is_xtrans)
+    {
+      full_lin_use = d->preview_full_lin;
+      full_w = d->preview_lin_w;
+      full_h = d->preview_lin_h;
+    }
+    else
+    {
+      full_cfa_use = d->preview_full_cfa;
+      full_w = d->preview_full_w;
+      full_h = d->preview_full_h;
+    }
+  }
+  else if(is_xtrans)
+  {
+    if(dt_restore_raw_linear_prepare(pd->imgid, &take_full_lin,
+                                     &full_w, &full_h) != 0
+       || !take_full_lin)
+    {
+      dt_restore_unref(ctx);
+      goto cleanup;
+    }
+    full_lin_use = take_full_lin;
+  }
+  else
+  {
+    // Bayer: read CFA from mipmap cache
+    dt_mipmap_buffer_t mbuf;
+    dt_mipmap_cache_get(&mbuf, pd->imgid, DT_MIPMAP_FULL,
+                        DT_MIPMAP_BLOCKING, 'r');
+    if(!mbuf.buf)
+    {
+      dt_mipmap_cache_release(&mbuf);
+      dt_restore_unref(ctx);
+      goto cleanup;
+    }
+    full_w = img_meta.width;
+    full_h = img_meta.height;
+    const size_t npix = (size_t)full_w * full_h;
+    take_full_cfa = g_try_malloc(npix * sizeof(float));
+    if(!take_full_cfa)
+    {
+      dt_mipmap_cache_release(&mbuf);
+      dt_restore_unref(ctx);
+      goto cleanup;
+    }
+    if(img_meta.buf_dsc.datatype == TYPE_UINT16)
+    {
+      const uint16_t *src = (const uint16_t *)mbuf.buf;
+      for(size_t i = 0; i < npix; i++) take_full_cfa[i] = (float)src[i];
+    }
+    else if(img_meta.buf_dsc.datatype == TYPE_FLOAT)
+    {
+      memcpy(take_full_cfa, mbuf.buf, npix * sizeof(float));
+    }
+    else
+    {
+      dt_mipmap_cache_release(&mbuf);
+      g_free(take_full_cfa);
+      dt_restore_unref(ctx);
+      goto cleanup;
+    }
+    dt_mipmap_cache_release(&mbuf);
+    full_cfa_use = take_full_cfa;
+  }
+
+  if(pd->sequence != g_atomic_int_get(&d->preview_sequence))
+  {
+    g_free(take_full_cfa);
+    dt_free_align(take_full_lin);
+    dt_restore_unref(ctx);
+    goto cleanup;
+  }
+
+  // 3.5. refresh the picker-thumbnail export when we loaded a fresh
+  //      full buffer (i.e. imgid / sensor-type changed). match the RGB
+  //      preview path: dt_imageio_export_with_flags runs the user's
+  //      full pipeline at ~1024 long edge, giving a display-accurate
+  //      thumbnail whose colours match what the user sees in darkroom
+  //      (and match our before/after ROI pipe outputs).
+  float *take_export_pixels = NULL;
+  int    export_thumb_w = 0;
+  int    export_thumb_h = 0;
+  if(!cache_matches)
+  {
+    dt_neural_preview_capture_t cap = {0};
+    const int export_size = dt_conf_get_int(CONF_PREVIEW_EXPORT_SIZE);
+    cap.parent.max_width = export_size;
+    cap.parent.max_height = export_size;
+
+    dt_imageio_module_format_t fmt = {
+      .mime = _ai_get_mime,
+      .levels = _ai_check_levels,
+      .bpp = _ai_check_bpp,
+      .write_image = _preview_capture_write_image};
+
+    const dt_colorspaces_color_profile_type_t cfg_type
+      = dt_conf_key_exists(CONF_ICC_TYPE)
+        ? dt_conf_get_int(CONF_ICC_TYPE)
+        : DT_COLORSPACE_NONE;
+    gchar *cfg_file = (cfg_type == DT_COLORSPACE_FILE)
+      ? dt_conf_get_string(CONF_ICC_FILE)
+      : NULL;
+    dt_imageio_export_with_flags(
+      pd->imgid, "unused", &fmt,
+      (dt_imageio_module_data_t *)&cap,
+      TRUE,   // ignore_exif
+      FALSE,  // display_byteorder
+      TRUE,   // high_quality
+      FALSE,  // upscale
+      FALSE,  // is_scaling
+      1.0,    // scale_factor
+      FALSE,  // thumbnail_export
+      NULL,   // filter
+      FALSE,  // copy_metadata
+      FALSE,  // export_masks
+      (cfg_type == DT_COLORSPACE_NONE)
+        ? dt_colorspaces_get_work_profile(pd->imgid)->type
+        : cfg_type,
+      cfg_file,
+      DT_INTENT_PERCEPTUAL,
+      NULL, NULL, 1, 1, NULL, -1);
+    g_free(cfg_file);
+
+    if(cap.pixels && cap.cap_w > 0 && cap.cap_h > 0)
+    {
+      take_export_pixels = cap.pixels;
+      export_thumb_w = cap.cap_w;
+      export_thumb_h = cap.cap_h;
+    }
+  }
+
+  // 4. compute crop region. widget dims define the "100% preview" size,
+  //    capped by the model's compiled tile size minus mandatory overlap.
+  const int T = dt_restore_get_tile_size(ctx);
+  // Bayer model upscales 2x; linear is 1:1. so the maximum displayed
+  // crop in sensor pixels:
+  //   bayer:  2*T - 4*overlap_packed = 2*T - 128  (for OVERLAP_PACKED=32)
+  //   linear: T   - 2*overlap_linear = T   - 64   (for OVERLAP_LINEAR=32)
+  const int max_disp = is_xtrans ? (T - 64) : (2 * T - 128);
+
+  // the raw buffer is always landscape (sensor layout), but the preview
+  // thumbnail the user clicks on is oriented per EXIF. un-rotate the
+  // widget dims + click position into sensor coords before picking the
+  // crop, otherwise portrait images end up sampling the wrong area
+  const dt_image_orientation_t ori = dt_image_orientation(&img_meta);
+  const gboolean swap_xy = (ori & ORIENTATION_SWAP_XY) != 0;
+
+  int crop_w = MIN(swap_xy ? pd->preview_h : pd->preview_w, max_disp);
+  int crop_h = MIN(swap_xy ? pd->preview_w : pd->preview_h, max_disp);
+  // Bayer: snap to mod 2 (CFA grid)
+  if(!is_xtrans)
+  {
+    crop_w = (crop_w / 2) * 2;
+    crop_h = (crop_h / 2) * 2;
+  }
+  if(crop_w <= 0 || crop_h <= 0)
+  {
+    g_free(take_full_cfa);
+    dt_free_align(take_full_lin);
+    dt_restore_unref(ctx);
+    goto cleanup;
+  }
+
+  // display-normalised click (u, v) -> sensor pixel, inverting whatever
+  // combination of swap/flip the flip iop will apply during display.
+  // matches dt_iop_flip:distort_backtransform semantics
+  const int disp_w = swap_xy ? full_h : full_w;
+  const int disp_h = swap_xy ? full_w : full_h;
+  float dx_disp = pd->patch_center[0] * disp_w;
+  float dy_disp = pd->patch_center[1] * disp_h;
+  float sx, sy;
+  if(swap_xy) { sx = dy_disp; sy = dx_disp; }
+  else        { sx = dx_disp; sy = dy_disp; }
+  if(ori & ORIENTATION_FLIP_X) sx = (float)full_w - sx;
+  if(ori & ORIENTATION_FLIP_Y) sy = (float)full_h - sy;
+
+  int crop_x = (int)sx - crop_w / 2;
+  int crop_y = (int)sy - crop_h / 2;
+  crop_x = CLAMP(crop_x, 0, full_w - crop_w);
+  crop_y = CLAMP(crop_y, 0, full_h - crop_h);
+  if(!is_xtrans)
+  {
+    crop_x = (crop_x / 2) * 2;
+    crop_y = (crop_y / 2) * 2;
+  }
+
+  dt_print(DT_DEBUG_AI,
+           "[neural_restore] raw preview: full=%dx%d ori=0x%x "
+           "patch_center=(%.3f,%.3f) -> sensor=(%d,%d %dx%d) %s",
+           full_w, full_h, (unsigned)ori,
+           pd->patch_center[0], pd->patch_center[1],
+           crop_x, crop_y, crop_w, crop_h,
+           is_xtrans ? "linear" : "bayer");
+
+  // 5. inference
+  // Bayer path uses the _piped variant which runs darktable's full
+  // pixelpipe on both the original CFA and a denoised-patched CFA, so
+  // "before"/"after" match what the user would see after Process +
+  // re-import (same history stack, same filmic/tone curve, same output
+  // profile). Slower (~2-5 s for two pipes) but colour-accurate.
+  // Linear path still uses the simpler in-space blend for now.
+  float *src_rgb = NULL;
+  float *denoised_rgb = NULL;
+  int actual_w = 0, actual_h = 0;
+  int err;
+  if(is_xtrans)
+    err = dt_restore_raw_linear_preview_piped(ctx, &img_meta, pd->imgid,
+                                              full_lin_use,
+                                              full_w, full_h,
+                                              crop_x, crop_y,
+                                              crop_w, crop_h,
+                                              &src_rgb, &denoised_rgb,
+                                              &actual_w, &actual_h);
+  else
+    err = dt_restore_raw_bayer_preview_piped(ctx, &img_meta, pd->imgid,
+                                             full_cfa_use,
+                                             full_w, full_h,
+                                             crop_x, crop_y,
+                                             crop_w, crop_h,
+                                             &src_rgb, &denoised_rgb,
+                                             &actual_w, &actual_h);
+
+  dt_restore_unref(ctx);
+
+  dt_print(DT_DEBUG_AI,
+           "[neural_restore] raw preview: inference returned err=%d "
+           "src=%p denoised=%p requested=%dx%d actual=%dx%d",
+           err, (void *)src_rgb, (void *)denoised_rgb,
+           crop_w, crop_h, actual_w, actual_h);
+
+  if(err || !src_rgb || !denoised_rgb || actual_w <= 0 || actual_h <= 0)
+  {
+    g_free(src_rgb);
+    g_free(denoised_rgb);
+    g_free(take_full_cfa);
+    dt_free_align(take_full_lin);
+    goto cleanup;
+  }
+
+  // 6. ship to UI thread. width/height carry the ACTUAL rendered dims
+  // from the pipe, which can be smaller than crop_w/crop_h when the
+  // user's history includes geometry-modifying modules (clipping,
+  // ashift, lens). downstream blend + cairo render must use these.
+  dt_neural_preview_result_raw_t *res
+    = g_new0(dt_neural_preview_result_raw_t, 1);
+  res->self = pd->self;
+  res->src_rgb = src_rgb;
+  res->denoised_rgb = denoised_rgb;
+  res->width = actual_w;
+  res->height = actual_h;
+  res->sequence = pd->sequence;
+  res->take_full_cfa = take_full_cfa;
+  res->take_full_lin = take_full_lin;
+  res->full_w = full_w;
+  res->full_h = full_h;
+  res->full_imgid = pd->imgid;
+  res->full_sensor_class = cls;
+  res->patch_center[0] = pd->patch_center[0];
+  res->patch_center[1] = pd->patch_center[1];
+  res->take_export_pixels = take_export_pixels;
+  res->export_thumb_w = export_thumb_w;
+  res->export_thumb_h = export_thumb_h;
+  g_idle_add(_preview_raw_result_idle, res);
+  g_free(pd);
+  return NULL;
+
+cleanup:
+  // worker bailed before producing a result. clear preview_generating
+  // on the UI thread so the user can re-trigger and the button state
+  // reflects reality. only schedule when the sequence is still current
+  // (a stale bail means a newer trigger is already in flight).
+  if(pd->sequence == g_atomic_int_get(&d->preview_sequence))
+    g_idle_add(_preview_raw_failed_idle, pd->self);
+  g_free(pd);
+  return NULL;
+}
+
+// thread dispatcher: serialises the actual inference / pipe work via
+// preview_inference_lock so that even when an old worker is still
+// running, the new one queues up rather than fighting for the GPU.
+// also re-checks the sequence after acquiring the lock — if the
+// trigger that spawned us has already been superseded while we were
+// waiting, drop on the floor without doing anything expensive
+static gpointer _preview_thread(gpointer data);
+static gpointer _preview_thread_raw(gpointer data);
+static gpointer _preview_thread_dispatch(gpointer data)
+{
+  dt_neural_preview_data_t *pd = (dt_neural_preview_data_t *)data;
+  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)pd->self->data;
+
+  g_mutex_lock(&d->preview_inference_lock);
+
+  if(pd->sequence != g_atomic_int_get(&d->preview_sequence))
+  {
+    g_mutex_unlock(&d->preview_inference_lock);
+    g_free(pd);
+    return NULL;
+  }
+
+  gpointer res = (pd->task == NEURAL_TASK_RAW_DENOISE)
+    ? _preview_thread_raw(data)
+    : _preview_thread(data);
+
+  g_mutex_unlock(&d->preview_inference_lock);
+  return res;
+}
+
+// debounced trigger: rapid tab switches collapse to one preview run.
+// the timer handle in d->preview_trigger_timer is replaced (and the
+// previous one removed) so the trigger only fires after the user
+// settles on a tab for `delay_ms`
+static gboolean _trigger_preview_from_timer(gpointer user_data);
+static void _trigger_preview(dt_lib_module_t *self);
+
+static void _schedule_preview_refresh(dt_lib_module_t *self, guint delay_ms)
+{
+  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
+  if(!d->model_available || !d->preview_requested) return;
+  if(d->preview_trigger_timer)
+    g_source_remove(d->preview_trigger_timer);
+  d->preview_trigger_timer
+    = g_timeout_add(delay_ms, _trigger_preview_from_timer, self);
+}
+
+static gboolean _trigger_preview_from_timer(gpointer user_data)
+{
+  dt_lib_module_t *self = (dt_lib_module_t *)user_data;
+  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
+  d->preview_trigger_timer = 0;
+  _trigger_preview(self);
+  return G_SOURCE_REMOVE;
+}
+
 static void _trigger_preview(dt_lib_module_t *self)
 {
   dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
@@ -1489,6 +2822,23 @@ static void _trigger_preview(dt_lib_module_t *self)
 
   if(!dt_is_valid_imgid(imgid)) return;
 
+  // per-task cache lookup: if we already have a result for this exact
+  // (task, imgid, patch_center) tuple, install it and skip the worker
+  if(_preview_cache_hit(d, d->task, imgid))
+  {
+    dt_print(DT_DEBUG_AI,
+             "[neural_restore] preview cache hit for %s",
+             _task_log_name(d->task));
+    _install_cache_slot(self, d->task);
+    d->preview_generating = FALSE;
+    _update_button_sensitivity(d);
+    // cache hit means preview_raw_sensor_class was set by an earlier
+    // worker run on this imgid — refresh the overlay so the DNG
+    // output label appears when _task_changed cleared it on tab switch
+    _update_info_label(d);
+    return;
+  }
+
   // compute preview dimensions matching widget aspect ratio
   const int widget_w = gtk_widget_get_allocated_width(d->preview_area);
   const int widget_h = gtk_widget_get_allocated_height(d->preview_area);
@@ -1518,36 +2868,49 @@ static void _trigger_preview(dt_lib_module_t *self)
 
   // borrow cached export pixels if available (re-pick scenario).
   // the pointer is valid for the thread's lifetime because
-  // _trigger_preview joins the previous thread before starting,
-  // and _cancel_preview joins before freeing export_pixels
+  // _cancel_preview joins before freeing export_pixels
   if(d->export_pixels)
   {
     pd->reuse_pixels = d->export_pixels;
     pd->reuse_w = d->export_w;
     pd->reuse_h = d->export_h;
   }
-  // join previous preview thread before starting a new one
+  // detach the previous worker (don't join — that would block the
+  // UI thread for the duration of the in-flight inference / pipe
+  // call). preview_inference_lock serialises the actual heavy work,
+  // and the bumped sequence + per-task cache lookup at the new
+  // worker's entry guarantees we don't run two inferences for the
+  // same target. gui_cleanup joins the latest worker for shutdown.
   if(d->preview_thread)
   {
-    g_thread_join(d->preview_thread);
+    g_thread_unref(d->preview_thread);
     d->preview_thread = NULL;
   }
   d->preview_thread = g_thread_new("neural_preview",
-                                   _preview_thread, pd);
+                                   _preview_thread_dispatch, pd);
 }
 
-static void _update_task_from_ui(dt_lib_neural_restore_t *d)
+// map notebook page index to task. pages are ordered in the notebook as:
+//   0 = raw denoise, 1 = denoise, 2 = upscale (with scale_combo picking 2x/4x)
+static dt_neural_task_t _task_from_page(dt_lib_neural_restore_t *d, int page)
 {
-  const int page = gtk_notebook_get_current_page(d->notebook);
-  if(page == 0)
-    d->task = NEURAL_TASK_DENOISE;
-  else
+  switch(page)
   {
-    const int scale_pos = dt_bauhaus_combobox_get(d->scale_combo);
-    d->task = (scale_pos == 1) ? NEURAL_TASK_UPSCALE_4X : NEURAL_TASK_UPSCALE_2X;
+    case 0: return NEURAL_TASK_RAW_DENOISE;
+    case 1: return NEURAL_TASK_DENOISE;
+    default:
+    {
+      const int scale_pos = dt_bauhaus_combobox_get(d->scale_combo);
+      return (scale_pos == 1) ? NEURAL_TASK_UPSCALE_4X : NEURAL_TASK_UPSCALE_2X;
+    }
   }
 }
 
+static void _update_task_from_ui(dt_lib_neural_restore_t *d)
+{
+  d->task = _task_from_page(d, gtk_notebook_get_current_page(d->notebook));
+}
+
 static void _notebook_page_changed(GtkNotebook *notebook,
                                    GtkWidget *page,
                                    guint page_num,
@@ -1556,18 +2919,13 @@ static void _notebook_page_changed(GtkNotebook *notebook,
   dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
 
   // switch-page fires before the page changes, so use page_num
-  if(page_num == 0)
-    d->task = NEURAL_TASK_DENOISE;
-  else
-  {
-    const int scale_pos = dt_bauhaus_combobox_get(d->scale_combo);
-    d->task = (scale_pos == 1) ? NEURAL_TASK_UPSCALE_4X : NEURAL_TASK_UPSCALE_2X;
-  }
+  d->task = _task_from_page(d, page_num);
 
   dt_conf_set_int(CONF_ACTIVE_PAGE, page_num);
   _task_changed(d);
+  // debounced — rapid tab cycling won't pile up worker threads
   if(d->preview_requested)
-    _trigger_preview(self);
+    _schedule_preview_refresh(self, 150);
 }
 
 static void _scale_combo_changed(GtkWidget *widget, dt_lib_module_t *self)
@@ -1576,14 +2934,14 @@ static void _scale_combo_changed(GtkWidget *widget, dt_lib_module_t *self)
   _update_task_from_ui(d);
   _task_changed(d);
   if(d->preview_requested)
-    _trigger_preview(self);
+    _schedule_preview_refresh(self, 150);
 }
 
 static void _recovery_slider_changed(GtkWidget *widget, dt_lib_module_t *self)
 {
   dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
   if(d->recovery_changing) return;
-  dt_conf_set_float(CONF_DETAIL_RECOVERY, dt_bauhaus_slider_get(d->recovery_slider));
+  dt_conf_set_float(CONF_STRENGTH, dt_bauhaus_slider_get(d->recovery_slider));
   if(d->preview_ready)
   {
     _rebuild_cairo_after(d);
@@ -1591,6 +2949,22 @@ static void _recovery_slider_changed(GtkWidget *widget, dt_lib_module_t *self)
   }
 }
 
+static void _raw_strength_slider_changed(GtkWidget *widget,
+                                         dt_lib_module_t *self)
+{
+  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
+  dt_conf_set_float(CONF_RAW_STRENGTH,
+                    dt_bauhaus_slider_get(d->raw_strength_slider));
+
+  // live preview re-blend (debounced). only fires when raw denoise tab
+  // is active and a preview is already cached — otherwise the model
+  // hasn't run yet and there's nothing to blend.
+  if(d->task == NEURAL_TASK_RAW_DENOISE
+     && d->preview_raw_src_rgb
+     && d->preview_raw_denoised_rgb)
+    _schedule_raw_strength_reblend(self);
+}
+
 static void _process_clicked(GtkWidget *widget, gpointer user_data)
 {
   dt_lib_module_t *self = (dt_lib_module_t *)user_data;
@@ -1608,7 +2982,12 @@ static void _process_clicked(GtkWidget *widget, gpointer user_data)
   job_data->env = d->env;
   job_data->images = images;
   job_data->scale = _task_scale(d->task);
-  job_data->detail_recovery = dt_conf_get_float(CONF_DETAIL_RECOVERY);
+  job_data->strength = dt_conf_key_exists(CONF_STRENGTH)
+    ? dt_conf_get_float(CONF_STRENGTH) : 100.0f;
+  // raw denoise strength: 0..100 in UI, 0..1 for the pipeline
+  job_data->raw_strength = dt_conf_key_exists(CONF_RAW_STRENGTH)
+    ? dt_conf_get_float(CONF_RAW_STRENGTH) / 100.0f
+    : 1.0f;
   job_data->bpp = dt_conf_key_exists(CONF_BIT_DEPTH)
     ? dt_conf_get_int(CONF_BIT_DEPTH)
     : NEURAL_BPP_16;
@@ -1769,14 +3148,48 @@ static gboolean _preview_draw(GtkWidget *widget, cairo_t *cr, dt_lib_module_t *s
     cairo_paint(cr);
     cairo_restore(cr);
 
-    // draw crop rectangle at current patch_center
-    const int task_scale = _task_scale(d->task);
-    const int crop_w = w / task_scale;
-    const int crop_h = h / task_scale;
-    const double rw = (double)crop_w / d->export_w * img_w;
-    const double rh = (double)crop_h / d->export_h * img_h;
-    const double rx = ox + d->patch_center[0] * img_w - rw / 2.0;
-    const double ry = oy + d->patch_center[1] * img_h - rh / 2.0;
+    // draw crop rectangle at current patch_center. the rectangle
+    // represents the actual displayed patch, not the full tile size:
+    //  * RGB denoise / upscale: the preview runs on the exported image
+    //    at 1:1, so crop pixels == widget pixels, measured against the
+    //    thumbnail's own resolution (export_w == source for this path)
+    //  * raw denoise: preview_raw_crop_* carries the pipe's backbuf
+    //    dims — already in display orientation (post-flip iop) — so we
+    //    scale them against the display-oriented thumbnail (export_w /
+    //    export_h), NOT the sensor buffer (preview_full_*). using the
+    //    sensor dims would draw the wrong rectangle size on portrait
+    //    images where sensor and display axes swap
+    double rw, rh;
+    if(d->task == NEURAL_TASK_RAW_DENOISE
+       && d->preview_raw_crop_w > 0 && d->preview_raw_crop_h > 0
+       && d->export_w > 0 && d->export_h > 0)
+    {
+      rw = (double)d->preview_raw_crop_w / d->export_w * img_w;
+      rh = (double)d->preview_raw_crop_h / d->export_h * img_h;
+    }
+    else
+    {
+      const int task_scale = _task_scale(d->task);
+      const int crop_w = w / task_scale;
+      const int crop_h = h / task_scale;
+      rw = (double)crop_w / d->export_w * img_w;
+      rh = (double)crop_h / d->export_h * img_h;
+    }
+    // compute rectangle top-left. for RGB denoise / upscale, the click
+    // / motion handlers already clamp patch_center with inner margins,
+    // so the rectangle always fits — match master by NOT pushing here.
+    // for raw denoise, patch_center is free-range in [0, 1]; push the
+    // rectangle inward so it still fits (matches the worker's CLAMP on
+    // crop_x / crop_y and keeps the picker visually honest).
+    double rx = ox + d->patch_center[0] * img_w - rw / 2.0;
+    double ry = oy + d->patch_center[1] * img_h - rh / 2.0;
+    if(d->task == NEURAL_TASK_RAW_DENOISE)
+    {
+      if(rx < ox) rx = ox;
+      if(ry < oy) ry = oy;
+      if(rx + rw > ox + img_w) rx = ox + img_w - rw;
+      if(ry + rh > oy + img_h) ry = oy + img_h - rh;
+    }
 
     // dim area outside the rectangle
     cairo_save(cr);
@@ -1974,18 +3387,35 @@ static gboolean _preview_button_press(GtkWidget *widget,
     double img_w, img_h, ox, oy;
     _picking_geometry(d, w, h, &img_w, &img_h, &ox, &oy);
 
-    // convert click to normalized image coords, clamped so
-    // the crop rectangle stays within the image
-    const int task_scale = _task_scale(d->task);
-    const float half_w = (float)w / task_scale / (2.0f * d->export_w);
-    const float half_h = (float)h / task_scale / (2.0f * d->export_h);
+    // convert click to normalized image coords.
+    //  * RGB denoise / upscale: clamp so the crop rectangle stays
+    //    within the image (master behaviour — the export-based preview
+    //    needs this because the worker and draw share a single
+    //    export_w-based scale).
+    //  * raw denoise: no inner-margin clamp — user can pick corners.
+    //    the raw worker CLAMPs crop_x/y, and _preview_draw pushes the
+    //    rectangle inward to match.
     const float nx = (float)((ex - ox) / img_w);
     const float ny = (float)((ey - oy) / img_h);
     if(nx < 0.0f || nx > 1.0f || ny < 0.0f || ny > 1.0f)
       return TRUE;
 
-    d->patch_center[0] = CLAMP(nx, half_w, 1.0f - half_w);
-    d->patch_center[1] = CLAMP(ny, half_h, 1.0f - half_h);
+    if(d->task == NEURAL_TASK_RAW_DENOISE)
+    {
+      d->patch_center[0] = CLAMP(nx, 0.0f, 1.0f);
+      d->patch_center[1] = CLAMP(ny, 0.0f, 1.0f);
+    }
+    else
+    {
+      const int task_scale = _task_scale(d->task);
+      const float half_w = (float)w / task_scale / (2.0f * d->export_w);
+      const float half_h = (float)h / task_scale / (2.0f * d->export_h);
+      d->patch_center[0] = CLAMP(nx, half_w, 1.0f - half_w);
+      d->patch_center[1] = CLAMP(ny, half_h, 1.0f - half_h);
+    }
+
+    // patch moved — every cached preview is now stale (different crop)
+    _preview_cache_invalidate_all(d);
 
     // exit picking mode
     d->picking_thumbnail = FALSE;
@@ -2057,16 +3487,24 @@ static gboolean _preview_motion(GtkWidget *widget,
     double img_w, img_h, ox, oy;
     _picking_geometry(d, w, h, &img_w, &img_h, &ox, &oy);
 
-    // clamp so the crop rectangle stays within the image
-    const int task_scale = _task_scale(d->task);
-    const float half_w = (float)w / task_scale / (2.0f * d->export_w);
-    const float half_h = (float)h / task_scale / (2.0f * d->export_h);
-    const float nx = CLAMP((float)((ex - ox) / img_w),
-                           half_w, 1.0f - half_w);
-    const float ny = CLAMP((float)((ey - oy) / img_h),
-                           half_h, 1.0f - half_h);
-    d->patch_center[0] = nx;
-    d->patch_center[1] = ny;
+    // motion follows the cursor, with clamping based on task:
+    //  * RGB denoise / upscale: inner-margin clamp (master behaviour)
+    //  * raw denoise: free-range in [0, 1] so corners are reachable
+    const float rx = (float)((ex - ox) / img_w);
+    const float ry = (float)((ey - oy) / img_h);
+    if(d->task == NEURAL_TASK_RAW_DENOISE)
+    {
+      d->patch_center[0] = CLAMP(rx, 0.0f, 1.0f);
+      d->patch_center[1] = CLAMP(ry, 0.0f, 1.0f);
+    }
+    else
+    {
+      const int task_scale = _task_scale(d->task);
+      const float half_w = (float)w / task_scale / (2.0f * d->export_w);
+      const float half_h = (float)h / task_scale / (2.0f * d->export_h);
+      d->patch_center[0] = CLAMP(rx, half_w, 1.0f - half_w);
+      d->patch_center[1] = CLAMP(ry, half_h, 1.0f - half_h);
+    }
     gtk_widget_queue_draw(widget);
     return TRUE;
   }
@@ -2127,6 +3565,7 @@ static void _selection_changed_callback(gpointer instance, dt_lib_module_t *self
   dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
   d->preview_requested = FALSE;
   _cancel_preview(self);
+  _preview_cache_invalidate_all(d);
   _update_info_label(d);
   _update_button_sensitivity(d);
 }
@@ -2136,6 +3575,7 @@ static void _image_changed_callback(gpointer instance, dt_lib_module_t *self)
   dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
   d->preview_requested = FALSE;
   _cancel_preview(self);
+  _preview_cache_invalidate_all(d);
   _update_info_label(d);
   _update_button_sensitivity(d);
 }
@@ -2252,6 +3692,7 @@ void gui_init(dt_lib_module_t *self)
   d->env = dt_restore_env_init();
   d->processing_images = g_hash_table_new(g_direct_hash, g_direct_equal);
   dt_pthread_mutex_init(&d->ctx_lock, NULL);
+  g_mutex_init(&d->preview_inference_lock);
   d->split_pos = 0.5f;
 
   // notebook tabs (denoise / upscale)
@@ -2260,21 +3701,46 @@ void gui_init(dt_lib_module_t *self)
   dt_action_define(DT_ACTION(self), NULL, N_("page"),
                    GTK_WIDGET(d->notebook), &notebook_def);
 
-  d->denoise_page = dt_ui_notebook_page(d->notebook, N_("denoise"),
-                                        _("AI denoising"));
-  d->upscale_page = dt_ui_notebook_page(d->notebook, N_("upscale"),
-                                        _("AI upscaling"));
-
-  // denoise page: detail recovery slider
-  const float saved_recovery = dt_conf_get_float(CONF_DETAIL_RECOVERY);
+  // raw denoise sits first: it runs earliest in the denoise workflow
+  // (before demosaic-stage processing). bayer / linear variant selection
+  // is driven by the active "rawdenoise" model rather than a UI toggle
+  d->raw_denoise_page = dt_ui_notebook_page(d->notebook, N_("raw denoise"),
+                                            _("AI raw denoising"));
+  d->denoise_page     = dt_ui_notebook_page(d->notebook, N_("denoise"),
+                                            _("AI denoising"));
+  d->upscale_page     = dt_ui_notebook_page(d->notebook, N_("upscale"),
+                                            _("AI upscaling"));
+
+  // raw denoise page: strength slider. 100 = full model output,
+  // 0 = unchanged source CFA, linear blend in raw ADC space
+  const float saved_raw_strength = dt_conf_key_exists(CONF_RAW_STRENGTH)
+    ? dt_conf_get_float(CONF_RAW_STRENGTH) : 100.0f;
+  d->raw_strength_slider = dt_bauhaus_slider_new_action(DT_ACTION(self),
+                                                        0.0f, 100.0f, 1.0f,
+                                                        saved_raw_strength, 0);
+  dt_bauhaus_widget_set_label(d->raw_strength_slider, NULL, N_("strength"));
+  dt_bauhaus_slider_set_format(d->raw_strength_slider, "%");
+  gtk_widget_set_tooltip_text(d->raw_strength_slider,
+                              _("blend between the source CFA (0%) and "
+                                "the denoised output (100%)"));
+  g_signal_connect(G_OBJECT(d->raw_strength_slider), "value-changed",
+                   G_CALLBACK(_raw_strength_slider_changed), self);
+  dt_gui_box_add(d->raw_denoise_page, d->raw_strength_slider);
+
+  // denoise page: strength slider. 100 = full denoise, 0 = source-like.
+  // dialing below 100 brings DWT-filtered texture back without
+  // reintroducing the noise-frequency content.
+  const float saved_strength = dt_conf_key_exists(CONF_STRENGTH)
+    ? dt_conf_get_float(CONF_STRENGTH) : 100.0f;
   d->recovery_slider = dt_bauhaus_slider_new_action(DT_ACTION(self),
                                                     0.0f, 100.0f, 1.0f,
-                                                    saved_recovery, 0);
-  dt_bauhaus_widget_set_label(d->recovery_slider, NULL, N_("detail recovery"));
+                                                    saved_strength, 0);
+  dt_bauhaus_widget_set_label(d->recovery_slider, NULL, N_("strength"));
   dt_bauhaus_slider_set_format(d->recovery_slider, "%");
   gtk_widget_set_tooltip_text(d->recovery_slider,
-                              _("recover fine texture lost during denoising "
-                                "while suppressing noise"));
+                              _("100% applies the full AI model output; "
+                                "lower values bring back luminance texture "
+                                "and grain while keeping color noise suppressed"));
   g_signal_connect(G_OBJECT(d->recovery_slider), "value-changed",
                    G_CALLBACK(_recovery_slider_changed), self);
   dt_gui_box_add(d->denoise_page, d->recovery_slider);
@@ -2470,8 +3936,10 @@ void gui_init(dt_lib_module_t *self)
   DT_CONTROL_SIGNAL_HANDLE(DT_SIGNAL_DEVELOP_IMAGE_CHANGED, _image_changed_callback);
   DT_CONTROL_SIGNAL_HANDLE(DT_SIGNAL_AI_MODELS_CHANGED, _ai_models_changed_callback);
 
-  _update_info_label(d);
-  _update_button_sensitivity(d);
+  // sync per-task widget visibility for the initially-active tab.
+  // _task_changed does detail-slider + output-knobs visibility and
+  // info/button state — safe to call here after all widgets exist.
+  _task_changed(d);
 }
 
 void gui_cleanup(dt_lib_module_t *self)
@@ -2484,13 +3952,28 @@ void gui_cleanup(dt_lib_module_t *self)
 
   if(d)
   {
-    // signal preview thread to exit and wait for it
+    // cancel any pending debounced trigger before tearing down state
+    if(d->preview_trigger_timer)
+    {
+      g_source_remove(d->preview_trigger_timer);
+      d->preview_trigger_timer = 0;
+    }
+    // signal preview thread to exit and wait for it. join blocks
+    // here (unlike _cancel_preview during runtime, where we can't
+    // afford to freeze the UI) — happens once on shutdown only
     g_atomic_int_inc(&d->preview_sequence);
     if(d->preview_thread)
     {
       g_thread_join(d->preview_thread);
       d->preview_thread = NULL;
     }
+    // any worker idle callbacks queued just before the join may still
+    // fire after this point. they check sequence and discard, but they
+    // dereference `d` to do that — drain the main context once so they
+    // run while `d` is still alive
+    while(g_main_context_pending(NULL))
+      g_main_context_iteration(NULL, FALSE);
+    g_mutex_clear(&d->preview_inference_lock);
 
     g_free(d->preview_before);
     g_free(d->preview_after);
@@ -2499,6 +3982,18 @@ void gui_cleanup(dt_lib_module_t *self)
     g_free(d->cairo_after);
     g_free(d->export_pixels);
     g_free(d->export_cairo);
+
+    // raw denoise preview cache
+    if(d->preview_strength_timer)
+    {
+      g_source_remove(d->preview_strength_timer);
+      d->preview_strength_timer = 0;
+    }
+    g_free(d->preview_full_cfa);
+    dt_free_align(d->preview_full_lin);
+    g_free(d->preview_raw_src_rgb);
+    g_free(d->preview_raw_denoised_rgb);
+    _preview_cache_invalidate_all(d);
     if(d->processing_images)
       g_hash_table_destroy(d->processing_images);
     dt_restore_unref(d->cached_ctx);
diff --git a/src/tests/unittests/ai/test_ai_backend.c b/src/tests/unittests/ai/test_ai_backend.c
index d226ad1bcd89..c9df71e0b68c 100644
--- a/src/tests/unittests/ai/test_ai_backend.c
+++ b/src/tests/unittests/ai/test_ai_backend.c
@@ -332,7 +332,7 @@ static void test_load_opt_levels(void **state)
   // DT_AI_OPT_BASIC
   dt_ai_context_t *ctx_basic
     = dt_ai_load_model_ext(env, "test-multiply", NULL,
-                           DT_AI_PROVIDER_CPU, DT_AI_OPT_BASIC, NULL, 0);
+                           DT_AI_PROVIDER_CPU, DT_AI_OPT_BASIC, NULL, 0, 0);
   assert_non_null(ctx_basic);
 
   // verify inference still works with basic optimization
@@ -348,7 +348,7 @@ static void test_load_opt_levels(void **state)
   // DT_AI_OPT_DISABLED
   dt_ai_context_t *ctx_none
     = dt_ai_load_model_ext(env, "test-multiply", NULL,
-                           DT_AI_PROVIDER_CPU, DT_AI_OPT_DISABLED, NULL, 0);
+                           DT_AI_PROVIDER_CPU, DT_AI_OPT_DISABLED, NULL, 0, 0);
   assert_non_null(ctx_none);
   dt_ai_unload_model(ctx_none);
 }

From 2291c1e3c15f4e1a1126e225506bf75fe6cf9153 Mon Sep 17 00:00:00 2001
From: Andrii Ryzhkov <andrii.ryzhkov@pm.me>
Date: Fri, 24 Apr 2026 08:16:36 +0200
Subject: [PATCH 2/9] Promote neural restore output as group leader when source
 leads

---
 src/libs/neural_restore.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/libs/neural_restore.c b/src/libs/neural_restore.c
index 07ca26834410..17eb52dedf51 100644
--- a/src/libs/neural_restore.c
+++ b/src/libs/neural_restore.c
@@ -843,7 +843,17 @@ static void _import_image(const char *filename, dt_imgid_t source_imgid)
   {
     dt_print(DT_DEBUG_AI, "[neural_restore] imported imgid=%d: %s", newid, filename);
     if(dt_is_valid_imgid(source_imgid))
+    {
       dt_grouping_add_to_group(source_imgid, newid);
+      // promote the output as group leader, but only when the source
+      // was the current leader — preserves any manually-set leader the
+      // user deliberately chose
+      const dt_image_t *src = dt_image_cache_get(source_imgid, 'r');
+      const gboolean source_is_leader = src && src->group_id == source_imgid;
+      dt_image_cache_read_release(src);
+      if(source_is_leader)
+        dt_grouping_change_representative(newid);
+    }
     // refresh the collection so the new image appears in the thumb grid
     dt_collection_update_query(darktable.collection,
                                DT_COLLECTION_CHANGE_RELOAD,

From 6f37d5106327a38d185e765705959a0644918034 Mon Sep 17 00:00:00 2001
From: Andrii Ryzhkov <andrii.ryzhkov@pm.me>
Date: Sat, 25 Apr 2026 09:50:13 +0200
Subject: [PATCH 3/9] DNG writer: advertise visible area via ACTIVEAREA tag

---
 src/common/dng_writer.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/common/dng_writer.c b/src/common/dng_writer.c
index 9c537f56c7a4..d26c6d27181e 100644
--- a/src/common/dng_writer.c
+++ b/src/common/dng_writer.c
@@ -177,13 +177,23 @@ int dt_dng_write_cfa_bayer(const char *filename,
     }
   }
 
-  // default scale / crop
-  // DefaultScale=1,1; DefaultCropOrigin=0,0; DefaultCropSize=W,H.
-  // this keeps the importer from applying any additional framing;
-  // darktable's crop module handles downstream framing
+  // advertise the visible region inside the full raw buffer; without
+  // these tags the importer renders the optical-black margins too
+  const int crop_x = (img->crop_x > 0) ? img->crop_x : 0;
+  const int crop_y = (img->crop_y > 0) ? img->crop_y : 0;
+  const int vis_w  = (img->p_width  > 0 && img->p_width  <= width  - crop_x)
+                     ? img->p_width  : (width  - crop_x);
+  const int vis_h  = (img->p_height > 0 && img->p_height <= height - crop_y)
+                     ? img->p_height : (height - crop_y);
+
+  const uint32_t active_area[4] = {
+    (uint32_t)crop_y, (uint32_t)crop_x,
+    (uint32_t)(crop_y + vis_h), (uint32_t)(crop_x + vis_w),
+  };
   const float default_scale[2] = { 1.0f, 1.0f };
   const float default_crop_origin[2] = { 0.0f, 0.0f };
-  const float default_crop_size[2] = { (float)width, (float)height };
+  const float default_crop_size[2] = { (float)vis_w, (float)vis_h };
+  TIFFSetField(tif, TIFFTAG_ACTIVEAREA, active_area);
   TIFFSetField(tif, TIFFTAG_DEFAULTSCALE, default_scale);
   TIFFSetField(tif, TIFFTAG_DEFAULTCROPORIGIN, default_crop_origin);
   TIFFSetField(tif, TIFFTAG_DEFAULTCROPSIZE, default_crop_size);
@@ -316,10 +326,15 @@ int dt_dng_write_linear(const char *filename,
     }
   }
 
-  // default scale / crop (full frame, no inset)
+  // linear DNG: buffer is already at visible dims (post-demosaic);
+  // ACTIVEAREA covers the full buffer, no margin to crop
+  const uint32_t active_area[4] = {
+    0, 0, (uint32_t)height, (uint32_t)width,
+  };
   const float default_scale[2] = { 1.0f, 1.0f };
   const float default_crop_origin[2] = { 0.0f, 0.0f };
   const float default_crop_size[2] = { (float)width, (float)height };
+  TIFFSetField(tif, TIFFTAG_ACTIVEAREA, active_area);
   TIFFSetField(tif, TIFFTAG_DEFAULTSCALE, default_scale);
   TIFFSetField(tif, TIFFTAG_DEFAULTCROPORIGIN, default_crop_origin);
   TIFFSetField(tif, TIFFTAG_DEFAULTCROPSIZE, default_crop_size);

From f9afb2ad38e13437f3191831e799ee70df8dbc65 Mon Sep 17 00:00:00 2001
From: Andrii Ryzhkov <andrii.ryzhkov@pm.me>
Date: Sat, 25 Apr 2026 20:39:46 +0200
Subject: [PATCH 4/9] AI raw denoise: blend tile overlaps to hide per-tile gain
 seams

---
 src/common/ai/restore_common.h     |  34 +++++
 src/common/ai/restore_raw_bayer.c  | 206 +++++++++++++++++++++++++----
 src/common/ai/restore_raw_linear.c | 206 ++++++++++++++++++++++++++---
 3 files changed, 402 insertions(+), 44 deletions(-)

diff --git a/src/common/ai/restore_common.h b/src/common/ai/restore_common.h
index 2a5138b6ac4d..1da0b2ee3219 100644
--- a/src/common/ai/restore_common.h
+++ b/src/common/ai/restore_common.h
@@ -229,6 +229,40 @@ static inline int _mirror_in_range(int i, int lo, int hi)
   return lo + _mirror(i - lo, n);
 }
 
+// tile overlap blending weights: each tile contributes ax·ay; adjacent
+// tiles' ramps sum to 1, so strip accumulators recover the blended value
+// with no per-pixel division. seam = 2*sensor_O wide, centered on the
+// core boundary; returns 1.0 outside the seam (pure interior)
+
+static inline float _seam_ramp(int d, int sensor_O)
+{
+  return ((float)d + 0.5f) / (float)(2 * sensor_O);
+}
+
+static inline float _seam_ax(int sc,
+                             int px_base, int px_end,
+                             int sensor_O,
+                             gboolean has_left, gboolean has_right)
+{
+  if(has_left && sc < px_base + sensor_O)
+    return _seam_ramp(sc - (px_base - sensor_O), sensor_O);
+  if(has_right && sc >= px_end - sensor_O)
+    return 1.0f - _seam_ramp(sc - (px_end - sensor_O), sensor_O);
+  return 1.0f;
+}
+
+static inline float _seam_ay(int sr,
+                             int py_base, int py_end,
+                             int sensor_O,
+                             gboolean has_top, gboolean has_bot)
+{
+  if(has_top && sr < py_base + sensor_O)
+    return _seam_ramp(sr - (py_base - sensor_O), sensor_O);
+  if(has_bot && sr >= py_end - sensor_O)
+    return 1.0f - _seam_ramp(sr - (py_end - sensor_O), sensor_O);
+  return 1.0f;
+}
+
 // clang-format off
 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
 // vim: shiftwidth=2 expandtab tabstop=2 cindent
diff --git a/src/common/ai/restore_raw_bayer.c b/src/common/ai/restore_raw_bayer.c
index 91cff804936b..0f0d76cc294e 100644
--- a/src/common/ai/restore_raw_bayer.c
+++ b/src/common/ai/restore_raw_bayer.c
@@ -400,8 +400,34 @@ retry:;
   int res = 0;
   int tile_count = 0;
 
+  // overlap blending: at each tile boundary 2 (4 at corners) tiles emit
+  // ax·ay-weighted contributions whose ramps sum to 1. only the 2*sensor_O-
+  // wide seam regions accumulate; pure interior hard-writes. h-strips own
+  // corners. memory ~4 MB live, independent of image size
+  const int sensor_O = 2 * O;
+  const int hstrip_h = 2 * sensor_O;
+
+  // h_strip_top = seam between (ty-1) and ty: built by ty-1 as bot, flushed by ty
+  float *h_strip_top = NULL;
+  int h_strip_top_sy0 = 0;
+
   for(int ty = 0; ty < rows && res == 0; ty++)
   {
+    const gboolean has_top = ty > 0;
+    const gboolean has_bot = ty < rows - 1;
+
+    float *h_strip_bot = NULL;
+    int h_strip_bot_sy0 = 0;
+    if(has_bot)
+    {
+      h_strip_bot = g_try_malloc0((size_t)width * hstrip_h * sizeof(float));
+      if(!h_strip_bot) { res = 1; break; }
+    }
+
+    // v_strip_left = seam (tx-1)↔tx: rotated in from tx-1's right, flushed by tx
+    float *v_strip_left = NULL;
+    int v_strip_left_sx0 = 0, v_strip_left_sy0 = 0, v_strip_left_h = 0;
+
     for(int tx = 0; tx < cols && res == 0; tx++)
     {
       if(control_job
@@ -416,8 +442,6 @@ retry:;
       const int px_base = tx * step;
       const int py_end = (py_base + step > Hh) ? Hh : py_base + step;
       const int px_end = (px_base + step > Wh) ? Wh : px_base + step;
-      const int core_h = py_end - py_base;
-      const int core_w = px_end - px_base;
 
       // build 4ch input at packed half-res (T x T). geometry picks
       // the right origin and mirror-reflection bounds based on
@@ -472,6 +496,9 @@ retry:;
                    T, next_T);
           g_free(tile_in);
           g_free(tile_out);
+          g_free(h_strip_top);
+          g_free(h_strip_bot);
+          g_free(v_strip_left);
           T = next_T;
           goto retry;
         }
@@ -519,49 +546,176 @@ retry:;
                  in_mean, out_mean, (double)gain);
       }
 
-      // re-mosaic the core-valid region and un-preprocess
-      // model output dims: 2T x 2T (sensor pixels) for T x T packed tile.
-      // core valid region in model output starts at (2*O, 2*O) and spans
-      // (2*core_h) x (2*core_w) sensor pixels
-      const int core_sh = 2 * core_h;  // sensor height of core
-      const int core_sw = 2 * core_w;
-      for(int dy = 0; dy < core_sh; dy++)
+      const gboolean has_left = tx > 0;
+      const gboolean has_right = tx < cols - 1;
+      const int sensor_py_base = y0 + 2 * py_base;
+      const int sensor_py_end  = y0 + 2 * py_end;
+      const int sensor_px_base = x0 + 2 * px_base;
+      const int sensor_px_end  = x0 + 2 * px_end;
+
+      // cores edge-to-edge in y → one shared h_strip_bot origin per row
+      if(has_bot && tx == 0) h_strip_bot_sy0 = sensor_py_end - sensor_O;
+
+      // v-strip excludes top/bot corners (h-strips own them) → y extent = pure interior
+      float *v_strip_right = NULL;
+      int v_strip_right_sx0 = 0, v_strip_right_sy0 = 0, v_strip_right_h = 0;
+      if(has_right)
       {
-        const int r = y0 + 2 * py_base + dy;          // sensor row
-        const int my = 2 * O + dy;                    // model-output row
-        const size_t row_off = (size_t)my * tile_out_w;
-        for(int dx = 0; dx < core_sw; dx++)
+        v_strip_right_sx0 = sensor_px_end - sensor_O;
+        v_strip_right_sy0 = sensor_py_base + (has_top ? sensor_O : 0);
+        const int v_y_end = sensor_py_end - (has_bot ? sensor_O : 0);
+        v_strip_right_h = v_y_end - v_strip_right_sy0;
+        if(v_strip_right_h > 0)
         {
-          const int c = x0 + 2 * px_base + dx;        // sensor col
-          const int mx = 2 * O + dx;
+          v_strip_right = g_try_malloc0((size_t)(2 * sensor_O)
+                                        * v_strip_right_h * sizeof(float));
+          if(!v_strip_right) { res = 1; break; }
+        }
+      }
 
-          const int ch = FC(r, c, filters);           // 0=R, 1=G, 2=B
-          const float model_val
-            = tile_out[(size_t)ch * tile_out_plane + row_off + mx];
+      // extended extent = core ± seam where a neighbor exists; matches model-output validity
+      const int ext_y0 = has_top  ? sensor_py_base - sensor_O : sensor_py_base;
+      const int ext_y1 = has_bot  ? sensor_py_end + sensor_O  : sensor_py_end;
+      const int ext_x0 = has_left ? sensor_px_base - sensor_O : sensor_px_base;
+      const int ext_x1 = has_right? sensor_px_end + sensor_O  : sensor_px_end;
 
-          // reverse WB + normalisation → raw ADC
+      for(int sr = ext_y0; sr < ext_y1; sr++)
+      {
+        const int my = 2 * O + (sr - sensor_py_base);
+        const float ay = _seam_ay(sr, sensor_py_base, sensor_py_end,
+                                  sensor_O, has_top, has_bot);
+        const gboolean in_horiz_seam = (ay < 1.0f);
+        const size_t mo_row = (size_t)my * tile_out_w;
+
+        float *h_strip = NULL;
+        int    h_strip_sy0 = 0;
+        if(in_horiz_seam)
+        {
+          if(has_top && sr < sensor_py_base + sensor_O)
+          {
+            h_strip = h_strip_top;
+            h_strip_sy0 = h_strip_top_sy0;
+          }
+          else if(has_bot && sr >= sensor_py_end - sensor_O)
+          {
+            h_strip = h_strip_bot;
+            h_strip_sy0 = h_strip_bot_sy0;
+          }
+        }
+        const size_t h_strip_row_off = h_strip
+          ? (size_t)(sr - h_strip_sy0) * width : 0;
+
+        for(int sc = ext_x0; sc < ext_x1; sc++)
+        {
+          const int mx = 2 * O + (sc - sensor_px_base);
+          const float ax = _seam_ax(sc, sensor_px_base, sensor_px_end,
+                                    sensor_O, has_left, has_right);
+          const gboolean in_vert_seam = (ax < 1.0f);
+
+          const int ch = FC(sr, sc, filters);  // 0=R, 1=G, 2=B
+          const float model_val
+            = tile_out[(size_t)ch * tile_out_plane + mo_row + mx];
           const float raw_val
-            = _bayer_remosaic_raw(r, c, ch, model_val, &prep);
+            = _bayer_remosaic_raw(sr, sc, ch, model_val, &prep);
 
-          // strength blend: α=1 → denoised, α=0 → source CFA
-          const size_t pidx = (size_t)r * width + c;
+          const size_t pidx = (size_t)sr * width + sc;
           const float blended
             = alpha * raw_val + inv_alpha * cfa_in[pidx];
 
-          const float clipped
-            = blended < 0.0f ? 0.0f
-              : (blended > clip_max ? clip_max : blended);
-          cfa_out[pidx] = (uint16_t)(clipped + 0.5f);
+          if(in_horiz_seam)
+          {
+            // h-strip owns corners too; weight ax·ay (other 3 tiles complete the sum)
+            if(h_strip)
+              h_strip[h_strip_row_off + sc] += ax * ay * blended;
+          }
+          else if(in_vert_seam)
+          {
+            float *v_strip = NULL;
+            int v_sx0 = 0, v_sy0 = 0;
+            if(has_left && sc < sensor_px_base + sensor_O)
+            {
+              v_strip = v_strip_left;
+              v_sx0 = v_strip_left_sx0; v_sy0 = v_strip_left_sy0;
+            }
+            else if(has_right && sc >= sensor_px_end - sensor_O)
+            {
+              v_strip = v_strip_right;
+              v_sx0 = v_strip_right_sx0; v_sy0 = v_strip_right_sy0;
+            }
+            if(v_strip)
+            {
+              const size_t vidx
+                = (size_t)(sr - v_sy0) * (2 * sensor_O) + (sc - v_sx0);
+              v_strip[vidx] += ax * blended;
+            }
+          }
+          else
+          {
+            const float clipped
+              = blended < 0.0f ? 0.0f
+                : (blended > clip_max ? clip_max : blended);
+            cfa_out[pidx] = (uint16_t)(clipped + 0.5f);
+          }
         }
       }
 
+      // tx-1 + tx ramps now sum to 1; strip = final value, flush + free
+      if(v_strip_left)
+      {
+        for(int sr = v_strip_left_sy0;
+            sr < v_strip_left_sy0 + v_strip_left_h; sr++)
+        {
+          const size_t vrow = (size_t)(sr - v_strip_left_sy0) * (2 * sensor_O);
+          for(int dxs = 0; dxs < 2 * sensor_O; dxs++)
+          {
+            const int sc = v_strip_left_sx0 + dxs;
+            const float v = v_strip_left[vrow + dxs];
+            const float clipped
+              = v < 0.0f ? 0.0f : (v > clip_max ? clip_max : v);
+            cfa_out[(size_t)sr * width + sc] = (uint16_t)(clipped + 0.5f);
+          }
+        }
+        g_free(v_strip_left);
+      }
+      v_strip_left = v_strip_right;
+      v_strip_left_sx0 = v_strip_right_sx0;
+      v_strip_left_sy0 = v_strip_right_sy0;
+      v_strip_left_h   = v_strip_right_h;
+
       tile_count++;
       if(control_job)
         dt_control_job_set_progress(control_job,
                                     (double)tile_count / total_tiles);
     }
+
+    // defensive: should be NULL after last col, free in case of mid-row break
+    g_free(v_strip_left);
+    v_strip_left = NULL;
+
+    // ramps sum to 1, flush. clamp sc to working columns — outside cells
+    // were never written and would overwrite the cfa_in margin copy
+    if(h_strip_top)
+    {
+      for(int sr = h_strip_top_sy0; sr < h_strip_top_sy0 + hstrip_h; sr++)
+      {
+        const size_t hrow = (size_t)(sr - h_strip_top_sy0) * width;
+        for(int sc = x0; sc < x0 + 2 * Wh; sc++)
+        {
+          const float v = h_strip_top[hrow + sc];
+          const float clipped
+            = v < 0.0f ? 0.0f : (v > clip_max ? clip_max : v);
+          cfa_out[(size_t)sr * width + sc] = (uint16_t)(clipped + 0.5f);
+        }
+      }
+      g_free(h_strip_top);
+    }
+    h_strip_top = h_strip_bot;
+    h_strip_top_sy0 = h_strip_bot_sy0;
   }
 
+  // last row never allocates a bottom strip — defensive free
+  g_free(h_strip_top);
+
   g_free(tile_in);
   g_free(tile_out);
 
diff --git a/src/common/ai/restore_raw_linear.c b/src/common/ai/restore_raw_linear.c
index 23a63ea378ef..5a6d1b8ece04 100644
--- a/src/common/ai/restore_raw_linear.c
+++ b/src/common/ai/restore_raw_linear.c
@@ -548,8 +548,31 @@ retry:;
   int res = 0;
   int tile_count = 0;
 
+  // overlap blending — see restore_raw_bayer.c for the scheme.
+  // sensor_O = O (1:1 with input), strips are 3-ch planar matching rgb_out
+  const int sensor_O = O;
+  const int hstrip_h = 2 * sensor_O;
+  const size_t hstrip_chan = (size_t)w * hstrip_h;  // floats per channel
+
+  float *h_strip_top = NULL;
+  int h_strip_top_sy0 = 0;
+
   for(int ty = 0; ty < rows && res == 0; ty++)
   {
+    const gboolean has_top = ty > 0;
+    const gboolean has_bot = ty < rows - 1;
+
+    float *h_strip_bot = NULL;
+    int h_strip_bot_sy0 = 0;
+    if(has_bot)
+    {
+      h_strip_bot = g_try_malloc0(hstrip_chan * 3 * sizeof(float));
+      if(!h_strip_bot) { res = 1; break; }
+    }
+
+    float *v_strip_left = NULL;
+    int v_strip_left_sx0 = 0, v_strip_left_sy0 = 0, v_strip_left_h = 0;
+
     for(int tx = 0; tx < cols && res == 0; tx++)
     {
       if(control_job
@@ -564,8 +587,6 @@ retry:;
       const int x_base = tx * step;
       const int y_end  = (y_base + step > h) ? h : y_base + step;
       const int x_end  = (x_base + step > w) ? w : x_base + step;
-      const int core_h = y_end - y_base;
-      const int core_w = x_end - x_base;
 
       // extract T x T tile with mirror-pad at boundaries, planar
       for(int dy = 0; dy < T; dy++)
@@ -596,6 +617,9 @@ retry:;
                    T, next_T);
           g_free(tile_in);
           g_free(tile_out);
+          g_free(h_strip_top);
+          g_free(h_strip_bot);
+          g_free(v_strip_left);
           T = next_T;
           goto retry;
         }
@@ -621,38 +645,184 @@ retry:;
                  gain_ch[0], gain_ch[1], gain_ch[2]);
       }
 
-      // blend: write (α·denoised + (1-α)·source) per channel into
-      // the core-valid region. rgb_out was pre-filled with rgb_src
-      // so overlap gaps stay as source
-      for(int dy = 0; dy < core_h; dy++)
+      const gboolean has_left = tx > 0;
+      const gboolean has_right = tx < cols - 1;
+      const int sensor_py_base = y_base;
+      const int sensor_py_end  = y_end;
+      const int sensor_px_base = x_base;
+      const int sensor_px_end  = x_end;
+
+      if(has_bot && tx == 0) h_strip_bot_sy0 = sensor_py_end - sensor_O;
+
+      float *v_strip_right = NULL;
+      int v_strip_right_sx0 = 0, v_strip_right_sy0 = 0, v_strip_right_h = 0;
+      if(has_right)
+      {
+        v_strip_right_sx0 = sensor_px_end - sensor_O;
+        v_strip_right_sy0 = sensor_py_base + (has_top ? sensor_O : 0);
+        const int v_y_end = sensor_py_end - (has_bot ? sensor_O : 0);
+        v_strip_right_h = v_y_end - v_strip_right_sy0;
+        if(v_strip_right_h > 0)
+        {
+          v_strip_right = g_try_malloc0((size_t)(2 * sensor_O)
+                                        * v_strip_right_h * 3 * sizeof(float));
+          if(!v_strip_right) { res = 1; break; }
+        }
+      }
+
+      const int ext_y0 = has_top  ? sensor_py_base - sensor_O : sensor_py_base;
+      const int ext_y1 = has_bot  ? sensor_py_end + sensor_O  : sensor_py_end;
+      const int ext_x0 = has_left ? sensor_px_base - sensor_O : sensor_px_base;
+      const int ext_x1 = has_right? sensor_px_end + sensor_O  : sensor_px_end;
+
+      for(int sr = ext_y0; sr < ext_y1; sr++)
       {
-        const int y = y_base + dy;
-        const int my = O + dy;
-        for(int dx = 0; dx < core_w; dx++)
+        const int my = O + (sr - sensor_py_base);
+        const float ay = _seam_ay(sr, sensor_py_base, sensor_py_end,
+                                  sensor_O, has_top, has_bot);
+        const gboolean in_horiz_seam = (ay < 1.0f);
+
+        float *h_strip = NULL;
+        int h_strip_sy0 = 0;
+        if(in_horiz_seam)
+        {
+          if(has_top && sr < sensor_py_base + sensor_O)
+          {
+            h_strip = h_strip_top;
+            h_strip_sy0 = h_strip_top_sy0;
+          }
+          else if(has_bot && sr >= sensor_py_end - sensor_O)
+          {
+            h_strip = h_strip_bot;
+            h_strip_sy0 = h_strip_bot_sy0;
+          }
+        }
+        const size_t h_strip_row_off = h_strip
+          ? (size_t)(sr - h_strip_sy0) * w : 0;
+
+        for(int sc = ext_x0; sc < ext_x1; sc++)
         {
-          const int x = x_base + dx;
-          const int mx = O + dx;
+          const int mx = O + (sc - sensor_px_base);
+          const float ax = _seam_ax(sc, sensor_px_base, sensor_px_end,
+                                    sensor_O, has_left, has_right);
+          const gboolean in_vert_seam = (ax < 1.0f);
+
           const size_t tloc = (size_t)my * T + mx;
-          const size_t dst = (size_t)y * w + x;
+          const size_t dst = (size_t)sr * w + sc;
 
-          for(int k = 0; k < 3; k++)
+          if(in_horiz_seam)
           {
-            const float model_v
-              = tile_out[tloc + (size_t)k * per_ch];
-            const float src_v = rgb_src[dst + (size_t)k * plane];
-            rgb_out[dst + (size_t)k * plane]
-              = alpha * model_v + inv_alpha * src_v;
+            if(h_strip)
+            {
+              const float wgt = ax * ay;
+              for(int k = 0; k < 3; k++)
+              {
+                const float model_v = tile_out[tloc + (size_t)k * per_ch];
+                const float src_v   = rgb_src[dst + (size_t)k * plane];
+                const float blended = alpha * model_v + inv_alpha * src_v;
+                h_strip[h_strip_row_off + sc + (size_t)k * hstrip_chan]
+                  += wgt * blended;
+              }
+            }
+          }
+          else if(in_vert_seam)
+          {
+            float *v_strip = NULL;
+            int v_sx0 = 0, v_sy0 = 0, v_h = 0;
+            if(has_left && sc < sensor_px_base + sensor_O)
+            {
+              v_strip = v_strip_left;
+              v_sx0 = v_strip_left_sx0; v_sy0 = v_strip_left_sy0;
+              v_h = v_strip_left_h;
+            }
+            else if(has_right && sc >= sensor_px_end - sensor_O)
+            {
+              v_strip = v_strip_right;
+              v_sx0 = v_strip_right_sx0; v_sy0 = v_strip_right_sy0;
+              v_h = v_strip_right_h;
+            }
+            if(v_strip)
+            {
+              const size_t vchan = (size_t)(2 * sensor_O) * v_h;
+              const size_t vidx
+                = (size_t)(sr - v_sy0) * (2 * sensor_O) + (sc - v_sx0);
+              for(int k = 0; k < 3; k++)
+              {
+                const float model_v = tile_out[tloc + (size_t)k * per_ch];
+                const float src_v   = rgb_src[dst + (size_t)k * plane];
+                const float blended = alpha * model_v + inv_alpha * src_v;
+                v_strip[vidx + (size_t)k * vchan] += ax * blended;
+              }
+            }
+          }
+          else
+          {
+            for(int k = 0; k < 3; k++)
+            {
+              const float model_v = tile_out[tloc + (size_t)k * per_ch];
+              const float src_v   = rgb_src[dst + (size_t)k * plane];
+              rgb_out[dst + (size_t)k * plane]
+                = alpha * model_v + inv_alpha * src_v;
+            }
+          }
+        }
+      }
+
+      // tx-1 + tx ramps sum to 1; flush + free
+      if(v_strip_left)
+      {
+        const size_t vchan = (size_t)(2 * sensor_O) * v_strip_left_h;
+        for(int sr = v_strip_left_sy0;
+            sr < v_strip_left_sy0 + v_strip_left_h; sr++)
+        {
+          const size_t vrow = (size_t)(sr - v_strip_left_sy0) * (2 * sensor_O);
+          for(int dxs = 0; dxs < 2 * sensor_O; dxs++)
+          {
+            const int sc = v_strip_left_sx0 + dxs;
+            const size_t dst = (size_t)sr * w + sc;
+            for(int k = 0; k < 3; k++)
+              rgb_out[dst + (size_t)k * plane]
+                = v_strip_left[vrow + dxs + (size_t)k * vchan];
           }
         }
+        g_free(v_strip_left);
       }
+      v_strip_left = v_strip_right;
+      v_strip_left_sx0 = v_strip_right_sx0;
+      v_strip_left_sy0 = v_strip_right_sy0;
+      v_strip_left_h   = v_strip_right_h;
 
       tile_count++;
       if(control_job)
         dt_control_job_set_progress(control_job,
                                     (double)tile_count / total_tiles);
     }
+
+    g_free(v_strip_left);
+    v_strip_left = NULL;
+
+    // ramps sum to 1, flush. no column clamp needed (no working-region offset)
+    if(h_strip_top)
+    {
+      for(int sr = h_strip_top_sy0; sr < h_strip_top_sy0 + hstrip_h; sr++)
+      {
+        const size_t hrow = (size_t)(sr - h_strip_top_sy0) * w;
+        for(int sc = 0; sc < w; sc++)
+        {
+          const size_t dst = (size_t)sr * w + sc;
+          for(int k = 0; k < 3; k++)
+            rgb_out[dst + (size_t)k * plane]
+              = h_strip_top[hrow + sc + (size_t)k * hstrip_chan];
+        }
+      }
+      g_free(h_strip_top);
+    }
+    h_strip_top = h_strip_bot;
+    h_strip_top_sy0 = h_strip_bot_sy0;
   }
 
+  g_free(h_strip_top);
+
   g_free(tile_in);
   g_free(tile_out);
 

From 7408dba74c08f77d24b2ed9759e0f101604001e5 Mon Sep 17 00:00:00 2001
From: Andrii Ryzhkov <andrii.ryzhkov@pm.me>
Date: Mon, 27 Apr 2026 13:25:59 +0200
Subject: [PATCH 5/9] AI raw denoise: scalar match_gain on linear path

---
 src/common/ai/restore_raw_linear.c | 63 +++++++++++++-----------------
 1 file changed, 28 insertions(+), 35 deletions(-)

diff --git a/src/common/ai/restore_raw_linear.c b/src/common/ai/restore_raw_linear.c
index 5a6d1b8ece04..1d62e7be0a3d 100644
--- a/src/common/ai/restore_raw_linear.c
+++ b/src/common/ai/restore_raw_linear.c
@@ -85,33 +85,29 @@ static void _linear_build_M_boosted(const float input_to_cam[9],
       M[k * 3 + i] = input_to_cam[k * 3 + i] * inv_boost / wb_norm[k];
 }
 
-// per-channel scalar match_gain: tile_out[c] *= in_mean[c]/out_mean[c].
-// RawNIND linear output is arbitrary-scale camRGB-in-lin_rec2020 and
-// match_gain() is the canonical post-step that puts it back on the
-// input's scale. applied in place. out_gain[3] optional (batch uses it
-// for a tile0 diagnostic)
-static void _linear_gain_match_3ch(const float *tile_in,
-                                   float *tile_out,
-                                   size_t per_ch,
-                                   float out_gain[3])
+// scalar match_gain: tile_out *= in_mean / out_mean, where both means
+// are taken over all 3 channels and all spatial positions. mirrors the
+// upstream Python rawproc.match_gain (mean over (-1, -2, -3) dims),
+// which the model was trained against. applied in place. out_gain
+// optional (batch uses it for a tile0 diagnostic)
+static void _linear_gain_match(const float *tile_in,
+                               float *tile_out,
+                               size_t per_ch,
+                               float *out_gain)
 {
-  for(int k = 0; k < 3; k++)
+  const size_t total = per_ch * 3;
+  double in_sum = 0.0, out_sum = 0.0;
+  for(size_t i = 0; i < total; i++)
   {
-    const float *pi = tile_in + (size_t)k * per_ch;
-    float *po = tile_out + (size_t)k * per_ch;
-    double in_sum = 0.0, out_sum = 0.0;
-    for(size_t i = 0; i < per_ch; i++)
-    {
-      in_sum += pi[i];
-      out_sum += po[i];
-    }
-    const double im = in_sum / (double)per_ch;
-    const double om = out_sum / (double)per_ch;
-    const float g = (fabs(om) > 1e-8) ? (float)(im / om) : 1.0f;
-    if(g != 1.0f)
-      for(size_t i = 0; i < per_ch; i++) po[i] *= g;
-    if(out_gain) out_gain[k] = g;
+    in_sum += tile_in[i];
+    out_sum += tile_out[i];
   }
+  const double im = in_sum / (double)total;
+  const double om = out_sum / (double)total;
+  const float g = (fabs(om) > 1e-8) ? (float)(im / om) : 1.0f;
+  if(g != 1.0f)
+    for(size_t i = 0; i < total; i++) tile_out[i] *= g;
+  if(out_gain) *out_gain = g;
 }
 
 // derive + apply an exposure boost to a planar 3ch lin_rec2020 buffer.
@@ -630,20 +626,17 @@ retry:;
         break;
       }
 
-      // scalar match_gain per channel: tile_out *= in_mean / out_mean
-      // (applied in place by the helper). skipped for ABSOLUTE-scale
-      // models whose output is already calibrated
+      // scalar match_gain: tile_out *= in_mean / out_mean (applied in
+      // place by the helper). skipped for ABSOLUTE-scale models whose
+      // output is already calibrated
       const size_t per_ch = tile_plane;
-      float gain_ch[3] = { 1.0f, 1.0f, 1.0f };
+      float gain = 1.0f;
       if(ctx->output_scale == DT_RESTORE_OUT_MATCH_GAIN)
-        _linear_gain_match_3ch(tile_in, tile_out, per_ch, gain_ch);
+        _linear_gain_match(tile_in, tile_out, per_ch, &gain);
       if(tx == 0 && ty == 0)
-      {
         dt_print(DT_DEBUG_AI,
-                 "[restore_raw_linear] tile0 match_gain "
-                 "R=%.3e G=%.3e B=%.3e",
-                 gain_ch[0], gain_ch[1], gain_ch[2]);
-      }
+                 "[restore_raw_linear] tile0 match_gain=%.3e",
+                 (double)gain);
 
       const gboolean has_left = tx > 0;
       const gboolean has_right = tx < cols - 1;
@@ -1046,7 +1039,7 @@ int dt_restore_raw_linear_preview_piped(dt_restore_context_t *ctx,
   }
 
   if(ctx->output_scale == DT_RESTORE_OUT_MATCH_GAIN)
-    _linear_gain_match_3ch(tile_in, tile_out, tile_plane, NULL);
+    _linear_gain_match(tile_in, tile_out, tile_plane, NULL);
   g_free(tile_in);
 
   // build matrix to reverse matrix + WB + boost + normalise

From edd9ac1ba8a6e6faac88ce72566efe19a9e36183 Mon Sep 17 00:00:00 2001
From: Andrii Ryzhkov <andrii.ryzhkov@pm.me>
Date: Mon, 27 Apr 2026 16:51:52 +0200
Subject: [PATCH 6/9] DNG writer: consolidate into imageio/imageio_dng.{c,h}

---
 src/CMakeLists.txt                            |   2 +-
 src/common/ai/restore_raw_bayer.h             |   2 +-
 src/common/ai/restore_raw_linear.h            |   2 +-
 src/common/dng_writer.h                       | 105 ------
 src/control/jobs/control_jobs.c               |  22 +-
 .../dng_writer.c => imageio/imageio_dng.c}    | 274 +++++++++++++-
 src/imageio/imageio_dng.h                     | 343 ++++++------------
 src/libs/neural_restore.c                     |  16 +-
 8 files changed, 386 insertions(+), 380 deletions(-)
 delete mode 100644 src/common/dng_writer.h
 rename src/{common/dng_writer.c => imageio/imageio_dng.c} (58%)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 035104ed95d4..18916433608e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -160,6 +160,7 @@ FILE(GLOB SOURCE_FILES
   "gui/welcome.c"
   "gui/styles_dialog.c"
   "imageio/imageio.c"
+  "imageio/imageio_dng.c"
   "imageio/imageio_jpeg.c"
   "imageio/imageio_module.c"
   "imageio/imageio_pfm.c"
@@ -465,7 +466,6 @@ if(USE_AI)
     "common/ai/restore_rgb.c"
     "common/ai/restore_raw_bayer.c"
     "common/ai/restore_raw_linear.c"
-    "common/dng_writer.c"
     "develop/masks/object.c"
     "gui/preferences_ai.c"
   )
diff --git a/src/common/ai/restore_raw_bayer.h b/src/common/ai/restore_raw_bayer.h
index bd25ff48ae57..43ffdabb2b4f 100644
--- a/src/common/ai/restore_raw_bayer.h
+++ b/src/common/ai/restore_raw_bayer.h
@@ -23,7 +23,7 @@
 // WB, 2x2 pack), tiled inference with overlap blending, postprocessing
 // (un-WB, un-normalize), and re-mosaic back to the original CFA
 // pattern. produces a uint16 sensor-sized mosaic that is written to
-// DNG by dt_dng_write_cfa_bayer().
+// DNG by dt_imageio_dng_write_cfa_bayer().
 //
 // this is kept separate from the RGB denoise/upscale path in restore.c
 // because:
diff --git a/src/common/ai/restore_raw_linear.h b/src/common/ai/restore_raw_linear.h
index 53f3eaeec980..183db6d98bb7 100644
--- a/src/common/ai/restore_raw_linear.h
+++ b/src/common/ai/restore_raw_linear.h
@@ -32,7 +32,7 @@
 // output is a 3ch float RGB buffer at full sensor resolution, in the
 // same camRGB + raw ADC range as the source. the neural_restore batch
 // path re-mosaics nothing (this sensor type can't be round-tripped
-// through a CFA DNG) and writes a LinearRaw DNG via dng_writer.
+// through a CFA DNG) and writes a LinearRaw DNG via imageio_dng.
 
 #pragma once
 
diff --git a/src/common/dng_writer.h b/src/common/dng_writer.h
deleted file mode 100644
index e039cc848a09..000000000000
--- a/src/common/dng_writer.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
-    This file is part of darktable,
-    Copyright (C) 2026 darktable developers.
-
-    darktable is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    darktable is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-// dng_writer — minimal DNG CFA writer
-//
-// writes a single-plane uint16 Bayer mosaic plus enough DNG metadata
-// to let a raw processor (darktable, adobe, etc.) re-import the file
-// and run the normal raw pipeline: black/white level, CFA pattern,
-// AsShotNeutral (white balance), ColorMatrix1 (camRGB -> CIE XYZ),
-// camera make/model, and a pass-through of the source EXIF blob.
-//
-// this writer is intentionally narrow in scope:
-// - bayer only (no X-Trans, no LinearRaw demosaiced DNG)
-// - uncompressed strip layout
-// - single IFD (no embedded JPEG preview or thumbnails)
-// - 16-bit integer data only
-//
-// the consumer of the DNG (darktable itself) does not need more than
-// this for the neural restore round-trip.
-
-#pragma once
-
-#include <stdint.h>
-#include <glib.h>
-
-struct dt_image_t;
-
-// @brief Write a Bayer CFA mosaic as a DNG file.
-//
-// The output file contains a single IFD with PhotometricInterpretation=CFA.
-// All DNG metadata required for darktable re-import is sourced from @p img:
-//   - BlackLevel[4]                from img->raw_black_level_separate
-//   - WhiteLevel                   from img->raw_white_point
-//   - CFAPattern / CFARepeatDim    from img->buf_dsc.filters (dcraw format)
-//   - AsShotNeutral                from img->wb_coeffs (inverted)
-//   - ColorMatrix1                 from img->adobe_XYZ_to_CAM
-//   - Make / Model / UniqueModel   from img->camera_maker / camera_model
-//
-// @param filename output path (UTF-8)
-// @param cfa      Bayer mosaic (uint16, width * height samples, row-major)
-// @param width    image width in pixels (CFA samples per row)
-// @param height   image height in rows
-// @param img      source image, for DNG metadata
-// @param exif_blob optional Exif blob to embed (NULL = skip)
-// @param exif_len  size of exif_blob in bytes
-// @return 0 on success, non-zero on failure (file is removed on failure)
-int dt_dng_write_cfa_bayer(const char *filename,
-                           const uint16_t *cfa,
-                           int width,
-                           int height,
-                           const struct dt_image_t *img,
-                           const void *exif_blob,
-                           int exif_len);
-
-// @brief Write a demosaicked 3-channel linear DNG.
-//
-// Used for sensors the bayer DNG round-trip can't handle (X-Trans,
-// Foveon-like, pre-demosaicked raws). The output file has
-// PhotometricInterpretation=LinearRaw, SamplesPerPixel=3, and carries
-// the camera's ColorMatrix1 / AsShotNeutral / BlackLevel / WhiteLevel
-// so darktable re-imports it as a raw-origin image and skips its own
-// demosaic stage.
-//
-// Pixel data is interpreted as float-normalized camRGB in [0, ~1+]
-// (1.0 = source sensor white point after black subtract). The writer
-// scales that to uint16 using black = img->raw_black_level,
-// white = img->raw_white_point, so the encoding matches what the
-// corresponding raw CFA data would be in ADC units.
-//
-// @param filename output path (UTF-8)
-// @param rgb      interleaved 3ch float RGB, width*height*3 samples
-// @param width    image width in pixels
-// @param height   image height in pixels
-// @param img      source image, for DNG metadata + encoding range
-// @param exif_blob optional Exif blob to embed (NULL = skip)
-// @param exif_len  size of exif_blob in bytes
-// @return 0 on success, non-zero on failure (file removed on failure)
-int dt_dng_write_linear(const char *filename,
-                        const float *rgb,
-                        int width,
-                        int height,
-                        const struct dt_image_t *img,
-                        const void *exif_blob,
-                        int exif_len);
-
-// clang-format off
-// modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
-// vim: shiftwidth=2 expandtab tabstop=2 cindent
-// kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
-// clang-format on
diff --git a/src/control/jobs/control_jobs.c b/src/control/jobs/control_jobs.c
index db18726fb9f2..9c8fc02e4a63 100644
--- a/src/control/jobs/control_jobs.c
+++ b/src/control/jobs/control_jobs.c
@@ -693,17 +693,17 @@ static int32_t _control_merge_hdr_job_run(dt_job_t *job)
   char *c = pathname + strlen(pathname);
   while(*c != '.' && c > pathname) c--;
   g_strlcpy(c, "-hdr.dng", sizeof(pathname) - (c - pathname));
-  dt_imageio_write_dng(pathname,
-                       d.pixels,
-                       d.wd,
-                       d.ht,
-                       exif,
-                       exif_len,
-                       d.first_filter,
-                       (const uint8_t (*)[6])d.first_xtrans,
-                       1.0f,
-                       (const float (*))d.wb_coeffs,
-                       d.adobe_XYZ_to_CAM);
+  dt_imageio_dng_write_float(pathname,
+                             d.pixels,
+                             d.wd,
+                             d.ht,
+                             exif,
+                             exif_len,
+                             d.first_filter,
+                             (const uint8_t (*)[6])d.first_xtrans,
+                             1.0f,
+                             (const float (*))d.wb_coeffs,
+                             d.adobe_XYZ_to_CAM);
   free(exif);
 
   dt_control_job_set_progress(job, 1.0);
diff --git a/src/common/dng_writer.c b/src/imageio/imageio_dng.c
similarity index 58%
rename from src/common/dng_writer.c
rename to src/imageio/imageio_dng.c
index d26c6d27181e..255b377f9539 100644
--- a/src/common/dng_writer.c
+++ b/src/imageio/imageio_dng.c
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    Copyright (C) 2026 darktable developers.
+    Copyright (C) 2011-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -16,7 +16,7 @@
     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "common/dng_writer.h"
+#include "imageio/imageio_dng.h"
 #include "common/darktable.h"
 #include "common/exif.h"
 #include "common/image.h"
@@ -24,6 +24,10 @@
 
 #include <glib.h>
 #include <glib/gstdio.h>
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <tiffio.h>
 
@@ -45,13 +49,13 @@ static void _cfa_bytes_from_filters(uint32_t filters, uint8_t out[4])
   out[3] = FC(1, 1, filters);
 }
 
-int dt_dng_write_cfa_bayer(const char *filename,
-                           const uint16_t *cfa,
-                           int width,
-                           int height,
-                           const dt_image_t *img,
-                           const void *exif_blob,
-                           int exif_len)
+int dt_imageio_dng_write_cfa_bayer(const char *filename,
+                                   const uint16_t *cfa,
+                                   int width,
+                                   int height,
+                                   const dt_image_t *img,
+                                   const void *exif_blob,
+                                   int exif_len)
 {
   if(!filename || !cfa || !img || width <= 0 || height <= 0)
     return 1;
@@ -221,13 +225,13 @@ int dt_dng_write_cfa_bayer(const char *filename,
   return res;
 }
 
-int dt_dng_write_linear(const char *filename,
-                        const float *rgb,
-                        int width,
-                        int height,
-                        const dt_image_t *img,
-                        const void *exif_blob,
-                        int exif_len)
+int dt_imageio_dng_write_linear(const char *filename,
+                                const float *rgb,
+                                int width,
+                                int height,
+                                const dt_image_t *img,
+                                const void *exif_blob,
+                                int exif_len)
 {
   if(!filename || !rgb || !img || width <= 0 || height <= 0)
     return 1;
@@ -381,6 +385,244 @@ int dt_dng_write_linear(const char *filename,
   return res;
 }
 
+// =============================================================================
+// dt_imageio_dng_write_float — hand-rolled byte-level TIFF/DNG writer
+//
+// Used by HDR merge. Writes a 32-bit float CFA DNG (Bayer or X-Trans).
+// The helpers and macros below are private to this writer and are not
+// shared with the libtiff-based uint16 writers above
+// =============================================================================
+
+// TIFF type codes (libtiff knows these natively, so the uint16 writers
+// above don't need them)
+#define BYTE 1
+#define ASCII 2
+#define SHORT 3
+#define LONG 4
+#define RATIONAL 5
+#define SRATIONAL 10
+
+#define HEADBUFFSIZE 1024
+
+static inline void _imageio_dng_write_buf(uint8_t *buf, const uint32_t d, const int val)
+{
+  if(d + 4 >= HEADBUFFSIZE) return;
+  buf[d] = val & 0xff;
+  buf[d + 1] = (val >> 8) & 0xff;
+  buf[d + 2] = (val >> 16) & 0xff;
+  buf[d + 3] = val >> 24;
+}
+
+static inline int _imageio_dng_make_tag(
+    const uint16_t tag,
+    const uint16_t type,
+    const uint32_t lng,
+    const uint32_t fld,
+    uint8_t *buf,
+    const uint32_t b,
+    uint8_t *cnt)
+{
+  if(b + 12 < HEADBUFFSIZE)
+  {
+    _imageio_dng_write_buf(buf, b, (type << 16) | tag);
+    _imageio_dng_write_buf(buf, b+4, lng);
+    _imageio_dng_write_buf(buf, b+8, fld);
+    *cnt = *cnt + 1;
+  }
+  return b + 12;
+}
+
+static inline void _imageio_dng_write_tiff_header(
+    FILE *fp,
+    uint32_t xs,
+    uint32_t ys,
+    float Tv,
+    float Av,
+    float f,
+    float iso,
+    uint32_t filter,
+    const uint8_t xtrans[6][6],
+    const float whitelevel,
+    const dt_aligned_pixel_t wb_coeffs,
+    const float adobe_XYZ_to_CAM[4][3])
+{
+  const uint32_t channels = 1;
+  uint8_t buf[HEADBUFFSIZE];
+  uint8_t cnt = 0;
+
+  // this matrix is generic for XYZ->sRGB / D65
+  int m[9] = { 3240454, -1537138, -498531, -969266, 1876010, 41556, 55643, -204025, 1057225 };
+  int den = 1000000;
+
+  memset(buf, 0, sizeof(buf));
+  /* TIFF file header, little-endian */
+  buf[0] = 0x49;
+  buf[1] = 0x49;
+  buf[2] = 0x2a;
+  buf[4] = 8;
+
+  // If you want to add other tags written to a dng file include the the ID in the enum to
+  // keep track of written tags so we don't a) have leaks or b) overwrite anything in data section
+  const int first_tag = __LINE__ + 3;
+  enum write_tags
+  {
+    EXIF_TAG_SUBFILE = 254,           /* New subfile type.  */
+    EXIF_TAG_IMGWIDTH = 256,          /* Image width.  */
+    EXIF_TAG_IMGLENGTH = 257,         /* Image length.  */
+    EXIF_TAG_BPS = 258,               /* Bits per sample: 32-bit float */
+    EXIF_TAG_COMPRESS = 259,          /* Compression.  */
+    EXIF_TAG_PHOTOMINTREP = 262,      /* Photo interp: CFA  */
+    EXIF_TAG_STRIP_OFFSET = 273,      /* Strip offset.  */
+    EXIF_TAG_ORIENTATION = 274,       /* Orientation. */
+    EXIF_TAG_SAMPLES_PER_PIXEL = 277, /* Samples per pixel.  */
+    EXIF_TAG_ROWS_PER_STRIP = 278,    /* Rows per strip.  */
+    EXIF_TAG_STRIP_BCOUNT = 279,      /* Strip byte count.  */
+    EXIF_TAG_PLANAR_CONFIG = 284,     /* Planar configuration.  */
+    EXIF_TAG_SAMPLE_FORMAT = 339,     /* SampleFormat = 3 => ieee floating point */
+    EXIF_TAG_REPEAT_PATTERN = 33421,  /* pattern repeat */
+    EXIF_TAG_SENS_PATTERN = 33422,    /* sensor pattern */
+    EXIF_TAG_VERSION = 50706,         /* DNG Version */
+    EXIF_TAG_WHITE_LEVEL = 50717,     /* White level */
+    EXIF_TAG_COLOR_MATRIX1 = 50721,   /* ColorMatrix1 (XYZ->native cam) */
+    EXIF_TAG_SHOT_NEUTRAL = 50728,    /* AsShotNeutral for rawspeed Dngdecoder camera white balance */
+    EXIF_TAG_ILLUMINANT1 = 50778,     /* CalibrationIlluminant1 */
+  };
+  buf[8] = (uint8_t)(__LINE__ - first_tag - 1); /* number of entries */
+
+  uint32_t b = 10;
+  uint32_t data = 10 + buf[8] * 12 + 4; // takes care of the header, entries, and termination
+
+  b = _imageio_dng_make_tag(EXIF_TAG_SUBFILE, LONG, 1, 0, buf, b, &cnt);
+  b = _imageio_dng_make_tag(EXIF_TAG_IMGWIDTH, LONG, 1, xs, buf, b, &cnt);
+  b = _imageio_dng_make_tag(EXIF_TAG_IMGLENGTH, LONG, 1, ys, buf, b, &cnt);
+  b = _imageio_dng_make_tag(EXIF_TAG_BPS, SHORT, 1, 32, buf, b, &cnt);
+  b = _imageio_dng_make_tag(EXIF_TAG_COMPRESS, SHORT, 1, 1, buf, b, &cnt);
+  b = _imageio_dng_make_tag(EXIF_TAG_PHOTOMINTREP, SHORT, 1, 32803, buf, b, &cnt);
+  b = _imageio_dng_make_tag(EXIF_TAG_STRIP_OFFSET, LONG, 1, 0, buf, b, &cnt);
+  uint32_t ofst = b - 4; /* remember buffer address for updating strip offset later */
+  b = _imageio_dng_make_tag(EXIF_TAG_ORIENTATION, SHORT, 1, 1, buf, b, &cnt);
+  b = _imageio_dng_make_tag(EXIF_TAG_SAMPLES_PER_PIXEL, SHORT, 1, channels, buf, b, &cnt);
+  b = _imageio_dng_make_tag(EXIF_TAG_ROWS_PER_STRIP, LONG, 1, ys, buf, b, &cnt);
+  b = _imageio_dng_make_tag(EXIF_TAG_STRIP_BCOUNT, LONG, 1, (ys * xs * channels*4), buf, b, &cnt);
+  b = _imageio_dng_make_tag(EXIF_TAG_PLANAR_CONFIG, SHORT, 1, 1, buf, b, &cnt);
+  b = _imageio_dng_make_tag(EXIF_TAG_SAMPLE_FORMAT, SHORT, 1, 3, buf, b, &cnt);
+
+  if(filter == 9u) // xtrans
+    b = _imageio_dng_make_tag(EXIF_TAG_REPEAT_PATTERN, SHORT, 2, (6 << 16) | 6, buf, b, &cnt);
+  else
+    b = _imageio_dng_make_tag(EXIF_TAG_REPEAT_PATTERN, SHORT, 2, (2 << 16) | 2, buf, b, &cnt);
+
+  uint32_t cfapattern = 0;
+  switch(filter)
+  {
+    case 0x94949494:
+      cfapattern = (2 << 24) | (1 << 16) | (1 << 8) | 0; // rggb
+      break;
+    case 0x49494949:
+      cfapattern = (1 << 24) | (0 << 16) | (2 << 8) | 1; // gbrg
+      break;
+    case 0x61616161:
+      cfapattern = (1 << 24) | (2 << 16) | (0 << 8) | 1; // grbg
+      break;
+    default:                                             // case 0x16161616:
+      cfapattern = (0 << 24) | (1 << 16) | (1 << 8) | 2; // bggr
+      break;
+  }
+
+  if(filter == 9u) // xtrans
+  {
+    b = _imageio_dng_make_tag(EXIF_TAG_SENS_PATTERN, BYTE, 36, data, buf, b, &cnt); /* xtrans PATTERN */
+    // apparently this doesn't need byteswap:
+    memcpy(buf + data, xtrans, sizeof(uint8_t)*36);
+    data += 36;
+  }
+  else // bayer
+    b = _imageio_dng_make_tag(EXIF_TAG_SENS_PATTERN, BYTE, 4, cfapattern, buf, b, &cnt); /* bayer PATTERN */
+
+  b = _imageio_dng_make_tag(EXIF_TAG_VERSION, BYTE, 4, 1 | (4 << 8), buf, b, &cnt);
+
+  // WhiteLevel is straight integer even for float DNGs
+  b = _imageio_dng_make_tag(EXIF_TAG_WHITE_LEVEL, LONG, 1, (uint32_t)roundf(whitelevel), buf, b, &cnt);
+
+  // ColorMatrix1 try to get camera matrix else m[k] like before
+  if(dt_is_valid_colormatrix(adobe_XYZ_to_CAM[0][0]))
+  {
+    den = 10000;
+    for(int k= 0; k < 3; k++)
+      for(int i= 0; i < 3; i++)
+        m[k*3+i] = roundf(adobe_XYZ_to_CAM[k][i] * den);
+  }
+  b = _imageio_dng_make_tag(EXIF_TAG_COLOR_MATRIX1, SRATIONAL, 9, data, buf, b, &cnt); /* ColorMatrix1 (XYZ->native cam) */
+  for(int k = 0; k < 9; k++)
+  {
+    _imageio_dng_write_buf(buf, data + k*8, m[k]);
+    _imageio_dng_write_buf(buf, data+4 + k*8, den);
+  }
+  data += 9 * 8;
+
+  b = _imageio_dng_make_tag(EXIF_TAG_SHOT_NEUTRAL, RATIONAL, 3, data, buf, b, &cnt);
+  den = 1000000;
+  for(int k = 0; k < 3; k++)
+  {
+    const float coeff = roundf(((float)den * wb_coeffs[1]) / wb_coeffs[k]);
+    _imageio_dng_write_buf(buf, data + k*8, (int)coeff);
+    _imageio_dng_write_buf(buf, data+4 + k*8, den);
+  }
+  data += 3 * 8;
+
+  b = _imageio_dng_make_tag(EXIF_TAG_ILLUMINANT1, SHORT, 1, DT_LS_D65, buf, b, &cnt);
+
+  // We have all tags using data now written so we can finally use strip offset
+  _imageio_dng_write_buf(buf, ofst, data);
+
+  /* Termination is implicit: next IFD already 0 when buf initialized */
+
+  if(buf[8] != cnt)
+  {
+    dt_print(DT_DEBUG_ALWAYS, "[dng_write_header] can't write valid header, unexpected number of entries!");
+    return;
+  }
+
+  if(data >= HEADBUFFSIZE)
+  {
+    dt_print(DT_DEBUG_ALWAYS, "[dng_write_header] can't write valid header as it exceeds buffer size!");
+    return;
+  }
+
+  // exif is written later, by exiv2:
+  const int written = fwrite(buf, 1, data, fp);
+  if(written != data) dt_print(DT_DEBUG_ALWAYS, "[dng_write_header] failed to write image header!");
+}
+
+
+void dt_imageio_dng_write_float(
+    const char *filename, const float *const pixel, const int wd,
+    const int ht, void *exif, const int exif_len, const uint32_t filter,
+    const uint8_t xtrans[6][6],
+    const float whitelevel,
+    const dt_aligned_pixel_t wb_coeffs,
+    const float adobe_XYZ_to_CAM[4][3])
+{
+  FILE *f = g_fopen(filename, "wb");
+  if(f)
+  {
+    _imageio_dng_write_tiff_header(f, wd, ht, 1.0f / 100.0f, 1.0f / 4.0f, 50.0f, 100.0f,
+                                     filter, xtrans, whitelevel, wb_coeffs, adobe_XYZ_to_CAM);
+    const int k = fwrite(pixel, sizeof(float), (size_t)wd * ht, f);
+    if(k != wd * ht) dt_print(DT_DEBUG_ALWAYS, "[dng_write] Error writing image data to %s", filename);
+    fclose(f);
+    if(exif) dt_exif_write_blob(exif, exif_len, filename, 0);
+  }
+}
+
+#undef BYTE
+#undef ASCII
+#undef SHORT
+#undef LONG
+#undef RATIONAL
+#undef SRATIONAL
+#undef HEADBUFFSIZE
+
 // clang-format off
 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
 // vim: shiftwidth=2 expandtab tabstop=2 cindent
diff --git a/src/imageio/imageio_dng.h b/src/imageio/imageio_dng.h
index a4068e549069..e72ca801ede8 100644
--- a/src/imageio/imageio_dng.h
+++ b/src/imageio/imageio_dng.h
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    Copyright (C) 2011-2023 darktable developers.
+    Copyright (C) 2011-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -16,244 +16,113 @@
     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#pragma once
-
-// writes buffers as digital negative (dng) raw images
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "common/darktable.h"
-#include "common/exif.h"
-
-#define BYTE 1
-#define ASCII 2
-#define SHORT 3
-#define LONG 4
-#define RATIONAL 5
-#define SRATIONAL 10
-
-#define HEADBUFFSIZE 1024
-
-static inline void _imageio_dng_write_buf(uint8_t *buf, const uint32_t d, const int val)
-{
-  if(d + 4 >= HEADBUFFSIZE) return;
-  buf[d] = val & 0xff;
-  buf[d + 1] = (val >> 8) & 0xff;
-  buf[d + 2] = (val >> 16) & 0xff;
-  buf[d + 3] = val >> 24;
-}
-
-static inline int _imageio_dng_make_tag(
-    const uint16_t tag,
-    const uint16_t type,
-    const uint32_t lng,
-    const uint32_t fld,
-    uint8_t *buf,
-    const uint32_t b,
-    uint8_t *cnt)
-{
-  if(b + 12 < HEADBUFFSIZE)
-  {
-    _imageio_dng_write_buf(buf, b, (type << 16) | tag);
-    _imageio_dng_write_buf(buf, b+4, lng);
-    _imageio_dng_write_buf(buf, b+8, fld);
-    *cnt = *cnt + 1;
-  }
-  return b + 12;
-}
-
-static inline void _imageio_dng_write_tiff_header(
-    FILE *fp,
-    uint32_t xs,
-    uint32_t ys,
-    float Tv,
-    float Av,
-    float f,
-    float iso,
-    uint32_t filter,
-    const uint8_t xtrans[6][6],
-    const float whitelevel,
-    const dt_aligned_pixel_t wb_coeffs,
-    const float adobe_XYZ_to_CAM[4][3])
-{
-  const uint32_t channels = 1;
-  uint8_t buf[HEADBUFFSIZE];
-  uint8_t cnt = 0;
-
-  // this matrix is generic for XYZ->sRGB / D65
-  int m[9] = { 3240454, -1537138, -498531, -969266, 1876010, 41556, 55643, -204025, 1057225 };
-  int den = 1000000;
-
-  memset(buf, 0, sizeof(buf));
-  /* TIFF file header, little-endian */
-  buf[0] = 0x49;
-  buf[1] = 0x49;
-  buf[2] = 0x2a;
-  buf[4] = 8;
-
-  // If you want to add other tags written to a dng file include the the ID in the enum to
-  // keep track of written tags so we don't a) have leaks or b) overwrite anything in data section 
-  const int first_tag = __LINE__ + 3;
-  enum write_tags
-  {
-    EXIF_TAG_SUBFILE = 254,           /* New subfile type.  */
-    EXIF_TAG_IMGWIDTH = 256,          /* Image width.  */
-    EXIF_TAG_IMGLENGTH = 257,         /* Image length.  */
-    EXIF_TAG_BPS = 258,               /* Bits per sample: 32-bit float */
-    EXIF_TAG_COMPRESS = 259,          /* Compression.  */
-    EXIF_TAG_PHOTOMINTREP = 262,      /* Photo interp: CFA  */
-    EXIF_TAG_STRIP_OFFSET = 273,      /* Strip offset.  */
-    EXIF_TAG_ORIENTATION = 274,       /* Orientation. */
-    EXIF_TAG_SAMPLES_PER_PIXEL = 277, /* Samples per pixel.  */
-    EXIF_TAG_ROWS_PER_STRIP = 278,    /* Rows per strip.  */
-    EXIF_TAG_STRIP_BCOUNT = 279,      /* Strip byte count.  */
-    EXIF_TAG_PLANAR_CONFIG = 284,     /* Planar configuration.  */
-    EXIF_TAG_SAMPLE_FORMAT = 339,     /* SampleFormat = 3 => ieee floating point */
-    EXIF_TAG_REPEAT_PATTERN = 33421,  /* pattern repeat */
-    EXIF_TAG_SENS_PATTERN = 33422,    /* sensor pattern */
-    EXIF_TAG_VERSION = 50706,         /* DNG Version */
-    EXIF_TAG_WHITE_LEVEL = 50717,     /* White level */
-    EXIF_TAG_COLOR_MATRIX1 = 50721,   /* ColorMatrix1 (XYZ->native cam) */
-    EXIF_TAG_SHOT_NEUTRAL = 50728,    /* AsShotNeutral for rawspeed Dngdecoder camera white balance */
-    EXIF_TAG_ILLUMINANT1 = 50778,     /* CalibrationIlluminant1 */
-  };
-  buf[8] = (uint8_t)(__LINE__ - first_tag - 1); /* number of entries */
-
-  uint32_t b = 10;
-  uint32_t data = 10 + buf[8] * 12 + 4; // takes care of the header, entries, and termination
-
-  b = _imageio_dng_make_tag(EXIF_TAG_SUBFILE, LONG, 1, 0, buf, b, &cnt);
-  b = _imageio_dng_make_tag(EXIF_TAG_IMGWIDTH, LONG, 1, xs, buf, b, &cnt);
-  b = _imageio_dng_make_tag(EXIF_TAG_IMGLENGTH, LONG, 1, ys, buf, b, &cnt);
-  b = _imageio_dng_make_tag(EXIF_TAG_BPS, SHORT, 1, 32, buf, b, &cnt);
-  b = _imageio_dng_make_tag(EXIF_TAG_COMPRESS, SHORT, 1, 1, buf, b, &cnt);
-  b = _imageio_dng_make_tag(EXIF_TAG_PHOTOMINTREP, SHORT, 1, 32803, buf, b, &cnt);
-  b = _imageio_dng_make_tag(EXIF_TAG_STRIP_OFFSET, LONG, 1, 0, buf, b, &cnt);
-  uint32_t ofst = b - 4; /* remember buffer address for updating strip offset later */
-  b = _imageio_dng_make_tag(EXIF_TAG_ORIENTATION, SHORT, 1, 1, buf, b, &cnt);
-  b = _imageio_dng_make_tag(EXIF_TAG_SAMPLES_PER_PIXEL, SHORT, 1, channels, buf, b, &cnt);
-  b = _imageio_dng_make_tag(EXIF_TAG_ROWS_PER_STRIP, LONG, 1, ys, buf, b, &cnt);
-  b = _imageio_dng_make_tag(EXIF_TAG_STRIP_BCOUNT, LONG, 1, (ys * xs * channels*4), buf, b, &cnt);
-  b = _imageio_dng_make_tag(EXIF_TAG_PLANAR_CONFIG, SHORT, 1, 1, buf, b, &cnt);
-  b = _imageio_dng_make_tag(EXIF_TAG_SAMPLE_FORMAT, SHORT, 1, 3, buf, b, &cnt);
-
-  if(filter == 9u) // xtrans
-    b = _imageio_dng_make_tag(EXIF_TAG_REPEAT_PATTERN, SHORT, 2, (6 << 16) | 6, buf, b, &cnt);
-  else
-    b = _imageio_dng_make_tag(EXIF_TAG_REPEAT_PATTERN, SHORT, 2, (2 << 16) | 2, buf, b, &cnt);
+// imageio_dng — DNG writers
+//
+// Three entry points for three different DNG flavors:
+//   - dt_imageio_dng_write_float      32-bit float CFA (HDR merge)
+//   - dt_imageio_dng_write_cfa_bayer  16-bit uint Bayer CFA (raw round-trip)
+//   - dt_imageio_dng_write_linear     16-bit uint LinearRaw 3ch (demosaicked)
+//
+// The float writer is hand-rolled byte assembly into a small TIFF
+// header buffer; the two uint16 writers use libtiff.
 
-  uint32_t cfapattern = 0;
-  switch(filter)
-  {
-    case 0x94949494:
-      cfapattern = (2 << 24) | (1 << 16) | (1 << 8) | 0; // rggb
-      break;
-    case 0x49494949:
-      cfapattern = (1 << 24) | (0 << 16) | (2 << 8) | 1; // gbrg
-      break;
-    case 0x61616161:
-      cfapattern = (1 << 24) | (2 << 16) | (0 << 8) | 1; // grbg
-      break;
-    default:                                             // case 0x16161616:
-      cfapattern = (0 << 24) | (1 << 16) | (1 << 8) | 2; // bggr
-      break;
-  }
-
-  if(filter == 9u) // xtrans
-  {
-    b = _imageio_dng_make_tag(EXIF_TAG_SENS_PATTERN, BYTE, 36, data, buf, b, &cnt); /* xtrans PATTERN */
-    // apparently this doesn't need byteswap:
-    memcpy(buf + data, xtrans, sizeof(uint8_t)*36);
-    data += 36;
-  }
-  else // bayer
-    b = _imageio_dng_make_tag(EXIF_TAG_SENS_PATTERN, BYTE, 4, cfapattern, buf, b, &cnt); /* bayer PATTERN */
-
-  b = _imageio_dng_make_tag(EXIF_TAG_VERSION, BYTE, 4, 1 | (4 << 8), buf, b, &cnt);
-
-  // WhiteLevel is straight integer even for float DNGs
-  b = _imageio_dng_make_tag(EXIF_TAG_WHITE_LEVEL, LONG, 1, (uint32_t)roundf(whitelevel), buf, b, &cnt);
-
-  // ColorMatrix1 try to get camera matrix else m[k] like before
-  if(dt_is_valid_colormatrix(adobe_XYZ_to_CAM[0][0]))
-  {
-    den = 10000;
-    for(int k= 0; k < 3; k++)
-      for(int i= 0; i < 3; i++)
-        m[k*3+i] = roundf(adobe_XYZ_to_CAM[k][i] * den);
-  }
-  b = _imageio_dng_make_tag(EXIF_TAG_COLOR_MATRIX1, SRATIONAL, 9, data, buf, b, &cnt); /* ColorMatrix1 (XYZ->native cam) */
-  for(int k = 0; k < 9; k++)
-  {
-    _imageio_dng_write_buf(buf, data + k*8, m[k]);
-    _imageio_dng_write_buf(buf, data+4 + k*8, den);
-  }
-  data += 9 * 8;
-
-  b = _imageio_dng_make_tag(EXIF_TAG_SHOT_NEUTRAL, RATIONAL, 3, data, buf, b, &cnt);
-  den = 1000000;
-  for(int k = 0; k < 3; k++)
-  {
-    const float coeff = roundf(((float)den * wb_coeffs[1]) / wb_coeffs[k]);
-    _imageio_dng_write_buf(buf, data + k*8, (int)coeff);
-    _imageio_dng_write_buf(buf, data+4 + k*8, den);
-  }
-  data += 3 * 8;
-
-  b = _imageio_dng_make_tag(EXIF_TAG_ILLUMINANT1, SHORT, 1, DT_LS_D65, buf, b, &cnt);
-
-  // We have all tags using data now written so we can finally use strip offset
-  _imageio_dng_write_buf(buf, ofst, data); 
-
-  /* Termination is implicit: next IFD already 0 when buf initialized */
-
-  if(buf[8] != cnt)
-  {
-    dt_print(DT_DEBUG_ALWAYS, "[dng_write_header] can't write valid header, unexpected number of entries!");
-    return;
-  }
-
-  if(data >= HEADBUFFSIZE)
-  {
-    dt_print(DT_DEBUG_ALWAYS, "[dng_write_header] can't write valid header as it exceeds buffer size!");
-    return;
-  }
-
-  // exif is written later, by exiv2:
-  const int written = fwrite(buf, 1, data, fp);
-  if(written != data) dt_print(DT_DEBUG_ALWAYS, "[dng_write_header] failed to write image header!");
-}
-
-
-static inline void dt_imageio_write_dng(
-    const char *filename, const float *const pixel, const int wd,
-    const int ht, void *exif, const int exif_len, const uint32_t filter,
-    const uint8_t xtrans[6][6],
-    const float whitelevel,
-    const dt_aligned_pixel_t wb_coeffs,
-    const float adobe_XYZ_to_CAM[4][3])
-{
-  FILE *f = g_fopen(filename, "wb");
-  if(f)
-  {
-    _imageio_dng_write_tiff_header(f, wd, ht, 1.0f / 100.0f, 1.0f / 4.0f, 50.0f, 100.0f,
-                                     filter, xtrans, whitelevel, wb_coeffs, adobe_XYZ_to_CAM);
-    const int k = fwrite(pixel, sizeof(float), (size_t)wd * ht, f);
-    if(k != wd * ht) dt_print(DT_DEBUG_ALWAYS, "[dng_write] Error writing image data to %s", filename);
-    fclose(f);
-    if(exif) dt_exif_write_blob(exif, exif_len, filename, 0);
-  }
-}
+#pragma once
 
-#undef BYTE
-#undef ASCII
-#undef SHORT
-#undef LONG
-#undef RATIONAL
-#undef SRATIONAL
+#include <stdint.h>
+#include <glib.h>
+
+#include "common/dttypes.h"  // for dt_aligned_pixel_t
+
+struct dt_image_t;
+
+// @brief Write a 32-bit float CFA DNG (Bayer or X-Trans).
+//
+// Used by HDR merge: pixel data is float pre-normalized to
+// [0, whitelevel], so values may exceed any single sensor's white
+// point. The writer doesn't emit BlackLevel or ACTIVEAREA — the
+// importer assumes black=0 and the buffer is at the dimensions you
+// want displayed.
+//
+// @param filename output path (UTF-8)
+// @param pixel    float CFA, wd*ht samples, row-major
+// @param wd       image width in pixels
+// @param ht       image height in pixels
+// @param exif     optional Exif blob to embed (NULL = skip)
+// @param exif_len size of @p exif in bytes
+// @param filter   dcraw 2x2 CFA filters word, or 9u for X-Trans
+// @param xtrans   X-Trans 6x6 pattern (used iff filter == 9u)
+// @param whitelevel pre-normalized white level (typically 1.0f for HDR)
+// @param wb_coeffs camera-RGB raw-to-white multipliers
+// @param adobe_XYZ_to_CAM XYZ->cameraRGB matrix (4x3, only first 3 rows used)
+void dt_imageio_dng_write_float(const char *filename,
+                                const float *pixel,
+                                int wd,
+                                int ht,
+                                void *exif,
+                                int exif_len,
+                                uint32_t filter,
+                                const uint8_t xtrans[6][6],
+                                float whitelevel,
+                                const dt_aligned_pixel_t wb_coeffs,
+                                const float adobe_XYZ_to_CAM[4][3]);
+
+// @brief Write a Bayer CFA mosaic as a DNG file.
+//
+// The output file contains a single IFD with PhotometricInterpretation=CFA.
+// All DNG metadata required for darktable re-import is sourced from @p img:
+//   - BlackLevel[4]                from img->raw_black_level_separate
+//   - WhiteLevel                   from img->raw_white_point
+//   - CFAPattern / CFARepeatDim    from img->buf_dsc.filters (dcraw format)
+//   - AsShotNeutral                from img->wb_coeffs (inverted)
+//   - ColorMatrix1                 from img->adobe_XYZ_to_CAM
+//   - Make / Model / UniqueModel   from img->camera_maker / camera_model
+//
+// @param filename output path (UTF-8)
+// @param cfa      Bayer mosaic (uint16, width * height samples, row-major)
+// @param width    image width in pixels (CFA samples per row)
+// @param height   image height in rows
+// @param img      source image, for DNG metadata
+// @param exif_blob optional Exif blob to embed (NULL = skip)
+// @param exif_len  size of exif_blob in bytes
+// @return 0 on success, non-zero on failure (file is removed on failure)
+int dt_imageio_dng_write_cfa_bayer(const char *filename,
+                                   const uint16_t *cfa,
+                                   int width,
+                                   int height,
+                                   const struct dt_image_t *img,
+                                   const void *exif_blob,
+                                   int exif_len);
+
+// @brief Write a demosaicked 3-channel linear DNG.
+//
+// Used for sensors the bayer DNG round-trip can't handle (X-Trans,
+// Foveon-like, pre-demosaicked raws). The output file has
+// PhotometricInterpretation=LinearRaw, SamplesPerPixel=3, and carries
+// the camera's ColorMatrix1 / AsShotNeutral / BlackLevel / WhiteLevel
+// so darktable re-imports it as a raw-origin image and skips its own
+// demosaic stage.
+//
+// Pixel data is interpreted as float-normalized camRGB in [0, ~1+]
+// (1.0 = source sensor white point after black subtract). The writer
+// scales that to uint16 using black = img->raw_black_level,
+// white = img->raw_white_point, so the encoding matches what the
+// corresponding raw CFA data would be in ADC units.
+//
+// @param filename output path (UTF-8)
+// @param rgb      interleaved 3ch float RGB, width*height*3 samples
+// @param width    image width in pixels
+// @param height   image height in pixels
+// @param img      source image, for DNG metadata + encoding range
+// @param exif_blob optional Exif blob to embed (NULL = skip)
+// @param exif_len  size of exif_blob in bytes
+// @return 0 on success, non-zero on failure (file removed on failure)
+int dt_imageio_dng_write_linear(const char *filename,
+                                const float *rgb,
+                                int width,
+                                int height,
+                                const struct dt_image_t *img,
+                                const void *exif_blob,
+                                int exif_len);
 
 // clang-format off
 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
diff --git a/src/libs/neural_restore.c b/src/libs/neural_restore.c
index 17eb52dedf51..5d342090ed9b 100644
--- a/src/libs/neural_restore.c
+++ b/src/libs/neural_restore.c
@@ -94,11 +94,11 @@
 //      - raw denoise (Bayer): pre-process the sensor CFA (black/WB/
 //        pack), run tiled inference via dt_restore_raw_bayer(), un-
 //        process and re-mosaic, write a CFA DNG via
-//        dt_dng_write_cfa_bayer().
+//        dt_imageio_dng_write_cfa_bayer().
 //      - raw denoise (X-Trans / linear): demosaic via the darktable
 //        pipe (rawprepare + highlights + demosaic only), run
 //        dt_restore_raw_linear(), write a LinearRaw DNG via
-//        dt_dng_write_linear().
+//        dt_imageio_dng_write_linear().
 //      - denoise / upscale (RGB): export via the darktable pipeline
 //        with a custom format module that intercepts the pixel
 //        buffer in _ai_write_image(). when strength < 100 (so the
@@ -195,7 +195,7 @@
 #include "common/collection.h"
 #include "common/variables.h"
 #include "common/colorspaces.h"
-#include "common/dng_writer.h"
+#include "imageio/imageio_dng.h"
 #include "common/exif.h"
 #include "common/film.h"
 #include "common/grouping.h"
@@ -1046,9 +1046,9 @@ static int _process_raw_denoise_bayer(dt_neural_job_t *j,
   uint8_t *exif_blob = NULL;
   const int exif_len = dt_exif_read_blob(&exif_blob, src_path, imgid,
                                          FALSE, width, height, TRUE);
-  res = dt_dng_write_cfa_bayer(out_filename, cfa_out,
-                               width, height, img_meta,
-                               exif_blob, exif_len);
+  res = dt_imageio_dng_write_cfa_bayer(out_filename, cfa_out,
+                                       width, height, img_meta,
+                                       exif_blob, exif_len);
   g_free(exif_blob);
   g_free(cfa_out);
   return res;
@@ -1075,8 +1075,8 @@ static int _process_raw_denoise_linear(dt_neural_job_t *j,
   uint8_t *exif_blob = NULL;
   const int exif_len = dt_exif_read_blob(&exif_blob, src_path, imgid,
                                          FALSE, w, h, TRUE);
-  res = dt_dng_write_linear(out_filename, rgb, w, h, img_meta,
-                            exif_blob, exif_len);
+  res = dt_imageio_dng_write_linear(out_filename, rgb, w, h, img_meta,
+                                    exif_blob, exif_len);
   g_free(exif_blob);
   dt_free_align(rgb);
   return res;

From ee21d811e101145c60f89bc38a27c8601c4a7db6 Mon Sep 17 00:00:00 2001
From: Andrii Ryzhkov <andrii.ryzhkov@pm.me>
Date: Mon, 27 Apr 2026 17:50:22 +0200
Subject: [PATCH 7/9] Surface preview init failures in widget placeholder

---
 src/libs/neural_restore.c | 61 ++++++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 10 deletions(-)

diff --git a/src/libs/neural_restore.c b/src/libs/neural_restore.c
index 5d342090ed9b..844299be0b7d 100644
--- a/src/libs/neural_restore.c
+++ b/src/libs/neural_restore.c
@@ -249,6 +249,14 @@ typedef enum dt_neural_bpp_t
   NEURAL_BPP_32 = 2,
 } dt_neural_bpp_t;
 
+// preview-area placeholder state when no rendered preview exists
+typedef enum dt_nr_preview_err_t
+{
+  DT_NR_PREVIEW_ERR_NONE = 0,
+  DT_NR_PREVIEW_ERR_UNSUPPORTED,  // sensor class not handled by task
+  DT_NR_PREVIEW_ERR_INIT_FAILED,  // mipmap / model / cache load bailed
+} dt_nr_preview_err_t;
+
 typedef struct dt_lib_neural_restore_t
 {
   GtkNotebook *notebook;
@@ -280,6 +288,7 @@ typedef struct dt_lib_neural_restore_t
   gboolean preview_requested;
   gboolean dragging_split;
   gboolean preview_generating;
+  dt_nr_preview_err_t preview_error;
   gboolean recovery_changing;
   GThread *preview_thread;
   gint preview_sequence;
@@ -1507,6 +1516,8 @@ static void _update_info_label(dt_lib_neural_restore_t *d)
 
 static void _trigger_preview(dt_lib_module_t *self);
 static void _cancel_preview(dt_lib_module_t *self);
+static void _schedule_preview_failed(dt_lib_module_t *self,
+                                     dt_nr_preview_err_t err);
 
 static void _task_changed(dt_lib_neural_restore_t *d)
 {
@@ -1763,6 +1774,9 @@ static gpointer _preview_thread(gpointer data)
   dt_neural_preview_data_t *pd = (dt_neural_preview_data_t *)data;
   dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)pd->self->data;
 
+  // bail reason if we hit cleanup; stale-sequence bails are dropped
+  dt_nr_preview_err_t err = DT_NR_PREVIEW_ERR_INIT_FAILED;
+
   // reuse borrowed export if available (re-pick), otherwise export.
   // pixels points to either the borrowed buffer (not owned) or
   // cap.pixels (owned, must be freed on error or passed to result)
@@ -2018,8 +2032,13 @@ static gpointer _preview_thread(gpointer data)
   result->patch_center[0] = pd->patch_center[0];
   result->patch_center[1] = pd->patch_center[1];
   g_idle_add(_preview_result_idle, result);
+  g_free(pd);
+  return NULL;
 
 cleanup:
+  // bail: clear preview_generating on UI thread (stale-sequence bails dropped)
+  if(pd->sequence == g_atomic_int_get(&d->preview_sequence))
+    _schedule_preview_failed(pd->self, err);
   g_free(pd);
   return NULL;
 }
@@ -2256,18 +2275,35 @@ static void _schedule_raw_strength_reblend(dt_lib_module_t *self)
                     _strength_blend_timer_cb, self);
 }
 
-// fired when the raw worker bails before producing a result, so the UI
-// doesn't get stuck with preview_generating == TRUE forever.
-static gboolean _preview_raw_failed_idle(gpointer data)
+// fired when a preview worker bails: clears preview_generating and
+// records the bail reason so the placeholder shows it
+typedef struct
 {
-  dt_lib_module_t *self = (dt_lib_module_t *)data;
-  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)self->data;
+  dt_lib_module_t *self;
+  dt_nr_preview_err_t err;
+} _preview_failed_data_t;
+
+static gboolean _preview_failed_idle(gpointer data)
+{
+  _preview_failed_data_t *fd = (_preview_failed_data_t *)data;
+  dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)fd->self->data;
   d->preview_generating = FALSE;
+  d->preview_error = fd->err;
   _update_button_sensitivity(d);
   gtk_widget_queue_draw(d->preview_area);
+  g_free(fd);
   return G_SOURCE_REMOVE;
 }
 
+static void _schedule_preview_failed(dt_lib_module_t *self,
+                                     dt_nr_preview_err_t err)
+{
+  _preview_failed_data_t *fd = g_new0(_preview_failed_data_t, 1);
+  fd->self = self;
+  fd->err = err;
+  g_idle_add(_preview_failed_idle, fd);
+}
+
 static gboolean _preview_raw_result_idle(gpointer data)
 {
   dt_neural_preview_result_raw_t *res
@@ -2370,6 +2406,8 @@ static gpointer _preview_thread_raw(gpointer data)
   dt_neural_preview_data_t *pd = (dt_neural_preview_data_t *)data;
   dt_lib_neural_restore_t *d = (dt_lib_neural_restore_t *)pd->self->data;
 
+  // bail reason for cleanup path; unsupported-sensor branch overrides
+  dt_nr_preview_err_t bail_err = DT_NR_PREVIEW_ERR_INIT_FAILED;
 
   // 1. load source image metadata to determine sensor type.
   //    on a fresh session dt_image_cache_get returns img_meta with a
@@ -2406,6 +2444,7 @@ static gpointer _preview_thread_raw(gpointer data)
              "[neural_restore] raw preview: imgid %d is not bayer/xtrans "
              "(filters=0x%x class=%d)",
              pd->imgid, filters, cls);
+    bail_err = DT_NR_PREVIEW_ERR_UNSUPPORTED;
     goto cleanup;
   }
   dt_print(DT_DEBUG_AI,
@@ -2747,12 +2786,9 @@ static gpointer _preview_thread_raw(gpointer data)
   return NULL;
 
 cleanup:
-  // worker bailed before producing a result. clear preview_generating
-  // on the UI thread so the user can re-trigger and the button state
-  // reflects reality. only schedule when the sequence is still current
-  // (a stale bail means a newer trigger is already in flight).
+  // bail: clear preview_generating on UI thread (stale-sequence bails dropped)
   if(pd->sequence == g_atomic_int_get(&d->preview_sequence))
-    g_idle_add(_preview_raw_failed_idle, pd->self);
+    _schedule_preview_failed(pd->self, bail_err);
   g_free(pd);
   return NULL;
 }
@@ -2822,6 +2858,7 @@ static void _trigger_preview(dt_lib_module_t *self)
 
   // invalidate current preview and bump sequence so running thread exits early
   d->preview_ready = FALSE;
+  d->preview_error = DT_NR_PREVIEW_ERR_NONE;
   g_atomic_int_inc(&d->preview_sequence);
   gtk_widget_queue_draw(d->preview_area);
 
@@ -3251,6 +3288,10 @@ static gboolean _preview_draw(GtkWidget *widget, cairo_t *cr, dt_lib_module_t *s
       ? _("generating preview...")
       : !d->preview_requested
       ? _("click to generate preview")
+      : d->preview_error == DT_NR_PREVIEW_ERR_UNSUPPORTED
+      ? _("image not supported by this task")
+      : d->preview_error == DT_NR_PREVIEW_ERR_INIT_FAILED
+      ? _("preview initialization failed")
       : _("select an image to preview");
     cairo_text_extents(cr, text, &ext);
     cairo_move_to(cr, (w - ext.width) / 2.0, (h + ext.height) / 2.0);

From 27b3bdec846c689d759415b60b545f0958eca6fb Mon Sep 17 00:00:00 2001
From: Andrii Ryzhkov <andrii.ryzhkov@pm.me>
Date: Tue, 28 Apr 2026 09:41:18 +0200
Subject: [PATCH 8/9] Fix magenta X-Trans preview from WB mismatch in linear
 prepare

---
 src/common/ai/restore_raw_linear.c | 19 +++++++++----------
 src/common/ai/restore_raw_linear.h | 10 +++++++---
 src/libs/neural_restore.c          |  2 +-
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/common/ai/restore_raw_linear.c b/src/common/ai/restore_raw_linear.c
index 1d62e7be0a3d..338d832f3810 100644
--- a/src/common/ai/restore_raw_linear.c
+++ b/src/common/ai/restore_raw_linear.c
@@ -884,7 +884,8 @@ retry:;
 // WB + camRGB->lin_rec2020 once and returns a 3ch interleaved buffer at
 // sensor resolution; neural_restore.c caches it across previews of the
 // same image
-int dt_restore_raw_linear_prepare(const dt_imgid_t imgid,
+int dt_restore_raw_linear_prepare(const dt_restore_context_t *ctx,
+                                  const dt_imgid_t imgid,
                                   float **out_rgb,
                                   int *out_w,
                                   int *out_h)
@@ -904,19 +905,17 @@ int dt_restore_raw_linear_prepare(const dt_imgid_t imgid,
   dt_image_t img_meta = *cached;
   dt_image_cache_read_release(cached);
 
-  // this prepare path has no ctx; use the default (AS_SHOT) WB. the
-  // cached lin_rec2020 buffer assumes this mode, so a future model
-  // that needs a different WB would require keying the cache on ctx
-  // too (or adding ctx to this API)
+  // WB + matrix derived from ctx so the cached buffer matches what the
+  // inference + undo paths will assume. without this, a NONE-mode model
+  // would see a buffer with WB baked in by the prepare default and the
+  // undo step (which honours ctx) would not strip it back out — magenta
+  // cast on re-mosaic
   float wb_norm[3];
-  _resolve_linear_wb(NULL, &img_meta, wb_norm);
+  _resolve_linear_wb(ctx, &img_meta, wb_norm);
 
-  // 3. camRGB -> input-space matrix. this prepare path has no ctx
-  // so we use the default (LIN_REC2020); a second model expecting a
-  // different input space would need the cache keyed on it too
   float cam_to_input[9];
   float input_to_cam[9];
-  if(!_build_cam_matrices(NULL, &img_meta, cam_to_input, input_to_cam))
+  if(!_build_cam_matrices(ctx, &img_meta, cam_to_input, input_to_cam))
   {
     for(int i = 0; i < 9; i++)
       cam_to_input[i] = (i % 4 == 0) ? 1.0f : 0.0f;
diff --git a/src/common/ai/restore_raw_linear.h b/src/common/ai/restore_raw_linear.h
index 183db6d98bb7..ca6dcad1b602 100644
--- a/src/common/ai/restore_raw_linear.h
+++ b/src/common/ai/restore_raw_linear.h
@@ -88,14 +88,18 @@ int dt_restore_raw_linear(dt_restore_context_t *ctx,
 // Slow (full-image demosaic via darktable's pipeline). neural_restore
 // caches the result across multiple preview refreshes of the same image.
 //
+// @param ctx       loaded linear context (selects WB mode / colorspace
+//                  to match the model the inference + undo paths will
+//                  use). may be NULL only if the caller knows defaults
+//                  match the downstream consumer.
 // @param imgid     image id
 // @param out_rgb   caller-frees with dt_free_align. 3ch interleaved
-//                  (sensor_w * sensor_h * 3 floats), in lin_rec2020 with
-//                  as-shot WB applied.
+//                  (sensor_w * sensor_h * 3 floats), in lin_rec2020.
 // @param out_w     out: sensor width
 // @param out_h     out: sensor height
 // @return 0 on success
-int dt_restore_raw_linear_prepare(const dt_imgid_t imgid,
+int dt_restore_raw_linear_prepare(const dt_restore_context_t *ctx,
+                                  const dt_imgid_t imgid,
                                   float **out_rgb,
                                   int *out_w,
                                   int *out_h);
diff --git a/src/libs/neural_restore.c b/src/libs/neural_restore.c
index 844299be0b7d..925793c30620 100644
--- a/src/libs/neural_restore.c
+++ b/src/libs/neural_restore.c
@@ -2533,7 +2533,7 @@ static gpointer _preview_thread_raw(gpointer data)
   }
   else if(is_xtrans)
   {
-    if(dt_restore_raw_linear_prepare(pd->imgid, &take_full_lin,
+    if(dt_restore_raw_linear_prepare(ctx, pd->imgid, &take_full_lin,
                                      &full_w, &full_h) != 0
        || !take_full_lin)
     {

From 4d5e1c971135767c1885f432396013c1ce0a53b6 Mon Sep 17 00:00:00 2001
From: Andrii Ryzhkov <andrii.ryzhkov@pm.me>
Date: Tue, 28 Apr 2026 10:33:37 +0200
Subject: [PATCH 9/9] Embed source JPEG preview in denoised DNGs

---
 src/imageio/imageio_dng.c | 271 ++++++++++++++++++++------------------
 src/imageio/imageio_dng.h |  19 ++-
 src/libs/neural_restore.c | 107 ++++++++++++++-
 3 files changed, 268 insertions(+), 129 deletions(-)

diff --git a/src/imageio/imageio_dng.c b/src/imageio/imageio_dng.c
index 255b377f9539..8d1c65addf70 100644
--- a/src/imageio/imageio_dng.c
+++ b/src/imageio/imageio_dng.c
@@ -49,13 +49,113 @@ static void _cfa_bytes_from_filters(uint32_t filters, uint8_t out[4])
   out[3] = FC(1, 1, filters);
 }
 
+// shared DNG metadata block: written on whichever IFD readers consult
+// first for camera/colour information. for single-IFD layouts that's
+// the raw IFD; for the canonical preview-leading layout (IFD0 = JPEG
+// preview, SubIFD0 = raw) it's IFD0
+static void _set_dng_shared_metadata(TIFF *tif, const dt_image_t *img)
+{
+  TIFFSetField(tif, TIFFTAG_XRESOLUTION, 300.0);
+  TIFFSetField(tif, TIFFTAG_YRESOLUTION, 300.0);
+  TIFFSetField(tif, TIFFTAG_RESOLUTIONUNIT, RESUNIT_INCH);
+
+  gchar *software = g_strdup_printf("darktable %s", darktable_package_version);
+  TIFFSetField(tif, TIFFTAG_SOFTWARE, software);
+  g_free(software);
+
+  if(img->camera_maker[0])
+    TIFFSetField(tif, TIFFTAG_MAKE, img->camera_maker);
+  if(img->camera_model[0])
+    TIFFSetField(tif, TIFFTAG_MODEL, img->camera_model);
+  if(img->camera_makermodel[0])
+    TIFFSetField(tif, TIFFTAG_UNIQUECAMERAMODEL, img->camera_makermodel);
+
+  const uint8_t dng_version[4]  = { 1, 4, 0, 0 };
+  const uint8_t dng_backward[4] = { 1, 2, 0, 0 };
+  TIFFSetField(tif, TIFFTAG_DNGVERSION, dng_version);
+  TIFFSetField(tif, TIFFTAG_DNGBACKWARDVERSION, dng_backward);
+
+  // AsShotNeutral: inverse of wb_coeffs, normalized so max=1. fallback
+  // to neutral [1,1,1] when wb_coeffs missing so the tag is always set
+  float neutral[3] = { 1.0f, 1.0f, 1.0f };
+  if(img->wb_coeffs[0] > 0.0f
+     && img->wb_coeffs[1] > 0.0f
+     && img->wb_coeffs[2] > 0.0f)
+  {
+    for(int i = 0; i < 3; i++) neutral[i] = 1.0f / img->wb_coeffs[i];
+    const float m = fmaxf(neutral[0], fmaxf(neutral[1], neutral[2]));
+    if(m > 0.0f) for(int i = 0; i < 3; i++) neutral[i] /= m;
+  }
+  TIFFSetField(tif, TIFFTAG_ASSHOTNEUTRAL, 3, neutral);
+
+  // ColorMatrix1 (XYZ D50 -> cameraRGB, 3x3). row-major [camRGB][XYZ]
+  // matches darktable's adobe_XYZ_to_CAM layout exactly
+  float non_zero = 0.0f;
+  for(int k = 0; k < 3; k++)
+    for(int i = 0; i < 3; i++)
+      non_zero += fabsf(img->adobe_XYZ_to_CAM[k][i]);
+  if(non_zero > 0.0f)
+  {
+    float color_matrix[9];
+    for(int k = 0; k < 3; k++)
+      for(int i = 0; i < 3; i++)
+        color_matrix[k * 3 + i] = img->adobe_XYZ_to_CAM[k][i];
+    TIFFSetField(tif, TIFFTAG_COLORMATRIX1, 9, color_matrix);
+  }
+}
+
+// write IFD0 as the canonical Adobe-layout JPEG preview: small YCbCr
+// thumbnail + shared DNG metadata + SubIFD pointer to the raw payload
+// that the caller will write next. caller must follow with
+// TIFFCreateDirectory + raw-IFD population + TIFFWriteDirectory
+static int _write_jpeg_preview_ifd(TIFF *tif,
+                                   const dt_image_t *img,
+                                   const dt_imageio_dng_preview_t *p)
+{
+  TIFFSetField(tif, TIFFTAG_SUBFILETYPE, FILETYPE_REDUCEDIMAGE);
+  TIFFSetField(tif, TIFFTAG_IMAGEWIDTH, (uint32_t)p->width);
+  TIFFSetField(tif, TIFFTAG_IMAGELENGTH, (uint32_t)p->height);
+  TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, (uint16_t)8);
+  TIFFSetField(tif, TIFFTAG_SAMPLESPERPIXEL, (uint16_t)3);
+  TIFFSetField(tif, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);
+  TIFFSetField(tif, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_YCBCR);
+  TIFFSetField(tif, TIFFTAG_COMPRESSION, COMPRESSION_JPEG);
+  TIFFSetField(tif, TIFFTAG_ORIENTATION, ORIENTATION_TOPLEFT);
+  TIFFSetField(tif, TIFFTAG_ROWSPERSTRIP, (uint32_t)p->height);
+
+  _set_dng_shared_metadata(tif, img);
+
+  // SubIFD pointer with one slot. libtiff fills the actual offset
+  // when the SubIFD is later written via TIFFCreateDirectory + ...
+  toff_t sub_offsets[1] = { 0 };
+  TIFFSetField(tif, TIFFTAG_SUBIFD, (uint16_t)1, sub_offsets);
+
+  // pre-encoded JPEG written as a single raw strip (libtiff does not
+  // re-encode when COMPRESSION_JPEG is paired with TIFFWriteRawStrip)
+  if(TIFFWriteRawStrip(tif, 0, (void *)p->data, (tmsize_t)p->len) < 0)
+  {
+    dt_print(DT_DEBUG_ALWAYS,
+             "[imageio_dng] TIFFWriteRawStrip failed for JPEG preview "
+             "(%d bytes, %dx%d)", p->len, p->width, p->height);
+    return 1;
+  }
+  if(!TIFFWriteDirectory(tif))
+  {
+    dt_print(DT_DEBUG_ALWAYS,
+             "[imageio_dng] TIFFWriteDirectory failed for JPEG preview IFD0");
+    return 1;
+  }
+  return 0;
+}
+
 int dt_imageio_dng_write_cfa_bayer(const char *filename,
                                    const uint16_t *cfa,
                                    int width,
                                    int height,
                                    const dt_image_t *img,
                                    const void *exif_blob,
-                                   int exif_len)
+                                   int exif_len,
+                                   const dt_imageio_dng_preview_t *preview)
 {
   if(!filename || !cfa || !img || width <= 0 || height <= 0)
     return 1;
@@ -69,7 +169,28 @@ int dt_imageio_dng_write_cfa_bayer(const char *filename,
 #endif
   if(!tif) return 1;
 
-  // required baseline TIFF tags for a single-plane raw image
+  // canonical Adobe layout when a preview is provided: IFD0 holds the
+  // JPEG thumbnail + DNG identification metadata, the raw payload
+  // moves into SubIFD0
+  const gboolean canonical = (preview && preview->data && preview->len > 0
+                              && preview->width > 0 && preview->height > 0);
+  if(canonical)
+  {
+    if(_write_jpeg_preview_ifd(tif, img, preview) != 0)
+    {
+      dt_print(DT_DEBUG_ALWAYS,
+               "[imageio_dng] write_cfa_bayer: preview IFD0 failed, aborting");
+      TIFFClose(tif);
+      g_unlink(filename);
+      return 1;
+    }
+    // libtiff entered INSUBIFD mode when the IFD0 carrying TIFFTAG_SUBIFD
+    // was written; subsequent TIFFSetField + scanline writes populate
+    // the SubIFD without an explicit TIFFCreateDirectory call (whose
+    // return-value convention changed between libtiff versions)
+  }
+
+  // raw payload IFD: single IFD when no preview, otherwise SubIFD0
   TIFFSetField(tif, TIFFTAG_SUBFILETYPE, 0);
   TIFFSetField(tif, TIFFTAG_IMAGEWIDTH, (uint32_t)width);
   TIFFSetField(tif, TIFFTAG_IMAGELENGTH, (uint32_t)height);
@@ -81,29 +202,9 @@ int dt_imageio_dng_write_cfa_bayer(const char *filename,
   TIFFSetField(tif, TIFFTAG_COMPRESSION, COMPRESSION_NONE);
   TIFFSetField(tif, TIFFTAG_ORIENTATION, ORIENTATION_TOPLEFT);
   TIFFSetField(tif, TIFFTAG_ROWSPERSTRIP, TIFFDefaultStripSize(tif, 0));
-  TIFFSetField(tif, TIFFTAG_XRESOLUTION, 300.0);
-  TIFFSetField(tif, TIFFTAG_YRESOLUTION, 300.0);
-  TIFFSetField(tif, TIFFTAG_RESOLUTIONUNIT, RESUNIT_INCH);
-  {
-    gchar *software = g_strdup_printf("darktable %s",
-                                       darktable_package_version);
-    TIFFSetField(tif, TIFFTAG_SOFTWARE, software);
-    g_free(software);
-  }
-
-  // camera identification
-  if(img->camera_maker[0])
-    TIFFSetField(tif, TIFFTAG_MAKE, img->camera_maker);
-  if(img->camera_model[0])
-    TIFFSetField(tif, TIFFTAG_MODEL, img->camera_model);
-  if(img->camera_makermodel[0])
-    TIFFSetField(tif, TIFFTAG_UNIQUECAMERAMODEL, img->camera_makermodel);
-
-  // DNG identification
-  const uint8_t dng_version[4] = { 1, 4, 0, 0 };
-  const uint8_t dng_backward[4] = { 1, 2, 0, 0 };
-  TIFFSetField(tif, TIFFTAG_DNGVERSION, dng_version);
-  TIFFSetField(tif, TIFFTAG_DNGBACKWARDVERSION, dng_backward);
+  // shared metadata only on single-IFD layout — canonical has it on IFD0
+  if(!canonical)
+    _set_dng_shared_metadata(tif, img);
 
   // CFA description
   const uint16_t cfa_repeat_dim[2] = { 2, 2 };
@@ -142,45 +243,6 @@ int dt_imageio_dng_write_cfa_bayer(const char *filename,
     ? img->raw_white_point : 65535u;
   TIFFSetField(tif, TIFFTAG_WHITELEVEL, 1, &white);
 
-  // AsShotNeutral (derived from wb_coeffs)
-  // DNG AsShotNeutral encodes the neutral white balance as a
-  // cameraRGB triple where smaller values mean more amplification.
-  // darktable's wb_coeffs are raw-to-white multipliers; AsShotNeutral
-  // is their inverse, normalized so the maximum element is 1
-  if(img->wb_coeffs[0] > 0.0f
-     && img->wb_coeffs[1] > 0.0f
-     && img->wb_coeffs[2] > 0.0f)
-  {
-    float inv[3];
-    for(int i = 0; i < 3; i++)
-      inv[i] = 1.0f / img->wb_coeffs[i];
-    const float m = fmaxf(inv[0], fmaxf(inv[1], inv[2]));
-    if(m > 0.0f)
-      for(int i = 0; i < 3; i++) inv[i] /= m;
-    TIFFSetField(tif, TIFFTAG_ASSHOTNEUTRAL, 3, inv);
-  }
-
-  // ColorMatrix1 (XYZ D50 -> cameraRGB, 3x3 for trichromatic)
-  // darktable's adobe_XYZ_to_CAM is populated from the rawspeed
-  // cameras.xml matrix in row-major [camRGB][XYZ] layout, which
-  // matches the DNG ColorMatrix1 layout exactly (row = camera axis,
-  // column = XYZ axis)
-  {
-    float non_zero = 0.0f;
-    for(int k = 0; k < 3; k++)
-      for(int i = 0; i < 3; i++)
-        non_zero += fabsf(img->adobe_XYZ_to_CAM[k][i]);
-
-    if(non_zero > 0.0f)
-    {
-      float color_matrix[9];
-      for(int k = 0; k < 3; k++)
-        for(int i = 0; i < 3; i++)
-          color_matrix[k * 3 + i] = img->adobe_XYZ_to_CAM[k][i];
-      TIFFSetField(tif, TIFFTAG_COLORMATRIX1, 9, color_matrix);
-    }
-  }
-
   // advertise the visible region inside the full raw buffer; without
   // these tags the importer renders the optical-black margins too
   const int crop_x = (img->crop_x > 0) ? img->crop_x : 0;
@@ -231,7 +293,8 @@ int dt_imageio_dng_write_linear(const char *filename,
                                 int height,
                                 const dt_image_t *img,
                                 const void *exif_blob,
-                                int exif_len)
+                                int exif_len,
+                                const dt_imageio_dng_preview_t *preview)
 {
   if(!filename || !rgb || !img || width <= 0 || height <= 0)
     return 1;
@@ -245,6 +308,22 @@ int dt_imageio_dng_write_linear(const char *filename,
 #endif
   if(!tif) return 1;
 
+  // canonical layout when a preview is provided (see write_cfa_bayer)
+  const gboolean canonical = (preview && preview->data && preview->len > 0
+                              && preview->width > 0 && preview->height > 0);
+  if(canonical)
+  {
+    if(_write_jpeg_preview_ifd(tif, img, preview) != 0)
+    {
+      dt_print(DT_DEBUG_ALWAYS,
+               "[imageio_dng] write_linear: preview IFD0 failed, aborting");
+      TIFFClose(tif);
+      g_unlink(filename);
+      return 1;
+    }
+    // libtiff is in INSUBIFD mode after IFD0 was written with TIFFTAG_SUBIFD
+  }
+
   // baseline TIFF tags, 3 samples per pixel (demosaicked)
   TIFFSetField(tif, TIFFTAG_SUBFILETYPE, 0);
   TIFFSetField(tif, TIFFTAG_IMAGEWIDTH, (uint32_t)width);
@@ -257,29 +336,8 @@ int dt_imageio_dng_write_linear(const char *filename,
   TIFFSetField(tif, TIFFTAG_COMPRESSION, COMPRESSION_NONE);
   TIFFSetField(tif, TIFFTAG_ORIENTATION, ORIENTATION_TOPLEFT);
   TIFFSetField(tif, TIFFTAG_ROWSPERSTRIP, TIFFDefaultStripSize(tif, 0));
-  TIFFSetField(tif, TIFFTAG_XRESOLUTION, 300.0);
-  TIFFSetField(tif, TIFFTAG_YRESOLUTION, 300.0);
-  TIFFSetField(tif, TIFFTAG_RESOLUTIONUNIT, RESUNIT_INCH);
-  {
-    gchar *software = g_strdup_printf("darktable %s",
-                                       darktable_package_version);
-    TIFFSetField(tif, TIFFTAG_SOFTWARE, software);
-    g_free(software);
-  }
-
-  // camera identification
-  if(img->camera_maker[0])
-    TIFFSetField(tif, TIFFTAG_MAKE, img->camera_maker);
-  if(img->camera_model[0])
-    TIFFSetField(tif, TIFFTAG_MODEL, img->camera_model);
-  if(img->camera_makermodel[0])
-    TIFFSetField(tif, TIFFTAG_UNIQUECAMERAMODEL, img->camera_makermodel);
-
-  // DNG identification
-  const uint8_t dng_version[4] = { 1, 4, 0, 0 };
-  const uint8_t dng_backward[4] = { 1, 2, 0, 0 };
-  TIFFSetField(tif, TIFFTAG_DNGVERSION, dng_version);
-  TIFFSetField(tif, TIFFTAG_DNGBACKWARDVERSION, dng_backward);
+  if(!canonical)
+    _set_dng_shared_metadata(tif, img);
 
   // NO CFA tags: this is demosaicked data.
   //     encode as normalized: BlackLevel=0, WhiteLevel=65535. the
@@ -293,43 +351,6 @@ int dt_imageio_dng_write_linear(const char *filename,
   TIFFSetField(tif, TIFFTAG_BLACKLEVEL, 3, black3);
   TIFFSetField(tif, TIFFTAG_WHITELEVEL, 1, &white_norm);
 
-  // AsShotNeutral = inverse of WB multipliers, normalized so max=1.
-  // on re-import, darktable reads this and derives WB coeffs via
-  // wb[c] = 1/AsShotNeutral[c] / wb[G-normalized]. the temperature
-  // iop then applies this WB to our un-WB'd data, giving the standard
-  // raw-pipeline result
-  if(img->wb_coeffs[0] > 0.0f
-     && img->wb_coeffs[1] > 0.0f
-     && img->wb_coeffs[2] > 0.0f)
-  {
-    float inv[3];
-    for(int i = 0; i < 3; i++) inv[i] = 1.0f / img->wb_coeffs[i];
-    const float m = fmaxf(inv[0], fmaxf(inv[1], inv[2]));
-    if(m > 0.0f) for(int i = 0; i < 3; i++) inv[i] /= m;
-    TIFFSetField(tif, TIFFTAG_ASSHOTNEUTRAL, 3, inv);
-  }
-  else
-  {
-    const float neutral[3] = { 1.0f, 1.0f, 1.0f };
-    TIFFSetField(tif, TIFFTAG_ASSHOTNEUTRAL, 3, neutral);
-  }
-
-  // ColorMatrix1 from camera's XYZ->CAM (3x3 portion)
-  {
-    float non_zero = 0.0f;
-    for(int k = 0; k < 3; k++)
-      for(int i = 0; i < 3; i++)
-        non_zero += fabsf(img->adobe_XYZ_to_CAM[k][i]);
-    if(non_zero > 0.0f)
-    {
-      float color_matrix[9];
-      for(int k = 0; k < 3; k++)
-        for(int i = 0; i < 3; i++)
-          color_matrix[k * 3 + i] = img->adobe_XYZ_to_CAM[k][i];
-      TIFFSetField(tif, TIFFTAG_COLORMATRIX1, 9, color_matrix);
-    }
-  }
-
   // linear DNG: buffer is already at visible dims (post-demosaic);
   // ACTIVEAREA covers the full buffer, no margin to crop
   const uint32_t active_area[4] = {
diff --git a/src/imageio/imageio_dng.h b/src/imageio/imageio_dng.h
index e72ca801ede8..1ea2ef3e97cf 100644
--- a/src/imageio/imageio_dng.h
+++ b/src/imageio/imageio_dng.h
@@ -35,6 +35,19 @@
 
 struct dt_image_t;
 
+// optional embedded JPEG preview for the uint16 DNG writers. when
+// non-NULL, the writer uses the canonical Adobe layout (IFD0 = JPEG
+// preview, SubIFD0 = raw payload) so library browsers (Finder,
+// Photomator, Lightroom) can render thumbnails without decoding the
+// raw. when NULL, falls back to the historical single-IFD layout
+typedef struct dt_imageio_dng_preview_t
+{
+  const uint8_t *data;     // pre-encoded JPEG bytes, 8-bit YCbCr
+  int            len;      // length of @p data in bytes
+  int            width;    // declared image width
+  int            height;   // declared image height
+} dt_imageio_dng_preview_t;
+
 // @brief Write a 32-bit float CFA DNG (Bayer or X-Trans).
 //
 // Used by HDR merge: pixel data is float pre-normalized to
@@ -91,7 +104,8 @@ int dt_imageio_dng_write_cfa_bayer(const char *filename,
                                    int height,
                                    const struct dt_image_t *img,
                                    const void *exif_blob,
-                                   int exif_len);
+                                   int exif_len,
+                                   const dt_imageio_dng_preview_t *preview);
 
 // @brief Write a demosaicked 3-channel linear DNG.
 //
@@ -122,7 +136,8 @@ int dt_imageio_dng_write_linear(const char *filename,
                                 int height,
                                 const struct dt_image_t *img,
                                 const void *exif_blob,
-                                int exif_len);
+                                int exif_len,
+                                const dt_imageio_dng_preview_t *preview);
 
 // clang-format off
 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.py
diff --git a/src/libs/neural_restore.c b/src/libs/neural_restore.c
index 925793c30620..7447f7896394 100644
--- a/src/libs/neural_restore.c
+++ b/src/libs/neural_restore.c
@@ -196,6 +196,7 @@
 #include "common/variables.h"
 #include "common/colorspaces.h"
 #include "imageio/imageio_dng.h"
+#include "imageio/imageio_jpeg.h"
 #include "common/exif.h"
 #include "common/film.h"
 #include "common/grouping.h"
@@ -501,6 +502,62 @@ static inline float _linear_to_srgb(float v)
   return (v <= 0.0031308f) ? 12.92f * v : 1.055f * powf(v, 1.0f / 2.4f) - 0.055f;
 }
 
+// pull the camera's embedded JPEG preview from the source raw, to embed
+// as the DNG thumbnail; output buffer is g_malloc'd (caller frees)
+static int _extract_source_jpeg_preview(const char *src_path,
+                                        uint8_t **out_jpeg,
+                                        int *out_jpeg_len,
+                                        int *out_w,
+                                        int *out_h)
+{
+  *out_jpeg = NULL;
+  *out_jpeg_len = 0;
+  *out_w = 0;
+  *out_h = 0;
+
+  uint8_t *raw_buf = NULL;
+  size_t   raw_size = 0;
+  char    *mime = NULL;
+  // dt_exif_get_thumbnail returns FALSE on success, allocates via malloc/strdup
+  if(dt_exif_get_thumbnail(src_path, &raw_buf, &raw_size, &mime) || !raw_buf)
+  {
+    free(raw_buf);
+    free(mime);
+    return 1;
+  }
+  const gboolean is_jpeg = mime && (strcmp(mime, "image/jpeg") == 0);
+  free(mime);
+  if(!is_jpeg || raw_size == 0 || raw_size > (size_t)INT_MAX)
+  {
+    free(raw_buf);
+    return 1;
+  }
+
+  dt_imageio_jpeg_t jpg;
+  if(dt_imageio_jpeg_decompress_header(raw_buf, raw_size, &jpg) != 0
+     || jpg.width <= 0 || jpg.height <= 0)
+  {
+    free(raw_buf);
+    return 1;
+  }
+
+  // re-allocate via glib so caller can g_free uniformly
+  uint8_t *jpeg = g_try_malloc(raw_size);
+  if(!jpeg)
+  {
+    free(raw_buf);
+    return 1;
+  }
+  memcpy(jpeg, raw_buf, raw_size);
+  free(raw_buf);
+
+  *out_jpeg     = jpeg;
+  *out_jpeg_len = (int)raw_size;
+  *out_w        = jpg.width;
+  *out_h        = jpg.height;
+  return 0;
+}
+
 // convert float RGB (3ch interleaved, linear) to cairo RGB24 surface data
 static void _float_rgb_to_cairo(const float *const restrict src,
                                 unsigned char *const restrict dst,
@@ -1055,9 +1112,32 @@ static int _process_raw_denoise_bayer(dt_neural_job_t *j,
   uint8_t *exif_blob = NULL;
   const int exif_len = dt_exif_read_blob(&exif_blob, src_path, imgid,
                                          FALSE, width, height, TRUE);
+  uint8_t *jpeg_buf = NULL;
+  int jpeg_len = 0, jpeg_w = 0, jpeg_h = 0;
+  dt_imageio_dng_preview_t preview = {0};
+  const int prev_rc = _extract_source_jpeg_preview(src_path, &jpeg_buf,
+                                                   &jpeg_len, &jpeg_w, &jpeg_h);
+  if(prev_rc == 0)
+  {
+    preview.data   = jpeg_buf;
+    preview.len    = jpeg_len;
+    preview.width  = jpeg_w;
+    preview.height = jpeg_h;
+    dt_print(DT_DEBUG_AI,
+             "[neural_restore] embedded JPEG preview from source %dx%d (%d bytes)",
+             jpeg_w, jpeg_h, jpeg_len);
+  }
+  else
+  {
+    dt_print(DT_DEBUG_AI,
+             "[neural_restore] no embedded preview in source (rc=%d) — "
+             "writing DNG without thumbnail", prev_rc);
+  }
   res = dt_imageio_dng_write_cfa_bayer(out_filename, cfa_out,
                                        width, height, img_meta,
-                                       exif_blob, exif_len);
+                                       exif_blob, exif_len,
+                                       jpeg_buf ? &preview : NULL);
+  g_free(jpeg_buf);
   g_free(exif_blob);
   g_free(cfa_out);
   return res;
@@ -1084,8 +1164,31 @@ static int _process_raw_denoise_linear(dt_neural_job_t *j,
   uint8_t *exif_blob = NULL;
   const int exif_len = dt_exif_read_blob(&exif_blob, src_path, imgid,
                                          FALSE, w, h, TRUE);
+  uint8_t *jpeg_buf = NULL;
+  int jpeg_len = 0, jpeg_w = 0, jpeg_h = 0;
+  dt_imageio_dng_preview_t preview = {0};
+  const int prev_rc = _extract_source_jpeg_preview(src_path, &jpeg_buf,
+                                                   &jpeg_len, &jpeg_w, &jpeg_h);
+  if(prev_rc == 0)
+  {
+    preview.data   = jpeg_buf;
+    preview.len    = jpeg_len;
+    preview.width  = jpeg_w;
+    preview.height = jpeg_h;
+    dt_print(DT_DEBUG_AI,
+             "[neural_restore] embedded JPEG preview from source %dx%d (%d bytes)",
+             jpeg_w, jpeg_h, jpeg_len);
+  }
+  else
+  {
+    dt_print(DT_DEBUG_AI,
+             "[neural_restore] no embedded preview in source (rc=%d) — "
+             "writing DNG without thumbnail", prev_rc);
+  }
   res = dt_imageio_dng_write_linear(out_filename, rgb, w, h, img_meta,
-                                    exif_blob, exif_len);
+                                    exif_blob, exif_len,
+                                    jpeg_buf ? &preview : NULL);
+  g_free(jpeg_buf);
   g_free(exif_blob);
   dt_free_align(rgb);
   return res;