From 47744e375f79d1f313ca0b4622add284cd605031 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 13:05:50 +0000
Subject: [PATCH 01/23] feat(nodes): add HW video codec backends (Vulkan Video
 H.264, VA-API AV1, NVENC/NVDEC AV1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement hardware-accelerated video encoding and decoding for StreamKit,
targeting Linux with Intel and NVIDIA GPUs (issue #217).

Three backends behind optional feature flags:

  vulkan_video — H.264 encode/decode via Vulkan Video (vk-video v0.3).
    Cross-vendor (Intel ANV, NVIDIA, AMD RADV). Includes lazy encoder
    creation on first frame for resolution detection, NV12/I420 input
    support, and configurable bitrate/framerate/keyframe interval.

  vaapi — AV1 encode/decode via VA-API (cros-codecs v0.0.6).
    Primarily Intel (intel-media-driver), also AMD. Uses GBM surfaces
    for zero-copy VA-API buffer management. Includes stride-aware
    NV12 plane read/write helpers with odd-width correctness.

  nvcodec — AV1 encode/decode via NVENC/NVDEC (shiguredo_nvcodec v2025.2).
    NVIDIA only (RTX 30xx+ decode, RTX 40xx+ AV1 encode). Dynamic CUDA
    loading — no build-time CUDA Toolkit required for the host binary.

All backends share:
- HwAccelMode enum (auto/force_hw/force_cpu) for graceful fallback
- ProcessorNode trait integration with health reporting
- Consistent config structs with serde deny_unknown_fields validation
- Comprehensive unit tests (mock-based, no GPU required)

Closes #217

Signed-off-by: Devin AI <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 Cargo.lock                             |  552 +++++++-
 crates/nodes/Cargo.toml                |   14 +
 crates/nodes/src/video/mod.rs          |   43 +-
 crates/nodes/src/video/nv_av1.rs       | 1184 ++++++++++++++++
 crates/nodes/src/video/vaapi_av1.rs    | 1807 ++++++++++++++++++++++++
 crates/nodes/src/video/vulkan_video.rs | 1461 +++++++++++++++++++
 justfile                               |    3 +-
 7 files changed, 4998 insertions(+), 66 deletions(-)
 create mode 100644 crates/nodes/src/video/nv_av1.rs
 create mode 100644 crates/nodes/src/video/vaapi_av1.rs
 create mode 100644 crates/nodes/src/video/vulkan_video.rs

diff --git a/Cargo.lock b/Cargo.lock
index 800ab5c8..5390b6dc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -174,7 +174,7 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -237,7 +237,7 @@ checksum = "3109e49b1e4909e9db6515a30c633684d68cdeaa252f215214cb4fa1a5bfee2c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
  "synstructure",
 ]
 
@@ -249,7 +249,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -288,7 +288,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -299,7 +299,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -334,7 +334,7 @@ checksum = "49c98dba06b920588de7d63f6acc23f1e6a9fade5fd6198e564506334fb5a4f5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -564,6 +564,46 @@ version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
+[[package]]
+name = "bindgen"
+version = "0.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
+dependencies = [
+ "bitflags 2.11.0",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.12.1",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash 1.1.0",
+ "shlex",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "bindgen"
+version = "0.72.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
+dependencies = [
+ "bitflags 2.11.0",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.12.1",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash 2.1.1",
+ "shlex",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "bit-set"
 version = "0.9.1"
@@ -600,6 +640,12 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "bitstream-io"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6099cdc01846bc367c4e7dd630dc5966dccf36b652fae7a74e17b640411a91b2"
+
 [[package]]
 name = "bitstream-io"
 version = "4.9.0"
@@ -677,7 +723,7 @@ checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -809,12 +855,27 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
 
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom 7.1.3",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 
+[[package]]
+name = "cfg_aliases"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
+
 [[package]]
 name = "cfg_aliases"
 version = "0.2.1"
@@ -873,6 +934,17 @@ dependencies = [
  "half",
 ]
 
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading 0.8.9",
+]
+
 [[package]]
 name = "clap"
 version = "4.6.0"
@@ -904,7 +976,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -946,6 +1018,8 @@ version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681"
 dependencies = [
+ "serde",
+ "termcolor",
  "unicode-width",
 ]
 
@@ -1322,6 +1396,40 @@ dependencies = [
  "itertools 0.10.5",
 ]
 
+[[package]]
+name = "cros-codecs"
+version = "0.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "80f7441b4f31c17b6b6b7f57f6c202944aad11d0ab23739a9ff88d8d34dec621"
+dependencies = [
+ "anyhow",
+ "byteorder",
+ "crc32fast",
+ "cros-libva",
+ "drm",
+ "drm-fourcc",
+ "gbm",
+ "gbm-sys",
+ "log",
+ "nix 0.28.0",
+ "thiserror 1.0.69",
+ "zerocopy 0.8.47",
+]
+
+[[package]]
+name = "cros-libva"
+version = "0.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "902c9726e953b678595456bd38f95f31aaf1947c56dd9f4a2290f3f1eca4d228"
+dependencies = [
+ "bindgen 0.70.1",
+ "bitflags 2.11.0",
+ "log",
+ "pkg-config",
+ "regex",
+ "thiserror 1.0.69",
+]
+
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.15"
@@ -1392,7 +1500,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1403,7 +1511,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d"
 dependencies = [
  "darling_core",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1451,6 +1559,17 @@ dependencies = [
  "serde_core",
 ]
 
+[[package]]
+name = "derivative"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "derive_more"
 version = "2.1.1"
@@ -1470,7 +1589,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustc_version",
- "syn",
+ "syn 2.0.117",
  "unicode-xid",
 ]
 
@@ -1541,7 +1660,16 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "dlib"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab8ecd87370524b461f8557c119c405552c396ed91fc0a8eec68679eab26f94a"
+dependencies = [
+ "libloading 0.8.9",
 ]
 
 [[package]]
@@ -1562,6 +1690,45 @@ dependencies = [
  "litrs",
 ]
 
+[[package]]
+name = "drm"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "98888c4bbd601524c11a7ed63f814b8825f420514f78e96f752c437ae9cbb5d1"
+dependencies = [
+ "bitflags 2.11.0",
+ "bytemuck",
+ "drm-ffi",
+ "drm-fourcc",
+ "rustix 0.38.44",
+]
+
+[[package]]
+name = "drm-ffi"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97c98727e48b7ccb4f4aea8cfe881e5b07f702d17b7875991881b41af7278d53"
+dependencies = [
+ "drm-sys",
+ "rustix 0.38.44",
+]
+
+[[package]]
+name = "drm-fourcc"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0aafbcdb8afc29c1a7ee5fbe53b5d62f4565b35a042a662ca9fecd0b54dae6f4"
+
+[[package]]
+name = "drm-sys"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd39dde40b6e196c2e8763f23d119ddb1a8714534bf7d77fa97a65b0feda3986"
+dependencies = [
+ "libc",
+ "linux-raw-sys 0.6.5",
+]
+
 [[package]]
 name = "dunce"
 version = "1.0.5"
@@ -1642,7 +1809,7 @@ checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1863,6 +2030,12 @@ dependencies = [
  "percent-encoding",
 ]
 
+[[package]]
+name = "four-cc"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "795cbfc56d419a7ce47ccbb7504dd9a5b7c484c083c356e797de08bd988d9629"
+
 [[package]]
 name = "fs-err"
 version = "3.3.0"
@@ -1946,7 +2119,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1992,6 +2165,28 @@ dependencies = [
  "serde_json",
 ]
 
+[[package]]
+name = "gbm"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45bf55ba6dd53ad0ac115046ff999c5324c283444ee6e0be82454c4e8eb2f36a"
+dependencies = [
+ "bitflags 2.11.0",
+ "drm",
+ "drm-fourcc",
+ "gbm-sys",
+ "libc",
+]
+
+[[package]]
+name = "gbm-sys"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9cc2f64de9fa707b5c6b2d2f10d7a7e49e845018a9f5685891eb40d3bab2538"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -2064,6 +2259,17 @@ dependencies = [
  "stable_deref_trait",
 ]
 
+[[package]]
+name = "gl_generator"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a95dfc23a2b4a9a2f5ab41d194f8bfda3cabec42af4e39f08c339eb2a0c124d"
+dependencies = [
+ "khronos_api",
+ "log",
+ "xml-rs",
+]
+
 [[package]]
 name = "glob"
 version = "0.3.3"
@@ -2082,6 +2288,27 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "glow"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29038e1c483364cc6bb3cf78feee1816002e127c331a1eec55a4d202b9e1adb5"
+dependencies = [
+ "js-sys",
+ "slotmap",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "glutin_wgl_sys"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c4ee00b289aba7a9e5306d57c2d05499b2e5dc427f84ac708bd2c090212cf3e"
+dependencies = [
+ "gl_generator",
+]
+
 [[package]]
 name = "gpu-allocator"
 version = "0.28.0"
@@ -2135,6 +2362,19 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "h264-reader"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "036a78b2620d92f0ec57690bc792b3bb87348632ee5225302ba2e66a48021c6c"
+dependencies = [
+ "bitstream-io 2.6.0",
+ "hex-slice",
+ "log",
+ "memchr",
+ "rfc6381-codec",
+]
+
 [[package]]
 name = "half"
 version = "2.7.1"
@@ -2236,6 +2476,12 @@ version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
 
+[[package]]
+name = "hex-slice"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5491a308e0214554f07a81d8944abe45f552871c12e3c3c6e7e5d354039a6c4c"
+
 [[package]]
 name = "hexf-parse"
 version = "0.2.1"
@@ -2645,7 +2891,7 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -2790,7 +3036,7 @@ checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -2865,6 +3111,23 @@ dependencies = [
  "signature",
 ]
 
+[[package]]
+name = "khronos-egl"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6aae1df220ece3c0ada96b8153459b67eebe9ae9212258bb0134ae60416fdf76"
+dependencies = [
+ "libc",
+ "libloading 0.8.9",
+ "pkg-config",
+]
+
+[[package]]
+name = "khronos_api"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2db585e1d738fc771bf08a151420d3ed193d9d895a36df7f6f8a9456b911ddc"
+
 [[package]]
 name = "kurbo"
 version = "0.13.0"
@@ -2960,6 +3223,12 @@ version = "0.4.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a385b1be4e5c3e362ad2ffa73c392e53f031eaa5b7d648e64cd87f27f6063d7"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.12.1"
@@ -3197,6 +3466,21 @@ dependencies = [
  "pxfm",
 ]
 
+[[package]]
+name = "mp4ra-rust"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdbc3d3867085d66ac6270482e66f3dd2c5a18451a3dc9ad7269e94844a536b7"
+dependencies = [
+ "four-cc",
+]
+
+[[package]]
+name = "mpeg4-audio-const"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96a1fe2275b68991faded2c80aa4a33dba398b77d276038b8f50701a22e55918"
+
 [[package]]
 name = "multer"
 version = "3.1.0"
@@ -3230,7 +3514,7 @@ dependencies = [
  "bit-set",
  "bitflags 2.11.0",
  "cfg-if",
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "codespan-reporting",
  "half",
  "hashbrown 0.16.1",
@@ -3273,6 +3557,15 @@ dependencies = [
  "tempfile",
 ]
 
+[[package]]
+name = "ndk-sys"
+version = "0.6.0+11769913"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee6cda3051665f1fb8d9e08fc35c96d5a244fb1be711a03b71118828afc9a873"
+dependencies = [
+ "jni-sys",
+]
+
 [[package]]
 name = "new_debug_unreachable"
 version = "1.0.6"
@@ -3299,6 +3592,18 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "nix"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4"
+dependencies = [
+ "bitflags 2.11.0",
+ "cfg-if",
+ "cfg_aliases 0.1.1",
+ "libc",
+]
+
 [[package]]
 name = "nix"
 version = "0.30.1"
@@ -3307,7 +3612,7 @@ checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
 dependencies = [
  "bitflags 2.11.0",
  "cfg-if",
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "libc",
 ]
 
@@ -3407,7 +3712,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3480,7 +3785,7 @@ dependencies = [
  "proc-macro-crate",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3682,7 +3987,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3864,7 +4169,7 @@ dependencies = [
  "proc-macro2",
  "proc-macro2-diagnostics",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3906,7 +4211,7 @@ checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4056,7 +4361,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
 dependencies = [
  "proc-macro2",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4094,7 +4399,7 @@ checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
  "version_check",
  "yansi",
 ]
@@ -4115,7 +4420,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "52717f9a02b6965224f95ca2a81e2e0c5c43baacd28ca057577988930b6c3d5b"
 dependencies = [
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4155,7 +4460,7 @@ dependencies = [
  "prost 0.12.6",
  "prost-types 0.12.6",
  "regex",
- "syn",
+ "syn 2.0.117",
  "tempfile",
 ]
 
@@ -4169,7 +4474,7 @@ dependencies = [
  "itertools 0.12.1",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4182,7 +4487,7 @@ dependencies = [
  "itertools 0.14.0",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4223,7 +4528,7 @@ checksum = "56000349b6896e3d44286eb9c330891237f40b27fd43c1ccc84547d0b463cb40"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4292,7 +4597,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
 dependencies = [
  "bytes",
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "pin-project-lite",
  "quinn-proto",
  "quinn-udp",
@@ -4335,7 +4640,7 @@ version = "0.5.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
 dependencies = [
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "libc",
  "once_cell",
  "socket2",
@@ -4498,7 +4803,7 @@ dependencies = [
  "arrayvec",
  "av-scenechange",
  "av1-grain",
- "bitstream-io",
+ "bitstream-io 4.9.0",
  "built",
  "cc",
  "cfg-if",
@@ -4637,7 +4942,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4821,6 +5126,16 @@ dependencies = [
  "usvg",
 ]
 
+[[package]]
+name = "rfc6381-codec"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed54c20f5c3ec82eab6d998b313dc75ec5d5650d4f57675e61d72489040297fd"
+dependencies = [
+ "mp4ra-rust",
+ "mpeg4-audio-const",
+]
+
 [[package]]
 name = "rgb"
 version = "0.8.53"
@@ -4919,7 +5234,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rust-embed-utils",
- "syn",
+ "syn 2.0.117",
  "walkdir",
 ]
 
@@ -5221,7 +5536,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde_derive_internals",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5310,7 +5625,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5321,7 +5636,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5406,7 +5721,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5470,6 +5785,17 @@ version = "2026.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b135058874815f8f13edae644ceedb659f7238fe4a9e2b1bdceecc72dc659b35"
 
+[[package]]
+name = "shiguredo_nvcodec"
+version = "2025.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7abdb7e695a3fe6f37ea08a6366c6848ea1d4491dafbf793fe5d2691928087c8"
+dependencies = [
+ "bindgen 0.72.1",
+ "libloading 0.8.9",
+ "toml 0.9.12+spec-1.1.0",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -5541,6 +5867,15 @@ version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
 
+[[package]]
+name = "slotmap"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdd58c3c93c3d278ca835519292445cb4b0d4dc59ccfdf7ceadaab3f8aeb4038"
+dependencies = [
+ "version_check",
+]
+
 [[package]]
 name = "smallvec"
 version = "1.15.1"
@@ -5692,6 +6027,7 @@ dependencies = [
  "bytes",
  "cc",
  "cmake",
+ "cros-codecs",
  "env-libvpx-sys",
  "fontdue",
  "futures",
@@ -5721,6 +6057,7 @@ dependencies = [
  "serde-saphyr",
  "serde_json",
  "shiguredo_mp4",
+ "shiguredo_nvcodec",
  "smallvec",
  "streamkit-core",
  "symphonia",
@@ -5734,6 +6071,7 @@ dependencies = [
  "ts-rs",
  "url",
  "uuid",
+ "vk-video",
  "webm",
  "wgpu",
  "wildmatch",
@@ -5910,7 +6248,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -6061,6 +6399,17 @@ dependencies = [
  "symphonia-metadata",
 ]
 
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
 [[package]]
 name = "syn"
 version = "2.0.117"
@@ -6089,7 +6438,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -6208,7 +6557,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -6219,7 +6568,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -6401,7 +6750,7 @@ checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -6751,7 +7100,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -6857,7 +7206,7 @@ checksum = "38d90eea51bc7988ef9e674bf80a85ba6804739e535e9cab48e4bb34a8b652aa"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
  "termcolor",
 ]
 
@@ -7074,7 +7423,38 @@ checksum = "d674d135b4a8c1d7e813e2f8d1c9a58308aee4a680323066025e53132218bd91"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "vk-mem"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0cb12b79bcec57a3334d0284f1364c1846f378bb47df9779c6dbfcfc245c9404"
+dependencies = [
+ "ash",
+ "bitflags 2.11.0",
+ "cc",
+]
+
+[[package]]
+name = "vk-video"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6accac84fee2e209165c93dfc9e44ae37391b4e0b812aba92660bfc0ca77c440"
+dependencies = [
+ "ash",
+ "bytemuck",
+ "bytes",
+ "cfg_aliases 0.2.1",
+ "derivative",
+ "h264-reader",
+ "memchr",
+ "rustc-hash 2.1.1",
+ "thiserror 1.0.69",
+ "tracing",
+ "vk-mem",
+ "wgpu",
 ]
 
 [[package]]
@@ -7166,7 +7546,7 @@ dependencies = [
  "bumpalo",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
  "wasm-bindgen-shared",
 ]
 
@@ -7428,7 +7808,7 @@ dependencies = [
  "anyhow",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
  "wasmtime-internal-component-util",
  "wasmtime-internal-wit-bindgen",
  "wit-parser 0.243.0",
@@ -7542,7 +7922,7 @@ checksum = "70f8b9796a3f0451a7b702508b303d654de640271ac80287176de222f187a237"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -7650,6 +8030,18 @@ dependencies = [
  "wast 245.0.1",
 ]
 
+[[package]]
+name = "wayland-sys"
+version = "0.31.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8eab23fefc9e41f8e841df4a9c707e8a8c4ed26e944ef69297184de2785e3be"
+dependencies = [
+ "dlib",
+ "log",
+ "once_cell",
+ "pkg-config",
+]
+
 [[package]]
 name = "web-async"
 version = "0.1.3"
@@ -7767,15 +8159,21 @@ dependencies = [
  "bitflags 2.11.0",
  "bytemuck",
  "cfg-if",
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "document-features",
  "hashbrown 0.16.1",
+ "js-sys",
  "log",
+ "naga",
+ "parking_lot",
  "portable-atomic",
  "profiling",
  "raw-window-handle",
  "smallvec",
  "static_assertions",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
  "wgpu-core",
  "wgpu-hal",
  "wgpu-types",
@@ -7792,7 +8190,7 @@ dependencies = [
  "bit-vec",
  "bitflags 2.11.0",
  "bytemuck",
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "document-features",
  "hashbrown 0.16.1",
  "indexmap 2.13.0",
@@ -7807,6 +8205,7 @@ dependencies = [
  "smallvec",
  "thiserror 2.0.18",
  "wgpu-core-deps-apple",
+ "wgpu-core-deps-emscripten",
  "wgpu-core-deps-windows-linux-android",
  "wgpu-hal",
  "wgpu-naga-bridge",
@@ -7822,6 +8221,15 @@ dependencies = [
  "wgpu-hal",
 ]
 
+[[package]]
+name = "wgpu-core-deps-emscripten"
+version = "29.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef043bf135cc68b6f667c55ff4e345ce2b5924d75bad36a47921b0287ca4b24a"
+dependencies = [
+ "wgpu-hal",
+]
+
 [[package]]
 name = "wgpu-core-deps-windows-linux-android"
 version = "29.0.0"
@@ -7845,14 +8253,19 @@ dependencies = [
  "block2",
  "bytemuck",
  "cfg-if",
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
+ "glow",
+ "glutin_wgl_sys",
  "gpu-allocator",
  "gpu-descriptor",
  "hashbrown 0.16.1",
+ "js-sys",
+ "khronos-egl",
  "libc",
  "libloading 0.8.9",
  "log",
  "naga",
+ "ndk-sys",
  "objc2",
  "objc2-core-foundation",
  "objc2-foundation",
@@ -7870,6 +8283,9 @@ dependencies = [
  "renderdoc-sys",
  "smallvec",
  "thiserror 2.0.18",
+ "wasm-bindgen",
+ "wayland-sys",
+ "web-sys",
  "wgpu-naga-bridge",
  "wgpu-types",
  "windows",
@@ -7894,8 +8310,10 @@ checksum = "ec2675540fb1a5cfa5ef122d3d5f390e2c75711a0b946410f2d6ac3a0f77d1f6"
 dependencies = [
  "bitflags 2.11.0",
  "bytemuck",
+ "js-sys",
  "log",
  "raw-window-handle",
+ "web-sys",
 ]
 
 [[package]]
@@ -7932,7 +8350,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
  "witx",
 ]
 
@@ -7944,7 +8362,7 @@ checksum = "fea2aea744eded58ae092bf57110c27517dab7d5a300513ff13897325c5c5021"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
  "wiggle-generate",
 ]
 
@@ -8067,7 +8485,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -8078,7 +8496,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -8433,7 +8851,7 @@ dependencies = [
  "heck",
  "indexmap 2.13.0",
  "prettyplease",
- "syn",
+ "syn 2.0.117",
  "wasm-metadata",
  "wit-bindgen-core",
  "wit-component",
@@ -8449,7 +8867,7 @@ dependencies = [
  "prettyplease",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
  "wit-bindgen-core",
  "wit-bindgen-rust",
 ]
@@ -8555,6 +8973,12 @@ dependencies = [
  "rustix 1.1.4",
 ]
 
+[[package]]
+name = "xml-rs"
+version = "0.8.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ae8337f8a065cfc972643663ea4279e04e7256de865aa66fe25cec5fb912d3f"
+
 [[package]]
 name = "xmlwriter"
 version = "0.1.0"
@@ -8601,7 +9025,7 @@ checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
  "synstructure",
 ]
 
@@ -8632,7 +9056,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -8643,7 +9067,7 @@ checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -8663,7 +9087,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
  "synstructure",
 ]
 
@@ -8703,7 +9127,7 @@ checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.117",
 ]
 
 [[package]]
diff --git a/crates/nodes/Cargo.toml b/crates/nodes/Cargo.toml
index a949afc2..adba5046 100644
--- a/crates/nodes/Cargo.toml
+++ b/crates/nodes/Cargo.toml
@@ -106,6 +106,11 @@ wgpu = { version = "29", optional = true, default-features = false, features = [
 pollster = { version = "0.4", optional = true }
 bytemuck = { version = "1.22", optional = true, features = ["derive"] }
 
+# HW-accelerated video codecs (optional, behind respective features)
+vk-video = { version = "0.3", optional = true }            # vulkan_video feature — Vulkan Video H.264 HW codec
+cros-codecs = { version = "0.0.6", optional = true, features = ["vaapi"] }  # vaapi feature — requires libva-dev system package
+shiguredo_nvcodec = { version = "2025.2", optional = true }
+
 futures-util = "0.3"
 
 [features]
@@ -176,6 +181,15 @@ object_store = ["dep:opendal", "dep:schemars"]
 codegen = ["dep:ts-rs"]
 video = ["vp9", "av1", "openh264", "colorbars", "compositor"]
 
+# HW-accelerated video codecs — not in `default`; each requires vendor-specific
+# system libraries or drivers at runtime.
+# vulkan_video: H.264 encode/decode via Vulkan Video (vk-video crate). Cross-vendor (Intel/NVIDIA/AMD).
+vulkan_video = ["dep:schemars", "dep:vk-video", "dep:serde_json"]
+# vaapi: AV1 encode/decode via VA-API (cros-codecs crate). Primarily Intel, also AMD.
+vaapi = ["dep:schemars", "dep:cros-codecs", "dep:serde_json"]
+# nvcodec: AV1 encode/decode via NVENC/NVDEC (shiguredo_nvcodec crate). NVIDIA only.
+nvcodec = ["dep:schemars", "dep:shiguredo_nvcodec", "dep:serde_json"]
+
 [[bin]]
 name = "generate-compositor-types"
 path = "src/bin/generate_compositor_types.rs"
diff --git a/crates/nodes/src/video/mod.rs b/crates/nodes/src/video/mod.rs
index 3e855914..6230541f 100644
--- a/crates/nodes/src/video/mod.rs
+++ b/crates/nodes/src/video/mod.rs
@@ -70,6 +70,29 @@ pub const AV1_CONTENT_TYPE: &str = "video/av1";
 /// MIME-style content type for H.264-encoded video packets.
 pub const H264_CONTENT_TYPE: &str = "video/h264";
 
+// ── Hardware acceleration mode ───────────────────────────────────────────────
+//
+// Shared across all HW-accelerated codec modules (Vulkan Video, VA-API, NVENC).
+
+/// Hardware acceleration mode for GPU codec nodes.
+///
+/// Mirrors the compositor's `gpu_mode` pattern: auto-detect by default,
+/// with explicit force options for testing and deployment.
+#[cfg(any(feature = "vulkan_video", feature = "vaapi", feature = "nvcodec"))]
+#[derive(
+    Debug, Clone, Copy, Default, serde::Serialize, serde::Deserialize, schemars::JsonSchema,
+)]
+#[serde(rename_all = "lowercase")]
+pub enum HwAccelMode {
+    /// Auto-detect: use HW if available, fall back to CPU otherwise.
+    #[default]
+    Auto,
+    /// Force HW acceleration — fail if unavailable.
+    ForceHw,
+    /// Force CPU path — ignore available HW.
+    ForceCpu,
+}
+
 /// Parse a pixel format string into a [`PixelFormat`].
 ///
 /// Accepts `"i420"`, `"nv12"`, `"rgba8"`, or `"rgba"` (case-insensitive).
@@ -100,9 +123,27 @@ pub mod pixel_ops;
 #[cfg(feature = "compositor")]
 pub mod pixel_convert;
 
-#[cfg(any(feature = "vp9", feature = "av1", feature = "svt_av1", feature = "openh264"))]
+#[cfg(any(
+    feature = "vp9",
+    feature = "av1",
+    feature = "svt_av1",
+    feature = "openh264",
+    feature = "nvcodec",
+    feature = "vaapi"
+))]
 pub(crate) mod encoder_trait;
 
+// ── HW-accelerated codec modules ─────────────────────────────────────────────
+
+#[cfg(feature = "vulkan_video")]
+pub mod vulkan_video;
+
+#[cfg(feature = "vaapi")]
+pub mod vaapi_av1;
+
+#[cfg(feature = "nvcodec")]
+pub mod nv_av1;
+
 // ── Shared I420→NV12 conversion helpers ──────────────────────────────────────
 //
 // Used by both the rav1d decoder (av1.rs) and the C dav1d decoder (dav1d.rs).
diff --git a/crates/nodes/src/video/nv_av1.rs b/crates/nodes/src/video/nv_av1.rs
new file mode 100644
index 00000000..2de6c8b7
--- /dev/null
+++ b/crates/nodes/src/video/nv_av1.rs
@@ -0,0 +1,1184 @@
+// SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+//
+// SPDX-License-Identifier: MPL-2.0
+
+//! NVIDIA NVENC/NVDEC HW-accelerated AV1 encoder and decoder nodes.
+//!
+//! Uses the [`shiguredo_nvcodec`](https://crates.io/crates/shiguredo_nvcodec)
+//! crate which provides Rust bindings for the NVIDIA Video Codec SDK.  CUDA
+//! driver API is loaded dynamically at runtime (`dlopen`) — no build-time
+//! CUDA Toolkit dependency.
+//!
+//! This module provides:
+//! - `NvAv1DecoderNode` — decodes AV1 packets to NV12 `VideoFrame`s via NVDEC
+//! - `NvAv1EncoderNode` — encodes NV12 `VideoFrame`s to AV1 packets via NVENC
+//!
+//! Both nodes perform runtime capability detection: if no NVIDIA GPU with
+//! AV1 support is found, node creation returns an error so the pipeline can
+//! fall back to a CPU codec (rav1e/dav1d/SVT-AV1).
+//!
+//! # Feature gate
+//!
+//! Requires `nvcodec` feature.
+//!
+//! # GPU requirements
+//!
+//! - **AV1 decode**: NVIDIA RTX 30xx (Ampere) or newer.
+//! - **AV1 encode**: NVIDIA RTX 40xx (Ada Lovelace) or newer.
+
+use async_trait::async_trait;
+use bytes::Bytes;
+use opentelemetry::global;
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+use std::borrow::Cow;
+use std::sync::Arc;
+use std::time::Instant;
+use streamkit_core::stats::NodeStatsTracker;
+use streamkit_core::types::{
+    EncodedVideoFormat, Packet, PacketMetadata, PacketType, PixelFormat, RawVideoFormat,
+    VideoCodec, VideoFrame, VideoLayout,
+};
+use streamkit_core::{
+    config_helpers, get_codec_channel_capacity, packet_helpers, state_helpers, InputPin,
+    NodeContext, NodeRegistry, OutputPin, PinCardinality, PooledVideoData, ProcessorNode,
+    StreamKitError, VideoFramePool,
+};
+use tokio::sync::mpsc;
+
+use super::encoder_trait::{self, EncodedPacket, EncoderNodeRunner, StandardVideoEncoder};
+use super::HwAccelMode;
+use super::AV1_CONTENT_TYPE;
+
+// ---------------------------------------------------------------------------
+// Decoder
+// ---------------------------------------------------------------------------
+
+/// Configuration for the NVIDIA AV1 decoder node.
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[serde(default, deny_unknown_fields)]
+pub struct NvAv1DecoderConfig {
+    /// Hardware acceleration mode.
+    pub hw_accel: HwAccelMode,
+    /// CUDA device index (0-based). If `None`, use device 0.
+    pub cuda_device: Option<u32>,
+}
+
+impl Default for NvAv1DecoderConfig {
+    fn default() -> Self {
+        Self { hw_accel: HwAccelMode::Auto, cuda_device: None }
+    }
+}
+
+/// NVIDIA NVDEC AV1 decoder node.
+///
+/// Accepts AV1 encoded `Binary` packets on its `"in"` pin and emits
+/// decoded NV12 `VideoFrame`s on its `"out"` pin.
+pub struct NvAv1DecoderNode {
+    config: NvAv1DecoderConfig,
+}
+
+impl NvAv1DecoderNode {
+    /// Create a new decoder node with the given configuration.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if `hw_accel` is `ForceCpu` (this node only does HW).
+    pub fn new(config: NvAv1DecoderConfig) -> Result<Self, StreamKitError> {
+        if matches!(config.hw_accel, HwAccelMode::ForceCpu) {
+            return Err(StreamKitError::Configuration(
+                "NvAv1DecoderNode only supports hardware decoding; \
+                 use the CPU AV1 decoder (video::av1::decoder) for ForceCpu mode"
+                    .to_string(),
+            ));
+        }
+        Ok(Self { config })
+    }
+}
+
+#[async_trait]
+impl ProcessorNode for NvAv1DecoderNode {
+    fn input_pins(&self) -> Vec<InputPin> {
+        vec![InputPin {
+            name: "in".to_string(),
+            accepts_types: vec![PacketType::EncodedVideo(EncodedVideoFormat {
+                codec: VideoCodec::Av1,
+                bitstream_format: None,
+                codec_private: None,
+                profile: None,
+                level: None,
+            })],
+            cardinality: PinCardinality::One,
+        }]
+    }
+
+    fn output_pins(&self) -> Vec<OutputPin> {
+        vec![OutputPin {
+            name: "out".to_string(),
+            produces_type: PacketType::RawVideo(RawVideoFormat {
+                width: None,
+                height: None,
+                pixel_format: PixelFormat::Nv12,
+            }),
+            cardinality: PinCardinality::Broadcast,
+        }]
+    }
+
+    async fn run(self: Box<Self>, mut context: NodeContext) -> Result<(), StreamKitError> {
+        let node_name = context.output_sender.node_name().to_string();
+        state_helpers::emit_initializing(&context.state_tx, &node_name);
+
+        tracing::info!("NvAv1DecoderNode starting");
+        let mut input_rx = context.take_input("in")?;
+        let video_pool = context.video_pool.clone();
+
+        let meter = global::meter("skit_nodes");
+        let packets_processed_counter =
+            meter.u64_counter("nv_av1_decoder_packets_processed").build();
+        let decode_duration_histogram = meter
+            .f64_histogram("nv_av1_decode_duration")
+            .with_boundaries(streamkit_core::metrics::HISTOGRAM_BOUNDARIES_CODEC_PACKET.to_vec())
+            .build();
+
+        let (decode_tx, mut decode_rx) =
+            mpsc::channel::<(Bytes, Option<PacketMetadata>)>(get_codec_channel_capacity());
+        let (result_tx, mut result_rx) =
+            mpsc::channel::<Result<VideoFrame, String>>(get_codec_channel_capacity());
+
+        let cuda_device = self.config.cuda_device.unwrap_or(0);
+        let decode_task = tokio::task::spawn_blocking(move || {
+            let nv_config = shiguredo_nvcodec::DecoderConfig {
+                #[allow(clippy::cast_possible_wrap)]
+                device_id: cuda_device as i32,
+                max_display_delay: 0, // low-latency
+                ..shiguredo_nvcodec::DecoderConfig::default()
+            };
+
+            let mut decoder = match shiguredo_nvcodec::Decoder::new_av1(nv_config) {
+                Ok(d) => d,
+                Err(err) => {
+                    let _ = result_tx.blocking_send(Err(format!(
+                        "NVDEC: failed to create AV1 decoder on GPU {cuda_device}: {err}"
+                    )));
+                    return;
+                },
+            };
+
+            tracing::info!("NVDEC AV1 decoder created on GPU {cuda_device}");
+
+            while let Some((data, metadata)) = decode_rx.blocking_recv() {
+                if result_tx.is_closed() {
+                    return;
+                }
+
+                if data.is_empty() {
+                    continue;
+                }
+
+                let decode_start_time = Instant::now();
+
+                if let Err(err) = decoder.decode(&data) {
+                    tracing::warn!("NVDEC AV1 decode error: {err}");
+                    let _ =
+                        result_tx.blocking_send(Err(format!("NVDEC: AV1 decode failed: {err}")));
+                    continue;
+                }
+
+                // Drain all decoded frames produced by this input packet.
+                loop {
+                    match decoder.next_frame() {
+                        Ok(Some(nv_frame)) => {
+                            match copy_nvdec_frame(&nv_frame, metadata.clone(), video_pool.as_ref())
+                            {
+                                Ok(frame) => {
+                                    if result_tx.blocking_send(Ok(frame)).is_err() {
+                                        return;
+                                    }
+                                },
+                                Err(err) => {
+                                    let _ = result_tx.blocking_send(Err(err));
+                                },
+                            }
+                        },
+                        Ok(None) => break,
+                        Err(err) => {
+                            tracing::warn!("NVDEC next_frame error: {err}");
+                            let _ = result_tx
+                                .blocking_send(Err(format!("NVDEC: next_frame failed: {err}")));
+                            break;
+                        },
+                    }
+                }
+
+                // Record decode duration once per input packet (after the
+                // entire decode + drain cycle), matching the AV1 CPU decoder
+                // pattern in av1.rs.
+                decode_duration_histogram.record(decode_start_time.elapsed().as_secs_f64(), &[]);
+            }
+
+            // Flush remaining frames.
+            if result_tx.is_closed() {
+                return;
+            }
+            if let Err(err) = decoder.finish() {
+                tracing::warn!("NVDEC finish error: {err}");
+                return;
+            }
+            loop {
+                match decoder.next_frame() {
+                    Ok(Some(nv_frame)) => {
+                        match copy_nvdec_frame(&nv_frame, None, video_pool.as_ref()) {
+                            Ok(frame) => {
+                                if result_tx.blocking_send(Ok(frame)).is_err() {
+                                    return;
+                                }
+                            },
+                            Err(err) => {
+                                let _ = result_tx.blocking_send(Err(err));
+                            },
+                        }
+                    },
+                    Ok(None) => break,
+                    Err(err) => {
+                        tracing::warn!("NVDEC flush next_frame error: {err}");
+                        break;
+                    },
+                }
+            }
+        });
+
+        state_helpers::emit_running(&context.state_tx, &node_name);
+
+        let mut stats_tracker = NodeStatsTracker::new(node_name.clone(), context.stats_tx.clone());
+        let batch_size = context.batch_size;
+
+        let decode_tx_clone = decode_tx.clone();
+        let mut input_task = tokio::spawn(async move {
+            loop {
+                let Some(first_packet) = input_rx.recv().await else {
+                    break;
+                };
+
+                let packet_batch =
+                    packet_helpers::batch_packets_greedy(first_packet, &mut input_rx, batch_size);
+
+                for packet in packet_batch {
+                    if let Packet::Binary { data, metadata, .. } = packet {
+                        if decode_tx_clone.send((data, metadata)).await.is_err() {
+                            tracing::error!(
+                                "NvAv1DecoderNode decode task has shut down unexpectedly"
+                            );
+                            return;
+                        }
+                    }
+                }
+            }
+            tracing::info!("NvAv1DecoderNode input stream closed");
+        });
+
+        crate::codec_utils::codec_forward_loop(
+            &mut context,
+            &mut result_rx,
+            &mut input_task,
+            decode_task,
+            decode_tx,
+            &packets_processed_counter,
+            &mut stats_tracker,
+            Packet::Video,
+            "NvAv1DecoderNode",
+        )
+        .await;
+
+        state_helpers::emit_stopped(&context.state_tx, &node_name, "input_closed");
+        tracing::info!("NvAv1DecoderNode finished");
+        Ok(())
+    }
+}
+
+/// Copy a decoded NV12 frame from `shiguredo_nvcodec` into a `VideoFrame`.
+///
+/// The `DecodedFrame` already provides NV12 data (separate Y and interleaved
+/// UV planes), so we copy them into a contiguous buffer with the canonical
+/// packed NV12 layout.
+fn copy_nvdec_frame(
+    decoded: &shiguredo_nvcodec::DecodedFrame,
+    metadata: Option<PacketMetadata>,
+    video_pool: Option<&Arc<VideoFramePool>>,
+) -> Result<VideoFrame, String> {
+    #[allow(clippy::cast_possible_truncation)]
+    let width = decoded.width() as u32;
+    #[allow(clippy::cast_possible_truncation)]
+    let height = decoded.height() as u32;
+
+    if width == 0 || height == 0 {
+        return Err("NVDEC produced empty frame".to_string());
+    }
+
+    let nv12_layout = VideoLayout::packed(width, height, PixelFormat::Nv12);
+    let mut data = video_pool.map_or_else(
+        || PooledVideoData::from_vec(vec![0u8; nv12_layout.total_bytes()]),
+        |pool| pool.get(nv12_layout.total_bytes()),
+    );
+    let data_slice = data.as_mut_slice();
+
+    let nv12_planes = nv12_layout.planes();
+    let y_plane = nv12_planes[0];
+    let uv_plane = nv12_planes[1];
+
+    // Copy Y plane.
+    let y_src = decoded.y_plane();
+    let y_src_stride = decoded.y_stride();
+    let width_usize = width as usize;
+    let height_usize = height as usize;
+
+    for row in 0..height_usize {
+        let src_start = row * y_src_stride;
+        let src_end = src_start + width_usize;
+        if src_end > y_src.len() {
+            return Err(format!("NVDEC Y plane too small: need {src_end}, have {}", y_src.len()));
+        }
+        let dst_start = y_plane.offset + row * y_plane.stride;
+        let dst_end = dst_start + width_usize;
+        if dst_end > data_slice.len() {
+            return Err("NVDEC Y plane copy overflow".to_string());
+        }
+        data_slice[dst_start..dst_end].copy_from_slice(&y_src[src_start..src_end]);
+    }
+
+    // Copy UV plane (already interleaved NV12 format from NVDEC).
+    let uv_src = decoded.uv_plane();
+    let uv_src_stride = decoded.uv_stride();
+    let chroma_h = uv_plane.height as usize;
+    let uv_row_bytes = uv_plane.width as usize; // NV12: ceil(width/2) interleaved UV pairs
+
+    for row in 0..chroma_h {
+        let src_start = row * uv_src_stride;
+        let src_end = src_start + uv_row_bytes;
+        if src_end > uv_src.len() {
+            return Err(format!("NVDEC UV plane too small: need {src_end}, have {}", uv_src.len()));
+        }
+        let dst_start = uv_plane.offset + row * uv_plane.stride;
+        let dst_end = dst_start + uv_row_bytes;
+        if dst_end > data_slice.len() {
+            return Err("NVDEC UV plane copy overflow".to_string());
+        }
+        data_slice[dst_start..dst_end].copy_from_slice(&uv_src[src_start..src_end]);
+    }
+
+    VideoFrame::from_pooled(width, height, PixelFormat::Nv12, data, metadata)
+        .map_err(|e| e.to_string())
+}
+
+// ---------------------------------------------------------------------------
+// Encoder
+// ---------------------------------------------------------------------------
+
+/// Configuration for the NVIDIA AV1 encoder node.
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[serde(default, deny_unknown_fields)]
+pub struct NvAv1EncoderConfig {
+    /// Hardware acceleration mode.
+    pub hw_accel: HwAccelMode,
+    /// CUDA device index (0-based). If `None`, use device 0.
+    pub cuda_device: Option<u32>,
+    /// Target bitrate in bits per second.
+    pub bitrate: u32,
+    /// Target framerate in frames per second.
+    pub framerate: u32,
+    /// Keyframe interval (GOP length). `None` uses the NVENC default
+    /// (infinite GOP).
+    pub keyframe_interval: Option<u32>,
+}
+
+impl Default for NvAv1EncoderConfig {
+    fn default() -> Self {
+        Self {
+            hw_accel: HwAccelMode::Auto,
+            cuda_device: None,
+            bitrate: 2_000_000,
+            framerate: 30,
+            keyframe_interval: None,
+        }
+    }
+}
+
+/// NVIDIA NVENC AV1 encoder node.
+///
+/// Accepts NV12/I420 `VideoFrame`s on its `"in"` pin and emits AV1
+/// encoded `Binary` packets on its `"out"` pin.
+pub struct NvAv1EncoderNode {
+    config: NvAv1EncoderConfig,
+}
+
+impl NvAv1EncoderNode {
+    /// Create a new encoder node with the given configuration.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if `hw_accel` is `ForceCpu` (this node only does HW).
+    pub fn new(config: NvAv1EncoderConfig) -> Result<Self, StreamKitError> {
+        if matches!(config.hw_accel, HwAccelMode::ForceCpu) {
+            return Err(StreamKitError::Configuration(
+                "NvAv1EncoderNode only supports hardware encoding; \
+                 use the CPU AV1 encoder (video::av1::encoder) for ForceCpu mode"
+                    .to_string(),
+            ));
+        }
+        Ok(Self { config })
+    }
+}
+
+#[async_trait]
+impl ProcessorNode for NvAv1EncoderNode {
+    fn input_pins(&self) -> Vec<InputPin> {
+        vec![InputPin {
+            name: "in".to_string(),
+            accepts_types: vec![
+                PacketType::RawVideo(RawVideoFormat {
+                    width: None,
+                    height: None,
+                    pixel_format: PixelFormat::I420,
+                }),
+                PacketType::RawVideo(RawVideoFormat {
+                    width: None,
+                    height: None,
+                    pixel_format: PixelFormat::Nv12,
+                }),
+            ],
+            cardinality: PinCardinality::One,
+        }]
+    }
+
+    fn output_pins(&self) -> Vec<OutputPin> {
+        vec![OutputPin {
+            name: "out".to_string(),
+            produces_type: PacketType::EncodedVideo(EncodedVideoFormat {
+                codec: VideoCodec::Av1,
+                bitstream_format: None,
+                codec_private: None,
+                profile: None,
+                level: None,
+            }),
+            cardinality: PinCardinality::Broadcast,
+        }]
+    }
+
+    fn content_type(&self) -> Option<String> {
+        Some(AV1_CONTENT_TYPE.to_string())
+    }
+
+    async fn run(self: Box<Self>, context: NodeContext) -> Result<(), StreamKitError> {
+        encoder_trait::run_encoder(*self, context).await
+    }
+}
+
+impl EncoderNodeRunner for NvAv1EncoderNode {
+    const CONTENT_TYPE: &'static str = AV1_CONTENT_TYPE;
+    const NODE_LABEL: &'static str = "NvAv1EncoderNode";
+    const PACKETS_COUNTER_NAME: &'static str = "nv_av1_encoder_packets_processed";
+    const DURATION_HISTOGRAM_NAME: &'static str = "nv_av1_encode_duration";
+
+    fn spawn_codec_task(
+        self,
+        encode_rx: mpsc::Receiver<(VideoFrame, Option<PacketMetadata>)>,
+        result_tx: mpsc::Sender<Result<EncodedPacket, String>>,
+        duration_histogram: opentelemetry::metrics::Histogram<f64>,
+    ) -> tokio::task::JoinHandle<()> {
+        encoder_trait::spawn_standard_encode_task::<NvAv1Encoder>(
+            self.config,
+            encode_rx,
+            result_tx,
+            duration_histogram,
+        )
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Internal NVENC wrapper implementing StandardVideoEncoder
+// ---------------------------------------------------------------------------
+
+struct NvAv1Encoder {
+    encoder: shiguredo_nvcodec::Encoder,
+    next_pts: i64,
+}
+
+impl StandardVideoEncoder for NvAv1Encoder {
+    type Config = NvAv1EncoderConfig;
+    const CODEC_NAME: &'static str = "NV-AV1";
+
+    fn new_encoder(width: u32, height: u32, config: &Self::Config) -> Result<Self, String>
+    where
+        Self: Sized,
+    {
+        let cuda_device = config.cuda_device.unwrap_or(0);
+
+        let nv_config = shiguredo_nvcodec::EncoderConfig {
+            width,
+            height,
+            fps_numerator: config.framerate,
+            fps_denominator: 1,
+            target_bitrate: Some(config.bitrate),
+            preset: shiguredo_nvcodec::Preset::P1, // fastest for real-time
+            tuning_info: shiguredo_nvcodec::TuningInfo::LOW_LATENCY,
+            rate_control_mode: shiguredo_nvcodec::RateControlMode::Cbr,
+            gop_length: config.keyframe_interval,
+            idr_period: config.keyframe_interval,
+            frame_interval_p: 1, // no B-frames for low latency
+            profile: None,
+            #[allow(clippy::cast_possible_wrap)]
+            device_id: cuda_device as i32,
+            max_encode_width: None,
+            max_encode_height: None,
+        };
+
+        let encoder = shiguredo_nvcodec::Encoder::new_av1(nv_config).map_err(|err| {
+            format!("NVENC: failed to create AV1 encoder on GPU {cuda_device}: {err}")
+        })?;
+
+        tracing::info!(
+            width,
+            height,
+            bitrate = config.bitrate,
+            framerate = config.framerate,
+            gpu = cuda_device,
+            "NVENC AV1 encoder created"
+        );
+
+        Ok(Self { encoder, next_pts: 0 })
+    }
+
+    fn encode(
+        &mut self,
+        frame: &VideoFrame,
+        metadata: Option<PacketMetadata>,
+    ) -> Result<Vec<EncodedPacket>, String> {
+        let nv12_data = match frame.pixel_format {
+            PixelFormat::Nv12 => Cow::Borrowed(frame.data.as_slice()),
+            PixelFormat::I420 => Cow::Owned(i420_to_nv12_buffer(frame)),
+            other => {
+                return Err(format!("NV-AV1 encoder expects NV12 or I420 input, got {other:?}"));
+            },
+        };
+
+        self.encoder
+            .encode(&nv12_data)
+            .map_err(|err| format!("NVENC: AV1 encode failed: {err}"))?;
+
+        Ok(self.drain_packets(metadata))
+    }
+
+    fn flush_encoder(&mut self) -> Result<Vec<EncodedPacket>, String> {
+        self.encoder.finish().map_err(|err| format!("NVENC: AV1 finish failed: {err}"))?;
+
+        Ok(self.drain_packets(None))
+    }
+
+    fn flush_on_dimension_change() -> bool {
+        true
+    }
+}
+
+impl NvAv1Encoder {
+    /// Drain all available encoded frames from NVENC.
+    fn drain_packets(&mut self, metadata: Option<PacketMetadata>) -> Vec<EncodedPacket> {
+        let mut packets = Vec::new();
+        let mut remaining_metadata = metadata;
+
+        loop {
+            let Some(encoded) = self.encoder.next_frame() else {
+                break;
+            };
+
+            let is_keyframe = matches!(
+                encoded.picture_type(),
+                shiguredo_nvcodec::PictureType::I | shiguredo_nvcodec::PictureType::Idr
+            );
+            let data = Bytes::from(encoded.into_data());
+
+            let pts = self.next_pts;
+            self.next_pts += 1;
+
+            let meta = remaining_metadata.take();
+            let output_metadata = merge_keyframe_metadata(meta, is_keyframe, pts);
+
+            packets.push(EncodedPacket { data, metadata: Some(output_metadata) });
+        }
+
+        packets
+    }
+}
+
+/// Convert an I420 `VideoFrame` to a contiguous NV12 byte buffer suitable
+/// for `shiguredo_nvcodec::Encoder::encode()`.
+fn i420_to_nv12_buffer(frame: &VideoFrame) -> Vec<u8> {
+    let width = frame.width as usize;
+    let height = frame.height as usize;
+    let layout = frame.layout();
+    let planes = layout.planes();
+    let data = frame.data.as_slice();
+
+    // NV12 layout: Y plane (width * height) + UV plane (chroma_w*2 * chroma_h)
+    let chroma_w = width.div_ceil(2);
+    let chroma_h = height.div_ceil(2);
+    let uv_row_bytes = chroma_w * 2; // ceil(width/2) pairs of (U, V)
+    let nv12_size = width * height + uv_row_bytes * chroma_h;
+    let mut nv12 = vec![0u8; nv12_size];
+
+    // Copy Y plane.
+    let y_plane = &planes[0];
+    for row in 0..height {
+        let src_start = y_plane.offset + row * y_plane.stride;
+        let dst_start = row * width;
+        nv12[dst_start..dst_start + width].copy_from_slice(&data[src_start..src_start + width]);
+    }
+
+    // Interleave U + V into NV12 UV plane.
+    let u_plane = &planes[1];
+    let v_plane = &planes[2];
+    let uv_offset = width * height;
+
+    for row in 0..chroma_h {
+        let u_src_start = u_plane.offset + row * u_plane.stride;
+        let v_src_start = v_plane.offset + row * v_plane.stride;
+        let dst_start = uv_offset + row * uv_row_bytes;
+        for col in 0..chroma_w {
+            nv12[dst_start + col * 2] = data[u_src_start + col];
+            nv12[dst_start + col * 2 + 1] = data[v_src_start + col];
+        }
+    }
+
+    nv12
+}
+
+#[allow(clippy::missing_const_for_fn)] // map_or with closures is not yet stable in const fn
+fn merge_keyframe_metadata(
+    metadata: Option<PacketMetadata>,
+    keyframe: bool,
+    pts: i64,
+) -> PacketMetadata {
+    metadata.map_or(
+        PacketMetadata {
+            #[allow(clippy::cast_sign_loss)]
+            timestamp_us: if pts >= 0 { Some(pts as u64) } else { None },
+            duration_us: None,
+            sequence: None,
+            keyframe: Some(keyframe),
+        },
+        |meta| PacketMetadata {
+            timestamp_us: meta.timestamp_us,
+            duration_us: meta.duration_us,
+            sequence: meta.sequence,
+            keyframe: Some(keyframe),
+        },
+    )
+}
+
+// ---------------------------------------------------------------------------
+// Registration
+// ---------------------------------------------------------------------------
+
+use schemars::schema_for;
+use streamkit_core::registry::StaticPins;
+
+#[allow(clippy::expect_used, clippy::missing_panics_doc)]
+pub fn register_nv_av1_nodes(registry: &mut NodeRegistry) {
+    // Runtime capability check: verify that CUDA libraries are loadable.
+    // If not, log a warning but still register the nodes — they will fail
+    // at runtime with a clear error when the pipeline starts.
+    if !shiguredo_nvcodec::is_cuda_library_available() {
+        tracing::warn!(
+            "CUDA libraries not available — NV AV1 encoder/decoder nodes \
+             will fail at runtime if used"
+        );
+    }
+
+    let default_decoder = NvAv1DecoderNode::new(NvAv1DecoderConfig::default())
+        .expect("default NV AV1 decoder config should be valid");
+    registry.register_static_with_description(
+        "video::nv::av1_decoder",
+        |params| {
+            let config = config_helpers::parse_config_optional(params)?;
+            Ok(Box::new(NvAv1DecoderNode::new(config)?))
+        },
+        serde_json::to_value(schema_for!(NvAv1DecoderConfig))
+            .expect("NvAv1DecoderConfig schema should serialize to JSON"),
+        StaticPins { inputs: default_decoder.input_pins(), outputs: default_decoder.output_pins() },
+        vec![
+            "video".to_string(),
+            "codecs".to_string(),
+            "av1".to_string(),
+            "hw".to_string(),
+            "nvidia".to_string(),
+        ],
+        false,
+        "Decodes AV1-compressed packets into raw NV12 video frames using \
+         NVIDIA NVDEC hardware acceleration. Requires an NVIDIA RTX 30xx \
+         (Ampere) or newer GPU.",
+    );
+
+    let default_encoder = NvAv1EncoderNode::new(NvAv1EncoderConfig::default())
+        .expect("default NV AV1 encoder config should be valid");
+    registry.register_static_with_description(
+        "video::nv::av1_encoder",
+        |params| {
+            let config = config_helpers::parse_config_optional(params)?;
+            Ok(Box::new(NvAv1EncoderNode::new(config)?))
+        },
+        serde_json::to_value(schema_for!(NvAv1EncoderConfig))
+            .expect("NvAv1EncoderConfig schema should serialize to JSON"),
+        StaticPins { inputs: default_encoder.input_pins(), outputs: default_encoder.output_pins() },
+        vec![
+            "video".to_string(),
+            "codecs".to_string(),
+            "av1".to_string(),
+            "hw".to_string(),
+            "nvidia".to_string(),
+        ],
+        false,
+        "Encodes raw video frames (NV12 or I420) into AV1 packets using \
+         NVIDIA NVENC hardware acceleration. Requires an NVIDIA RTX 40xx \
+         (Ada Lovelace) or newer GPU. Insert a video::pixel_convert node \
+         upstream if the source outputs RGBA8.",
+    );
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+#[allow(clippy::unwrap_used, clippy::expect_used, clippy::disallowed_macros)]
+mod tests {
+    use super::*;
+    use crate::test_utils::{
+        assert_state_initializing, assert_state_running, assert_state_stopped, create_test_context,
+        create_test_video_frame,
+    };
+    use std::borrow::Cow;
+    use std::collections::HashMap;
+    use tokio::sync::mpsc;
+
+    // ── Helpers ─────────────────────────────────────────────────────────────
+
+    /// Returns `true` if CUDA libraries can be loaded AND a decoder can
+    /// actually be created on device 0.  This catches machines that have
+    /// `libcuda.so` but no physical GPU (or no AV1-capable GPU).
+    fn nvdec_av1_available() -> bool {
+        if !shiguredo_nvcodec::is_cuda_library_available() {
+            return false;
+        }
+        let config = shiguredo_nvcodec::DecoderConfig {
+            device_id: 0,
+            max_display_delay: 0,
+            ..shiguredo_nvcodec::DecoderConfig::default()
+        };
+        shiguredo_nvcodec::Decoder::new_av1(config).is_ok()
+    }
+
+    /// Returns `true` if NVENC AV1 encoding is available on device 0.
+    /// AV1 encode requires RTX 40xx (Ada Lovelace) or newer.
+    fn nvenc_av1_available() -> bool {
+        if !shiguredo_nvcodec::is_cuda_library_available() {
+            return false;
+        }
+        let config = shiguredo_nvcodec::EncoderConfig {
+            width: 64,
+            height: 64,
+            fps_numerator: 30,
+            fps_denominator: 1,
+            target_bitrate: Some(2_000_000),
+            preset: shiguredo_nvcodec::Preset::P1,
+            tuning_info: shiguredo_nvcodec::TuningInfo::LOW_LATENCY,
+            rate_control_mode: shiguredo_nvcodec::RateControlMode::Cbr,
+            gop_length: Some(1),
+            idr_period: Some(1),
+            frame_interval_p: 1,
+            profile: None,
+            device_id: 0,
+            max_encode_width: None,
+            max_encode_height: None,
+        };
+        shiguredo_nvcodec::Encoder::new_av1(config).is_ok()
+    }
+
+    // ── Unit tests (no GPU required) ────────────────────────────────────────
+
+    #[test]
+    fn force_cpu_decoder_rejected() {
+        let result = NvAv1DecoderNode::new(NvAv1DecoderConfig {
+            hw_accel: HwAccelMode::ForceCpu,
+            cuda_device: None,
+        });
+        assert!(result.is_err(), "ForceCpu should be rejected by NV decoder");
+    }
+
+    #[test]
+    fn force_cpu_encoder_rejected() {
+        let result = NvAv1EncoderNode::new(NvAv1EncoderConfig {
+            hw_accel: HwAccelMode::ForceCpu,
+            cuda_device: None,
+            bitrate: 2_000_000,
+            keyframe_interval: None,
+        });
+        assert!(result.is_err(), "ForceCpu should be rejected by NV encoder");
+    }
+
+    #[test]
+    fn default_configs_accepted() {
+        assert!(NvAv1DecoderNode::new(NvAv1DecoderConfig::default()).is_ok());
+        assert!(NvAv1EncoderNode::new(NvAv1EncoderConfig::default()).is_ok());
+    }
+
+    #[test]
+    fn decoder_pins_correct() {
+        let node = NvAv1DecoderNode::new(NvAv1DecoderConfig::default()).unwrap();
+        let inputs = node.input_pins();
+        let outputs = node.output_pins();
+        assert_eq!(inputs.len(), 1);
+        assert_eq!(outputs.len(), 1);
+        assert_eq!(inputs[0].name, "in");
+        assert_eq!(outputs[0].name, "out");
+        assert!(
+            matches!(&inputs[0].accepts_types[0], PacketType::EncodedVideo(fmt) if fmt.codec == VideoCodec::Av1),
+            "Decoder input should accept AV1"
+        );
+        assert!(
+            matches!(&outputs[0].produces_type, PacketType::RawVideo(fmt) if fmt.pixel_format == PixelFormat::Nv12),
+            "Decoder output should produce NV12"
+        );
+    }
+
+    #[test]
+    fn encoder_pins_correct() {
+        let node = NvAv1EncoderNode::new(NvAv1EncoderConfig::default()).unwrap();
+        let inputs = node.input_pins();
+        let outputs = node.output_pins();
+        assert_eq!(inputs.len(), 1);
+        assert_eq!(outputs.len(), 1);
+        assert_eq!(inputs[0].name, "in");
+        assert_eq!(outputs[0].name, "out");
+        // Encoder should accept both I420 and NV12.
+        assert_eq!(inputs[0].accepts_types.len(), 2);
+        assert!(
+            matches!(&outputs[0].produces_type, PacketType::EncodedVideo(fmt) if fmt.codec == VideoCodec::Av1),
+            "Encoder output should produce AV1"
+        );
+    }
+
+    #[test]
+    fn deny_unknown_fields_decoder() {
+        let json = r#"{"hw_accel":"Auto","cuda_device":null,"bogus_field":42}"#;
+        let result: Result<NvAv1DecoderConfig, _> = serde_json::from_str(json);
+        assert!(result.is_err(), "Unknown fields should be rejected");
+    }
+
+    #[test]
+    fn deny_unknown_fields_encoder() {
+        let json = r#"{"bitrate":1000000,"unknown_key":"oops"}"#;
+        let result: Result<NvAv1EncoderConfig, _> = serde_json::from_str(json);
+        assert!(result.is_err(), "Unknown fields should be rejected");
+    }
+
+    #[test]
+    fn i420_to_nv12_basic() {
+        // Build a minimal 4×4 I420 frame and convert.
+        let frame = create_test_video_frame(4, 4, PixelFormat::I420, 1);
+        let nv12 = i420_to_nv12_buffer(&frame);
+
+        // NV12 size: Y (4*4) + UV (ceil(4/2)*2 * ceil(4/2)) = 16 + 4*2 = 24
+        let expected_size = 4 * 4 + 4 * 2;
+        assert_eq!(nv12.len(), expected_size, "NV12 buffer size mismatch");
+    }
+
+    // ── GPU integration tests ───────────────────────────────────────────────
+
+    /// Encode several NV12 frames via NVENC, then decode them via NVDEC.
+    /// This is the full HW roundtrip test.
+    #[tokio::test]
+    async fn gpu_tests_nv_av1_encode_decode_roundtrip() {
+        if !nvenc_av1_available() {
+            eprintln!("Skipping NV AV1 encode/decode roundtrip: NVENC AV1 not available");
+            return;
+        }
+        if !nvdec_av1_available() {
+            eprintln!("Skipping NV AV1 encode/decode roundtrip: NVDEC AV1 not available");
+            return;
+        }
+
+        // --- Encode ---
+        let (enc_input_tx, enc_input_rx) = mpsc::channel(10);
+        let mut enc_inputs = HashMap::new();
+        enc_inputs.insert("in".to_string(), enc_input_rx);
+
+        let (enc_context, enc_sender, mut enc_state_rx) = create_test_context(enc_inputs, 10);
+        let encoder_config = NvAv1EncoderConfig {
+            bitrate: 2_000_000,
+            keyframe_interval: Some(1),
+            ..Default::default()
+        };
+        let encoder = NvAv1EncoderNode::new(encoder_config).unwrap();
+
+        let enc_handle = tokio::spawn(async move { Box::new(encoder).run(enc_context).await });
+
+        assert_state_initializing(&mut enc_state_rx).await;
+        assert_state_running(&mut enc_state_rx).await;
+
+        for index in 0_u64..5 {
+            let timestamp = 1_000 + 33_333_u64 * index;
+            let duration: u64 = 33_333;
+
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::Nv12, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(timestamp),
+                duration_us: Some(duration),
+                sequence: Some(index),
+                keyframe: Some(true),
+            });
+            enc_input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(enc_input_tx);
+
+        assert_state_stopped(&mut enc_state_rx).await;
+        enc_handle.await.unwrap().unwrap();
+
+        let encoded_packets = enc_sender.get_packets_for_pin("out").await;
+        assert!(!encoded_packets.is_empty(), "NVENC AV1 encoder produced no packets");
+
+        // --- Decode ---
+        let (dec_input_tx, dec_input_rx) = mpsc::channel(10);
+        let mut dec_inputs = HashMap::new();
+        dec_inputs.insert("in".to_string(), dec_input_rx);
+
+        let (dec_context, dec_sender, mut dec_state_rx) = create_test_context(dec_inputs, 10);
+        let decoder = NvAv1DecoderNode::new(NvAv1DecoderConfig::default()).unwrap();
+        let dec_handle = tokio::spawn(async move { Box::new(decoder).run(dec_context).await });
+
+        assert_state_initializing(&mut dec_state_rx).await;
+        assert_state_running(&mut dec_state_rx).await;
+
+        for packet in encoded_packets {
+            if let Packet::Binary { data, metadata, .. } = packet {
+                dec_input_tx
+                    .send(Packet::Binary {
+                        data,
+                        content_type: Some(Cow::Borrowed(AV1_CONTENT_TYPE)),
+                        metadata,
+                    })
+                    .await
+                    .unwrap();
+            }
+        }
+        drop(dec_input_tx);
+
+        assert_state_stopped(&mut dec_state_rx).await;
+        dec_handle.await.unwrap().unwrap();
+
+        let decoded_packets = dec_sender.get_packets_for_pin("out").await;
+        assert!(!decoded_packets.is_empty(), "NVDEC AV1 decoder produced no frames");
+
+        for packet in decoded_packets {
+            match packet {
+                Packet::Video(frame) => {
+                    assert_eq!(frame.width, 64);
+                    assert_eq!(frame.height, 64);
+                    assert_eq!(frame.pixel_format, PixelFormat::Nv12);
+                    assert!(!frame.data().is_empty(), "Decoded frame should have data");
+                },
+                _ => panic!("Expected Video packet from NV AV1 decoder"),
+            }
+        }
+    }
+
+    /// Encode-only test: verify that the encoder produces output packets
+    /// and that the first packet is marked as a keyframe.
+    #[tokio::test]
+    async fn gpu_tests_nv_av1_encoder_produces_keyframes() {
+        if !nvenc_av1_available() {
+            eprintln!("Skipping NV AV1 encoder keyframe test: NVENC AV1 not available");
+            return;
+        }
+
+        let (enc_input_tx, enc_input_rx) = mpsc::channel(10);
+        let mut enc_inputs = HashMap::new();
+        enc_inputs.insert("in".to_string(), enc_input_rx);
+
+        let (enc_context, enc_sender, mut enc_state_rx) = create_test_context(enc_inputs, 10);
+        let encoder_config = NvAv1EncoderConfig {
+            bitrate: 2_000_000,
+            keyframe_interval: Some(1),
+            ..Default::default()
+        };
+        let encoder = NvAv1EncoderNode::new(encoder_config).unwrap();
+
+        let enc_handle = tokio::spawn(async move { Box::new(encoder).run(enc_context).await });
+
+        assert_state_initializing(&mut enc_state_rx).await;
+        assert_state_running(&mut enc_state_rx).await;
+
+        for index in 0_u64..3 {
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::Nv12, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(33_333 * index),
+                duration_us: Some(33_333),
+                sequence: Some(index),
+                keyframe: None,
+            });
+            enc_input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(enc_input_tx);
+
+        assert_state_stopped(&mut enc_state_rx).await;
+        enc_handle.await.unwrap().unwrap();
+
+        let encoded_packets = enc_sender.get_packets_for_pin("out").await;
+        assert!(!encoded_packets.is_empty(), "NVENC AV1 encoder produced no packets");
+
+        // With keyframe_interval=1, every packet should be a keyframe.
+        for (i, packet) in encoded_packets.iter().enumerate() {
+            if let Packet::Binary { metadata, .. } = packet {
+                let meta = metadata.as_ref().expect("Encoded packet should have metadata");
+                assert_eq!(
+                    meta.keyframe,
+                    Some(true),
+                    "Packet {i} should be a keyframe with keyframe_interval=1"
+                );
+            }
+        }
+    }
+
+    /// Encode from I420 input — verifies the I420→NV12 conversion path.
+    #[tokio::test]
+    async fn gpu_tests_nv_av1_encoder_i420_input() {
+        if !nvenc_av1_available() {
+            eprintln!("Skipping NV AV1 I420 input test: NVENC AV1 not available");
+            return;
+        }
+
+        let (enc_input_tx, enc_input_rx) = mpsc::channel(10);
+        let mut enc_inputs = HashMap::new();
+        enc_inputs.insert("in".to_string(), enc_input_rx);
+
+        let (enc_context, enc_sender, mut enc_state_rx) = create_test_context(enc_inputs, 10);
+        let encoder_config = NvAv1EncoderConfig {
+            bitrate: 2_000_000,
+            keyframe_interval: Some(1),
+            ..Default::default()
+        };
+        let encoder = NvAv1EncoderNode::new(encoder_config).unwrap();
+
+        let enc_handle = tokio::spawn(async move { Box::new(encoder).run(enc_context).await });
+
+        assert_state_initializing(&mut enc_state_rx).await;
+        assert_state_running(&mut enc_state_rx).await;
+
+        // Send I420 frames instead of NV12.
+        for index in 0_u64..3 {
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::I420, 1);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(33_333 * index),
+                duration_us: Some(33_333),
+                sequence: Some(index),
+                keyframe: Some(true),
+            });
+            enc_input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(enc_input_tx);
+
+        assert_state_stopped(&mut enc_state_rx).await;
+        enc_handle.await.unwrap().unwrap();
+
+        let encoded_packets = enc_sender.get_packets_for_pin("out").await;
+        assert!(
+            !encoded_packets.is_empty(),
+            "NVENC AV1 encoder produced no packets from I420 input"
+        );
+    }
+
+    /// Metadata propagation: timestamps from input frames should be
+    /// preserved through the encode→decode roundtrip.
+    #[tokio::test]
+    async fn gpu_tests_nv_av1_metadata_propagation() {
+        if !nvenc_av1_available() || !nvdec_av1_available() {
+            eprintln!("Skipping NV AV1 metadata test: NVENC/NVDEC AV1 not available");
+            return;
+        }
+
+        // --- Encode ---
+        let (enc_input_tx, enc_input_rx) = mpsc::channel(10);
+        let mut enc_inputs = HashMap::new();
+        enc_inputs.insert("in".to_string(), enc_input_rx);
+
+        let (enc_context, enc_sender, mut enc_state_rx) = create_test_context(enc_inputs, 10);
+        let encoder_config = NvAv1EncoderConfig {
+            bitrate: 2_000_000,
+            keyframe_interval: Some(1),
+            ..Default::default()
+        };
+        let encoder = NvAv1EncoderNode::new(encoder_config).unwrap();
+        let enc_handle = tokio::spawn(async move { Box::new(encoder).run(enc_context).await });
+
+        assert_state_initializing(&mut enc_state_rx).await;
+        assert_state_running(&mut enc_state_rx).await;
+
+        let timestamps: Vec<u64> = vec![1_000, 34_333, 67_666];
+        for (i, &ts) in timestamps.iter().enumerate() {
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::Nv12, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(ts),
+                duration_us: Some(33_333),
+                sequence: Some(i as u64),
+                keyframe: Some(true),
+            });
+            enc_input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(enc_input_tx);
+
+        assert_state_stopped(&mut enc_state_rx).await;
+        enc_handle.await.unwrap().unwrap();
+
+        let encoded_packets = enc_sender.get_packets_for_pin("out").await;
+        assert!(!encoded_packets.is_empty());
+
+        // --- Decode and verify metadata ---
+        let (dec_input_tx, dec_input_rx) = mpsc::channel(10);
+        let mut dec_inputs = HashMap::new();
+        dec_inputs.insert("in".to_string(), dec_input_rx);
+
+        let (dec_context, dec_sender, mut dec_state_rx) = create_test_context(dec_inputs, 10);
+        let decoder = NvAv1DecoderNode::new(NvAv1DecoderConfig::default()).unwrap();
+        let dec_handle = tokio::spawn(async move { Box::new(decoder).run(dec_context).await });
+
+        assert_state_initializing(&mut dec_state_rx).await;
+        assert_state_running(&mut dec_state_rx).await;
+
+        for packet in encoded_packets {
+            if let Packet::Binary { data, metadata, .. } = packet {
+                dec_input_tx
+                    .send(Packet::Binary {
+                        data,
+                        content_type: Some(Cow::Borrowed(AV1_CONTENT_TYPE)),
+                        metadata,
+                    })
+                    .await
+                    .unwrap();
+            }
+        }
+        drop(dec_input_tx);
+
+        assert_state_stopped(&mut dec_state_rx).await;
+        dec_handle.await.unwrap().unwrap();
+
+        let decoded_packets = dec_sender.get_packets_for_pin("out").await;
+        assert!(!decoded_packets.is_empty(), "Decoder should produce at least one frame");
+
+        // Every decoded frame should have metadata preserved.
+        for (i, packet) in decoded_packets.iter().enumerate() {
+            match packet {
+                Packet::Video(frame) => {
+                    assert!(frame.metadata.is_some(), "Decoded frame {i} should have metadata");
+                },
+                _ => panic!("Expected Video packet from NV AV1 decoder"),
+            }
+        }
+    }
+}
diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
new file mode 100644
index 00000000..2d1be2bb
--- /dev/null
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -0,0 +1,1807 @@
+// SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+//
+// SPDX-License-Identifier: MPL-2.0
+
+//! VA-API HW-accelerated AV1 encoder and decoder nodes.
+//!
+//! Uses the [`cros-codecs`](https://crates.io/crates/cros-codecs) crate which
+//! provides high-level VA-API AV1 codec abstractions on Linux.  The cros-codecs
+//! `StatelessDecoder` and `StatelessEncoder` handle all AV1 bitstream parsing
+//! and VA-API parameter buffer construction internally — this module manages
+//! frame I/O and integrates with StreamKit's pipeline architecture.
+//!
+//! # Nodes
+//!
+//! - [`VaapiAv1DecoderNode`] — decodes AV1 OBU packets to NV12 [`VideoFrame`]s
+//! - [`VaapiAv1EncoderNode`] — encodes NV12/I420 [`VideoFrame`]s to AV1 packets
+//!
+//! Both perform runtime capability detection: if no VA-API device is found (or
+//! AV1 is not supported), node creation returns an error so the pipeline can
+//! fall back to a CPU codec (rav1e/dav1d/SVT-AV1).
+//!
+//! # Feature gate
+//!
+//! Requires `vaapi` Cargo feature and `libva-dev` + `libgbm-dev` system packages.
+//!
+//! # Platform support
+//!
+//! - **Intel**: Full AV1 encode (Arc+) and decode via `intel-media-driver`.
+//! - **AMD**: AV1 encode + decode via Mesa RadeonSI VA-API.
+//! - **NVIDIA**: Decode only via community `nvidia-vaapi-driver` (no VA-API encoding).
+
+use std::rc::Rc;
+use std::sync::Arc;
+use std::time::Instant;
+
+use async_trait::async_trait;
+use bytes::Bytes;
+use opentelemetry::global;
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+use streamkit_core::stats::NodeStatsTracker;
+use streamkit_core::types::{
+    EncodedVideoFormat, Packet, PacketMetadata, PacketType, PixelFormat, RawVideoFormat,
+    VideoCodec, VideoFrame,
+};
+use streamkit_core::{
+    config_helpers, get_codec_channel_capacity, packet_helpers, state_helpers, InputPin,
+    NodeContext, NodeRegistry, OutputPin, PinCardinality, ProcessorNode, StreamKitError,
+};
+use tokio::sync::mpsc;
+
+// cros-codecs high-level APIs.
+use cros_codecs::backend::vaapi::decoder::VaapiBackend as VaapiDecBackend;
+use cros_codecs::codec::av1::parser::Profile as Av1Profile;
+use cros_codecs::decoder::stateless::av1::Av1;
+use cros_codecs::decoder::stateless::{DecodeError, StatelessDecoder, StatelessVideoDecoder};
+use cros_codecs::decoder::{BlockingMode, DecodedHandle, DecoderEvent};
+use cros_codecs::encoder::av1::EncoderConfig as CrosEncoderConfig;
+use cros_codecs::encoder::stateless::StatelessEncoder;
+use cros_codecs::encoder::{
+    FrameMetadata as CrosFrameMetadata, PredictionStructure, RateControl, Tunings, VideoEncoder,
+};
+use cros_codecs::libva;
+use cros_codecs::video_frame::gbm_video_frame::{
+    GbmDevice, GbmExternalBufferDescriptor, GbmUsage, GbmVideoFrame,
+};
+use cros_codecs::video_frame::{ReadMapping, VideoFrame as CrosVideoFrame, WriteMapping};
+use cros_codecs::{Fourcc as CrosFourcc, FrameLayout, PlaneLayout, Resolution as CrosResolution};
+
+use super::encoder_trait::{self, EncodedPacket, EncoderNodeRunner, StandardVideoEncoder};
+use super::HwAccelMode;
+use super::AV1_CONTENT_TYPE;
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+/// Default VA-API render device path.
+const DEFAULT_RENDER_DEVICE: &str = "/dev/dri/renderD128";
+
+/// AV1 superblock size — coded resolution must be aligned to this.
+const AV1_SB_SIZE: u32 = 64;
+
+/// Maximum number of consecutive retries when the decoder returns
+/// `CheckEvents` or `NotEnoughOutputBuffers` without making progress.
+/// Matches the established pattern in `av1.rs` and `dav1d.rs`.
+const MAX_EAGAIN_EMPTY_RETRIES: u32 = 1000;
+
+/// After this many retries, switch from `thread::yield_now()` to
+/// `thread::sleep(1ms)` to avoid a tight spin-loop.
+const EAGAIN_YIELD_THRESHOLD: u32 = 10;
+
+/// Default constant-quality parameter (0–255, lower = better quality).
+const DEFAULT_QUALITY: u32 = 128;
+
+/// Default framerate for rate-control hints.
+const DEFAULT_FRAMERATE: u32 = 30;
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/// NV12 fourcc code for GBM/VA-API surfaces.
+fn nv12_fourcc() -> CrosFourcc {
+    CrosFourcc::from(b"NV12")
+}
+
+/// Align `value` up to the next multiple of `alignment`.
+fn align_up_u32(value: u32, alignment: u32) -> u32 {
+    debug_assert!(alignment > 0);
+    value.div_ceil(alignment) * alignment
+}
+
+/// Auto-detect a VA-API render device by scanning `/dev/dri/renderD*`.
+///
+/// Returns the first device path that can be opened as a VA display, or `None`
+/// if no VA-API capable device is found.
+fn detect_render_device() -> Option<String> {
+    let mut entries: Vec<_> = std::fs::read_dir("/dev/dri")
+        .ok()?
+        .filter_map(std::result::Result::ok)
+        .filter(|e| e.file_name().to_str().is_some_and(|n| n.starts_with("renderD")))
+        .collect();
+    entries.sort_by_key(std::fs::DirEntry::file_name);
+
+    for entry in entries {
+        let path = entry.path();
+        if libva::Display::open_drm_display(&path).is_ok() {
+            return path.to_str().map(String::from);
+        }
+    }
+
+    None
+}
+
+/// Resolve the render device path from config, auto-detection, or default.
+fn resolve_render_device(configured: Option<&String>) -> String {
+    if let Some(path) = configured {
+        return path.clone();
+    }
+
+    if let Some(path) = detect_render_device() {
+        tracing::info!(device = %path, "auto-detected VA-API render device");
+        return path;
+    }
+
+    tracing::info!(
+        device = DEFAULT_RENDER_DEVICE,
+        "no VA-API device detected, falling back to default"
+    );
+    DEFAULT_RENDER_DEVICE.to_string()
+}
+
+/// Open a VA display and a GBM device on the same render node.
+fn open_va_and_gbm(
+    render_device: Option<&String>,
+) -> Result<(Rc<libva::Display>, Arc<GbmDevice>, String), String> {
+    let path = resolve_render_device(render_device);
+    let display = libva::Display::open_drm_display(&path)
+        .map_err(|e| format!("failed to open VA display on {path}: {e}"))?;
+    let gbm =
+        GbmDevice::open(&path).map_err(|e| format!("failed to open GBM device on {path}: {e}"))?;
+    Ok((display, gbm, path))
+}
+
+/// Copy NV12 plane data from a GBM read-mapping into a flat `Vec<u8>` suitable
+/// for a packed StreamKit [`VideoFrame`].
+///
+/// Handles stride != width by copying row-by-row.
+fn read_nv12_from_mapping(
+    mapping: &dyn ReadMapping<'_>,
+    width: u32,
+    height: u32,
+    plane_pitches: &[usize],
+) -> Vec<u8> {
+    let planes = mapping.get();
+    let w = width as usize;
+    let h = height as usize;
+    let y_size = w * h;
+    let uv_h = h.div_ceil(2);
+    // NV12 UV row width: interleaved U/V pairs, matching VideoLayout::packed.
+    let chroma_w = w.div_ceil(2) * 2;
+    let uv_size = chroma_w * uv_h;
+    let mut data = vec![0u8; y_size + uv_size];
+
+    // Y plane.
+    if !planes.is_empty() {
+        let y_stride = plane_pitches.first().copied().unwrap_or(w);
+        if y_stride == w {
+            let copy_len = y_size.min(planes[0].len());
+            data[..copy_len].copy_from_slice(&planes[0][..copy_len]);
+        } else {
+            for row in 0..h {
+                let dst_off = row * w;
+                let src_off = row * y_stride;
+                if src_off + w <= planes[0].len() && dst_off + w <= y_size {
+                    data[dst_off..dst_off + w].copy_from_slice(&planes[0][src_off..src_off + w]);
+                }
+            }
+        }
+    }
+
+    // UV plane (interleaved).
+    if planes.len() > 1 {
+        let uv_stride = plane_pitches.get(1).copied().unwrap_or(chroma_w);
+        if uv_stride == chroma_w {
+            let copy_len = uv_size.min(planes[1].len());
+            data[y_size..y_size + copy_len].copy_from_slice(&planes[1][..copy_len]);
+        } else {
+            for row in 0..uv_h {
+                let dst_off = y_size + row * chroma_w;
+                let src_off = row * uv_stride;
+                if src_off + chroma_w <= planes[1].len() && dst_off + chroma_w <= data.len() {
+                    data[dst_off..dst_off + chroma_w]
+                        .copy_from_slice(&planes[1][src_off..src_off + chroma_w]);
+                }
+            }
+        }
+    }
+
+    data
+}
+
+/// Write NV12 data from a StreamKit [`VideoFrame`] into a GBM frame's
+/// write-mapping.
+///
+/// If the source is I420, it is converted to NV12 on the fly (U/V planes
+/// are interleaved into a single UV plane).
+fn write_nv12_to_mapping(
+    mapping: &dyn WriteMapping<'_>,
+    frame: &VideoFrame,
+    plane_pitches: &[usize],
+) -> Result<(), String> {
+    let planes = mapping.get();
+    if planes.is_empty() {
+        return Err("GBM mapping returned no planes".into());
+    }
+
+    let w = frame.width as usize;
+    let h = frame.height as usize;
+    let src = frame.data.as_ref().as_ref();
+
+    match frame.pixel_format {
+        PixelFormat::Nv12 => {
+            let y_size = w * h;
+            // NV12 UV row width: interleaved U/V pairs, matching VideoLayout::packed.
+            let chroma_w = w.div_ceil(2) * 2;
+            let uv_h = h.div_ceil(2);
+            let uv_size = chroma_w * uv_h;
+
+            // Y plane.
+            let y_stride = plane_pitches.first().copied().unwrap_or(w);
+            {
+                let mut y_plane = planes[0].borrow_mut();
+                if y_stride == w {
+                    let n = y_size.min(y_plane.len()).min(src.len());
+                    y_plane[..n].copy_from_slice(&src[..n]);
+                } else {
+                    for row in 0..h {
+                        let s = row * w;
+                        let d = row * y_stride;
+                        if s + w <= src.len() && d + w <= y_plane.len() {
+                            y_plane[d..d + w].copy_from_slice(&src[s..s + w]);
+                        }
+                    }
+                }
+            }
+
+            // UV plane.
+            if planes.len() > 1 {
+                let uv_stride = plane_pitches.get(1).copied().unwrap_or(chroma_w);
+                let mut uv_plane = planes[1].borrow_mut();
+                let src_uv = &src[y_size..];
+                if uv_stride == chroma_w {
+                    let n = uv_size.min(uv_plane.len()).min(src_uv.len());
+                    uv_plane[..n].copy_from_slice(&src_uv[..n]);
+                } else {
+                    for row in 0..uv_h {
+                        let s = row * chroma_w;
+                        let d = row * uv_stride;
+                        if s + chroma_w <= src_uv.len() && d + chroma_w <= uv_plane.len() {
+                            uv_plane[d..d + chroma_w].copy_from_slice(&src_uv[s..s + chroma_w]);
+                        }
+                    }
+                }
+            }
+        },
+        PixelFormat::I420 => {
+            // Convert I420 → NV12: Y stays the same, U and V are interleaved.
+            let y_size = w * h;
+            let uv_w = w.div_ceil(2);
+            let uv_h = h.div_ceil(2);
+            let u_plane_size = uv_w * uv_h;
+
+            // Y plane.
+            let y_stride = plane_pitches.first().copied().unwrap_or(w);
+            {
+                let mut y_plane = planes[0].borrow_mut();
+                if y_stride == w {
+                    let n = y_size.min(y_plane.len()).min(src.len());
+                    y_plane[..n].copy_from_slice(&src[..n]);
+                } else {
+                    for row in 0..h {
+                        let s = row * w;
+                        let d = row * y_stride;
+                        if s + w <= src.len() && d + w <= y_plane.len() {
+                            y_plane[d..d + w].copy_from_slice(&src[s..s + w]);
+                        }
+                    }
+                }
+            }
+
+            // UV plane — interleave U and V from I420 into NV12 UV.
+            if planes.len() > 1 {
+                let uv_stride = plane_pitches.get(1).copied().unwrap_or(uv_w * 2);
+                let mut uv_plane = planes[1].borrow_mut();
+                for row in 0..uv_h {
+                    for col in 0..uv_w {
+                        let u_idx = y_size + row * uv_w + col;
+                        let v_idx = y_size + u_plane_size + row * uv_w + col;
+                        let dst_idx = row * uv_stride + col * 2;
+                        if u_idx < src.len() && v_idx < src.len() && dst_idx + 1 < uv_plane.len() {
+                            uv_plane[dst_idx] = src[u_idx];
+                            uv_plane[dst_idx + 1] = src[v_idx];
+                        }
+                    }
+                }
+            }
+        },
+        _ => {
+            return Err(format!(
+                "VA-API AV1 encoder requires NV12 or I420 input, got {:?}",
+                frame.pixel_format
+            ));
+        },
+    }
+
+    Ok(())
+}
+
+// ---------------------------------------------------------------------------
+// Decoder
+// ---------------------------------------------------------------------------
+
+/// Configuration for the VA-API AV1 hardware decoder node.
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[serde(default, deny_unknown_fields)]
+pub struct VaapiAv1DecoderConfig {
+    /// Path to the DRM render device (e.g. `/dev/dri/renderD128`).
+    /// When `None`, auto-detects the first VA-API capable device.
+    pub render_device: Option<String>,
+
+    /// Hardware acceleration mode.
+    pub hw_accel: HwAccelMode,
+}
+
+impl Default for VaapiAv1DecoderConfig {
+    fn default() -> Self {
+        Self { render_device: None, hw_accel: HwAccelMode::Auto }
+    }
+}
+
+pub struct VaapiAv1DecoderNode {
+    config: VaapiAv1DecoderConfig,
+}
+
+impl VaapiAv1DecoderNode {
+    #[allow(clippy::missing_errors_doc)]
+    pub fn new(config: VaapiAv1DecoderConfig) -> Result<Self, StreamKitError> {
+        if matches!(config.hw_accel, HwAccelMode::ForceCpu) {
+            return Err(StreamKitError::Configuration(
+                "VaapiAv1DecoderNode only supports hardware decoding; \
+                 use video::av1::decoder for CPU decode"
+                    .into(),
+            ));
+        }
+        Ok(Self { config })
+    }
+}
+
+#[async_trait]
+impl ProcessorNode for VaapiAv1DecoderNode {
+    fn input_pins(&self) -> Vec<InputPin> {
+        vec![InputPin {
+            name: "in".to_string(),
+            accepts_types: vec![PacketType::EncodedVideo(EncodedVideoFormat {
+                codec: VideoCodec::Av1,
+                bitstream_format: None,
+                codec_private: None,
+                profile: None,
+                level: None,
+            })],
+            cardinality: PinCardinality::One,
+        }]
+    }
+
+    fn output_pins(&self) -> Vec<OutputPin> {
+        vec![OutputPin {
+            name: "out".to_string(),
+            produces_type: PacketType::RawVideo(RawVideoFormat {
+                width: None,
+                height: None,
+                pixel_format: PixelFormat::Nv12,
+            }),
+            cardinality: PinCardinality::Broadcast,
+        }]
+    }
+
+    async fn run(self: Box<Self>, mut context: NodeContext) -> Result<(), StreamKitError> {
+        let node_name = context.output_sender.node_name().to_string();
+        state_helpers::emit_initializing(&context.state_tx, &node_name);
+
+        tracing::info!("VaapiAv1DecoderNode starting");
+        let mut input_rx = context.take_input("in")?;
+
+        let meter = global::meter("skit_nodes");
+        let packets_processed_counter =
+            meter.u64_counter("vaapi_av1_decoder_packets_processed").build();
+        let decode_duration_histogram = meter
+            .f64_histogram("vaapi_av1_decode_duration")
+            .with_boundaries(streamkit_core::metrics::HISTOGRAM_BOUNDARIES_CODEC_PACKET.to_vec())
+            .build();
+
+        let (decode_tx, decode_rx) =
+            mpsc::channel::<(Bytes, Option<PacketMetadata>)>(get_codec_channel_capacity());
+        let (result_tx, mut result_rx) =
+            mpsc::channel::<Result<VideoFrame, String>>(get_codec_channel_capacity());
+
+        let render_device = self.config.render_device.clone();
+        let decode_task = tokio::task::spawn_blocking(move || {
+            vaapi_av1_decode_loop(
+                render_device.as_ref(),
+                decode_rx,
+                &result_tx,
+                &decode_duration_histogram,
+            );
+        });
+
+        state_helpers::emit_running(&context.state_tx, &node_name);
+
+        let mut stats_tracker = NodeStatsTracker::new(node_name.clone(), context.stats_tx.clone());
+        let batch_size = context.batch_size;
+
+        let decode_tx_clone = decode_tx.clone();
+        let mut input_task = tokio::spawn(async move {
+            loop {
+                let Some(first_packet) = input_rx.recv().await else {
+                    break;
+                };
+
+                let packet_batch =
+                    packet_helpers::batch_packets_greedy(first_packet, &mut input_rx, batch_size);
+
+                for packet in packet_batch {
+                    if let Packet::Binary { data, metadata, .. } = packet {
+                        if decode_tx_clone.send((data, metadata)).await.is_err() {
+                            tracing::error!(
+                                "VaapiAv1DecoderNode decode task has shut down unexpectedly"
+                            );
+                            return;
+                        }
+                    }
+                }
+            }
+            tracing::info!("VaapiAv1DecoderNode input stream closed");
+        });
+
+        crate::codec_utils::codec_forward_loop(
+            &mut context,
+            &mut result_rx,
+            &mut input_task,
+            decode_task,
+            decode_tx,
+            &packets_processed_counter,
+            &mut stats_tracker,
+            Packet::Video,
+            "VaapiAv1DecoderNode",
+        )
+        .await;
+
+        state_helpers::emit_stopped(&context.state_tx, &node_name, "input_closed");
+        tracing::info!("VaapiAv1DecoderNode finished");
+        Ok(())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Decoder — blocking decode loop
+// ---------------------------------------------------------------------------
+
+/// Blocking decode loop running inside `spawn_blocking`.
+///
+/// Creates the VA-API display, GBM device, and cros-codecs `StatelessDecoder`,
+/// then processes input packets until the channel is closed.
+fn vaapi_av1_decode_loop(
+    render_device: Option<&String>,
+    mut decode_rx: mpsc::Receiver<(Bytes, Option<PacketMetadata>)>,
+    result_tx: &mpsc::Sender<Result<VideoFrame, String>>,
+    duration_histogram: &opentelemetry::metrics::Histogram<f64>,
+) {
+    // ── Open GBM device + VA display ──────────────────────────────────
+    let path = resolve_render_device(render_device);
+
+    let gbm = match GbmDevice::open(&path) {
+        Ok(g) => g,
+        Err(e) => {
+            let _ =
+                result_tx.blocking_send(Err(format!("failed to open GBM device on {path}: {e}")));
+            return;
+        },
+    };
+
+    let display = match libva::Display::open_drm_display(&path) {
+        Ok(d) => d,
+        Err(e) => {
+            let _ =
+                result_tx.blocking_send(Err(format!("failed to open VA display on {path}: {e}")));
+            return;
+        },
+    };
+    tracing::info!(device = %path, "VA-API AV1 decoder opened display");
+
+    // ── Create stateless decoder ─────────────────────────────────────────
+    let mut decoder = match StatelessDecoder::<Av1, VaapiDecBackend<GbmVideoFrame>>::new_vaapi(
+        display,
+        BlockingMode::Blocking,
+    ) {
+        Ok(d) => d,
+        Err(e) => {
+            let _ =
+                result_tx.blocking_send(Err(format!("failed to create VA-API AV1 decoder: {e}")));
+            return;
+        },
+    };
+
+    // Stream resolution — updated on FormatChanged events.
+    let mut coded_width: u32 = 0;
+    let mut coded_height: u32 = 0;
+
+    while let Some((data, metadata)) = decode_rx.blocking_recv() {
+        if result_tx.is_closed() {
+            return;
+        }
+
+        let decode_start = Instant::now();
+        let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(0);
+
+        // Feed bitstream to the decoder.  The decoder may process it in
+        // multiple chunks and may require event handling between calls.
+        let mut offset = 0usize;
+        let bitstream = data.as_ref();
+        let mut eagain_empty_retries: u32 = 0;
+
+        while offset < bitstream.len() {
+            let gbm_ref = Arc::clone(&gbm);
+            let cw = coded_width;
+            let ch = coded_height;
+            let mut alloc_cb = move || {
+                gbm_ref
+                    .clone()
+                    .new_frame(
+                        nv12_fourcc(),
+                        CrosResolution { width: cw, height: ch },
+                        CrosResolution { width: cw, height: ch },
+                        GbmUsage::Decode,
+                    )
+                    .ok()
+            };
+
+            let mut made_progress = false;
+
+            match decoder.decode(timestamp, &bitstream[offset..], &mut alloc_cb) {
+                Ok(bytes_consumed) => {
+                    offset += bytes_consumed;
+                    made_progress = true;
+                },
+                Err(DecodeError::CheckEvents | DecodeError::NotEnoughOutputBuffers(_)) => {
+                    // Process pending events / drain ready frames, then retry.
+                },
+                Err(e) => {
+                    tracing::error!(error = %e, "VA-API AV1 decode error");
+                    let _ = result_tx.blocking_send(Err(format!("VA-API AV1 decode error: {e}")));
+                    break;
+                },
+            }
+
+            // Process all pending events (format changes + ready frames).
+            let (should_exit, had_events) = drain_decoder_events(
+                &mut decoder,
+                result_tx,
+                metadata.as_ref(),
+                &mut coded_width,
+                &mut coded_height,
+            );
+            if should_exit {
+                return;
+            }
+
+            if made_progress || had_events {
+                eagain_empty_retries = 0;
+            } else {
+                eagain_empty_retries += 1;
+                if eagain_empty_retries > MAX_EAGAIN_EMPTY_RETRIES {
+                    tracing::error!(
+                        "VA-API AV1 decoder stuck: no progress after {MAX_EAGAIN_EMPTY_RETRIES} retries"
+                    );
+                    let _ = result_tx.blocking_send(Err(
+                        "VA-API AV1 decoder stuck in CheckEvents/NotEnoughOutputBuffers loop"
+                            .to_string(),
+                    ));
+                    break;
+                }
+                // Progressive backoff to avoid a tight spin-loop.
+                if eagain_empty_retries <= EAGAIN_YIELD_THRESHOLD {
+                    std::thread::yield_now();
+                } else {
+                    std::thread::sleep(std::time::Duration::from_millis(1));
+                }
+            }
+        }
+
+        duration_histogram.record(decode_start.elapsed().as_secs_f64(), &[]);
+    }
+
+    // Flush remaining frames from the decoder.
+    if result_tx.is_closed() {
+        return;
+    }
+    if let Err(e) = decoder.flush() {
+        tracing::warn!(error = %e, "VA-API AV1 decoder flush failed");
+    }
+    drain_decoder_events(&mut decoder, result_tx, None, &mut coded_width, &mut coded_height);
+}
+
+/// Drain all pending events from the decoder.
+///
+/// Returns `(should_exit, had_events)`:
+/// - `should_exit`: the result channel is closed and the caller should return.
+/// - `had_events`: at least one event (format change or frame) was processed.
+fn drain_decoder_events(
+    decoder: &mut StatelessDecoder<Av1, VaapiDecBackend<GbmVideoFrame>>,
+    result_tx: &mpsc::Sender<Result<VideoFrame, String>>,
+    metadata: Option<&PacketMetadata>,
+    coded_width: &mut u32,
+    coded_height: &mut u32,
+) -> (bool, bool) {
+    let mut had_events = false;
+    while let Some(event) = decoder.next_event() {
+        had_events = true;
+        match event {
+            DecoderEvent::FormatChanged => {
+                if let Some(info) = decoder.stream_info() {
+                    let dw = info.display_resolution.width;
+                    let dh = info.display_resolution.height;
+                    *coded_width = info.coded_resolution.width;
+                    *coded_height = info.coded_resolution.height;
+                    tracing::info!(
+                        display_width = dw,
+                        display_height = dh,
+                        coded_width = *coded_width,
+                        coded_height = *coded_height,
+                        "VA-API AV1 decoder stream format changed"
+                    );
+                }
+            },
+            DecoderEvent::FrameReady(handle) => {
+                if let Err(e) = handle.sync() {
+                    tracing::error!(error = %e, "VA-API AV1 frame sync failed");
+                    continue;
+                }
+
+                let display_res = handle.display_resolution();
+                let frame_w = display_res.width;
+                let frame_h = display_res.height;
+
+                let gbm_frame = handle.video_frame();
+                let pitches = gbm_frame.get_plane_pitch();
+
+                // Extract NV12 data while the mapping is alive, then drop the
+                // mapping before `gbm_frame` to satisfy the borrow checker.
+                let nv12_data = {
+                    let mapping = match gbm_frame.map() {
+                        Ok(m) => m,
+                        Err(e) => {
+                            tracing::error!(error = %e, "failed to map decoded GBM frame");
+                            continue;
+                        },
+                    };
+                    read_nv12_from_mapping(mapping.as_ref(), frame_w, frame_h, &pitches)
+                };
+
+                match VideoFrame::with_metadata(
+                    frame_w,
+                    frame_h,
+                    PixelFormat::Nv12,
+                    nv12_data,
+                    metadata.cloned(),
+                ) {
+                    Ok(frame) => {
+                        if result_tx.blocking_send(Ok(frame)).is_err() {
+                            return (true, had_events);
+                        }
+                    },
+                    Err(e) => {
+                        tracing::error!(
+                            error = %e,
+                            "failed to construct VideoFrame from decoded data"
+                        );
+                    },
+                }
+            },
+        }
+    }
+    (false, had_events)
+}
+
+// ---------------------------------------------------------------------------
+// Encoder
+// ---------------------------------------------------------------------------
+
+/// Configuration for the VA-API AV1 hardware encoder node.
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[serde(default, deny_unknown_fields)]
+pub struct VaapiAv1EncoderConfig {
+    /// Path to the DRM render device (e.g. `/dev/dri/renderD128`).
+    /// When `None`, auto-detects the first VA-API capable device.
+    pub render_device: Option<String>,
+
+    /// Constant quality parameter (QP).  Lower values produce higher quality
+    /// at the cost of larger bitstream.  Range depends on the driver; typical
+    /// range is 0–255, default 128.
+    ///
+    /// Note: VA-API AV1 encoding via cros-codecs currently supports only the
+    /// `ConstantQuality` rate control mode, not `ConstantBitrate`.
+    pub quality: u32,
+
+    /// Target framerate in frames per second (used for rate control hints).
+    pub framerate: u32,
+
+    /// Use low-power encoding mode if the driver supports it.
+    /// Low-power mode uses the GPU's fixed-function encoder (if available)
+    /// rather than shader-based encoding, typically offering lower latency
+    /// at reduced quality flexibility.
+    pub low_power: bool,
+
+    /// Hardware acceleration mode.
+    pub hw_accel: HwAccelMode,
+}
+
+const fn default_quality() -> u32 {
+    DEFAULT_QUALITY
+}
+
+const fn default_framerate() -> u32 {
+    DEFAULT_FRAMERATE
+}
+
+impl Default for VaapiAv1EncoderConfig {
+    fn default() -> Self {
+        Self {
+            render_device: None,
+            quality: DEFAULT_QUALITY,
+            framerate: DEFAULT_FRAMERATE,
+            low_power: false,
+            hw_accel: HwAccelMode::Auto,
+        }
+    }
+}
+
+pub struct VaapiAv1EncoderNode {
+    config: VaapiAv1EncoderConfig,
+}
+
+impl VaapiAv1EncoderNode {
+    #[allow(clippy::missing_errors_doc)]
+    pub fn new(config: VaapiAv1EncoderConfig) -> Result<Self, StreamKitError> {
+        if matches!(config.hw_accel, HwAccelMode::ForceCpu) {
+            return Err(StreamKitError::Configuration(
+                "VaapiAv1EncoderNode only supports hardware encoding; \
+                 use video::av1::encoder for CPU encode"
+                    .into(),
+            ));
+        }
+        Ok(Self { config })
+    }
+}
+
+#[async_trait]
+impl ProcessorNode for VaapiAv1EncoderNode {
+    fn input_pins(&self) -> Vec<InputPin> {
+        vec![InputPin {
+            name: "in".to_string(),
+            accepts_types: vec![
+                PacketType::RawVideo(RawVideoFormat {
+                    width: None,
+                    height: None,
+                    pixel_format: PixelFormat::I420,
+                }),
+                PacketType::RawVideo(RawVideoFormat {
+                    width: None,
+                    height: None,
+                    pixel_format: PixelFormat::Nv12,
+                }),
+            ],
+            cardinality: PinCardinality::One,
+        }]
+    }
+
+    fn output_pins(&self) -> Vec<OutputPin> {
+        vec![OutputPin {
+            name: "out".to_string(),
+            produces_type: PacketType::EncodedVideo(EncodedVideoFormat {
+                codec: VideoCodec::Av1,
+                bitstream_format: None,
+                codec_private: None,
+                profile: None,
+                level: None,
+            }),
+            cardinality: PinCardinality::Broadcast,
+        }]
+    }
+
+    fn content_type(&self) -> Option<String> {
+        Some(AV1_CONTENT_TYPE.to_string())
+    }
+
+    async fn run(self: Box<Self>, context: NodeContext) -> Result<(), StreamKitError> {
+        encoder_trait::run_encoder(*self, context).await
+    }
+}
+
+impl EncoderNodeRunner for VaapiAv1EncoderNode {
+    const CONTENT_TYPE: &'static str = AV1_CONTENT_TYPE;
+    const NODE_LABEL: &'static str = "VaapiAv1EncoderNode";
+    const PACKETS_COUNTER_NAME: &'static str = "vaapi_av1_encoder_packets_processed";
+    const DURATION_HISTOGRAM_NAME: &'static str = "vaapi_av1_encode_duration";
+
+    fn spawn_codec_task(
+        self,
+        encode_rx: mpsc::Receiver<(VideoFrame, Option<PacketMetadata>)>,
+        result_tx: mpsc::Sender<Result<EncodedPacket, String>>,
+        duration_histogram: opentelemetry::metrics::Histogram<f64>,
+    ) -> tokio::task::JoinHandle<()> {
+        encoder_trait::spawn_standard_encode_task::<VaapiAv1Encoder>(
+            self.config,
+            encode_rx,
+            result_tx,
+            duration_histogram,
+        )
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Encoder — internal codec wrapper
+// ---------------------------------------------------------------------------
+
+/// Type alias for the full VA-API AV1 encoder with GBM-backed frames.
+type CrosVaapiAv1Encoder = StatelessEncoder<
+    cros_codecs::encoder::av1::AV1,
+    GbmVideoFrame,
+    cros_codecs::backend::vaapi::encoder::VaapiBackend<
+        GbmExternalBufferDescriptor,
+        libva::Surface<GbmExternalBufferDescriptor>,
+    >,
+>;
+
+/// Internal encoder state wrapping the cros-codecs `StatelessEncoder`.
+///
+/// `!Send` due to internal `Rc<libva::Display>` — lives entirely inside
+/// a `spawn_blocking` thread, matching the pattern in `av1.rs`.
+struct VaapiAv1Encoder {
+    encoder: CrosVaapiAv1Encoder,
+    gbm: Arc<GbmDevice>,
+    width: u32,
+    height: u32,
+    coded_width: u32,
+    coded_height: u32,
+    frame_count: u64,
+}
+
+impl StandardVideoEncoder for VaapiAv1Encoder {
+    type Config = VaapiAv1EncoderConfig;
+    const CODEC_NAME: &'static str = "VA-API AV1";
+
+    fn new_encoder(width: u32, height: u32, config: &Self::Config) -> Result<Self, String> {
+        let (display, gbm, path) = open_va_and_gbm(config.render_device.as_ref())?;
+        tracing::info!(device = %path, width, height, "VA-API AV1 encoder opening");
+
+        let coded_width = align_up_u32(width, AV1_SB_SIZE);
+        let coded_height = align_up_u32(height, AV1_SB_SIZE);
+
+        let cros_config = CrosEncoderConfig {
+            profile: Av1Profile::Profile0,
+            bit_depth: cros_codecs::codec::av1::parser::BitDepth::Depth8,
+            resolution: CrosResolution { width: coded_width, height: coded_height },
+            pred_structure: PredictionStructure::LowDelay { limit: 1024 },
+            initial_tunings: Tunings {
+                rate_control: RateControl::ConstantQuality(config.quality),
+                framerate: config.framerate,
+                min_quality: 0,
+                max_quality: 255,
+            },
+        };
+
+        let encoder = CrosVaapiAv1Encoder::new_vaapi(
+            display,
+            cros_config,
+            nv12_fourcc(),
+            CrosResolution { width: coded_width, height: coded_height },
+            config.low_power,
+            BlockingMode::Blocking,
+        )
+        .map_err(|e| format!("failed to create VA-API AV1 encoder: {e}"))?;
+
+        tracing::info!(
+            device = %path,
+            width,
+            height,
+            coded_width,
+            coded_height,
+            quality = config.quality,
+            "VA-API AV1 encoder created"
+        );
+
+        Ok(Self { encoder, gbm, width, height, coded_width, coded_height, frame_count: 0 })
+    }
+
+    fn encode(
+        &mut self,
+        frame: &VideoFrame,
+        metadata: Option<PacketMetadata>,
+    ) -> Result<Vec<EncodedPacket>, String> {
+        if frame.pixel_format == PixelFormat::Rgba8 {
+            return Err("VA-API AV1 encoder requires NV12 or I420 input; \
+                 insert a video::pixel_convert node upstream"
+                .into());
+        }
+
+        // Create a GBM frame and upload the raw video data.
+        let mut gbm_frame = Arc::clone(&self.gbm)
+            .new_frame(
+                nv12_fourcc(),
+                CrosResolution { width: self.width, height: self.height },
+                CrosResolution { width: self.coded_width, height: self.coded_height },
+                GbmUsage::Encode,
+            )
+            .map_err(|e| format!("failed to allocate GBM frame for encoding: {e}"))?;
+
+        // Write frame data into the GBM buffer.
+        let pitches = gbm_frame.get_plane_pitch();
+        {
+            let mapping = gbm_frame
+                .map_mut()
+                .map_err(|e| format!("failed to map GBM frame for writing: {e}"))?;
+            write_nv12_to_mapping(mapping.as_ref(), frame, &pitches)?;
+        }
+
+        let is_keyframe = metadata.as_ref().and_then(|m| m.keyframe).unwrap_or(false);
+        let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(self.frame_count);
+
+        let frame_layout = FrameLayout {
+            format: (nv12_fourcc(), 0), // DRM_FORMAT_MOD_LINEAR
+            size: CrosResolution { width: self.coded_width, height: self.coded_height },
+            planes: vec![
+                PlaneLayout {
+                    buffer_index: 0,
+                    offset: 0,
+                    stride: pitches.first().copied().unwrap_or(self.width as usize),
+                },
+                PlaneLayout {
+                    buffer_index: 0,
+                    offset: pitches.first().copied().unwrap_or(self.width as usize)
+                        * self.coded_height as usize,
+                    stride: pitches.get(1).copied().unwrap_or(self.width as usize),
+                },
+            ],
+        };
+
+        let cros_meta =
+            CrosFrameMetadata { timestamp, layout: frame_layout, force_keyframe: is_keyframe };
+
+        self.encoder
+            .encode(cros_meta, gbm_frame)
+            .map_err(|e| format!("VA-API AV1 encode error: {e}"))?;
+
+        self.frame_count += 1;
+
+        // Poll for all available encoded output.
+        let mut packets = Vec::new();
+        loop {
+            match self.encoder.poll() {
+                Ok(Some(coded)) => {
+                    packets.push(EncodedPacket {
+                        data: Bytes::from(coded.bitstream),
+                        metadata: metadata.clone(),
+                    });
+                },
+                Ok(None) => break,
+                Err(e) => return Err(format!("VA-API AV1 encoder poll error: {e}")),
+            }
+        }
+
+        Ok(packets)
+    }
+
+    fn flush_encoder(&mut self) -> Result<Vec<EncodedPacket>, String> {
+        self.encoder.drain().map_err(|e| format!("VA-API AV1 encoder drain error: {e}"))?;
+
+        let mut packets = Vec::new();
+        loop {
+            match self.encoder.poll() {
+                Ok(Some(coded)) => {
+                    packets
+                        .push(EncodedPacket { data: Bytes::from(coded.bitstream), metadata: None });
+                },
+                Ok(None) => break,
+                Err(e) => return Err(format!("VA-API AV1 encoder poll error: {e}")),
+            }
+        }
+
+        Ok(packets)
+    }
+
+    fn flush_on_dimension_change() -> bool {
+        true
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Registration
+// ---------------------------------------------------------------------------
+
+use schemars::schema_for;
+use streamkit_core::registry::StaticPins;
+
+#[allow(clippy::expect_used, clippy::missing_panics_doc)]
+pub fn register_vaapi_av1_nodes(registry: &mut NodeRegistry) {
+    let default_decoder = VaapiAv1DecoderNode::new(VaapiAv1DecoderConfig::default())
+        .expect("default VA-API AV1 decoder config should be valid");
+    registry.register_static_with_description(
+        "video::vaapi::av1_decoder",
+        |params| {
+            let config = config_helpers::parse_config_optional(params)?;
+            Ok(Box::new(VaapiAv1DecoderNode::new(config)?))
+        },
+        serde_json::to_value(schema_for!(VaapiAv1DecoderConfig))
+            .expect("VaapiAv1DecoderConfig schema should serialize to JSON"),
+        StaticPins { inputs: default_decoder.input_pins(), outputs: default_decoder.output_pins() },
+        vec![
+            "video".to_string(),
+            "codecs".to_string(),
+            "av1".to_string(),
+            "hw".to_string(),
+            "vaapi".to_string(),
+        ],
+        false,
+        "Decodes AV1-compressed packets into raw NV12 video frames using VA-API \
+         hardware acceleration. Requires a VA-API capable GPU (Intel Arc+, AMD, \
+         or NVIDIA with nvidia-vaapi-driver).",
+    );
+
+    let default_encoder = VaapiAv1EncoderNode::new(VaapiAv1EncoderConfig::default())
+        .expect("default VA-API AV1 encoder config should be valid");
+    registry.register_static_with_description(
+        "video::vaapi::av1_encoder",
+        |params| {
+            let config = config_helpers::parse_config_optional(params)?;
+            Ok(Box::new(VaapiAv1EncoderNode::new(config)?))
+        },
+        serde_json::to_value(schema_for!(VaapiAv1EncoderConfig))
+            .expect("VaapiAv1EncoderConfig schema should serialize to JSON"),
+        StaticPins { inputs: default_encoder.input_pins(), outputs: default_encoder.output_pins() },
+        vec![
+            "video".to_string(),
+            "codecs".to_string(),
+            "av1".to_string(),
+            "hw".to_string(),
+            "vaapi".to_string(),
+        ],
+        false,
+        "Encodes raw NV12/I420 video frames into AV1-compressed packets using VA-API \
+         hardware acceleration. Uses constant-quality (CQP) rate control. Requires a \
+         VA-API capable GPU with AV1 encode support (Intel Arc+, AMD).",
+    );
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+#[allow(clippy::unwrap_used, clippy::expect_used, clippy::disallowed_macros)]
+mod tests {
+    use super::*;
+    use std::cell::RefCell;
+
+    // -----------------------------------------------------------------------
+    // Mock mapping types for unit-testing read/write helpers without a GPU.
+    // -----------------------------------------------------------------------
+
+    struct MockReadMapping<'a> {
+        planes: Vec<&'a [u8]>,
+    }
+
+    impl<'a> ReadMapping<'a> for MockReadMapping<'a> {
+        fn get(&self) -> Vec<&[u8]> {
+            self.planes.clone()
+        }
+    }
+
+    struct MockWriteMapping<'a> {
+        planes: Vec<RefCell<&'a mut [u8]>>,
+    }
+
+    impl<'a> WriteMapping<'a> for MockWriteMapping<'a> {
+        fn get(&self) -> Vec<RefCell<&'a mut [u8]>> {
+            // Re-borrow each plane to return fresh RefCells.
+            // SAFETY: this is only used in single-threaded tests where
+            // the returned RefCells do not outlive `self`.
+            self.planes
+                .iter()
+                .map(|cell| {
+                    let ptr = cell.borrow_mut().as_mut_ptr();
+                    let len = cell.borrow().len();
+                    RefCell::new(unsafe { std::slice::from_raw_parts_mut(ptr, len) })
+                })
+                .collect()
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // align_up_u32
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_align_up_u32_already_aligned() {
+        assert_eq!(align_up_u32(64, 64), 64);
+        assert_eq!(align_up_u32(128, 64), 128);
+    }
+
+    #[test]
+    fn test_align_up_u32_needs_alignment() {
+        assert_eq!(align_up_u32(65, 64), 128);
+        assert_eq!(align_up_u32(1, 64), 64);
+        assert_eq!(align_up_u32(100, 64), 128);
+    }
+
+    #[test]
+    fn test_align_up_u32_alignment_one() {
+        assert_eq!(align_up_u32(42, 1), 42);
+    }
+
+    // -----------------------------------------------------------------------
+    // read_nv12_from_mapping — buffer size and content
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_read_nv12_even_dimensions() {
+        let w: u32 = 64;
+        let h: u32 = 48;
+        let y_size = (w * h) as usize;
+        let uv_h = h as usize / 2;
+        let chroma_w = w as usize; // even width: chroma_w == w
+        let uv_size = chroma_w * uv_h;
+
+        let y_plane = vec![0xAA_u8; y_size];
+        let uv_plane = vec![0x80_u8; uv_size];
+        let mapping = MockReadMapping { planes: vec![&y_plane, &uv_plane] };
+        let pitches = [w as usize, chroma_w];
+
+        let data = read_nv12_from_mapping(&mapping, w, h, &pitches);
+
+        let layout = streamkit_core::types::VideoLayout::packed(w, h, PixelFormat::Nv12);
+        assert_eq!(
+            data.len(),
+            layout.total_bytes(),
+            "output buffer size must match VideoLayout::packed"
+        );
+        assert!(data[..y_size].iter().all(|&b| b == 0xAA), "Y plane data mismatch");
+        assert!(data[y_size..].iter().all(|&b| b == 0x80), "UV plane data mismatch");
+    }
+
+    #[test]
+    fn test_read_nv12_odd_width() {
+        // Odd width exercises the chroma_w = (w+1)/2*2 formula.
+        let w: u32 = 641;
+        let h: u32 = 480;
+        let y_size = (w * h) as usize;
+        let chroma_w = (w as usize + 1) / 2 * 2; // 642
+        let uv_h = h as usize / 2;
+        let uv_size = chroma_w * uv_h;
+
+        let y_plane = vec![0x10_u8; y_size];
+        let uv_plane = vec![0x80_u8; uv_size];
+        let mapping = MockReadMapping { planes: vec![&y_plane, &uv_plane] };
+        let pitches = [w as usize, chroma_w];
+
+        let data = read_nv12_from_mapping(&mapping, w, h, &pitches);
+
+        let layout = streamkit_core::types::VideoLayout::packed(w, h, PixelFormat::Nv12);
+        assert_eq!(
+            data.len(),
+            layout.total_bytes(),
+            "odd-width output buffer must match VideoLayout::packed (chroma_w={chroma_w})"
+        );
+    }
+
+    #[test]
+    fn test_read_nv12_odd_height() {
+        let w: u32 = 64;
+        let h: u32 = 49; // odd height
+        let y_size = (w * h) as usize;
+        let chroma_w = w as usize;
+        let uv_h = (h as usize + 1) / 2; // 25
+        let uv_size = chroma_w * uv_h;
+
+        let y_plane = vec![0x10_u8; y_size];
+        let uv_plane = vec![0x80_u8; uv_size];
+        let mapping = MockReadMapping { planes: vec![&y_plane, &uv_plane] };
+        let pitches = [w as usize, chroma_w];
+
+        let data = read_nv12_from_mapping(&mapping, w, h, &pitches);
+
+        let layout = streamkit_core::types::VideoLayout::packed(w, h, PixelFormat::Nv12);
+        assert_eq!(
+            data.len(),
+            layout.total_bytes(),
+            "odd-height output buffer must match VideoLayout::packed"
+        );
+    }
+
+    #[test]
+    fn test_read_nv12_with_stride() {
+        // Simulate a GBM surface with stride > width (e.g. 128-byte aligned).
+        let w: u32 = 100;
+        let h: u32 = 4;
+        let y_stride = 128_usize; // padded stride
+        let uv_stride = 128_usize;
+        let uv_h = 2_usize;
+        let chroma_w = (w as usize + 1) / 2 * 2; // 100
+
+        // Build Y plane with stride padding.
+        let mut y_plane = vec![0u8; y_stride * h as usize];
+        for row in 0..h as usize {
+            for col in 0..w as usize {
+                y_plane[row * y_stride + col] = 0xAA;
+            }
+        }
+
+        // Build UV plane with stride padding.
+        let mut uv_plane = vec![0u8; uv_stride * uv_h];
+        for row in 0..uv_h {
+            for col in 0..chroma_w {
+                uv_plane[row * uv_stride + col] = 0x80;
+            }
+        }
+
+        let mapping = MockReadMapping { planes: vec![&y_plane, &uv_plane] };
+        let pitches = [y_stride, uv_stride];
+
+        let data = read_nv12_from_mapping(&mapping, w, h, &pitches);
+
+        let layout = streamkit_core::types::VideoLayout::packed(w, h, PixelFormat::Nv12);
+        assert_eq!(data.len(), layout.total_bytes());
+
+        // Verify Y data is correctly de-strided.
+        let y_size = w as usize * h as usize;
+        assert!(data[..y_size].iter().all(|&b| b == 0xAA));
+        // Verify UV data is correctly de-strided.
+        assert!(data[y_size..].iter().all(|&b| b == 0x80));
+    }
+
+    // -----------------------------------------------------------------------
+    // read → VideoFrame::with_metadata roundtrip
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_read_nv12_produces_valid_video_frame() {
+        // The key invariant: read_nv12_from_mapping output must be accepted by
+        // VideoFrame::with_metadata, which validates against VideoLayout::packed.
+        for &(w, h) in &[(64, 48), (641, 480), (1920, 1080), (1921, 1081)] {
+            let y_size = (w * h) as usize;
+            let chroma_w = (w as usize + 1) / 2 * 2;
+            let uv_h = (h as usize + 1) / 2;
+            let uv_size = chroma_w * uv_h;
+
+            let y_plane = vec![0x10_u8; y_size];
+            let uv_plane = vec![0x80_u8; uv_size];
+            let mapping = MockReadMapping { planes: vec![&y_plane, &uv_plane] };
+            let pitches = [w as usize, chroma_w];
+
+            let data = read_nv12_from_mapping(&mapping, w, h, &pitches);
+            let result = VideoFrame::with_metadata(w, h, PixelFormat::Nv12, data, None);
+            assert!(
+                result.is_ok(),
+                "VideoFrame::with_metadata failed for {w}x{h}: {:?}",
+                result.err()
+            );
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // write_nv12_to_mapping — NV12 source
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_write_nv12_even_dimensions() {
+        let w: u32 = 64;
+        let h: u32 = 48;
+        let frame = crate::test_utils::create_test_video_frame(w, h, PixelFormat::Nv12, 0xAA);
+
+        let y_size = (w * h) as usize;
+        let chroma_w = (w as usize + 1) / 2 * 2;
+        let uv_h = (h as usize + 1) / 2;
+
+        let mut y_buf = vec![0u8; y_size];
+        let mut uv_buf = vec![0u8; chroma_w * uv_h];
+
+        let mapping =
+            MockWriteMapping { planes: vec![RefCell::new(&mut y_buf), RefCell::new(&mut uv_buf)] };
+        let pitches = [w as usize, chroma_w];
+
+        let result = write_nv12_to_mapping(&mapping, &frame, &pitches);
+        assert!(result.is_ok(), "write_nv12_to_mapping failed: {:?}", result.err());
+
+        // Y plane should be filled with 0xAA.
+        assert!(y_buf.iter().all(|&b| b == 0xAA), "Y plane should contain frame data");
+    }
+
+    #[test]
+    fn test_write_nv12_odd_width() {
+        let w: u32 = 641;
+        let h: u32 = 480;
+        let frame = crate::test_utils::create_test_video_frame(w, h, PixelFormat::Nv12, 0x10);
+
+        let y_size = (w * h) as usize;
+        let chroma_w = (w as usize + 1) / 2 * 2; // 642
+        let uv_h = (h as usize + 1) / 2;
+
+        let mut y_buf = vec![0u8; y_size];
+        let mut uv_buf = vec![0u8; chroma_w * uv_h];
+
+        let mapping =
+            MockWriteMapping { planes: vec![RefCell::new(&mut y_buf), RefCell::new(&mut uv_buf)] };
+        let pitches = [w as usize, chroma_w];
+
+        let result = write_nv12_to_mapping(&mapping, &frame, &pitches);
+        assert!(
+            result.is_ok(),
+            "write_nv12_to_mapping should handle odd width {w}: {:?}",
+            result.err()
+        );
+    }
+
+    // -----------------------------------------------------------------------
+    // write_nv12_to_mapping — I420 → NV12 conversion
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_write_i420_to_nv12_conversion() {
+        let w: u32 = 64;
+        let h: u32 = 48;
+        let frame = crate::test_utils::create_test_video_frame(w, h, PixelFormat::I420, 0x10);
+
+        let y_size = (w * h) as usize;
+        let chroma_w = (w as usize + 1) / 2 * 2;
+        let uv_h = (h as usize + 1) / 2;
+
+        let mut y_buf = vec![0u8; y_size];
+        let mut uv_buf = vec![0u8; chroma_w * uv_h];
+
+        let mapping =
+            MockWriteMapping { planes: vec![RefCell::new(&mut y_buf), RefCell::new(&mut uv_buf)] };
+        let pitches = [w as usize, chroma_w];
+
+        let result = write_nv12_to_mapping(&mapping, &frame, &pitches);
+        assert!(result.is_ok(), "I420→NV12 conversion failed: {:?}", result.err());
+
+        // Y plane should have the fill value.
+        assert!(y_buf.iter().all(|&b| b == 0x10), "Y plane should contain I420 luma data");
+
+        // UV plane should have interleaved U/V values (128 for neutral chroma
+        // from create_test_video_frame).
+        let uv_w = w.div_ceil(2) as usize;
+        for row in 0..uv_h {
+            for col in 0..uv_w {
+                let idx = row * chroma_w + col * 2;
+                assert_eq!(uv_buf[idx], 128, "U value at row={row} col={col}");
+                assert_eq!(uv_buf[idx + 1], 128, "V value at row={row} col={col}");
+            }
+        }
+    }
+
+    #[test]
+    fn test_write_i420_to_nv12_odd_width() {
+        // Odd width exercises the UV stride fallback path — the fix ensures
+        // the fallback uses `uv_w * 2` instead of `w` so rows don't misalign.
+        let w: u32 = 641;
+        let h: u32 = 480;
+        let frame = crate::test_utils::create_test_video_frame(w, h, PixelFormat::I420, 0x10);
+
+        let y_size = (w * h) as usize;
+        let uv_w = w.div_ceil(2) as usize; // 321
+        let chroma_w = uv_w * 2; // 642
+        let uv_h = (h as usize + 1) / 2;
+
+        let mut y_buf = vec![0u8; y_size];
+        let mut uv_buf = vec![0u8; chroma_w * uv_h];
+
+        let mapping =
+            MockWriteMapping { planes: vec![RefCell::new(&mut y_buf), RefCell::new(&mut uv_buf)] };
+        // Deliberately omit pitches to exercise the fallback.
+        let pitches: [usize; 0] = [];
+
+        let result = write_nv12_to_mapping(&mapping, &frame, &pitches);
+        assert!(result.is_ok(), "I420→NV12 odd-width conversion failed: {:?}", result.err());
+
+        // Verify UV interleaving on the last row to catch misalignment.
+        let last_row = uv_h - 1;
+        for col in 0..uv_w {
+            let idx = last_row * chroma_w + col * 2;
+            assert_eq!(uv_buf[idx], 128, "U at last row col={col}");
+            assert_eq!(uv_buf[idx + 1], 128, "V at last row col={col}");
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // write_nv12_to_mapping — unsupported pixel format
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_write_unsupported_format_returns_error() {
+        let w: u32 = 64;
+        let h: u32 = 48;
+        let frame = crate::test_utils::create_test_video_frame(w, h, PixelFormat::Rgba8, 0xFF);
+
+        let mut y_buf = vec![0u8; (w * h) as usize];
+        let mut uv_buf = vec![0u8; (w as usize) * (h as usize / 2)];
+
+        let mapping =
+            MockWriteMapping { planes: vec![RefCell::new(&mut y_buf), RefCell::new(&mut uv_buf)] };
+        let pitches = [w as usize, w as usize];
+
+        let result = write_nv12_to_mapping(&mapping, &frame, &pitches);
+        assert!(result.is_err(), "RGBA8 input should be rejected");
+        assert!(
+            result.unwrap_err().contains("requires NV12 or I420"),
+            "error message should mention supported formats"
+        );
+    }
+
+    // -----------------------------------------------------------------------
+    // NV12 read→write roundtrip
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_nv12_read_write_roundtrip() {
+        // Verify that data read from a mapping can be written back and
+        // produces identical plane content.
+        for &(w, h) in &[(64, 48), (640, 480), (641, 481)] {
+            let y_size = (w * h) as usize;
+            let chroma_w = (w as usize + 1) / 2 * 2;
+            let uv_h = (h as usize + 1) / 2;
+            let uv_size = chroma_w * uv_h;
+
+            // Create source planes with deterministic data.
+            let y_src: Vec<u8> = (0..y_size).map(|i| (i % 256) as u8).collect();
+            let uv_src: Vec<u8> = (0..uv_size).map(|i| ((i + 128) % 256) as u8).collect();
+
+            // Read from mapping.
+            let read_mapping = MockReadMapping { planes: vec![&y_src, &uv_src] };
+            let pitches = [w as usize, chroma_w];
+            let data = read_nv12_from_mapping(&read_mapping, w, h, &pitches);
+
+            // Create a VideoFrame from the read data.
+            let frame = VideoFrame::with_metadata(w, h, PixelFormat::Nv12, data, None).unwrap();
+
+            // Write back to a new mapping.
+            let mut y_dst = vec![0u8; y_size];
+            let mut uv_dst = vec![0u8; uv_size];
+            let write_mapping = MockWriteMapping {
+                planes: vec![RefCell::new(&mut y_dst), RefCell::new(&mut uv_dst)],
+            };
+            write_nv12_to_mapping(&write_mapping, &frame, &pitches).unwrap();
+
+            assert_eq!(y_dst, y_src, "Y plane roundtrip failed for {w}x{h}");
+            assert_eq!(uv_dst, uv_src, "UV plane roundtrip failed for {w}x{h}");
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // resolve_render_device
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_resolve_render_device_with_configured() {
+        let configured = "/dev/dri/renderD129".to_string();
+        let result = resolve_render_device(Some(&configured));
+        assert_eq!(result, "/dev/dri/renderD129");
+    }
+
+    #[test]
+    fn test_resolve_render_device_fallback() {
+        // Without a configured device and without real hardware, falls back
+        // to default or auto-detected device.
+        let result = resolve_render_device(None);
+        assert!(!result.is_empty(), "should return a non-empty device path");
+    }
+
+    // -----------------------------------------------------------------------
+    // GPU integration tests — encode/decode roundtrip
+    //
+    // These require a VA-API capable GPU. They are compiled with the `vaapi`
+    // feature but skip at runtime if no VA-API device is available.
+    // -----------------------------------------------------------------------
+
+    /// Check whether a usable VA-API display can be opened.
+    fn vaapi_available() -> bool {
+        let path = resolve_render_device(None);
+        libva::Display::open_drm_display(std::path::Path::new(&path)).is_ok()
+    }
+
+    /// Encoder + Decoder roundtrip: encode 5 NV12 frames, decode them back,
+    /// verify dimensions and pixel format.
+    #[tokio::test]
+    async fn test_vaapi_av1_encode_decode_roundtrip() {
+        if !vaapi_available() {
+            eprintln!("SKIP: no VA-API device available");
+            return;
+        }
+
+        use crate::test_utils::{
+            assert_state_initializing, assert_state_running, assert_state_stopped,
+            create_test_context, create_test_video_frame,
+        };
+        use std::borrow::Cow;
+        use std::collections::HashMap;
+
+        // --- Encode ---
+        let (enc_input_tx, enc_input_rx) = mpsc::channel(10);
+        let mut enc_inputs = HashMap::new();
+        enc_inputs.insert("in".to_string(), enc_input_rx);
+
+        let (enc_context, enc_sender, mut enc_state_rx) = create_test_context(enc_inputs, 10);
+        let encoder_config = VaapiAv1EncoderConfig {
+            render_device: None,
+            hw_accel: HwAccelMode::Auto,
+            quality: 200, // fast, lower quality for test speed
+            framerate: 30,
+            low_power: false,
+        };
+        let encoder = VaapiAv1EncoderNode::new(encoder_config).unwrap();
+        let enc_handle = tokio::spawn(async move { Box::new(encoder).run(enc_context).await });
+
+        assert_state_initializing(&mut enc_state_rx).await;
+        assert_state_running(&mut enc_state_rx).await;
+
+        for index in 0_u64..5 {
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::Nv12, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(1_000 + 33_333 * index),
+                duration_us: Some(33_333),
+                sequence: Some(index),
+                keyframe: Some(true),
+            });
+            enc_input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(enc_input_tx);
+
+        assert_state_stopped(&mut enc_state_rx).await;
+        enc_handle.await.unwrap().unwrap();
+
+        let encoded_packets = enc_sender.get_packets_for_pin("out").await;
+        assert!(!encoded_packets.is_empty(), "VA-API AV1 encoder produced no packets");
+
+        // --- Decode ---
+        let (dec_input_tx, dec_input_rx) = mpsc::channel(10);
+        let mut dec_inputs = HashMap::new();
+        dec_inputs.insert("in".to_string(), dec_input_rx);
+
+        let (dec_context, dec_sender, mut dec_state_rx) = create_test_context(dec_inputs, 10);
+        let decoder = VaapiAv1DecoderNode::new(VaapiAv1DecoderConfig::default()).unwrap();
+        let dec_handle = tokio::spawn(async move { Box::new(decoder).run(dec_context).await });
+
+        assert_state_initializing(&mut dec_state_rx).await;
+        assert_state_running(&mut dec_state_rx).await;
+
+        for packet in encoded_packets {
+            if let Packet::Binary { data, metadata, .. } = packet {
+                dec_input_tx
+                    .send(Packet::Binary {
+                        data,
+                        content_type: Some(Cow::Borrowed(AV1_CONTENT_TYPE)),
+                        metadata,
+                    })
+                    .await
+                    .unwrap();
+            }
+        }
+        drop(dec_input_tx);
+
+        assert_state_stopped(&mut dec_state_rx).await;
+        dec_handle.await.unwrap().unwrap();
+
+        let decoded_packets = dec_sender.get_packets_for_pin("out").await;
+        assert!(!decoded_packets.is_empty(), "VA-API AV1 decoder produced no frames");
+
+        for packet in decoded_packets {
+            match packet {
+                Packet::Video(frame) => {
+                    assert_eq!(frame.width, 64);
+                    assert_eq!(frame.height, 64);
+                    assert_eq!(frame.pixel_format, PixelFormat::Nv12);
+                    assert!(!frame.data().is_empty(), "Decoded frame should have data");
+                },
+                _ => panic!("Expected Video packet from VA-API AV1 decoder"),
+            }
+        }
+    }
+
+    /// Verify decoded frames preserve metadata from input packets.
+    #[tokio::test]
+    async fn test_vaapi_av1_metadata_propagation() {
+        if !vaapi_available() {
+            eprintln!("SKIP: no VA-API device available");
+            return;
+        }
+
+        use crate::test_utils::{
+            assert_state_initializing, assert_state_running, assert_state_stopped,
+            create_test_context, create_test_video_frame,
+        };
+        use std::borrow::Cow;
+        use std::collections::HashMap;
+
+        // --- Encode ---
+        let (enc_input_tx, enc_input_rx) = mpsc::channel(10);
+        let mut enc_inputs = HashMap::new();
+        enc_inputs.insert("in".to_string(), enc_input_rx);
+
+        let (enc_context, enc_sender, mut enc_state_rx) = create_test_context(enc_inputs, 10);
+        let encoder = VaapiAv1EncoderNode::new(VaapiAv1EncoderConfig {
+            render_device: None,
+            hw_accel: HwAccelMode::Auto,
+            quality: 200,
+            framerate: 30,
+            low_power: false,
+        })
+        .unwrap();
+        let enc_handle = tokio::spawn(async move { Box::new(encoder).run(enc_context).await });
+
+        assert_state_initializing(&mut enc_state_rx).await;
+        assert_state_running(&mut enc_state_rx).await;
+
+        let timestamps: Vec<u64> = vec![1_000, 34_333, 67_666];
+        for (i, &ts) in timestamps.iter().enumerate() {
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::Nv12, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(ts),
+                duration_us: Some(33_333),
+                sequence: Some(i as u64),
+                keyframe: Some(true),
+            });
+            enc_input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(enc_input_tx);
+
+        assert_state_stopped(&mut enc_state_rx).await;
+        enc_handle.await.unwrap().unwrap();
+
+        let encoded_packets = enc_sender.get_packets_for_pin("out").await;
+        assert!(!encoded_packets.is_empty());
+
+        // --- Decode and verify metadata ---
+        let (dec_input_tx, dec_input_rx) = mpsc::channel(10);
+        let mut dec_inputs = HashMap::new();
+        dec_inputs.insert("in".to_string(), dec_input_rx);
+
+        let (dec_context, dec_sender, mut dec_state_rx) = create_test_context(dec_inputs, 10);
+        let decoder = VaapiAv1DecoderNode::new(VaapiAv1DecoderConfig::default()).unwrap();
+        let dec_handle = tokio::spawn(async move { Box::new(decoder).run(dec_context).await });
+
+        assert_state_initializing(&mut dec_state_rx).await;
+        assert_state_running(&mut dec_state_rx).await;
+
+        for packet in encoded_packets {
+            if let Packet::Binary { data, metadata, .. } = packet {
+                dec_input_tx
+                    .send(Packet::Binary {
+                        data,
+                        content_type: Some(Cow::Borrowed(AV1_CONTENT_TYPE)),
+                        metadata,
+                    })
+                    .await
+                    .unwrap();
+            }
+        }
+        drop(dec_input_tx);
+
+        assert_state_stopped(&mut dec_state_rx).await;
+        dec_handle.await.unwrap().unwrap();
+
+        let decoded_packets = dec_sender.get_packets_for_pin("out").await;
+        assert!(!decoded_packets.is_empty(), "Decoder should produce at least one frame");
+
+        for (i, packet) in decoded_packets.iter().enumerate() {
+            match packet {
+                Packet::Video(frame) => {
+                    assert!(frame.metadata.is_some(), "Decoded frame {i} should have metadata");
+                },
+                _ => panic!("Expected Video packet from VA-API AV1 decoder"),
+            }
+        }
+    }
+
+    /// Encode I420 input frames and verify the encoder accepts them
+    /// (exercises the I420→NV12 conversion path).
+    #[tokio::test]
+    async fn test_vaapi_av1_encode_i420_input() {
+        if !vaapi_available() {
+            eprintln!("SKIP: no VA-API device available");
+            return;
+        }
+
+        use crate::test_utils::{
+            assert_state_initializing, assert_state_running, assert_state_stopped,
+            create_test_context, create_test_video_frame,
+        };
+        use std::collections::HashMap;
+
+        let (enc_input_tx, enc_input_rx) = mpsc::channel(10);
+        let mut enc_inputs = HashMap::new();
+        enc_inputs.insert("in".to_string(), enc_input_rx);
+
+        let (enc_context, enc_sender, mut enc_state_rx) = create_test_context(enc_inputs, 10);
+        let encoder = VaapiAv1EncoderNode::new(VaapiAv1EncoderConfig {
+            render_device: None,
+            hw_accel: HwAccelMode::Auto,
+            quality: 200,
+            framerate: 30,
+            low_power: false,
+        })
+        .unwrap();
+        let enc_handle = tokio::spawn(async move { Box::new(encoder).run(enc_context).await });
+
+        assert_state_initializing(&mut enc_state_rx).await;
+        assert_state_running(&mut enc_state_rx).await;
+
+        for index in 0_u64..3 {
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::I420, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(33_333 * index),
+                duration_us: Some(33_333),
+                sequence: Some(index),
+                keyframe: Some(true),
+            });
+            enc_input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(enc_input_tx);
+
+        assert_state_stopped(&mut enc_state_rx).await;
+        enc_handle.await.unwrap().unwrap();
+
+        let encoded_packets = enc_sender.get_packets_for_pin("out").await;
+        assert!(
+            !encoded_packets.is_empty(),
+            "VA-API AV1 encoder should accept I420 input and produce packets"
+        );
+    }
+
+    /// Verify ForceCpu mode returns an error (VA-API is HW-only).
+    #[test]
+    fn test_vaapi_force_cpu_returns_error() {
+        let decoder_config =
+            VaapiAv1DecoderConfig { render_device: None, hw_accel: HwAccelMode::ForceCpu };
+        let result = VaapiAv1DecoderNode::new(decoder_config);
+        assert!(result.is_err(), "ForceCpu should be rejected for VA-API decoder");
+
+        let encoder_config = VaapiAv1EncoderConfig {
+            render_device: None,
+            hw_accel: HwAccelMode::ForceCpu,
+            quality: DEFAULT_QUALITY,
+            framerate: DEFAULT_FRAMERATE,
+            low_power: false,
+        };
+        let result = VaapiAv1EncoderNode::new(encoder_config);
+        assert!(result.is_err(), "ForceCpu should be rejected for VA-API encoder");
+    }
+
+    // -----------------------------------------------------------------------
+    // deny_unknown_fields
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_deny_unknown_fields_decoder() {
+        let json = r#"{"render_device":null,"hw_accel":"auto","bogus":1}"#;
+        let result: Result<VaapiAv1DecoderConfig, _> = serde_json::from_str(json);
+        assert!(result.is_err(), "Unknown fields should be rejected");
+    }
+
+    #[test]
+    fn test_deny_unknown_fields_encoder() {
+        let json = r#"{"quality":128,"unknown_key":"oops"}"#;
+        let result: Result<VaapiAv1EncoderConfig, _> = serde_json::from_str(json);
+        assert!(result.is_err(), "Unknown fields should be rejected");
+    }
+}
diff --git a/crates/nodes/src/video/vulkan_video.rs b/crates/nodes/src/video/vulkan_video.rs
new file mode 100644
index 00000000..e0321da6
--- /dev/null
+++ b/crates/nodes/src/video/vulkan_video.rs
@@ -0,0 +1,1461 @@
+// SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+//
+// SPDX-License-Identifier: MPL-2.0
+
+//! Vulkan Video HW-accelerated H.264 encoder and decoder nodes.
+//!
+//! Uses the [`vk-video`](https://crates.io/crates/vk-video) crate which wraps
+//! the Vulkan Video extensions and integrates natively with `wgpu`.  Decoded
+//! frames are `wgpu::Texture`s — enabling a zero-copy path with the GPU
+//! compositor in the future.
+//!
+//! This module provides:
+//! - `VulkanVideoH264DecoderNode` — decodes H.264 packets to NV12 `VideoFrame`s
+//! - `VulkanVideoH264EncoderNode` — encodes NV12 `VideoFrame`s to H.264 packets
+//!
+//! Both nodes perform runtime capability detection: if no Vulkan Video capable
+//! GPU is found, node creation returns an error so the pipeline can fall back
+//! to a CPU codec.
+//!
+//! # Feature gate
+//!
+//! Requires `vulkan_video` feature.
+
+use std::borrow::Cow;
+use std::num::NonZeroU32;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use async_trait::async_trait;
+use bytes::Bytes;
+use opentelemetry::global;
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+use streamkit_core::stats::NodeStatsTracker;
+use streamkit_core::types::{
+    EncodedVideoFormat, Packet, PacketMetadata, PacketType, PixelFormat, RawVideoFormat,
+    VideoCodec, VideoFrame, VideoLayout,
+};
+use streamkit_core::{
+    config_helpers, get_codec_channel_capacity, packet_helpers, state_helpers, InputPin,
+    NodeContext, NodeRegistry, OutputPin, PinCardinality, PooledVideoData, ProcessorNode,
+    StreamKitError, VideoFramePool,
+};
+use tokio::sync::mpsc;
+
+use super::HwAccelMode;
+use super::H264_CONTENT_TYPE;
+
+// ---------------------------------------------------------------------------
+// Decoder
+// ---------------------------------------------------------------------------
+
+/// Configuration for the Vulkan Video H.264 decoder node.
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[serde(default, deny_unknown_fields)]
+pub struct VulkanVideoH264DecoderConfig {
+    /// Hardware acceleration mode.
+    pub hw_accel: HwAccelMode,
+}
+
+impl Default for VulkanVideoH264DecoderConfig {
+    fn default() -> Self {
+        Self { hw_accel: HwAccelMode::Auto }
+    }
+}
+
+/// Vulkan Video H.264 decoder node.
+///
+/// Accepts H.264 encoded `Binary` packets on its `"in"` pin and emits
+/// decoded NV12 `VideoFrame`s on its `"out"` pin.
+///
+/// Internally uses `vk-video::BytesDecoder` for GPU-accelerated decoding,
+/// which returns raw NV12 pixel data directly — avoiding explicit GPU
+/// texture readback while still leveraging the Vulkan Video decode engine.
+pub struct VulkanVideoH264DecoderNode {
+    config: VulkanVideoH264DecoderConfig,
+}
+
+impl VulkanVideoH264DecoderNode {
+    /// Create a new decoder node with the given configuration.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if `hw_accel` is `ForceCpu` — this node only
+    /// supports hardware decoding.  Capability probing is deferred to
+    /// `run()`.
+    pub fn new(config: VulkanVideoH264DecoderConfig) -> Result<Self, StreamKitError> {
+        if matches!(config.hw_accel, HwAccelMode::ForceCpu) {
+            return Err(StreamKitError::Configuration(
+                "VulkanVideoH264DecoderNode only supports hardware decoding; \
+                 use an OpenH264 decoder for CPU-only mode"
+                    .to_string(),
+            ));
+        }
+        Ok(Self { config })
+    }
+}
+
+#[async_trait]
+impl ProcessorNode for VulkanVideoH264DecoderNode {
+    fn input_pins(&self) -> Vec<InputPin> {
+        vec![InputPin {
+            name: "in".to_string(),
+            accepts_types: vec![PacketType::EncodedVideo(EncodedVideoFormat {
+                codec: VideoCodec::H264,
+                bitstream_format: None,
+                codec_private: None,
+                profile: None,
+                level: None,
+            })],
+            cardinality: PinCardinality::One,
+        }]
+    }
+
+    fn output_pins(&self) -> Vec<OutputPin> {
+        vec![OutputPin {
+            name: "out".to_string(),
+            produces_type: PacketType::RawVideo(RawVideoFormat {
+                width: None,
+                height: None,
+                pixel_format: PixelFormat::Nv12,
+            }),
+            cardinality: PinCardinality::Broadcast,
+        }]
+    }
+
+    async fn run(self: Box<Self>, mut context: NodeContext) -> Result<(), StreamKitError> {
+        let node_name = context.output_sender.node_name().to_string();
+        state_helpers::emit_initializing(&context.state_tx, &node_name);
+
+        tracing::info!("VulkanVideoH264DecoderNode starting (hw_accel={:?})", self.config.hw_accel);
+        let mut input_rx = context.take_input("in")?;
+        let video_pool = context.video_pool.clone();
+
+        // ── Metrics ──────────────────────────────────────────────────────
+        let meter = global::meter("skit_nodes");
+        let packets_processed_counter =
+            meter.u64_counter("vulkan_video_h264_decoder_packets_processed").build();
+        let decode_duration_histogram = meter
+            .f64_histogram("vulkan_video_h264_decode_duration")
+            .with_boundaries(streamkit_core::metrics::HISTOGRAM_BOUNDARIES_CODEC_PACKET.to_vec())
+            .build();
+
+        // ── Channels ─────────────────────────────────────────────────────
+        let (decode_tx, mut decode_rx) =
+            mpsc::channel::<(Bytes, Option<PacketMetadata>)>(get_codec_channel_capacity());
+        let (result_tx, mut result_rx) =
+            mpsc::channel::<Result<VideoFrame, String>>(get_codec_channel_capacity());
+
+        // ── Blocking decode task ─────────────────────────────────────────
+        let decode_task = tokio::task::spawn_blocking(move || {
+            let instance = match vk_video::VulkanInstance::new() {
+                Ok(inst) => inst,
+                Err(err) => {
+                    let _ = result_tx
+                        .blocking_send(Err(format!("failed to create VulkanInstance: {err}")));
+                    return;
+                },
+            };
+
+            let adapter = match instance
+                .create_adapter(&vk_video::parameters::VulkanAdapterDescriptor::default())
+            {
+                Ok(a) => a,
+                Err(err) => {
+                    let _ = result_tx
+                        .blocking_send(Err(format!("failed to create VulkanAdapter: {err}")));
+                    return;
+                },
+            };
+
+            let device = match adapter
+                .create_device(&vk_video::parameters::VulkanDeviceDescriptor::default())
+            {
+                Ok(d) => d,
+                Err(err) => {
+                    let _ = result_tx
+                        .blocking_send(Err(format!("failed to create VulkanDevice: {err}")));
+                    return;
+                },
+            };
+
+            if !device.supports_decoding() {
+                let _ = result_tx.blocking_send(Err(
+                    "Vulkan device does not support video decoding".to_string(),
+                ));
+                return;
+            }
+
+            let mut decoder = match device
+                .create_bytes_decoder(vk_video::parameters::DecoderParameters::default())
+            {
+                Ok(dec) => dec,
+                Err(err) => {
+                    let _ = result_tx
+                        .blocking_send(Err(format!("failed to create BytesDecoder: {err}")));
+                    return;
+                },
+            };
+
+            tracing::info!("Vulkan Video H.264 decoder initialised successfully");
+
+            while let Some((data, metadata)) = decode_rx.blocking_recv() {
+                if result_tx.is_closed() {
+                    return;
+                }
+
+                let pts = metadata.as_ref().and_then(|m| m.timestamp_us);
+
+                let decode_start = Instant::now();
+                let decode_result =
+                    decoder.decode(vk_video::EncodedInputChunk { data: &data, pts });
+                decode_duration_histogram.record(decode_start.elapsed().as_secs_f64(), &[]);
+
+                match decode_result {
+                    Ok(frames) => {
+                        for output_frame in frames {
+                            match raw_frame_to_video_frame(
+                                &output_frame,
+                                metadata.clone(),
+                                video_pool.as_ref(),
+                            ) {
+                                Ok(vf) => {
+                                    if result_tx.blocking_send(Ok(vf)).is_err() {
+                                        return;
+                                    }
+                                },
+                                Err(err) => {
+                                    let _ = result_tx.blocking_send(Err(err));
+                                },
+                            }
+                        }
+                    },
+                    Err(err) => {
+                        let _ = result_tx
+                            .blocking_send(Err(format!("Vulkan Video H.264 decode error: {err}")));
+                    },
+                }
+            }
+
+            // Flush remaining buffered frames.
+            if result_tx.is_closed() {
+                return;
+            }
+            match decoder.flush() {
+                Ok(frames) => {
+                    for output_frame in frames {
+                        match raw_frame_to_video_frame(&output_frame, None, video_pool.as_ref()) {
+                            Ok(vf) => {
+                                if result_tx.blocking_send(Ok(vf)).is_err() {
+                                    return;
+                                }
+                            },
+                            Err(err) => {
+                                let _ = result_tx.blocking_send(Err(err));
+                            },
+                        }
+                    }
+                },
+                Err(err) => {
+                    let _ = result_tx
+                        .blocking_send(Err(format!("Vulkan Video H.264 flush error: {err}")));
+                },
+            }
+        });
+
+        // ── State transition ─────────────────────────────────────────────
+        state_helpers::emit_running(&context.state_tx, &node_name);
+        let mut stats_tracker = NodeStatsTracker::new(node_name.clone(), context.stats_tx.clone());
+        let batch_size = context.batch_size;
+
+        // ── Input task ───────────────────────────────────────────────────
+        let decode_tx_clone = decode_tx.clone();
+        let mut input_task = tokio::spawn(async move {
+            loop {
+                let Some(first_packet) = input_rx.recv().await else {
+                    break;
+                };
+
+                let packet_batch =
+                    packet_helpers::batch_packets_greedy(first_packet, &mut input_rx, batch_size);
+
+                for packet in packet_batch {
+                    if let Packet::Binary { data, metadata, .. } = packet {
+                        if decode_tx_clone.send((data, metadata)).await.is_err() {
+                            tracing::error!(
+                                "VulkanVideoH264DecoderNode decode task has shut down unexpectedly"
+                            );
+                            return;
+                        }
+                    }
+                }
+            }
+            tracing::info!("VulkanVideoH264DecoderNode input stream closed");
+        });
+
+        // ── Forward loop ─────────────────────────────────────────────────
+        crate::codec_utils::codec_forward_loop(
+            &mut context,
+            &mut result_rx,
+            &mut input_task,
+            decode_task,
+            decode_tx,
+            &packets_processed_counter,
+            &mut stats_tracker,
+            Packet::Video,
+            "VulkanVideoH264DecoderNode",
+        )
+        .await;
+
+        state_helpers::emit_stopped(&context.state_tx, &node_name, "input_closed");
+        tracing::info!("VulkanVideoH264DecoderNode finished");
+        Ok(())
+    }
+}
+
+/// Convert a vk-video `OutputFrame<RawFrameData>` into a StreamKit `VideoFrame`.
+fn raw_frame_to_video_frame(
+    output_frame: &vk_video::OutputFrame<vk_video::RawFrameData>,
+    metadata: Option<PacketMetadata>,
+    video_pool: Option<&Arc<VideoFramePool>>,
+) -> Result<VideoFrame, String> {
+    let raw = &output_frame.data;
+    let nv12_bytes = &raw.frame;
+    let width = raw.width;
+    let height = raw.height;
+
+    let layout = VideoLayout::packed(width, height, PixelFormat::Nv12);
+    let expected_bytes = layout.total_bytes();
+
+    if nv12_bytes.len() < expected_bytes {
+        return Err(format!(
+            "Vulkan Video decoder returned {len} bytes but NV12 {width}×{height} needs {expected_bytes}",
+            len = nv12_bytes.len(),
+        ));
+    }
+
+    let mut data = video_pool.map_or_else(
+        || PooledVideoData::from_vec(vec![0u8; expected_bytes]),
+        |pool| pool.get(expected_bytes),
+    );
+    data.as_mut_slice()[..expected_bytes].copy_from_slice(&nv12_bytes[..expected_bytes]);
+
+    let frame_metadata = metadata.map(|mut m| {
+        // Propagate PTS from vk-video if the incoming metadata had none.
+        if m.timestamp_us.is_none() {
+            m.timestamp_us = output_frame.metadata.pts;
+        }
+        m
+    });
+
+    Ok(VideoFrame {
+        data: Arc::new(data),
+        pixel_format: PixelFormat::Nv12,
+        width,
+        height,
+        layout,
+        metadata: frame_metadata,
+    })
+}
+
+// ---------------------------------------------------------------------------
+// Encoder
+// ---------------------------------------------------------------------------
+
+/// Configuration for the Vulkan Video H.264 encoder node.
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[serde(default, deny_unknown_fields)]
+pub struct VulkanVideoH264EncoderConfig {
+    /// Hardware acceleration mode.
+    pub hw_accel: HwAccelMode,
+    /// Target bitrate in bits per second.
+    pub bitrate: u32,
+    /// Maximum bitrate in bits per second (VBR mode).
+    /// Defaults to 4× the target bitrate.
+    pub max_bitrate: Option<u32>,
+    /// Target framerate (frames per second).
+    pub framerate: u32,
+}
+
+impl Default for VulkanVideoH264EncoderConfig {
+    fn default() -> Self {
+        Self { hw_accel: HwAccelMode::Auto, bitrate: 2_000_000, max_bitrate: None, framerate: 30 }
+    }
+}
+
+/// Vulkan Video H.264 encoder node.
+///
+/// Accepts NV12/I420 `VideoFrame`s on its `"in"` pin and emits H.264
+/// encoded `Binary` packets on its `"out"` pin.
+///
+/// Internally uses `vk-video::BytesEncoder` for GPU-accelerated encoding.
+/// I420 input is converted to NV12 before encoding since Vulkan Video
+/// operates on NV12.
+pub struct VulkanVideoH264EncoderNode {
+    config: VulkanVideoH264EncoderConfig,
+}
+
+impl VulkanVideoH264EncoderNode {
+    /// Create a new encoder node with the given configuration.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if `hw_accel` is `ForceCpu` — this node only
+    /// supports hardware encoding.  Also rejects zero bitrate or
+    /// framerate to avoid confusing hardware-level errors later.
+    pub fn new(config: VulkanVideoH264EncoderConfig) -> Result<Self, StreamKitError> {
+        if matches!(config.hw_accel, HwAccelMode::ForceCpu) {
+            return Err(StreamKitError::Configuration(
+                "VulkanVideoH264EncoderNode only supports hardware encoding; \
+                 use an OpenH264 encoder for CPU-only mode"
+                    .to_string(),
+            ));
+        }
+        if config.bitrate == 0 {
+            return Err(StreamKitError::Configuration(
+                "VulkanVideoH264EncoderNode: bitrate must be > 0".to_string(),
+            ));
+        }
+        if config.framerate == 0 {
+            return Err(StreamKitError::Configuration(
+                "VulkanVideoH264EncoderNode: framerate must be > 0".to_string(),
+            ));
+        }
+        Ok(Self { config })
+    }
+}
+
+#[async_trait]
+impl ProcessorNode for VulkanVideoH264EncoderNode {
+    fn input_pins(&self) -> Vec<InputPin> {
+        vec![InputPin {
+            name: "in".to_string(),
+            accepts_types: vec![
+                PacketType::RawVideo(RawVideoFormat {
+                    width: None,
+                    height: None,
+                    pixel_format: PixelFormat::Nv12,
+                }),
+                PacketType::RawVideo(RawVideoFormat {
+                    width: None,
+                    height: None,
+                    pixel_format: PixelFormat::I420,
+                }),
+            ],
+            cardinality: PinCardinality::One,
+        }]
+    }
+
+    fn output_pins(&self) -> Vec<OutputPin> {
+        vec![OutputPin {
+            name: "out".to_string(),
+            produces_type: PacketType::EncodedVideo(EncodedVideoFormat {
+                codec: VideoCodec::H264,
+                bitstream_format: None,
+                codec_private: None,
+                profile: None,
+                level: None,
+            }),
+            cardinality: PinCardinality::Broadcast,
+        }]
+    }
+
+    fn content_type(&self) -> Option<String> {
+        Some(H264_CONTENT_TYPE.to_string())
+    }
+
+    async fn run(self: Box<Self>, mut context: NodeContext) -> Result<(), StreamKitError> {
+        let node_name = context.output_sender.node_name().to_string();
+        state_helpers::emit_initializing(&context.state_tx, &node_name);
+
+        tracing::info!(
+            "VulkanVideoH264EncoderNode starting (hw_accel={:?}, bitrate={})",
+            self.config.hw_accel,
+            self.config.bitrate,
+        );
+        let mut input_rx = context.take_input("in")?;
+
+        // ── Metrics ──────────────────────────────────────────────────────
+        let meter = global::meter("skit_nodes");
+        let packets_processed_counter =
+            meter.u64_counter("vulkan_video_h264_encoder_packets_processed").build();
+        let encode_duration_histogram = meter
+            .f64_histogram("vulkan_video_h264_encode_duration")
+            .with_boundaries(streamkit_core::metrics::HISTOGRAM_BOUNDARIES_CODEC_PACKET.to_vec())
+            .build();
+
+        // ── Channels ─────────────────────────────────────────────────────
+        let (encode_tx, mut encode_rx) =
+            mpsc::channel::<(VideoFrame, Option<PacketMetadata>)>(get_codec_channel_capacity());
+        let (result_tx, mut result_rx) =
+            mpsc::channel::<Result<EncoderOutput, String>>(get_codec_channel_capacity());
+
+        // ── Blocking encode task ─────────────────────────────────────────
+        let config = self.config.clone();
+        let encode_task = tokio::task::spawn_blocking(move || {
+            // Encoder and device are lazily initialised on the first frame
+            // so we know the actual resolution.
+            let mut encoder: Option<vk_video::BytesEncoder> = None;
+            let mut device: Option<Arc<vk_video::VulkanDevice>> = None;
+            let mut current_dimensions: Option<(u32, u32)> = None;
+
+            while let Some((frame, metadata)) = encode_rx.blocking_recv() {
+                if result_tx.is_closed() {
+                    return;
+                }
+
+                let dims = (frame.width, frame.height);
+
+                // (Re-)create encoder when dimensions change.
+                if current_dimensions != Some(dims) {
+                    tracing::info!(
+                        "VulkanVideoH264EncoderNode: (re)creating encoder for {}×{}",
+                        dims.0,
+                        dims.1,
+                    );
+
+                    let dev = match init_vulkan_encode_device(device.as_ref()) {
+                        Ok(d) => d,
+                        Err(err) => {
+                            let _ = result_tx.blocking_send(Err(err));
+                            return;
+                        },
+                    };
+
+                    let max_bitrate = u64::from(
+                        config.max_bitrate.unwrap_or_else(|| config.bitrate.saturating_mul(4)),
+                    );
+
+                    let output_params = match dev.encoder_output_parameters_high_quality(
+                        vk_video::parameters::RateControl::VariableBitrate {
+                            average_bitrate: u64::from(config.bitrate),
+                            max_bitrate,
+                            virtual_buffer_size: Duration::from_secs(2),
+                        },
+                    ) {
+                        Ok(p) => p,
+                        Err(err) => {
+                            let _ = result_tx.blocking_send(Err(format!(
+                                "failed to get encoder output parameters: {err}"
+                            )));
+                            return;
+                        },
+                    };
+
+                    let width = NonZeroU32::new(dims.0).unwrap_or(NonZeroU32::MIN);
+                    let height = NonZeroU32::new(dims.1).unwrap_or(NonZeroU32::MIN);
+
+                    let enc =
+                        match dev.create_bytes_encoder(vk_video::parameters::EncoderParameters {
+                            input_parameters: vk_video::parameters::VideoParameters {
+                                width,
+                                height,
+                                target_framerate: config.framerate.into(),
+                            },
+                            output_parameters: output_params,
+                        }) {
+                            Ok(e) => e,
+                            Err(err) => {
+                                let _ = result_tx.blocking_send(Err(format!(
+                                    "failed to create BytesEncoder: {err}"
+                                )));
+                                return;
+                            },
+                        };
+
+                    device = Some(dev);
+                    encoder = Some(enc);
+                    current_dimensions = Some(dims);
+                }
+
+                let Some(enc) = encoder.as_mut() else {
+                    let _ = result_tx.blocking_send(Err("encoder not initialised".to_string()));
+                    return;
+                };
+
+                // Convert I420 → NV12 if necessary.
+                let nv12_data = match frame.pixel_format {
+                    PixelFormat::Nv12 => frame.data.as_slice().to_vec(),
+                    PixelFormat::I420 => i420_to_nv12(&frame),
+                    other => {
+                        let _ = result_tx.blocking_send(Err(format!(
+                            "VulkanVideoH264EncoderNode: unsupported pixel format {other:?}, \
+                             expected NV12 or I420"
+                        )));
+                        continue;
+                    },
+                };
+
+                let force_keyframe = metadata.as_ref().and_then(|m| m.keyframe).unwrap_or(false);
+
+                let input_frame = vk_video::InputFrame {
+                    data: vk_video::RawFrameData {
+                        frame: nv12_data,
+                        width: frame.width,
+                        height: frame.height,
+                    },
+                    pts: metadata.as_ref().and_then(|m| m.timestamp_us),
+                };
+
+                let encode_start = Instant::now();
+                let result = enc.encode(&input_frame, force_keyframe);
+                encode_duration_histogram.record(encode_start.elapsed().as_secs_f64(), &[]);
+
+                match result {
+                    Ok(encoded_chunk) => {
+                        // Always propagate the keyframe flag, even when
+                        // the input had no metadata.  Without this,
+                        // downstream RTMP/MoQ transport cannot detect
+                        // keyframes for stream initialisation.
+                        let out_meta = match metadata {
+                            Some(mut m) => {
+                                m.keyframe = Some(encoded_chunk.is_keyframe);
+                                Some(m)
+                            },
+                            None => Some(PacketMetadata {
+                                timestamp_us: None,
+                                duration_us: None,
+                                sequence: None,
+                                keyframe: Some(encoded_chunk.is_keyframe),
+                            }),
+                        };
+
+                        let output = EncoderOutput {
+                            data: Bytes::from(encoded_chunk.data),
+                            metadata: out_meta,
+                        };
+                        if result_tx.blocking_send(Ok(output)).is_err() {
+                            return;
+                        }
+                    },
+                    Err(err) => {
+                        let _ = result_tx
+                            .blocking_send(Err(format!("Vulkan Video H.264 encode error: {err}")));
+                    },
+                }
+            }
+        });
+
+        // ── State transition ─────────────────────────────────────────────
+        state_helpers::emit_running(&context.state_tx, &node_name);
+        let mut stats_tracker = NodeStatsTracker::new(node_name.clone(), context.stats_tx.clone());
+        let batch_size = context.batch_size;
+
+        // ── Input task ───────────────────────────────────────────────────
+        let encode_tx_clone = encode_tx.clone();
+        let node_label = "VulkanVideoH264EncoderNode";
+        let mut input_task = tokio::spawn(async move {
+            loop {
+                let Some(first_packet) = input_rx.recv().await else {
+                    break;
+                };
+
+                let packet_batch =
+                    packet_helpers::batch_packets_greedy(first_packet, &mut input_rx, batch_size);
+
+                for packet in packet_batch {
+                    if let Packet::Video(mut frame) = packet {
+                        let metadata = frame.metadata.take();
+                        if encode_tx_clone.send((frame, metadata)).await.is_err() {
+                            tracing::error!("{node_label} encode task has shut down unexpectedly");
+                            return;
+                        }
+                    }
+                }
+            }
+            tracing::info!("{node_label} input stream closed");
+        });
+
+        // ── Forward loop ─────────────────────────────────────────────────
+        crate::codec_utils::codec_forward_loop(
+            &mut context,
+            &mut result_rx,
+            &mut input_task,
+            encode_task,
+            encode_tx,
+            &packets_processed_counter,
+            &mut stats_tracker,
+            |encoded: EncoderOutput| Packet::Binary {
+                data: encoded.data,
+                content_type: Some(Cow::Borrowed(H264_CONTENT_TYPE)),
+                metadata: encoded.metadata,
+            },
+            node_label,
+        )
+        .await;
+
+        state_helpers::emit_stopped(&context.state_tx, &node_name, "input_closed");
+        tracing::info!("VulkanVideoH264EncoderNode finished");
+        Ok(())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Encoder helpers
+// ---------------------------------------------------------------------------
+
+/// Internal encoded output type for the encoder channel.
+struct EncoderOutput {
+    data: Bytes,
+    metadata: Option<PacketMetadata>,
+}
+
+/// Initialise (or reuse) the Vulkan device for encoding.
+fn init_vulkan_encode_device(
+    existing: Option<&Arc<vk_video::VulkanDevice>>,
+) -> Result<Arc<vk_video::VulkanDevice>, String> {
+    if let Some(dev) = existing {
+        return Ok(Arc::clone(dev));
+    }
+
+    let instance = vk_video::VulkanInstance::new()
+        .map_err(|e| format!("failed to create VulkanInstance: {e}"))?;
+
+    let adapter = instance
+        .create_adapter(&vk_video::parameters::VulkanAdapterDescriptor::default())
+        .map_err(|e| format!("failed to create VulkanAdapter: {e}"))?;
+
+    let device = adapter
+        .create_device(&vk_video::parameters::VulkanDeviceDescriptor::default())
+        .map_err(|e| format!("failed to create VulkanDevice: {e}"))?;
+
+    if !device.supports_encoding() {
+        return Err("Vulkan device does not support video encoding".to_string());
+    }
+
+    tracing::info!("Vulkan Video encode device initialised successfully");
+    Ok(device)
+}
+
+/// Convert an I420 `VideoFrame` to NV12 byte layout.
+///
+/// NV12 layout: Y plane (width × height) followed by interleaved UV plane
+/// (width × height/2).
+fn i420_to_nv12(frame: &VideoFrame) -> Vec<u8> {
+    let w = frame.width as usize;
+    let h = frame.height as usize;
+    let layout = frame.layout();
+
+    let y_size = w * h;
+    let uv_size = w * (h / 2);
+    let mut nv12 = vec![0u8; y_size + uv_size];
+
+    let src = frame.data.as_slice();
+    let planes = layout.planes();
+
+    let y_plane = &planes[0];
+    let u_plane = &planes[1];
+    let v_plane = &planes[2];
+
+    // Copy Y plane.
+    for row in 0..h {
+        let src_start = y_plane.offset + row * y_plane.stride;
+        let dst_start = row * w;
+        nv12[dst_start..dst_start + w].copy_from_slice(&src[src_start..src_start + w]);
+    }
+
+    // Interleave U and V into NV12 UV plane.
+    let chroma_h = h / 2;
+    let chroma_w = w / 2;
+    for row in 0..chroma_h {
+        let u_src_start = u_plane.offset + row * u_plane.stride;
+        let v_src_start = v_plane.offset + row * v_plane.stride;
+        let dst_start = y_size + row * w;
+        for col in 0..chroma_w {
+            nv12[dst_start + col * 2] = src[u_src_start + col];
+            nv12[dst_start + col * 2 + 1] = src[v_src_start + col];
+        }
+    }
+
+    nv12
+}
+
+// ---------------------------------------------------------------------------
+// Registration
+// ---------------------------------------------------------------------------
+
+use schemars::schema_for;
+use streamkit_core::registry::StaticPins;
+
+#[allow(clippy::expect_used, clippy::missing_panics_doc)]
+pub fn register_vulkan_video_nodes(registry: &mut NodeRegistry) {
+    let default_decoder = VulkanVideoH264DecoderNode::new(VulkanVideoH264DecoderConfig::default())
+        .expect("default VulkanVideoH264 decoder config should be valid");
+    registry.register_static_with_description(
+        "video::vulkan_video::h264_decoder",
+        |params| {
+            let config = config_helpers::parse_config_optional(params)?;
+            Ok(Box::new(VulkanVideoH264DecoderNode::new(config)?))
+        },
+        serde_json::to_value(schema_for!(VulkanVideoH264DecoderConfig))
+            .expect("VulkanVideoH264DecoderConfig schema should serialize to JSON"),
+        StaticPins { inputs: default_decoder.input_pins(), outputs: default_decoder.output_pins() },
+        vec!["video".to_string(), "codecs".to_string(), "h264".to_string(), "hw".to_string()],
+        false,
+        "Decodes H.264 Annex B packets into raw NV12 video frames using Vulkan Video \
+         hardware acceleration. Requires a GPU with Vulkan Video decode support \
+         (NVIDIA, AMD, or Intel with recent Mesa drivers). Use video::openh264::decoder \
+         for CPU-only fallback.",
+    );
+
+    let default_encoder = VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig::default())
+        .expect("default VulkanVideoH264 encoder config should be valid");
+    registry.register_static_with_description(
+        "video::vulkan_video::h264_encoder",
+        |params| {
+            let config = config_helpers::parse_config_optional(params)?;
+            Ok(Box::new(VulkanVideoH264EncoderNode::new(config)?))
+        },
+        serde_json::to_value(schema_for!(VulkanVideoH264EncoderConfig))
+            .expect("VulkanVideoH264EncoderConfig schema should serialize to JSON"),
+        StaticPins { inputs: default_encoder.input_pins(), outputs: default_encoder.output_pins() },
+        vec!["video".to_string(), "codecs".to_string(), "h264".to_string(), "hw".to_string()],
+        false,
+        "Encodes raw video frames (NV12 or I420) into H.264 Annex B packets using \
+         Vulkan Video hardware acceleration. Supports VBR rate control with configurable \
+         bitrate. Requires a GPU with Vulkan Video encode support. Use \
+         video::openh264::encoder for CPU-only fallback.",
+    );
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+#[allow(clippy::unwrap_used, clippy::expect_used, clippy::disallowed_macros)]
+mod tests {
+    use super::*;
+    use crate::test_utils::{
+        assert_state_initializing, assert_state_running, assert_state_stopped, create_test_context,
+        create_test_video_frame,
+    };
+    use std::collections::HashMap;
+    use streamkit_core::types::Packet;
+    use tokio::sync::mpsc;
+
+    // ── Vulkan Video availability helper ────────────────────────────────
+    //
+    // Integration tests that require a Vulkan Video capable GPU use this
+    // helper.  On machines without the right hardware/drivers the tests
+    // print a message and pass (skip) instead of failing.
+
+    /// Try to create a Vulkan Video device.  Returns `true` if both encode
+    /// and decode are available.
+    fn vulkan_video_available() -> bool {
+        let Ok(instance) = vk_video::VulkanInstance::new() else {
+            return false;
+        };
+        let Ok(adapter) =
+            instance.create_adapter(&vk_video::parameters::VulkanAdapterDescriptor::default())
+        else {
+            return false;
+        };
+        let Ok(device) =
+            adapter.create_device(&vk_video::parameters::VulkanDeviceDescriptor::default())
+        else {
+            return false;
+        };
+        device.supports_decoding() && device.supports_encoding()
+    }
+
+    /// Like [`vulkan_video_available`] but only checks for decode support.
+    fn vulkan_decode_available() -> bool {
+        let Ok(instance) = vk_video::VulkanInstance::new() else {
+            return false;
+        };
+        let Ok(adapter) =
+            instance.create_adapter(&vk_video::parameters::VulkanAdapterDescriptor::default())
+        else {
+            return false;
+        };
+        let Ok(device) =
+            adapter.create_device(&vk_video::parameters::VulkanDeviceDescriptor::default())
+        else {
+            return false;
+        };
+        device.supports_decoding()
+    }
+
+    /// Like [`vulkan_video_available`] but only checks for encode support.
+    fn vulkan_encode_available() -> bool {
+        let Ok(instance) = vk_video::VulkanInstance::new() else {
+            return false;
+        };
+        let Ok(adapter) =
+            instance.create_adapter(&vk_video::parameters::VulkanAdapterDescriptor::default())
+        else {
+            return false;
+        };
+        let Ok(device) =
+            adapter.create_device(&vk_video::parameters::VulkanDeviceDescriptor::default())
+        else {
+            return false;
+        };
+        device.supports_encoding()
+    }
+
+    macro_rules! skip_without_vulkan_encode {
+        () => {
+            if !vulkan_encode_available() {
+                eprintln!("SKIPPED: no Vulkan Video encode support on this machine");
+                return;
+            }
+        };
+    }
+
+    macro_rules! skip_without_vulkan_decode {
+        () => {
+            if !vulkan_decode_available() {
+                eprintln!("SKIPPED: no Vulkan Video decode support on this machine");
+                return;
+            }
+        };
+    }
+
+    macro_rules! skip_without_vulkan_video {
+        () => {
+            if !vulkan_video_available() {
+                eprintln!("SKIPPED: no Vulkan Video encode+decode support on this machine");
+                return;
+            }
+        };
+    }
+
+    // ── Config validation tests (no GPU needed) ─────────────────────────
+
+    #[test]
+    fn test_decoder_rejects_force_cpu() {
+        let result = VulkanVideoH264DecoderNode::new(VulkanVideoH264DecoderConfig {
+            hw_accel: HwAccelMode::ForceCpu,
+        });
+        assert!(result.is_err(), "ForceCpu should be rejected for HW-only decoder");
+    }
+
+    #[test]
+    fn test_decoder_accepts_auto() {
+        let result = VulkanVideoH264DecoderNode::new(VulkanVideoH264DecoderConfig {
+            hw_accel: HwAccelMode::Auto,
+        });
+        assert!(result.is_ok(), "Auto should be accepted");
+    }
+
+    #[test]
+    fn test_decoder_accepts_force_hw() {
+        let result = VulkanVideoH264DecoderNode::new(VulkanVideoH264DecoderConfig {
+            hw_accel: HwAccelMode::ForceHw,
+        });
+        assert!(result.is_ok(), "ForceHw should be accepted");
+    }
+
+    #[test]
+    fn test_encoder_rejects_force_cpu() {
+        let result = VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig {
+            hw_accel: HwAccelMode::ForceCpu,
+            ..Default::default()
+        });
+        assert!(result.is_err(), "ForceCpu should be rejected for HW-only encoder");
+    }
+
+    #[test]
+    fn test_encoder_rejects_zero_bitrate() {
+        let result = VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig {
+            bitrate: 0,
+            ..Default::default()
+        });
+        assert!(result.is_err(), "bitrate=0 should be rejected");
+    }
+
+    #[test]
+    fn test_encoder_rejects_zero_framerate() {
+        let result = VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig {
+            framerate: 0,
+            ..Default::default()
+        });
+        assert!(result.is_err(), "framerate=0 should be rejected");
+    }
+
+    #[test]
+    fn test_encoder_accepts_valid_config() {
+        let result = VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig {
+            hw_accel: HwAccelMode::Auto,
+            bitrate: 2_000_000,
+            max_bitrate: None,
+            framerate: 30,
+        });
+        assert!(result.is_ok(), "valid config should be accepted");
+    }
+
+    #[test]
+    fn test_encoder_accepts_custom_max_bitrate() {
+        let result = VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig {
+            hw_accel: HwAccelMode::Auto,
+            bitrate: 2_000_000,
+            max_bitrate: Some(8_000_000),
+            framerate: 60,
+        });
+        assert!(result.is_ok(), "custom max_bitrate config should be accepted");
+    }
+
+    // ── deny_unknown_fields tests ─────────────────────────────────────
+
+    #[test]
+    fn test_deny_unknown_fields_decoder() {
+        let json = r#"{"hw_accel":"auto","bogus_field":42}"#;
+        let result: Result<VulkanVideoH264DecoderConfig, _> = serde_json::from_str(json);
+        assert!(result.is_err(), "Unknown fields should be rejected");
+    }
+
+    #[test]
+    fn test_deny_unknown_fields_encoder() {
+        let json = r#"{"bitrate":1000000,"unknown_key":"oops"}"#;
+        let result: Result<VulkanVideoH264EncoderConfig, _> = serde_json::from_str(json);
+        assert!(result.is_err(), "Unknown fields should be rejected");
+    }
+
+    // ── Pin configuration tests ─────────────────────────────────────────
+
+    #[test]
+    fn test_decoder_pin_config() {
+        let node =
+            VulkanVideoH264DecoderNode::new(VulkanVideoH264DecoderConfig::default()).unwrap();
+
+        let inputs = node.input_pins();
+        assert_eq!(inputs.len(), 1);
+        assert_eq!(inputs[0].name, "in");
+        assert!(matches!(inputs[0].cardinality, PinCardinality::One));
+        assert!(matches!(
+            &inputs[0].accepts_types[0],
+            PacketType::EncodedVideo(fmt) if fmt.codec == VideoCodec::H264
+        ));
+
+        let outputs = node.output_pins();
+        assert_eq!(outputs.len(), 1);
+        assert_eq!(outputs[0].name, "out");
+        assert!(matches!(outputs[0].cardinality, PinCardinality::Broadcast));
+        assert!(matches!(
+            &outputs[0].produces_type,
+            PacketType::RawVideo(fmt) if fmt.pixel_format == PixelFormat::Nv12
+        ));
+    }
+
+    #[test]
+    fn test_encoder_pin_config() {
+        let node =
+            VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig::default()).unwrap();
+
+        let inputs = node.input_pins();
+        assert_eq!(inputs.len(), 1);
+        assert_eq!(inputs[0].name, "in");
+        assert_eq!(inputs[0].accepts_types.len(), 2, "should accept NV12 and I420");
+
+        let outputs = node.output_pins();
+        assert_eq!(outputs.len(), 1);
+        assert_eq!(outputs[0].name, "out");
+        assert!(matches!(
+            &outputs[0].produces_type,
+            PacketType::EncodedVideo(fmt) if fmt.codec == VideoCodec::H264
+        ));
+    }
+
+    #[test]
+    fn test_encoder_content_type() {
+        let node =
+            VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig::default()).unwrap();
+        assert_eq!(
+            node.content_type().as_deref(),
+            Some(H264_CONTENT_TYPE),
+            "Encoder should report video/h264 content type"
+        );
+    }
+
+    // ── Integration tests (require Vulkan Video GPU) ────────────────────
+
+    #[tokio::test]
+    async fn test_vulkan_video_encode_nv12() {
+        skip_without_vulkan_encode!();
+
+        let (input_tx, input_rx) = mpsc::channel(10);
+        let mut inputs = HashMap::new();
+        inputs.insert("in".to_string(), input_rx);
+
+        let (context, sender, mut state_rx) = create_test_context(inputs, 10);
+        let encoder =
+            VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig::default()).unwrap();
+
+        let handle = tokio::spawn(async move { Box::new(encoder).run(context).await });
+
+        assert_state_initializing(&mut state_rx).await;
+        assert_state_running(&mut state_rx).await;
+
+        for i in 0_u64..5 {
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::Nv12, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(33_333 * i),
+                duration_us: Some(33_333),
+                sequence: Some(i),
+                keyframe: Some(i == 0),
+            });
+            input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(input_tx);
+
+        assert_state_stopped(&mut state_rx).await;
+        handle.await.unwrap().unwrap();
+
+        let packets = sender.get_packets_for_pin("out").await;
+        assert!(!packets.is_empty(), "Vulkan Video encoder should produce packets");
+
+        for (i, packet) in packets.iter().enumerate() {
+            match packet {
+                Packet::Binary { data, content_type, metadata, .. } => {
+                    assert!(!data.is_empty(), "Encoded packet {i} should have data");
+                    assert_eq!(
+                        content_type.as_deref(),
+                        Some(H264_CONTENT_TYPE),
+                        "Content type should be video/h264"
+                    );
+                    assert!(metadata.is_some(), "Encoded packet {i} should have metadata");
+                    let meta = metadata.as_ref().unwrap();
+                    assert!(
+                        meta.keyframe.is_some(),
+                        "Encoded packet {i} should have keyframe flag"
+                    );
+                },
+                _ => panic!("Expected Binary packet from Vulkan Video encoder, got {packet:?}"),
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_vulkan_video_encode_i420() {
+        skip_without_vulkan_encode!();
+
+        let (input_tx, input_rx) = mpsc::channel(10);
+        let mut inputs = HashMap::new();
+        inputs.insert("in".to_string(), input_rx);
+
+        let (context, sender, mut state_rx) = create_test_context(inputs, 10);
+        let encoder =
+            VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig::default()).unwrap();
+
+        let handle = tokio::spawn(async move { Box::new(encoder).run(context).await });
+
+        assert_state_initializing(&mut state_rx).await;
+        assert_state_running(&mut state_rx).await;
+
+        for i in 0_u64..3 {
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::I420, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(33_333 * i),
+                duration_us: Some(33_333),
+                sequence: Some(i),
+                keyframe: Some(true),
+            });
+            input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(input_tx);
+
+        assert_state_stopped(&mut state_rx).await;
+        handle.await.unwrap().unwrap();
+
+        let packets = sender.get_packets_for_pin("out").await;
+        assert!(!packets.is_empty(), "Vulkan Video encoder should produce packets from I420 input");
+    }
+
+    #[tokio::test]
+    async fn test_vulkan_video_encode_metadata_without_input_metadata() {
+        skip_without_vulkan_encode!();
+
+        let (input_tx, input_rx) = mpsc::channel(10);
+        let mut inputs = HashMap::new();
+        inputs.insert("in".to_string(), input_rx);
+
+        let (context, sender, mut state_rx) = create_test_context(inputs, 10);
+        let encoder =
+            VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig::default()).unwrap();
+
+        let handle = tokio::spawn(async move { Box::new(encoder).run(context).await });
+
+        assert_state_initializing(&mut state_rx).await;
+        assert_state_running(&mut state_rx).await;
+
+        // Send frames with NO metadata to verify keyframe flag is still propagated.
+        for _ in 0..3 {
+            let frame = create_test_video_frame(64, 64, PixelFormat::Nv12, 16);
+            // frame.metadata is None by default from create_test_video_frame
+            input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(input_tx);
+
+        assert_state_stopped(&mut state_rx).await;
+        handle.await.unwrap().unwrap();
+
+        let packets = sender.get_packets_for_pin("out").await;
+        assert!(!packets.is_empty(), "Encoder should produce packets even without input metadata");
+
+        for (i, packet) in packets.iter().enumerate() {
+            match packet {
+                Packet::Binary { metadata, .. } => {
+                    assert!(
+                        metadata.is_some(),
+                        "Packet {i} should have metadata even when input had None"
+                    );
+                    let meta = metadata.as_ref().unwrap();
+                    assert!(
+                        meta.keyframe.is_some(),
+                        "Packet {i} should always have keyframe flag set"
+                    );
+                },
+                _ => panic!("Expected Binary packet"),
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_vulkan_video_roundtrip_encode_decode() {
+        skip_without_vulkan_video!();
+
+        // ── Step 1: Encode NV12 frames to H.264 ─────────────────────────
+        let (enc_input_tx, enc_input_rx) = mpsc::channel(10);
+        let mut enc_inputs = HashMap::new();
+        enc_inputs.insert("in".to_string(), enc_input_rx);
+
+        let (enc_context, enc_sender, mut enc_state_rx) = create_test_context(enc_inputs, 10);
+        let encoder =
+            VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig::default()).unwrap();
+
+        let enc_handle = tokio::spawn(async move { Box::new(encoder).run(enc_context).await });
+
+        assert_state_initializing(&mut enc_state_rx).await;
+        assert_state_running(&mut enc_state_rx).await;
+
+        let frame_count = 5_u64;
+        let width = 64_u32;
+        let height = 64_u32;
+
+        for i in 0..frame_count {
+            let mut frame = create_test_video_frame(width, height, PixelFormat::Nv12, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(33_333 * i),
+                duration_us: Some(33_333),
+                sequence: Some(i),
+                keyframe: Some(i == 0),
+            });
+            enc_input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(enc_input_tx);
+
+        assert_state_stopped(&mut enc_state_rx).await;
+        enc_handle.await.unwrap().unwrap();
+
+        let encoded_packets = enc_sender.get_packets_for_pin("out").await;
+        assert!(!encoded_packets.is_empty(), "Encoder should produce packets");
+
+        // ── Step 2: Decode the H.264 packets back to NV12 ───────────────
+        let (dec_input_tx, dec_input_rx) = mpsc::channel(10);
+        let mut dec_inputs = HashMap::new();
+        dec_inputs.insert("in".to_string(), dec_input_rx);
+
+        let (dec_context, dec_sender, mut dec_state_rx) = create_test_context(dec_inputs, 10);
+        let decoder =
+            VulkanVideoH264DecoderNode::new(VulkanVideoH264DecoderConfig::default()).unwrap();
+
+        let dec_handle = tokio::spawn(async move { Box::new(decoder).run(dec_context).await });
+
+        assert_state_initializing(&mut dec_state_rx).await;
+        assert_state_running(&mut dec_state_rx).await;
+
+        // Feed encoded packets to the decoder.
+        for packet in encoded_packets {
+            dec_input_tx.send(packet).await.unwrap();
+        }
+        drop(dec_input_tx);
+
+        assert_state_stopped(&mut dec_state_rx).await;
+        dec_handle.await.unwrap().unwrap();
+
+        let decoded_packets = dec_sender.get_packets_for_pin("out").await;
+        assert!(!decoded_packets.is_empty(), "Decoder should produce frames from roundtrip data");
+
+        // Verify decoded frames are NV12 with the right dimensions.
+        for (i, packet) in decoded_packets.iter().enumerate() {
+            match packet {
+                Packet::Video(frame) => {
+                    assert_eq!(
+                        frame.pixel_format,
+                        PixelFormat::Nv12,
+                        "Decoded frame {i} should be NV12"
+                    );
+                    assert_eq!(frame.width, width, "Decoded frame {i} width mismatch");
+                    assert_eq!(frame.height, height, "Decoded frame {i} height mismatch");
+                    assert!(
+                        !frame.data.as_slice().is_empty(),
+                        "Decoded frame {i} should have data"
+                    );
+                },
+                _ => panic!("Expected Video packet from decoder, got {packet:?}"),
+            }
+        }
+    }
+
+    // ── I420→NV12 conversion unit test ──────────────────────────────────
+
+    #[test]
+    fn test_i420_to_nv12_conversion() {
+        let width = 4_u32;
+        let height = 4_u32;
+        let frame = create_test_video_frame(width, height, PixelFormat::I420, 0);
+
+        // Manually fill planes with known values for verification.
+        let layout = frame.layout();
+        let planes = layout.planes();
+
+        // Build a frame with identifiable plane content.
+        let mut data = vec![0u8; layout.total_bytes()];
+        // Y plane: fill with 100
+        for row in 0..height as usize {
+            for col in 0..width as usize {
+                data[planes[0].offset + row * planes[0].stride + col] = 100;
+            }
+        }
+        // U plane: fill with 50
+        let chroma_w = width as usize / 2;
+        let chroma_h = height as usize / 2;
+        for row in 0..chroma_h {
+            for col in 0..chroma_w {
+                data[planes[1].offset + row * planes[1].stride + col] = 50;
+            }
+        }
+        // V plane: fill with 200
+        for row in 0..chroma_h {
+            for col in 0..chroma_w {
+                data[planes[2].offset + row * planes[2].stride + col] = 200;
+            }
+        }
+
+        let test_frame = VideoFrame::new(width, height, PixelFormat::I420, data)
+            .expect("test frame should be valid");
+
+        let nv12 = i420_to_nv12(&test_frame);
+
+        let y_size = (width * height) as usize;
+        let uv_size = width as usize * (height as usize / 2);
+        assert_eq!(nv12.len(), y_size + uv_size, "NV12 buffer size mismatch");
+
+        // Verify Y plane was copied correctly.
+        for (i, &byte) in nv12.iter().enumerate().take(y_size) {
+            assert_eq!(byte, 100, "Y plane byte {i} mismatch");
+        }
+
+        // Verify UV plane has interleaved U and V values.
+        for row in 0..chroma_h {
+            for col in 0..chroma_w {
+                let uv_offset = y_size + row * width as usize + col * 2;
+                assert_eq!(nv12[uv_offset], 50, "U value at row={row} col={col} mismatch");
+                assert_eq!(nv12[uv_offset + 1], 200, "V value at row={row} col={col} mismatch");
+            }
+        }
+    }
+
+    // ── Standalone decode test (requires encode+decode to produce input) ─
+
+    #[tokio::test]
+    async fn test_vulkan_video_decode_produces_frames() {
+        // We need both encode (to generate H.264 data) and decode capabilities.
+        // Use skip_without_vulkan_decode for the decode-specific skip message,
+        // but we also need encode to produce test data.
+        skip_without_vulkan_decode!();
+        skip_without_vulkan_encode!();
+
+        // First encode a few frames to get valid H.264 data.
+        let (enc_tx, enc_rx) = mpsc::channel(10);
+        let mut enc_inputs = HashMap::new();
+        enc_inputs.insert("in".to_string(), enc_rx);
+
+        let (enc_ctx, enc_sender, mut enc_state_rx) = create_test_context(enc_inputs, 10);
+        let encoder =
+            VulkanVideoH264EncoderNode::new(VulkanVideoH264EncoderConfig::default()).unwrap();
+        let enc_handle = tokio::spawn(async move { Box::new(encoder).run(enc_ctx).await });
+
+        assert_state_initializing(&mut enc_state_rx).await;
+        assert_state_running(&mut enc_state_rx).await;
+
+        for i in 0_u64..5 {
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::Nv12, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(33_333 * i),
+                duration_us: Some(33_333),
+                sequence: Some(i),
+                keyframe: Some(i == 0),
+            });
+            enc_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(enc_tx);
+
+        assert_state_stopped(&mut enc_state_rx).await;
+        enc_handle.await.unwrap().unwrap();
+
+        let encoded_packets = enc_sender.get_packets_for_pin("out").await;
+        assert!(!encoded_packets.is_empty(), "Need encoded data to test decoder");
+
+        // Now decode.
+        let (dec_tx, dec_rx) = mpsc::channel(10);
+        let mut dec_inputs = HashMap::new();
+        dec_inputs.insert("in".to_string(), dec_rx);
+
+        let (dec_ctx, dec_sender, mut dec_state_rx) = create_test_context(dec_inputs, 10);
+        let decoder =
+            VulkanVideoH264DecoderNode::new(VulkanVideoH264DecoderConfig::default()).unwrap();
+        let dec_handle = tokio::spawn(async move { Box::new(decoder).run(dec_ctx).await });
+
+        assert_state_initializing(&mut dec_state_rx).await;
+        assert_state_running(&mut dec_state_rx).await;
+
+        for packet in encoded_packets {
+            dec_tx.send(packet).await.unwrap();
+        }
+        drop(dec_tx);
+
+        assert_state_stopped(&mut dec_state_rx).await;
+        dec_handle.await.unwrap().unwrap();
+
+        let decoded_packets = dec_sender.get_packets_for_pin("out").await;
+        assert!(!decoded_packets.is_empty(), "Decoder should produce NV12 frames");
+
+        for (i, packet) in decoded_packets.iter().enumerate() {
+            match packet {
+                Packet::Video(frame) => {
+                    assert_eq!(
+                        frame.pixel_format,
+                        PixelFormat::Nv12,
+                        "Decoded frame {i} should be NV12"
+                    );
+                    assert_eq!(frame.width, 64, "Decoded frame {i} width mismatch");
+                    assert_eq!(frame.height, 64, "Decoded frame {i} height mismatch");
+                },
+                _ => panic!("Expected Video packet from decoder"),
+            }
+        }
+    }
+
+    // ── Registration test ───────────────────────────────────────────────
+
+    #[test]
+    fn test_node_registration() {
+        let mut registry = NodeRegistry::new();
+        register_vulkan_video_nodes(&mut registry);
+
+        // Verify both nodes are registered by trying to create them with
+        // default config.
+        assert!(
+            registry.create_node("video::vulkan_video::h264_decoder", None).is_ok(),
+            "decoder should be registered"
+        );
+        assert!(
+            registry.create_node("video::vulkan_video::h264_encoder", None).is_ok(),
+            "encoder should be registered"
+        );
+    }
+}
diff --git a/justfile b/justfile
index 71854446..7dff3d5a 100644
--- a/justfile
+++ b/justfile
@@ -201,11 +201,12 @@ test-skit:
     @cargo test --workspace -- --skip gpu_tests::
     @cargo test -p streamkit-server --features "moq"
 
-# Run GPU compositor tests (requires a machine with a GPU)
+# Run GPU tests (requires a machine with a GPU)
 test-skit-gpu:
     @echo "Testing skit (GPU)..."
     @cargo test -p streamkit-nodes --features gpu
     @cargo test -p streamkit-engine --features gpu
+    @cargo test -p streamkit-nodes --features nvcodec
 
 # Lint and format check the skit code
 # Note: We exclude dhat-heap since it's mutually exclusive with profiling (both define global allocators)

From ac337c5439e2f1b474ea264a59727af3383c4a22 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 13:46:33 +0000
Subject: [PATCH 02/23] ci: run nvcodec tests on GPU runner

The self-hosted GPU runner (skit-demo-eu-gpu) has an NVIDIA GPU but the
CI workflow wasn't exercising the nvcodec feature tests. Add the missing
cargo test invocation so NVENC/NVDEC AV1 tests run alongside the
existing GPU compositor tests.

Signed-off-by: Devin AI <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 .github/workflows/skit.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/skit.yml b/.github/workflows/skit.yml
index da6bdf34..2c3bcab6 100644
--- a/.github/workflows/skit.yml
+++ b/.github/workflows/skit.yml
@@ -143,6 +143,7 @@ jobs:
         run: |
           cargo test --locked -p streamkit-nodes --features gpu
           cargo test --locked -p streamkit-engine --features gpu
+          cargo test --locked -p streamkit-nodes --features nvcodec
 
   build:
     name: Build

From a3a5b2a2c7606e07a8bdf93d850c78a8de5bf06c Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 13:51:07 +0000
Subject: [PATCH 03/23] ci: install CUDA headers on GPU runner for nvcodec
 tests

The shiguredo_nvcodec build script requires cuda.h at compile time.
Install nvidia-cuda-toolkit on the self-hosted GPU runner if CUDA
headers aren't already present.

Signed-off-by: Devin AI <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 .github/workflows/skit.yml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/skit.yml b/.github/workflows/skit.yml
index 2c3bcab6..2a3502b0 100644
--- a/.github/workflows/skit.yml
+++ b/.github/workflows/skit.yml
@@ -120,7 +120,14 @@ jobs:
       - uses: actions/checkout@v5
 
       - name: Install system dependencies
-        run: sudo apt-get update && sudo apt-get install -y libvpx-dev nasm cmake pkg-config libopus-dev
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libvpx-dev nasm cmake pkg-config libopus-dev
+          # CUDA headers needed by shiguredo_nvcodec (nvcodec feature).
+          # Only install if not already present.
+          if [ ! -f /usr/local/cuda/include/cuda.h ]; then
+            sudo apt-get install -y nvidia-cuda-toolkit
+          fi
 
       - name: Install Rust toolchain
         uses: dtolnay/rust-toolchain@master

From 30d79f92a57821d86bd72b2552be1534ba26a413 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 13:55:30 +0000
Subject: [PATCH 04/23] ci: set CUDA_INCLUDE_PATH for nvcodec build on GPU
 runner

Ubuntu's nvidia-cuda-toolkit installs cuda.h to /usr/include, but
shiguredo_nvcodec's build script defaults to /usr/local/cuda/include.
Set CUDA_INCLUDE_PATH=/usr/include so the build finds the headers.

Signed-off-by: Devin AI <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 .github/workflows/skit.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/skit.yml b/.github/workflows/skit.yml
index 2a3502b0..848fcb2b 100644
--- a/.github/workflows/skit.yml
+++ b/.github/workflows/skit.yml
@@ -147,6 +147,9 @@ jobs:
           VPX_LIB_DIR: /usr/lib/x86_64-linux-gnu
           VPX_INCLUDE_DIR: /usr/include
           VPX_VERSION: "1.13.0"
+          # Ubuntu's nvidia-cuda-toolkit installs headers to /usr/include, not
+          # /usr/local/cuda/include.  Tell shiguredo_nvcodec where to find them.
+          CUDA_INCLUDE_PATH: /usr/include
         run: |
           cargo test --locked -p streamkit-nodes --features gpu
           cargo test --locked -p streamkit-engine --features gpu

From def0db30538770cd02e4ce6c9e2720c7ad499ae4 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 14:35:24 +0000
Subject: [PATCH 05/23] ci: fix nvcodec build on GPU runner
 (BINDGEN_EXTRA_CLANG_ARGS)

Remove conditional nvidia-cuda-toolkit install (already pre-installed
on the self-hosted runner) and add BINDGEN_EXTRA_CLANG_ARGS to point
bindgen at the LLVM 18 clang builtin includes so stddef.h is found.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 .github/workflows/skit.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/skit.yml b/.github/workflows/skit.yml
index 848fcb2b..a0e00159 100644
--- a/.github/workflows/skit.yml
+++ b/.github/workflows/skit.yml
@@ -123,11 +123,6 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y libvpx-dev nasm cmake pkg-config libopus-dev
-          # CUDA headers needed by shiguredo_nvcodec (nvcodec feature).
-          # Only install if not already present.
-          if [ ! -f /usr/local/cuda/include/cuda.h ]; then
-            sudo apt-get install -y nvidia-cuda-toolkit
-          fi
 
       - name: Install Rust toolchain
         uses: dtolnay/rust-toolchain@master
@@ -150,6 +145,9 @@ jobs:
           # Ubuntu's nvidia-cuda-toolkit installs headers to /usr/include, not
           # /usr/local/cuda/include.  Tell shiguredo_nvcodec where to find them.
           CUDA_INCLUDE_PATH: /usr/include
+          # bindgen (used by shiguredo_nvcodec) needs the clang builtin include
+          # path so it can find stddef.h and other compiler-provided headers.
+          BINDGEN_EXTRA_CLANG_ARGS: "-I/usr/lib/llvm-18/lib/clang/18/include"
         run: |
           cargo test --locked -p streamkit-nodes --features gpu
           cargo test --locked -p streamkit-engine --features gpu

From de36ad93c09e8da7b2d59fc34ce1e823efa1f6c6 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 14:38:44 +0000
Subject: [PATCH 06/23] ci: reorder GPU tests so nvcodec runs before engine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The streamkit-engine GPU test binary segfaults (SIGSEGV) during
cleanup after all 25 tests pass — this is a pre-existing issue
likely related to wgpu/Vulkan teardown.  Move the nvcodec node
tests before the engine GPU tests so they are not blocked by
the crash.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 .github/workflows/skit.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/skit.yml b/.github/workflows/skit.yml
index a0e00159..a241eb9e 100644
--- a/.github/workflows/skit.yml
+++ b/.github/workflows/skit.yml
@@ -150,8 +150,8 @@ jobs:
           BINDGEN_EXTRA_CLANG_ARGS: "-I/usr/lib/llvm-18/lib/clang/18/include"
         run: |
           cargo test --locked -p streamkit-nodes --features gpu
-          cargo test --locked -p streamkit-engine --features gpu
           cargo test --locked -p streamkit-nodes --features nvcodec
+          cargo test --locked -p streamkit-engine --features gpu
 
   build:
     name: Build

From 62341bd1bbe8f32e671cce18ec3ec5add7b8085e Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 14:43:10 +0000
Subject: [PATCH 07/23] fix(nodes): add missing framerate field in nvcodec test

The force_cpu_encoder_rejected test was constructing
NvAv1EncoderConfig with all fields explicitly but missed the
new framerate field added in the review-fix round.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/nv_av1.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/nodes/src/video/nv_av1.rs b/crates/nodes/src/video/nv_av1.rs
index 2de6c8b7..a5cc9177 100644
--- a/crates/nodes/src/video/nv_av1.rs
+++ b/crates/nodes/src/video/nv_av1.rs
@@ -818,6 +818,7 @@ mod tests {
             hw_accel: HwAccelMode::ForceCpu,
             cuda_device: None,
             bitrate: 2_000_000,
+            framerate: 30,
             keyframe_interval: None,
         });
         assert!(result.is_err(), "ForceCpu should be rejected by NV encoder");

From b374ba19bc3374ecfed027aa675edecde49c64b4 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 15:53:55 +0000
Subject: [PATCH 08/23] fix(nodes): register HW codec nodes, fix i420_to_nv12
 truncation, remove dead code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add cfg-gated registration calls for vulkan_video, vaapi, and nvcodec
  nodes in register_video_nodes() — without these, users enabling the
  features would get 'node not found' errors at runtime.
- Fix i420_to_nv12 in vulkan_video.rs to use div_ceil(2) for chroma
  dimensions instead of truncating integer division (h/2, w/2), matching
  the correct implementation in nv_av1.rs.
- Update HwAccelMode::Auto doc comment to accurately reflect that
  HW-only nodes do not implement CPU fallback — Auto and ForceHw
  behave identically; CPU fallback is achieved by selecting a different
  (software) node at the pipeline level.
- Remove dead default_quality() and default_framerate() functions in
  vaapi_av1.rs (unused — the struct uses a manual Default impl).
- Add registration regression tests to nv_av1 and vaapi_av1 modules.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/mod.rs          | 16 +++++++++++++++-
 crates/nodes/src/video/nv_av1.rs       | 17 +++++++++++++++++
 crates/nodes/src/video/vaapi_av1.rs    | 25 +++++++++++++++++--------
 crates/nodes/src/video/vulkan_video.rs | 10 +++++-----
 4 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/crates/nodes/src/video/mod.rs b/crates/nodes/src/video/mod.rs
index 6230541f..43f51323 100644
--- a/crates/nodes/src/video/mod.rs
+++ b/crates/nodes/src/video/mod.rs
@@ -84,7 +84,12 @@ pub const H264_CONTENT_TYPE: &str = "video/h264";
 )]
 #[serde(rename_all = "lowercase")]
 pub enum HwAccelMode {
-    /// Auto-detect: use HW if available, fall back to CPU otherwise.
+    /// Auto-detect: attempt hardware acceleration.
+    ///
+    /// For HW-only nodes (Vulkan Video, VA-API, NVENC/NVDEC) this behaves
+    /// identically to `ForceHw` — the node will fail if the required
+    /// hardware is unavailable.  CPU fallback is achieved by selecting a
+    /// different (software) node at the pipeline level.
     #[default]
     Auto,
     /// Force HW acceleration — fail if unavailable.
@@ -631,4 +636,13 @@ pub fn register_video_nodes(registry: &mut NodeRegistry, constraints: &GlobalNod
 
     #[cfg(feature = "dav1d")]
     dav1d::register_dav1d_nodes(registry);
+
+    #[cfg(feature = "vulkan_video")]
+    vulkan_video::register_vulkan_video_nodes(registry);
+
+    #[cfg(feature = "vaapi")]
+    vaapi_av1::register_vaapi_av1_nodes(registry);
+
+    #[cfg(feature = "nvcodec")]
+    nv_av1::register_nv_av1_nodes(registry);
 }
diff --git a/crates/nodes/src/video/nv_av1.rs b/crates/nodes/src/video/nv_av1.rs
index a5cc9177..d26e316e 100644
--- a/crates/nodes/src/video/nv_av1.rs
+++ b/crates/nodes/src/video/nv_av1.rs
@@ -1182,4 +1182,21 @@ mod tests {
             }
         }
     }
+
+    // ── Registration test ────────────────────────────────────────────────
+
+    #[test]
+    fn test_node_registration() {
+        let mut registry = NodeRegistry::new();
+        register_nv_av1_nodes(&mut registry);
+
+        assert!(
+            registry.create_node("video::nv::av1_decoder", None).is_ok(),
+            "NV AV1 decoder should be registered"
+        );
+        assert!(
+            registry.create_node("video::nv::av1_encoder", None).is_ok(),
+            "NV AV1 encoder should be registered"
+        );
+    }
 }
diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
index 2d1be2bb..9d8e1093 100644
--- a/crates/nodes/src/video/vaapi_av1.rs
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -747,14 +747,6 @@ pub struct VaapiAv1EncoderConfig {
     pub hw_accel: HwAccelMode,
 }
 
-const fn default_quality() -> u32 {
-    DEFAULT_QUALITY
-}
-
-const fn default_framerate() -> u32 {
-    DEFAULT_FRAMERATE
-}
-
 impl Default for VaapiAv1EncoderConfig {
     fn default() -> Self {
         Self {
@@ -1804,4 +1796,21 @@ mod tests {
         let result: Result<VaapiAv1EncoderConfig, _> = serde_json::from_str(json);
         assert!(result.is_err(), "Unknown fields should be rejected");
     }
+
+    // ── Registration test ────────────────────────────────────────────────
+
+    #[test]
+    fn test_node_registration() {
+        let mut registry = NodeRegistry::new();
+        register_vaapi_av1_nodes(&mut registry);
+
+        assert!(
+            registry.create_node("video::vaapi::av1_decoder", None).is_ok(),
+            "VA-API AV1 decoder should be registered"
+        );
+        assert!(
+            registry.create_node("video::vaapi::av1_encoder", None).is_ok(),
+            "VA-API AV1 encoder should be registered"
+        );
+    }
 }
diff --git a/crates/nodes/src/video/vulkan_video.rs b/crates/nodes/src/video/vulkan_video.rs
index e0321da6..59390cb6 100644
--- a/crates/nodes/src/video/vulkan_video.rs
+++ b/crates/nodes/src/video/vulkan_video.rs
@@ -737,9 +737,11 @@ fn i420_to_nv12(frame: &VideoFrame) -> Vec<u8> {
     let h = frame.height as usize;
     let layout = frame.layout();
 
+    let chroma_w = w.div_ceil(2);
+    let chroma_h = h.div_ceil(2);
+    let uv_row_bytes = chroma_w * 2;
     let y_size = w * h;
-    let uv_size = w * (h / 2);
-    let mut nv12 = vec![0u8; y_size + uv_size];
+    let mut nv12 = vec![0u8; y_size + uv_row_bytes * chroma_h];
 
     let src = frame.data.as_slice();
     let planes = layout.planes();
@@ -756,12 +758,10 @@ fn i420_to_nv12(frame: &VideoFrame) -> Vec<u8> {
     }
 
     // Interleave U and V into NV12 UV plane.
-    let chroma_h = h / 2;
-    let chroma_w = w / 2;
     for row in 0..chroma_h {
         let u_src_start = u_plane.offset + row * u_plane.stride;
         let v_src_start = v_plane.offset + row * v_plane.stride;
-        let dst_start = y_size + row * w;
+        let dst_start = y_size + row * uv_row_bytes;
         for col in 0..chroma_w {
             nv12[dst_start + col * 2] = src[u_src_start + col];
             nv12[dst_start + col * 2 + 1] = src[v_src_start + col];

From 5359ec7eb5fb781732668d847b4e416225d528e4 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 16:52:43 +0000
Subject: [PATCH 09/23] fix(nodes): add encoder flush comment, validate
 cuda_device, use GBM plane offsets

- vulkan_video.rs: document that vk-video 0.3.0 BytesEncoder has no
  flush() method (unlike BytesDecoder); frame-at-a-time, no B-frames
- nv_av1.rs: reject cuda_device > i32::MAX at construction time
  instead of silently wrapping via 'as i32' cast
- vaapi_av1.rs: use gbm_frame.get_plane_offset() for FrameLayout
  instead of manually computing y_stride * coded_height; also fix
  stride fallback to use coded_width instead of display width

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/nv_av1.rs       | 30 ++++++++++++++++++++++++++
 crates/nodes/src/video/vaapi_av1.rs    | 17 ++++++++++-----
 crates/nodes/src/video/vulkan_video.rs |  8 +++++++
 3 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/crates/nodes/src/video/nv_av1.rs b/crates/nodes/src/video/nv_av1.rs
index d26e316e..bff41404 100644
--- a/crates/nodes/src/video/nv_av1.rs
+++ b/crates/nodes/src/video/nv_av1.rs
@@ -92,6 +92,13 @@ impl NvAv1DecoderNode {
                     .to_string(),
             ));
         }
+        if config.cuda_device.is_some_and(|d| d > i32::MAX as u32) {
+            return Err(StreamKitError::Configuration(format!(
+                "cuda_device {} exceeds maximum CUDA device index ({})",
+                config.cuda_device.unwrap_or(0),
+                i32::MAX,
+            )));
+        }
         Ok(Self { config })
     }
 }
@@ -424,6 +431,13 @@ impl NvAv1EncoderNode {
                     .to_string(),
             ));
         }
+        if config.cuda_device.is_some_and(|d| d > i32::MAX as u32) {
+            return Err(StreamKitError::Configuration(format!(
+                "cuda_device {} exceeds maximum CUDA device index ({})",
+                config.cuda_device.unwrap_or(0),
+                i32::MAX,
+            )));
+        }
         Ok(Self { config })
     }
 }
@@ -830,6 +844,22 @@ mod tests {
         assert!(NvAv1EncoderNode::new(NvAv1EncoderConfig::default()).is_ok());
     }
 
+    #[test]
+    fn rejects_cuda_device_exceeding_i32_max() {
+        let bad_device = i32::MAX as u32 + 1;
+        let dec_result = NvAv1DecoderNode::new(NvAv1DecoderConfig {
+            cuda_device: Some(bad_device),
+            ..NvAv1DecoderConfig::default()
+        });
+        assert!(dec_result.is_err(), "cuda_device > i32::MAX should be rejected by decoder");
+
+        let enc_result = NvAv1EncoderNode::new(NvAv1EncoderConfig {
+            cuda_device: Some(bad_device),
+            ..NvAv1EncoderConfig::default()
+        });
+        assert!(enc_result.is_err(), "cuda_device > i32::MAX should be rejected by encoder");
+    }
+
     #[test]
     fn decoder_pins_correct() {
         let node = NvAv1DecoderNode::new(NvAv1DecoderConfig::default()).unwrap();
diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
index 9d8e1093..adc6129c 100644
--- a/crates/nodes/src/video/vaapi_av1.rs
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -950,20 +950,27 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
         let is_keyframe = metadata.as_ref().and_then(|m| m.keyframe).unwrap_or(false);
         let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(self.frame_count);
 
+        // Use actual GBM plane offsets instead of computing them manually.
+        // Different drivers may place the UV plane at an offset that differs
+        // from `y_stride * coded_height` (e.g. with extra padding rows).
+        let offsets = gbm_frame.get_plane_offset();
+
         let frame_layout = FrameLayout {
             format: (nv12_fourcc(), 0), // DRM_FORMAT_MOD_LINEAR
             size: CrosResolution { width: self.coded_width, height: self.coded_height },
             planes: vec![
                 PlaneLayout {
                     buffer_index: 0,
-                    offset: 0,
-                    stride: pitches.first().copied().unwrap_or(self.width as usize),
+                    offset: offsets.first().copied().unwrap_or(0),
+                    stride: pitches.first().copied().unwrap_or(self.coded_width as usize),
                 },
                 PlaneLayout {
                     buffer_index: 0,
-                    offset: pitches.first().copied().unwrap_or(self.width as usize)
-                        * self.coded_height as usize,
-                    stride: pitches.get(1).copied().unwrap_or(self.width as usize),
+                    offset: offsets.get(1).copied().unwrap_or(
+                        pitches.first().copied().unwrap_or(self.coded_width as usize)
+                            * self.coded_height as usize,
+                    ),
+                    stride: pitches.get(1).copied().unwrap_or(self.coded_width as usize),
                 },
             ],
         };
diff --git a/crates/nodes/src/video/vulkan_video.rs b/crates/nodes/src/video/vulkan_video.rs
index 59390cb6..df516eff 100644
--- a/crates/nodes/src/video/vulkan_video.rs
+++ b/crates/nodes/src/video/vulkan_video.rs
@@ -635,6 +635,14 @@ impl ProcessorNode for VulkanVideoH264EncoderNode {
                     },
                 }
             }
+
+            // Note: vk-video 0.3.0's BytesEncoder has no flush() method
+            // (unlike BytesDecoder which does).  The encoder operates
+            // frame-at-a-time without B-frame reordering, so no frames
+            // should be buffered internally.  If a future vk-video version
+            // adds flush(), it should be called here — matching the
+            // decoder's flush at line ~245 and the pattern in
+            // encoder_trait::spawn_standard_encode_task.
         });
 
         // ── State transition ─────────────────────────────────────────────

From 4366550b2aabdc5d25c429d892edd45f5caa7273 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 17:00:23 +0000
Subject: [PATCH 10/23] fix(skit): forward HW codec feature flags from
 streamkit-server to streamkit-nodes

Without these forwarding features, `just extra_features="--features vulkan_video" skit`
would silently ignore the feature since streamkit-server didn't know about it.

Adds vulkan_video, vaapi, and nvcodec feature forwarding, matching the
existing pattern for svt_av1 and dav1d.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 apps/skit/Cargo.toml | 3 +++
 justfile             | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/apps/skit/Cargo.toml b/apps/skit/Cargo.toml
index 81efe5c0..98b540c1 100644
--- a/apps/skit/Cargo.toml
+++ b/apps/skit/Cargo.toml
@@ -152,6 +152,9 @@ svt_av1 = ["streamkit-nodes/svt_av1"]
 svt_av1_static = ["streamkit-nodes/svt_av1_static"]
 dav1d = ["streamkit-nodes/dav1d"]
 dav1d_static = ["streamkit-nodes/dav1d_static"]
+vulkan_video = ["streamkit-nodes/vulkan_video"]
+vaapi = ["streamkit-nodes/vaapi"]
+nvcodec = ["streamkit-nodes/nvcodec"]
 
 [dev-dependencies]
 tokio-test = "0.4.5"
diff --git a/justfile b/justfile
index 7dff3d5a..1b02b972 100644
--- a/justfile
+++ b/justfile
@@ -11,6 +11,8 @@ tokio_console_features := "--features tokio-console"
 # Optional extra features to enable in skit builds (e.g. "svt_av1").
 # Usage: just extra_features="--features svt_av1" skit
 #    or: just extra_features="--features svt_av1" build-skit
+# HW codecs: vulkan_video (H.264 Vulkan Video), vaapi (AV1 VA-API), nvcodec (AV1 NVENC/NVDEC)
+#    e.g.: just extra_features="--features vulkan_video,nvcodec" skit
 extra_features := ""
 
 # sherpa-onnx version for Kokoro TTS plugin (must match sherpa-rs version)

From 4188051031d52f4a961f007c0d05920aa20f3631 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 17:12:52 +0000
Subject: [PATCH 11/23] docs(samples): add HW video codec sample pipelines

Add oneshot and dynamic (MoQ) sample pipelines for each HW video codec
backend:

- Vulkan Video H.264: video_vulkan_video_h264_colorbars (oneshot + MoQ)
- VA-API AV1: video_vaapi_av1_colorbars (oneshot + MoQ)
- NVENC AV1: video_nv_av1_colorbars (oneshot + MoQ)

Each oneshot pipeline generates SMPTE color bars, HW-encodes, muxes into
a container (MP4 for H.264, WebM for AV1), and outputs via HTTP.

Each dynamic pipeline generates color bars, HW-encodes, and streams via
MoQ for live playback in the browser.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 .../dynamic/video_moq_nv_av1_colorbars.yml    | 47 +++++++++++++++
 .../dynamic/video_moq_vaapi_av1_colorbars.yml | 46 +++++++++++++++
 .../video_moq_vulkan_video_h264_colorbars.yml | 46 +++++++++++++++
 .../oneshot/video_nv_av1_colorbars.yml        | 58 +++++++++++++++++++
 .../oneshot/video_vaapi_av1_colorbars.yml     | 58 +++++++++++++++++++
 .../video_vulkan_video_h264_colorbars.yml     | 53 +++++++++++++++++
 6 files changed, 308 insertions(+)
 create mode 100644 samples/pipelines/dynamic/video_moq_nv_av1_colorbars.yml
 create mode 100644 samples/pipelines/dynamic/video_moq_vaapi_av1_colorbars.yml
 create mode 100644 samples/pipelines/dynamic/video_moq_vulkan_video_h264_colorbars.yml
 create mode 100644 samples/pipelines/oneshot/video_nv_av1_colorbars.yml
 create mode 100644 samples/pipelines/oneshot/video_vaapi_av1_colorbars.yml
 create mode 100644 samples/pipelines/oneshot/video_vulkan_video_h264_colorbars.yml

diff --git a/samples/pipelines/dynamic/video_moq_nv_av1_colorbars.yml b/samples/pipelines/dynamic/video_moq_nv_av1_colorbars.yml
new file mode 100644
index 00000000..fb6572e8
--- /dev/null
+++ b/samples/pipelines/dynamic/video_moq_nv_av1_colorbars.yml
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# Streams SMPTE color bars encoded with NVIDIA NVENC AV1 (GPU-accelerated)
+# over MoQ.
+#
+# Requires: skit built with --features nvcodec
+#           NVIDIA GPU with NVENC AV1 support (Ada Lovelace / RTX 40+)
+#           System packages: nvidia-cuda-toolkit, libclang-dev
+
+name: NVENC AV1 Color Bars (MoQ Stream)
+description: Continuously generates SMPTE color bars and streams via MoQ using NVIDIA NVENC AV1 HW encoder
+mode: dynamic
+client:
+  gateway_path: /moq/video
+  watch:
+    broadcast: output
+    audio: false
+    video: true
+
+nodes:
+  colorbars:
+    kind: video::colorbars
+    params:
+      width: 1280
+      height: 720
+      fps: 30
+      pixel_format: nv12
+      draw_time: true
+
+  nv_av1_encoder:
+    kind: video::nv::av1_encoder
+    params:
+      bitrate: 2000000
+      framerate: 30
+    needs: colorbars
+
+  moq_peer:
+    kind: transport::moq::peer
+    params:
+      gateway_path: /moq/video
+      output_broadcast: output
+      allow_reconnect: true
+      video_codec: av1
+    needs:
+      in: nv_av1_encoder
diff --git a/samples/pipelines/dynamic/video_moq_vaapi_av1_colorbars.yml b/samples/pipelines/dynamic/video_moq_vaapi_av1_colorbars.yml
new file mode 100644
index 00000000..112f2345
--- /dev/null
+++ b/samples/pipelines/dynamic/video_moq_vaapi_av1_colorbars.yml
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# Streams SMPTE color bars encoded with VA-API AV1 (GPU-accelerated) over MoQ.
+#
+# Requires: skit built with --features vaapi
+#           VA-API capable GPU with AV1 encode support (Intel Arc+, AMD)
+#           System packages: libva-dev, libgbm-dev
+
+name: VA-API AV1 Color Bars (MoQ Stream)
+description: Continuously generates SMPTE color bars and streams via MoQ using VA-API AV1 HW encoder
+mode: dynamic
+client:
+  gateway_path: /moq/video
+  watch:
+    broadcast: output
+    audio: false
+    video: true
+
+nodes:
+  colorbars:
+    kind: video::colorbars
+    params:
+      width: 1280
+      height: 720
+      fps: 30
+      pixel_format: nv12
+      draw_time: true
+
+  vaapi_av1_encoder:
+    kind: video::vaapi::av1_encoder
+    params:
+      quality: 128
+      framerate: 30
+    needs: colorbars
+
+  moq_peer:
+    kind: transport::moq::peer
+    params:
+      gateway_path: /moq/video
+      output_broadcast: output
+      allow_reconnect: true
+      video_codec: av1
+    needs:
+      in: vaapi_av1_encoder
diff --git a/samples/pipelines/dynamic/video_moq_vulkan_video_h264_colorbars.yml b/samples/pipelines/dynamic/video_moq_vulkan_video_h264_colorbars.yml
new file mode 100644
index 00000000..be381fdc
--- /dev/null
+++ b/samples/pipelines/dynamic/video_moq_vulkan_video_h264_colorbars.yml
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# Streams SMPTE color bars encoded with Vulkan Video H.264 (GPU-accelerated)
+# over MoQ.
+#
+# Requires: skit built with --features vulkan_video
+#           Vulkan-capable GPU with H.264 encode support
+
+name: Vulkan Video H.264 Color Bars (MoQ Stream)
+description: Continuously generates SMPTE color bars and streams via MoQ using Vulkan Video H.264 HW encoder
+mode: dynamic
+client:
+  gateway_path: /moq/video
+  watch:
+    broadcast: output
+    audio: false
+    video: true
+
+nodes:
+  colorbars:
+    kind: video::colorbars
+    params:
+      width: 1280
+      height: 720
+      fps: 30
+      pixel_format: nv12
+      draw_time: true
+
+  vk_h264_encoder:
+    kind: video::vulkan_video::h264_encoder
+    params:
+      bitrate: 2000000
+      framerate: 30
+    needs: colorbars
+
+  moq_peer:
+    kind: transport::moq::peer
+    params:
+      gateway_path: /moq/video
+      output_broadcast: output
+      allow_reconnect: true
+      video_codec: h264
+    needs:
+      in: vk_h264_encoder
diff --git a/samples/pipelines/oneshot/video_nv_av1_colorbars.yml b/samples/pipelines/oneshot/video_nv_av1_colorbars.yml
new file mode 100644
index 00000000..dddddc21
--- /dev/null
+++ b/samples/pipelines/oneshot/video_nv_av1_colorbars.yml
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# Demonstrates the NVIDIA NVENC AV1 HW encoder:
+#   Generates SMPTE color bars (NV12), encodes to AV1 via NVENC
+#   (GPU-accelerated), muxes into a WebM container, and writes the result
+#   to HTTP output.
+#
+# Requires: skit built with --features nvcodec
+#           NVIDIA GPU with NVENC AV1 support (Ada Lovelace / RTX 40+)
+#           System packages: nvidia-cuda-toolkit, libclang-dev
+
+name: NVENC AV1 Encode (WebM Oneshot)
+description: Generates color bars, encodes to AV1 using NVIDIA NVENC HW encoder, and muxes into WebM (30 seconds)
+mode: oneshot
+client:
+  input:
+    type: none
+  output:
+    type: video
+
+nodes:
+  colorbars:
+    kind: video::colorbars
+    params:
+      width: 1280
+      height: 720
+      fps: 30
+      frame_count: 900  # 30 seconds at 30fps
+      pixel_format: nv12
+      draw_time: true
+      draw_time_use_pts: true
+
+  nv_av1_encoder:
+    kind: video::nv::av1_encoder
+    params:
+      bitrate: 2000000
+      framerate: 30
+    needs: colorbars
+
+  webm_muxer:
+    kind: containers::webm::muxer
+    params:
+      video_width: 1280
+      video_height: 720
+      streaming_mode: live
+    needs: nv_av1_encoder
+
+  pacer:
+    kind: core::pacer
+    needs: webm_muxer
+
+  http_output:
+    kind: streamkit::http_output
+    params:
+      content_type: 'video/webm; codecs="av1"'
+    needs: pacer
diff --git a/samples/pipelines/oneshot/video_vaapi_av1_colorbars.yml b/samples/pipelines/oneshot/video_vaapi_av1_colorbars.yml
new file mode 100644
index 00000000..9ac5d81c
--- /dev/null
+++ b/samples/pipelines/oneshot/video_vaapi_av1_colorbars.yml
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# Demonstrates the VA-API AV1 HW encoder:
+#   Generates SMPTE color bars (NV12), encodes to AV1 via VA-API
+#   (GPU-accelerated), muxes into a WebM container, and writes the result
+#   to HTTP output.
+#
+# Requires: skit built with --features vaapi
+#           VA-API capable GPU with AV1 encode support (Intel Arc+, AMD)
+#           System packages: libva-dev, libgbm-dev
+
+name: VA-API AV1 Encode (WebM Oneshot)
+description: Generates color bars, encodes to AV1 using VA-API HW encoder, and muxes into WebM (30 seconds)
+mode: oneshot
+client:
+  input:
+    type: none
+  output:
+    type: video
+
+nodes:
+  colorbars:
+    kind: video::colorbars
+    params:
+      width: 1280
+      height: 720
+      fps: 30
+      frame_count: 900  # 30 seconds at 30fps
+      pixel_format: nv12
+      draw_time: true
+      draw_time_use_pts: true
+
+  vaapi_av1_encoder:
+    kind: video::vaapi::av1_encoder
+    params:
+      quality: 128
+      framerate: 30
+    needs: colorbars
+
+  webm_muxer:
+    kind: containers::webm::muxer
+    params:
+      video_width: 1280
+      video_height: 720
+      streaming_mode: live
+    needs: vaapi_av1_encoder
+
+  pacer:
+    kind: core::pacer
+    needs: webm_muxer
+
+  http_output:
+    kind: streamkit::http_output
+    params:
+      content_type: 'video/webm; codecs="av1"'
+    needs: pacer
diff --git a/samples/pipelines/oneshot/video_vulkan_video_h264_colorbars.yml b/samples/pipelines/oneshot/video_vulkan_video_h264_colorbars.yml
new file mode 100644
index 00000000..acbe95ba
--- /dev/null
+++ b/samples/pipelines/oneshot/video_vulkan_video_h264_colorbars.yml
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# Demonstrates the Vulkan Video H.264 HW encoder:
+#   Generates SMPTE color bars (NV12), encodes to H.264 via Vulkan Video
+#   (GPU-accelerated), muxes into an MP4 container, and writes the result
+#   to HTTP output.
+#
+# Requires: skit built with --features vulkan_video
+#           Vulkan-capable GPU with H.264 encode support
+
+name: Vulkan Video H.264 Encode (MP4 Oneshot)
+description: Generates color bars, encodes to H.264 using Vulkan Video HW encoder, and muxes into MP4 (30 seconds)
+mode: oneshot
+client:
+  input:
+    type: none
+  output:
+    type: video
+
+nodes:
+  colorbars:
+    kind: video::colorbars
+    params:
+      width: 1280
+      height: 720
+      fps: 30
+      frame_count: 900  # 30 seconds at 30fps
+      pixel_format: nv12
+      draw_time: true
+      draw_time_use_pts: true
+
+  vk_h264_encoder:
+    kind: video::vulkan_video::h264_encoder
+    params:
+      bitrate: 2000000
+      framerate: 30
+    needs: colorbars
+
+  mp4_muxer:
+    kind: containers::mp4::muxer
+    params:
+      mode: stream
+      video_width: 1280
+      video_height: 720
+    needs: vk_h264_encoder
+
+  http_output:
+    kind: streamkit::http_output
+    params:
+      content_type: 'video/mp4; codecs="avc1.42c01f"'
+    needs: mp4_muxer

From 3866ea6640cb855ce24c0f4a60b8df4d095055d2 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 17:59:17 +0000
Subject: [PATCH 12/23] fix(nodes): revert get_plane_offset to computed
 fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

get_plane_offset() is private in cros-codecs 0.0.6. Fall back to
computing the UV plane offset from pitch × coded_height, which is
correct for linear NV12 allocations used by VA-API encode surfaces.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/vaapi_av1.rs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
index adc6129c..93069bd7 100644
--- a/crates/nodes/src/video/vaapi_av1.rs
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -950,10 +950,13 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
         let is_keyframe = metadata.as_ref().and_then(|m| m.keyframe).unwrap_or(false);
         let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(self.frame_count);
 
-        // Use actual GBM plane offsets instead of computing them manually.
-        // Different drivers may place the UV plane at an offset that differs
-        // from `y_stride * coded_height` (e.g. with extra padding rows).
-        let offsets = gbm_frame.get_plane_offset();
+        // Ideally we'd use `gbm_frame.get_plane_offset()` to get the real UV
+        // plane offset from the GBM allocator, but that method is private in
+        // cros-codecs 0.0.6.  Fall back to computing it from pitch × coded_height,
+        // which is correct for linear (non-tiled) NV12 allocations — the common
+        // case for VA-API encode surfaces.
+        let y_stride = pitches.first().copied().unwrap_or(self.coded_width as usize);
+        let uv_offset = y_stride * self.coded_height as usize;
 
         let frame_layout = FrameLayout {
             format: (nv12_fourcc(), 0), // DRM_FORMAT_MOD_LINEAR
@@ -961,15 +964,12 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
             planes: vec![
                 PlaneLayout {
                     buffer_index: 0,
-                    offset: offsets.first().copied().unwrap_or(0),
-                    stride: pitches.first().copied().unwrap_or(self.coded_width as usize),
+                    offset: 0,
+                    stride: y_stride,
                 },
                 PlaneLayout {
                     buffer_index: 0,
-                    offset: offsets.get(1).copied().unwrap_or(
-                        pitches.first().copied().unwrap_or(self.coded_width as usize)
-                            * self.coded_height as usize,
-                    ),
+                    offset: uv_offset,
                     stride: pitches.get(1).copied().unwrap_or(self.coded_width as usize),
                 },
             ],

From 0450619d217f90010c2982d590ff3a056caebeff Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 17:59:27 +0000
Subject: [PATCH 13/23] style: format vaapi_av1.rs

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/vaapi_av1.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
index 93069bd7..4f67d924 100644
--- a/crates/nodes/src/video/vaapi_av1.rs
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -962,11 +962,7 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
             format: (nv12_fourcc(), 0), // DRM_FORMAT_MOD_LINEAR
             size: CrosResolution { width: self.coded_width, height: self.coded_height },
             planes: vec![
-                PlaneLayout {
-                    buffer_index: 0,
-                    offset: 0,
-                    stride: y_stride,
-                },
+                PlaneLayout { buffer_index: 0, offset: 0, stride: y_stride },
                 PlaneLayout {
                     buffer_index: 0,
                     offset: uv_offset,

From 8354c1f8221035a947fbe8a46a3ce7548c1a2966 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 18:03:05 +0000
Subject: [PATCH 14/23] feat(nodes): add VA-API H.264 encoder and decoder nodes

Add vaapi_h264 module with VaapiH264EncoderNode and VaapiH264DecoderNode
using cros-codecs StatelessEncoder/StatelessDecoder for H.264 via VA-API.

- Encoder: CQP rate control, Main profile, macroblock-aligned coding
- Decoder: stateless H.264 decode with format-change handling
- Reuses shared helpers from vaapi_av1 (GBM/NV12 I/O, device detection)
- Registration: video::vaapi::h264_encoder, video::vaapi::h264_decoder
- Sample pipelines: oneshot MP4 + dynamic MoQ for VA-API H.264

Supported on Intel (Sandy Bridge+), AMD, and NVIDIA (decode only).

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/mod.rs                 |    6 +
 crates/nodes/src/video/vaapi_av1.rs           |   12 +-
 crates/nodes/src/video/vaapi_h264.rs          | 1062 +++++++++++++++++
 .../video_moq_vaapi_h264_colorbars.yml        |   46 +
 .../oneshot/video_vaapi_h264_colorbars.yml    |   54 +
 5 files changed, 1174 insertions(+), 6 deletions(-)
 create mode 100644 crates/nodes/src/video/vaapi_h264.rs
 create mode 100644 samples/pipelines/dynamic/video_moq_vaapi_h264_colorbars.yml
 create mode 100644 samples/pipelines/oneshot/video_vaapi_h264_colorbars.yml

diff --git a/crates/nodes/src/video/mod.rs b/crates/nodes/src/video/mod.rs
index 43f51323..f3b2b6f3 100644
--- a/crates/nodes/src/video/mod.rs
+++ b/crates/nodes/src/video/mod.rs
@@ -146,6 +146,9 @@ pub mod vulkan_video;
 #[cfg(feature = "vaapi")]
 pub mod vaapi_av1;
 
+#[cfg(feature = "vaapi")]
+pub mod vaapi_h264;
+
 #[cfg(feature = "nvcodec")]
 pub mod nv_av1;
 
@@ -643,6 +646,9 @@ pub fn register_video_nodes(registry: &mut NodeRegistry, constraints: &GlobalNod
     #[cfg(feature = "vaapi")]
     vaapi_av1::register_vaapi_av1_nodes(registry);
 
+    #[cfg(feature = "vaapi")]
+    vaapi_h264::register_vaapi_h264_nodes(registry);
+
     #[cfg(feature = "nvcodec")]
     nv_av1::register_nv_av1_nodes(registry);
 }
diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
index 4f67d924..f69af6e9 100644
--- a/crates/nodes/src/video/vaapi_av1.rs
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -101,12 +101,12 @@ const DEFAULT_FRAMERATE: u32 = 30;
 // ---------------------------------------------------------------------------
 
 /// NV12 fourcc code for GBM/VA-API surfaces.
-fn nv12_fourcc() -> CrosFourcc {
+pub(super) fn nv12_fourcc() -> CrosFourcc {
     CrosFourcc::from(b"NV12")
 }
 
 /// Align `value` up to the next multiple of `alignment`.
-fn align_up_u32(value: u32, alignment: u32) -> u32 {
+pub(super) fn align_up_u32(value: u32, alignment: u32) -> u32 {
     debug_assert!(alignment > 0);
     value.div_ceil(alignment) * alignment
 }
@@ -134,7 +134,7 @@ fn detect_render_device() -> Option<String> {
 }
 
 /// Resolve the render device path from config, auto-detection, or default.
-fn resolve_render_device(configured: Option<&String>) -> String {
+pub(super) fn resolve_render_device(configured: Option<&String>) -> String {
     if let Some(path) = configured {
         return path.clone();
     }
@@ -152,7 +152,7 @@ fn resolve_render_device(configured: Option<&String>) -> String {
 }
 
 /// Open a VA display and a GBM device on the same render node.
-fn open_va_and_gbm(
+pub(super) fn open_va_and_gbm(
     render_device: Option<&String>,
 ) -> Result<(Rc<libva::Display>, Arc<GbmDevice>, String), String> {
     let path = resolve_render_device(render_device);
@@ -167,7 +167,7 @@ fn open_va_and_gbm(
 /// for a packed StreamKit [`VideoFrame`].
 ///
 /// Handles stride != width by copying row-by-row.
-fn read_nv12_from_mapping(
+pub(super) fn read_nv12_from_mapping(
     mapping: &dyn ReadMapping<'_>,
     width: u32,
     height: u32,
@@ -226,7 +226,7 @@ fn read_nv12_from_mapping(
 ///
 /// If the source is I420, it is converted to NV12 on the fly (U/V planes
 /// are interleaved into a single UV plane).
-fn write_nv12_to_mapping(
+pub(super) fn write_nv12_to_mapping(
     mapping: &dyn WriteMapping<'_>,
     frame: &VideoFrame,
     plane_pitches: &[usize],
diff --git a/crates/nodes/src/video/vaapi_h264.rs b/crates/nodes/src/video/vaapi_h264.rs
new file mode 100644
index 00000000..f4373538
--- /dev/null
+++ b/crates/nodes/src/video/vaapi_h264.rs
@@ -0,0 +1,1062 @@
+// SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+//
+// SPDX-License-Identifier: MPL-2.0
+
+//! VA-API HW-accelerated H.264 encoder and decoder nodes.
+//!
+//! Uses the [`cros-codecs`](https://crates.io/crates/cros-codecs) crate which
+//! provides high-level VA-API H.264 codec abstractions on Linux.  The cros-codecs
+//! `StatelessDecoder` and `StatelessEncoder` handle all H.264 bitstream parsing
+//! and VA-API parameter buffer construction internally — this module manages
+//! frame I/O and integrates with StreamKit's pipeline architecture.
+//!
+//! # Nodes
+//!
+//! - [`VaapiH264DecoderNode`] — decodes H.264 NAL packets to NV12 [`VideoFrame`]s
+//! - [`VaapiH264EncoderNode`] — encodes NV12/I420 [`VideoFrame`]s to H.264 packets
+//!
+//! Both perform runtime capability detection: if no VA-API device is found (or
+//! H.264 is not supported), the codec task returns an error so the pipeline can
+//! fall back to a CPU codec (OpenH264).
+//!
+//! # Feature gate
+//!
+//! Requires `vaapi` Cargo feature and `libva-dev` + `libgbm-dev` system packages.
+//!
+//! # Platform support
+//!
+//! - **Intel**: H.264 encode + decode on all modern Intel GPUs (Sandy Bridge+).
+//! - **AMD**: H.264 encode + decode via Mesa RadeonSI VA-API.
+//! - **NVIDIA**: Decode only via community `nvidia-vaapi-driver` (no VA-API encoding).
+
+use std::rc::Rc;
+use std::sync::Arc;
+use std::time::Instant;
+
+use async_trait::async_trait;
+use bytes::Bytes;
+use opentelemetry::global;
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+use streamkit_core::stats::NodeStatsTracker;
+use streamkit_core::types::{
+    EncodedVideoFormat, Packet, PacketMetadata, PacketType, PixelFormat, RawVideoFormat,
+    VideoCodec, VideoFrame,
+};
+use streamkit_core::{
+    config_helpers, get_codec_channel_capacity, packet_helpers, state_helpers, InputPin,
+    NodeContext, NodeRegistry, OutputPin, PinCardinality, ProcessorNode, StreamKitError,
+};
+use tokio::sync::mpsc;
+
+// cros-codecs high-level APIs.
+use cros_codecs::backend::vaapi::decoder::VaapiBackend as VaapiDecBackend;
+use cros_codecs::codec::h264::parser::Level as H264Level;
+use cros_codecs::codec::h264::parser::Profile as H264Profile;
+use cros_codecs::decoder::stateless::h264::H264;
+use cros_codecs::decoder::stateless::{DecodeError, StatelessDecoder, StatelessVideoDecoder};
+use cros_codecs::decoder::{BlockingMode, DecodedHandle, DecoderEvent};
+use cros_codecs::encoder::h264::EncoderConfig as CrosH264EncoderConfig;
+use cros_codecs::encoder::stateless::StatelessEncoder;
+use cros_codecs::encoder::{
+    FrameMetadata as CrosFrameMetadata, PredictionStructure, RateControl, Tunings, VideoEncoder,
+};
+use cros_codecs::libva;
+use cros_codecs::video_frame::gbm_video_frame::{
+    GbmDevice, GbmExternalBufferDescriptor, GbmUsage, GbmVideoFrame,
+};
+use cros_codecs::video_frame::{ReadMapping, VideoFrame as CrosVideoFrame, WriteMapping};
+use cros_codecs::{Fourcc as CrosFourcc, FrameLayout, PlaneLayout, Resolution as CrosResolution};
+
+use super::encoder_trait::{self, EncodedPacket, EncoderNodeRunner, StandardVideoEncoder};
+use super::HwAccelMode;
+use super::H264_CONTENT_TYPE;
+
+// Re-use helpers from the VA-API AV1 module — they are codec-agnostic NV12
+// I/O routines (GBM mapping, render-device detection, etc.).
+use super::vaapi_av1::{
+    align_up_u32, nv12_fourcc, open_va_and_gbm, read_nv12_from_mapping, write_nv12_to_mapping,
+};
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+/// H.264 macroblock size — coded resolution must be aligned to this.
+const H264_MB_SIZE: u32 = 16;
+
+/// Maximum number of consecutive retries when the decoder returns
+/// `CheckEvents` or `NotEnoughOutputBuffers` without making progress.
+const MAX_EAGAIN_EMPTY_RETRIES: u32 = 1000;
+
+/// After this many retries, switch from `thread::yield_now()` to
+/// `thread::sleep(1ms)` to avoid a tight spin-loop.
+const EAGAIN_YIELD_THRESHOLD: u32 = 10;
+
+/// Default constant-quality parameter for H.264 (0–51 QP scale).
+const DEFAULT_QUALITY: u32 = 26;
+
+/// Default framerate for rate-control hints.
+const DEFAULT_FRAMERATE: u32 = 30;
+
+// ---------------------------------------------------------------------------
+// Decoder
+// ---------------------------------------------------------------------------
+
+/// Configuration for the VA-API H.264 hardware decoder node.
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[serde(default, deny_unknown_fields)]
+pub struct VaapiH264DecoderConfig {
+    /// Path to the DRM render device (e.g. `/dev/dri/renderD128`).
+    /// When `None`, auto-detects the first VA-API capable device.
+    pub render_device: Option<String>,
+
+    /// Hardware acceleration mode.
+    pub hw_accel: HwAccelMode,
+}
+
+impl Default for VaapiH264DecoderConfig {
+    fn default() -> Self {
+        Self {
+            render_device: None,
+            hw_accel: HwAccelMode::Auto,
+        }
+    }
+}
+
+pub struct VaapiH264DecoderNode {
+    config: VaapiH264DecoderConfig,
+}
+
+impl VaapiH264DecoderNode {
+    #[allow(clippy::missing_errors_doc)]
+    pub fn new(config: VaapiH264DecoderConfig) -> Result<Self, StreamKitError> {
+        if matches!(config.hw_accel, HwAccelMode::ForceCpu) {
+            return Err(StreamKitError::Configuration(
+                "VaapiH264DecoderNode only supports hardware decoding; \
+                 use video::h264::decoder for CPU decode"
+                    .into(),
+            ));
+        }
+        Ok(Self { config })
+    }
+}
+
+#[async_trait]
+impl ProcessorNode for VaapiH264DecoderNode {
+    fn input_pins(&self) -> Vec<InputPin> {
+        vec![InputPin {
+            name: "in".to_string(),
+            accepts_types: vec![PacketType::EncodedVideo(EncodedVideoFormat {
+                codec: VideoCodec::H264,
+                bitstream_format: None,
+                codec_private: None,
+                profile: None,
+                level: None,
+            })],
+            cardinality: PinCardinality::One,
+        }]
+    }
+
+    fn output_pins(&self) -> Vec<OutputPin> {
+        vec![OutputPin {
+            name: "out".to_string(),
+            produces_type: PacketType::RawVideo(RawVideoFormat {
+                width: None,
+                height: None,
+                pixel_format: PixelFormat::Nv12,
+            }),
+            cardinality: PinCardinality::Broadcast,
+        }]
+    }
+
+    async fn run(self: Box<Self>, mut context: NodeContext) -> Result<(), StreamKitError> {
+        let node_name = context.output_sender.node_name().to_string();
+        state_helpers::emit_initializing(&context.state_tx, &node_name);
+
+        tracing::info!("VaapiH264DecoderNode starting");
+        let mut input_rx = context.take_input("in")?;
+
+        let meter = global::meter("skit_nodes");
+        let packets_processed_counter =
+            meter.u64_counter("vaapi_h264_decoder_packets_processed").build();
+        let decode_duration_histogram = meter
+            .f64_histogram("vaapi_h264_decode_duration")
+            .with_boundaries(streamkit_core::metrics::HISTOGRAM_BOUNDARIES_CODEC_PACKET.to_vec())
+            .build();
+
+        let (decode_tx, decode_rx) =
+            mpsc::channel::<(Bytes, Option<PacketMetadata>)>(get_codec_channel_capacity());
+        let (result_tx, mut result_rx) =
+            mpsc::channel::<Result<VideoFrame, String>>(get_codec_channel_capacity());
+
+        let render_device = self.config.render_device.clone();
+        let decode_task = tokio::task::spawn_blocking(move || {
+            vaapi_h264_decode_loop(
+                render_device.as_ref(),
+                decode_rx,
+                &result_tx,
+                &decode_duration_histogram,
+            );
+        });
+
+        state_helpers::emit_running(&context.state_tx, &node_name);
+
+        let mut stats_tracker = NodeStatsTracker::new(node_name.clone(), context.stats_tx.clone());
+        let batch_size = context.batch_size;
+
+        let decode_tx_clone = decode_tx.clone();
+        let mut input_task = tokio::spawn(async move {
+            loop {
+                let Some(first_packet) = input_rx.recv().await else {
+                    break;
+                };
+
+                let packet_batch =
+                    packet_helpers::batch_packets_greedy(first_packet, &mut input_rx, batch_size);
+
+                for packet in packet_batch {
+                    if let Packet::Binary { data, metadata, .. } = packet {
+                        if decode_tx_clone.send((data, metadata)).await.is_err() {
+                            tracing::error!(
+                                "VaapiH264DecoderNode decode task has shut down unexpectedly"
+                            );
+                            return;
+                        }
+                    }
+                }
+            }
+            tracing::info!("VaapiH264DecoderNode input stream closed");
+        });
+
+        crate::codec_utils::codec_forward_loop(
+            &mut context,
+            &mut result_rx,
+            &mut input_task,
+            decode_task,
+            decode_tx,
+            &packets_processed_counter,
+            &mut stats_tracker,
+            Packet::Video,
+            "VaapiH264DecoderNode",
+        )
+        .await;
+
+        state_helpers::emit_stopped(&context.state_tx, &node_name, "input_closed");
+        tracing::info!("VaapiH264DecoderNode finished");
+        Ok(())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Decoder — blocking decode loop
+// ---------------------------------------------------------------------------
+
+/// Blocking decode loop running inside `spawn_blocking`.
+///
+/// Creates the VA-API display, GBM device, and cros-codecs `StatelessDecoder`,
+/// then processes input packets until the channel is closed.
+fn vaapi_h264_decode_loop(
+    render_device: Option<&String>,
+    mut decode_rx: mpsc::Receiver<(Bytes, Option<PacketMetadata>)>,
+    result_tx: &mpsc::Sender<Result<VideoFrame, String>>,
+    duration_histogram: &opentelemetry::metrics::Histogram<f64>,
+) {
+    // ── Open GBM device + VA display ──────────────────────────────────
+    let (display, gbm, path) = match open_va_and_gbm(render_device) {
+        Ok(v) => v,
+        Err(e) => {
+            let _ = result_tx.blocking_send(Err(e));
+            return;
+        }
+    };
+    tracing::info!(device = %path, "VA-API H.264 decoder opened display");
+
+    // ── Create stateless decoder ─────────────────────────────────────
+    let mut decoder = match StatelessDecoder::<H264, VaapiDecBackend<GbmVideoFrame>>::new_vaapi(
+        display,
+        BlockingMode::Blocking,
+    ) {
+        Ok(d) => d,
+        Err(e) => {
+            let _ =
+                result_tx.blocking_send(Err(format!("failed to create VA-API H.264 decoder: {e}")));
+            return;
+        }
+    };
+
+    // Stream resolution — updated on FormatChanged events.
+    let mut coded_width: u32 = 0;
+    let mut coded_height: u32 = 0;
+
+    while let Some((data, metadata)) = decode_rx.blocking_recv() {
+        if result_tx.is_closed() {
+            return;
+        }
+
+        let decode_start = Instant::now();
+        let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(0);
+
+        // Feed bitstream to the decoder.
+        let mut offset = 0usize;
+        let bitstream = data.as_ref();
+        let mut eagain_empty_retries: u32 = 0;
+
+        while offset < bitstream.len() {
+            let gbm_ref = Arc::clone(&gbm);
+            let cw = coded_width;
+            let ch = coded_height;
+            let mut alloc_cb = move || {
+                gbm_ref
+                    .clone()
+                    .new_frame(
+                        nv12_fourcc(),
+                        CrosResolution { width: cw, height: ch },
+                        CrosResolution { width: cw, height: ch },
+                        GbmUsage::Decode,
+                    )
+                    .ok()
+            };
+
+            let mut made_progress = false;
+
+            match decoder.decode(timestamp, &bitstream[offset..], &mut alloc_cb) {
+                Ok(bytes_consumed) => {
+                    offset += bytes_consumed;
+                    made_progress = true;
+                }
+                Err(DecodeError::CheckEvents | DecodeError::NotEnoughOutputBuffers(_)) => {
+                    // Process pending events / drain ready frames, then retry.
+                }
+                Err(e) => {
+                    tracing::error!(error = %e, "VA-API H.264 decode error");
+                    let _ =
+                        result_tx.blocking_send(Err(format!("VA-API H.264 decode error: {e}")));
+                    break;
+                }
+            }
+
+            // Process all pending events (format changes + ready frames).
+            let (should_exit, had_events) = drain_decoder_events(
+                &mut decoder,
+                result_tx,
+                metadata.as_ref(),
+                &mut coded_width,
+                &mut coded_height,
+            );
+            if should_exit {
+                return;
+            }
+
+            if made_progress || had_events {
+                eagain_empty_retries = 0;
+            } else {
+                eagain_empty_retries += 1;
+                if eagain_empty_retries > MAX_EAGAIN_EMPTY_RETRIES {
+                    tracing::error!(
+                        "VA-API H.264 decoder stuck: no progress after {MAX_EAGAIN_EMPTY_RETRIES} retries"
+                    );
+                    let _ = result_tx.blocking_send(Err(
+                        "VA-API H.264 decoder stuck in CheckEvents/NotEnoughOutputBuffers loop"
+                            .to_string(),
+                    ));
+                    break;
+                }
+                // Progressive backoff to avoid a tight spin-loop.
+                if eagain_empty_retries <= EAGAIN_YIELD_THRESHOLD {
+                    std::thread::yield_now();
+                } else {
+                    std::thread::sleep(std::time::Duration::from_millis(1));
+                }
+            }
+        }
+
+        duration_histogram.record(decode_start.elapsed().as_secs_f64(), &[]);
+    }
+
+    // Flush remaining frames from the decoder.
+    if result_tx.is_closed() {
+        return;
+    }
+    if let Err(e) = decoder.flush() {
+        tracing::warn!(error = %e, "VA-API H.264 decoder flush failed");
+    }
+    drain_decoder_events(
+        &mut decoder,
+        result_tx,
+        None,
+        &mut coded_width,
+        &mut coded_height,
+    );
+}
+
+/// Drain all pending events from the decoder.
+///
+/// Returns `(should_exit, had_events)`:
+/// - `should_exit`: the result channel is closed and the caller should return.
+/// - `had_events`: at least one event (format change or frame) was processed.
+fn drain_decoder_events(
+    decoder: &mut StatelessDecoder<H264, VaapiDecBackend<GbmVideoFrame>>,
+    result_tx: &mpsc::Sender<Result<VideoFrame, String>>,
+    metadata: Option<&PacketMetadata>,
+    coded_width: &mut u32,
+    coded_height: &mut u32,
+) -> (bool, bool) {
+    let mut had_events = false;
+    while let Some(event) = decoder.next_event() {
+        had_events = true;
+        match event {
+            DecoderEvent::FormatChanged => {
+                if let Some(info) = decoder.stream_info() {
+                    let dw = info.display_resolution.width;
+                    let dh = info.display_resolution.height;
+                    *coded_width = info.coded_resolution.width;
+                    *coded_height = info.coded_resolution.height;
+                    tracing::info!(
+                        display_width = dw,
+                        display_height = dh,
+                        coded_width = *coded_width,
+                        coded_height = *coded_height,
+                        "VA-API H.264 decoder stream format changed"
+                    );
+                }
+            }
+            DecoderEvent::FrameReady(handle) => {
+                if let Err(e) = handle.sync() {
+                    tracing::error!(error = %e, "VA-API H.264 frame sync failed");
+                    continue;
+                }
+
+                let display_res = handle.display_resolution();
+                let frame_w = display_res.width;
+                let frame_h = display_res.height;
+
+                let gbm_frame = handle.video_frame();
+                let pitches = gbm_frame.get_plane_pitch();
+
+                // Extract NV12 data while the mapping is alive.
+                let nv12_data = {
+                    let mapping = match gbm_frame.map() {
+                        Ok(m) => m,
+                        Err(e) => {
+                            tracing::error!(error = %e, "failed to map decoded GBM frame");
+                            continue;
+                        }
+                    };
+                    read_nv12_from_mapping(mapping.as_ref(), frame_w, frame_h, &pitches)
+                };
+
+                match VideoFrame::with_metadata(
+                    frame_w,
+                    frame_h,
+                    PixelFormat::Nv12,
+                    nv12_data,
+                    metadata.cloned(),
+                ) {
+                    Ok(frame) => {
+                        if result_tx.blocking_send(Ok(frame)).is_err() {
+                            return (true, had_events);
+                        }
+                    }
+                    Err(e) => {
+                        tracing::error!(
+                            error = %e,
+                            "failed to construct VideoFrame from decoded data"
+                        );
+                    }
+                }
+            }
+        }
+    }
+    (false, had_events)
+}
+
+// ---------------------------------------------------------------------------
+// Encoder
+// ---------------------------------------------------------------------------
+
+/// Configuration for the VA-API H.264 hardware encoder node.
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
+#[serde(default, deny_unknown_fields)]
+pub struct VaapiH264EncoderConfig {
+    /// Path to the DRM render device (e.g. `/dev/dri/renderD128`).
+    /// When `None`, auto-detects the first VA-API capable device.
+    pub render_device: Option<String>,
+
+    /// Constant quality parameter (QP).  Lower values produce higher quality
+    /// at the cost of larger bitstream.  H.264 QP range is 0–51, default 26.
+    pub quality: u32,
+
+    /// Target framerate in frames per second (used for rate control hints).
+    pub framerate: u32,
+
+    /// Use low-power encoding mode if the driver supports it.
+    /// Low-power mode uses the GPU's fixed-function encoder (if available)
+    /// rather than shader-based encoding, typically offering lower latency
+    /// at reduced quality flexibility.
+    pub low_power: bool,
+
+    /// Hardware acceleration mode.
+    pub hw_accel: HwAccelMode,
+}
+
+impl Default for VaapiH264EncoderConfig {
+    fn default() -> Self {
+        Self {
+            render_device: None,
+            quality: DEFAULT_QUALITY,
+            framerate: DEFAULT_FRAMERATE,
+            low_power: false,
+            hw_accel: HwAccelMode::Auto,
+        }
+    }
+}
+
+pub struct VaapiH264EncoderNode {
+    config: VaapiH264EncoderConfig,
+}
+
+impl VaapiH264EncoderNode {
+    #[allow(clippy::missing_errors_doc)]
+    pub fn new(config: VaapiH264EncoderConfig) -> Result<Self, StreamKitError> {
+        if matches!(config.hw_accel, HwAccelMode::ForceCpu) {
+            return Err(StreamKitError::Configuration(
+                "VaapiH264EncoderNode only supports hardware encoding; \
+                 use video::h264::encoder for CPU encode"
+                    .into(),
+            ));
+        }
+        Ok(Self { config })
+    }
+}
+
+#[async_trait]
+impl ProcessorNode for VaapiH264EncoderNode {
+    fn input_pins(&self) -> Vec<InputPin> {
+        vec![InputPin {
+            name: "in".to_string(),
+            accepts_types: vec![
+                PacketType::RawVideo(RawVideoFormat {
+                    width: None,
+                    height: None,
+                    pixel_format: PixelFormat::I420,
+                }),
+                PacketType::RawVideo(RawVideoFormat {
+                    width: None,
+                    height: None,
+                    pixel_format: PixelFormat::Nv12,
+                }),
+            ],
+            cardinality: PinCardinality::One,
+        }]
+    }
+
+    fn output_pins(&self) -> Vec<OutputPin> {
+        vec![OutputPin {
+            name: "out".to_string(),
+            produces_type: PacketType::EncodedVideo(EncodedVideoFormat {
+                codec: VideoCodec::H264,
+                bitstream_format: None,
+                codec_private: None,
+                profile: None,
+                level: None,
+            }),
+            cardinality: PinCardinality::Broadcast,
+        }]
+    }
+
+    fn content_type(&self) -> Option<String> {
+        Some(H264_CONTENT_TYPE.to_string())
+    }
+
+    async fn run(self: Box<Self>, context: NodeContext) -> Result<(), StreamKitError> {
+        encoder_trait::run_encoder(*self, context).await
+    }
+}
+
+impl EncoderNodeRunner for VaapiH264EncoderNode {
+    const CONTENT_TYPE: &'static str = H264_CONTENT_TYPE;
+    const NODE_LABEL: &'static str = "VaapiH264EncoderNode";
+    const PACKETS_COUNTER_NAME: &'static str = "vaapi_h264_encoder_packets_processed";
+    const DURATION_HISTOGRAM_NAME: &'static str = "vaapi_h264_encode_duration";
+
+    fn spawn_codec_task(
+        self,
+        encode_rx: mpsc::Receiver<(VideoFrame, Option<PacketMetadata>)>,
+        result_tx: mpsc::Sender<Result<EncodedPacket, String>>,
+        duration_histogram: opentelemetry::metrics::Histogram<f64>,
+    ) -> tokio::task::JoinHandle<()> {
+        encoder_trait::spawn_standard_encode_task::<VaapiH264Encoder>(
+            self.config,
+            encode_rx,
+            result_tx,
+            duration_histogram,
+        )
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Encoder — internal codec wrapper
+// ---------------------------------------------------------------------------
+
+/// Type alias for the full VA-API H.264 encoder with GBM-backed frames.
+type CrosVaapiH264Encoder = StatelessEncoder<
+    cros_codecs::encoder::h264::H264,
+    GbmVideoFrame,
+    cros_codecs::backend::vaapi::encoder::VaapiBackend<
+        GbmExternalBufferDescriptor,
+        libva::Surface<GbmExternalBufferDescriptor>,
+    >,
+>;
+
+/// Internal encoder state wrapping the cros-codecs `StatelessEncoder`.
+///
+/// `!Send` due to internal `Rc<libva::Display>` — lives entirely inside
+/// a `spawn_blocking` thread.
+struct VaapiH264Encoder {
+    encoder: CrosVaapiH264Encoder,
+    gbm: Arc<GbmDevice>,
+    width: u32,
+    height: u32,
+    coded_width: u32,
+    coded_height: u32,
+    frame_count: u64,
+}
+
+impl StandardVideoEncoder for VaapiH264Encoder {
+    type Config = VaapiH264EncoderConfig;
+    const CODEC_NAME: &'static str = "VA-API H.264";
+
+    fn new_encoder(width: u32, height: u32, config: &Self::Config) -> Result<Self, String> {
+        let (display, gbm, path) = open_va_and_gbm(config.render_device.as_ref())?;
+        tracing::info!(device = %path, width, height, "VA-API H.264 encoder opening");
+
+        let coded_width = align_up_u32(width, H264_MB_SIZE);
+        let coded_height = align_up_u32(height, H264_MB_SIZE);
+
+        let cros_config = CrosH264EncoderConfig {
+            resolution: CrosResolution {
+                width: coded_width,
+                height: coded_height,
+            },
+            profile: H264Profile::Main,
+            level: H264Level::L4,
+            pred_structure: PredictionStructure::LowDelay { limit: 1024 },
+            initial_tunings: Tunings {
+                rate_control: RateControl::ConstantQuality(config.quality),
+                framerate: config.framerate,
+                min_quality: 0,
+                max_quality: 51,
+            },
+        };
+
+        let encoder = CrosVaapiH264Encoder::new_vaapi(
+            display,
+            cros_config,
+            nv12_fourcc(),
+            CrosResolution {
+                width: coded_width,
+                height: coded_height,
+            },
+            config.low_power,
+            BlockingMode::Blocking,
+        )
+        .map_err(|e| format!("failed to create VA-API H.264 encoder: {e}"))?;
+
+        tracing::info!(
+            device = %path,
+            width,
+            height,
+            coded_width,
+            coded_height,
+            quality = config.quality,
+            "VA-API H.264 encoder created"
+        );
+
+        Ok(Self {
+            encoder,
+            gbm,
+            width,
+            height,
+            coded_width,
+            coded_height,
+            frame_count: 0,
+        })
+    }
+
+    fn encode(
+        &mut self,
+        frame: &VideoFrame,
+        metadata: Option<PacketMetadata>,
+    ) -> Result<Vec<EncodedPacket>, String> {
+        if frame.pixel_format == PixelFormat::Rgba8 {
+            return Err(
+                "VA-API H.264 encoder requires NV12 or I420 input; \
+                 insert a video::pixel_convert node upstream"
+                    .into(),
+            );
+        }
+
+        // Create a GBM frame and upload the raw video data.
+        let mut gbm_frame = Arc::clone(&self.gbm)
+            .new_frame(
+                nv12_fourcc(),
+                CrosResolution {
+                    width: self.width,
+                    height: self.height,
+                },
+                CrosResolution {
+                    width: self.coded_width,
+                    height: self.coded_height,
+                },
+                GbmUsage::Encode,
+            )
+            .map_err(|e| format!("failed to allocate GBM frame for encoding: {e}"))?;
+
+        // Write frame data into the GBM buffer.
+        let pitches = gbm_frame.get_plane_pitch();
+        {
+            let mapping = gbm_frame
+                .map_mut()
+                .map_err(|e| format!("failed to map GBM frame for writing: {e}"))?;
+            write_nv12_to_mapping(mapping.as_ref(), frame, &pitches)?;
+        }
+
+        let is_keyframe = metadata.as_ref().and_then(|m| m.keyframe).unwrap_or(false);
+        let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(self.frame_count);
+
+        // Compute UV plane offset from pitch × coded_height (same approach as
+        // the AV1 encoder — get_plane_offset() is private in cros-codecs 0.0.6).
+        let y_stride = pitches.first().copied().unwrap_or(self.coded_width as usize);
+        let uv_offset = y_stride * self.coded_height as usize;
+
+        let frame_layout = FrameLayout {
+            format: (nv12_fourcc(), 0), // DRM_FORMAT_MOD_LINEAR
+            size: CrosResolution {
+                width: self.coded_width,
+                height: self.coded_height,
+            },
+            planes: vec![
+                PlaneLayout {
+                    buffer_index: 0,
+                    offset: 0,
+                    stride: y_stride,
+                },
+                PlaneLayout {
+                    buffer_index: 0,
+                    offset: uv_offset,
+                    stride: pitches.get(1).copied().unwrap_or(self.coded_width as usize),
+                },
+            ],
+        };
+
+        let cros_meta =
+            CrosFrameMetadata { timestamp, layout: frame_layout, force_keyframe: is_keyframe };
+
+        self.encoder
+            .encode(cros_meta, gbm_frame)
+            .map_err(|e| format!("VA-API H.264 encode error: {e}"))?;
+
+        self.frame_count += 1;
+
+        // Poll for all available encoded output.
+        let mut packets = Vec::new();
+        loop {
+            match self.encoder.poll() {
+                Ok(Some(coded)) => {
+                    packets.push(EncodedPacket {
+                        data: Bytes::from(coded.bitstream),
+                        metadata: metadata.clone(),
+                    });
+                }
+                Ok(None) => break,
+                Err(e) => return Err(format!("VA-API H.264 encoder poll error: {e}")),
+            }
+        }
+
+        Ok(packets)
+    }
+
+    fn flush_encoder(&mut self) -> Result<Vec<EncodedPacket>, String> {
+        self.encoder
+            .drain()
+            .map_err(|e| format!("VA-API H.264 encoder drain error: {e}"))?;
+
+        let mut packets = Vec::new();
+        loop {
+            match self.encoder.poll() {
+                Ok(Some(coded)) => {
+                    packets
+                        .push(EncodedPacket { data: Bytes::from(coded.bitstream), metadata: None });
+                }
+                Ok(None) => break,
+                Err(e) => return Err(format!("VA-API H.264 encoder poll error: {e}")),
+            }
+        }
+
+        Ok(packets)
+    }
+
+    fn flush_on_dimension_change() -> bool {
+        true
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Registration
+// ---------------------------------------------------------------------------
+
+use schemars::schema_for;
+use streamkit_core::registry::StaticPins;
+
+#[allow(clippy::expect_used, clippy::missing_panics_doc)]
+pub fn register_vaapi_h264_nodes(registry: &mut NodeRegistry) {
+    let default_decoder = VaapiH264DecoderNode::new(VaapiH264DecoderConfig::default())
+        .expect("default VA-API H.264 decoder config should be valid");
+    registry.register_static_with_description(
+        "video::vaapi::h264_decoder",
+        |params| {
+            let config = config_helpers::parse_config_optional(params)?;
+            Ok(Box::new(VaapiH264DecoderNode::new(config)?))
+        },
+        serde_json::to_value(schema_for!(VaapiH264DecoderConfig))
+            .expect("VaapiH264DecoderConfig schema should serialize to JSON"),
+        StaticPins {
+            inputs: default_decoder.input_pins(),
+            outputs: default_decoder.output_pins(),
+        },
+        vec![
+            "video".to_string(),
+            "codecs".to_string(),
+            "h264".to_string(),
+            "hw".to_string(),
+            "vaapi".to_string(),
+        ],
+        false,
+        "Decodes H.264-compressed packets into raw NV12 video frames using VA-API \
+         hardware acceleration. Requires a VA-API capable GPU (Intel Sandy Bridge+, \
+         AMD, or NVIDIA with nvidia-vaapi-driver).",
+    );
+
+    let default_encoder = VaapiH264EncoderNode::new(VaapiH264EncoderConfig::default())
+        .expect("default VA-API H.264 encoder config should be valid");
+    registry.register_static_with_description(
+        "video::vaapi::h264_encoder",
+        |params| {
+            let config = config_helpers::parse_config_optional(params)?;
+            Ok(Box::new(VaapiH264EncoderNode::new(config)?))
+        },
+        serde_json::to_value(schema_for!(VaapiH264EncoderConfig))
+            .expect("VaapiH264EncoderConfig schema should serialize to JSON"),
+        StaticPins {
+            inputs: default_encoder.input_pins(),
+            outputs: default_encoder.output_pins(),
+        },
+        vec![
+            "video".to_string(),
+            "codecs".to_string(),
+            "h264".to_string(),
+            "hw".to_string(),
+            "vaapi".to_string(),
+        ],
+        false,
+        "Encodes raw NV12/I420 video frames into H.264-compressed packets using VA-API \
+         hardware acceleration. Uses constant-quality (CQP) rate control. Requires a \
+         VA-API capable GPU with H.264 encode support (Intel, AMD).",
+    );
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+#[allow(clippy::unwrap_used, clippy::expect_used, clippy::disallowed_macros)]
+mod tests {
+    use super::*;
+
+    // ── Unit tests (no GPU required) ─────────────────────────────────
+
+    #[test]
+    fn test_force_cpu_rejected_decoder() {
+        let config =
+            VaapiH264DecoderConfig { hw_accel: HwAccelMode::ForceCpu, ..Default::default() };
+        let result = VaapiH264DecoderNode::new(config);
+        assert!(result.is_err(), "ForceCpu should be rejected for VA-API H.264 decoder");
+    }
+
+    #[test]
+    fn test_force_cpu_rejected_encoder() {
+        let config =
+            VaapiH264EncoderConfig { hw_accel: HwAccelMode::ForceCpu, ..Default::default() };
+        let result = VaapiH264EncoderNode::new(config);
+        assert!(result.is_err(), "ForceCpu should be rejected for VA-API H.264 encoder");
+    }
+
+    #[test]
+    fn test_default_configs() {
+        let dec = VaapiH264DecoderConfig::default();
+        assert!(dec.render_device.is_none());
+        assert!(matches!(dec.hw_accel, HwAccelMode::Auto));
+
+        let enc = VaapiH264EncoderConfig::default();
+        assert!(enc.render_device.is_none());
+        assert_eq!(enc.quality, DEFAULT_QUALITY);
+        assert_eq!(enc.framerate, DEFAULT_FRAMERATE);
+        assert!(!enc.low_power);
+        assert!(matches!(enc.hw_accel, HwAccelMode::Auto));
+    }
+
+    #[test]
+    fn test_decoder_pins() {
+        let node = VaapiH264DecoderNode::new(VaapiH264DecoderConfig::default()).unwrap();
+        assert_eq!(node.input_pins().len(), 1);
+        assert_eq!(node.output_pins().len(), 1);
+        assert_eq!(node.input_pins()[0].name, "in");
+        assert_eq!(node.output_pins()[0].name, "out");
+    }
+
+    #[test]
+    fn test_encoder_pins() {
+        let node = VaapiH264EncoderNode::new(VaapiH264EncoderConfig::default()).unwrap();
+        assert_eq!(node.input_pins().len(), 1);
+        assert_eq!(node.output_pins().len(), 1);
+        assert_eq!(node.input_pins()[0].name, "in");
+        assert_eq!(node.output_pins()[0].name, "out");
+        // Encoder should accept both I420 and NV12 inputs.
+        assert_eq!(node.input_pins()[0].accepts_types.len(), 2);
+    }
+
+    #[test]
+    fn test_encoder_content_type() {
+        let node = VaapiH264EncoderNode::new(VaapiH264EncoderConfig::default()).unwrap();
+        assert_eq!(node.content_type(), Some(H264_CONTENT_TYPE.to_string()));
+    }
+
+    // ── Registration test ────────────────────────────────────────────
+
+    #[test]
+    fn test_registration() {
+        let mut registry = NodeRegistry::new();
+        register_vaapi_h264_nodes(&mut registry);
+        assert!(
+            registry.create_node("video::vaapi::h264_decoder", None).is_ok(),
+            "VA-API H.264 decoder should be registered"
+        );
+        assert!(
+            registry.create_node("video::vaapi::h264_encoder", None).is_ok(),
+            "VA-API H.264 encoder should be registered"
+        );
+    }
+
+    // ── GPU integration tests ────────────────────────────────────────
+    //
+    // These require a VA-API capable GPU with H.264 support.  They are
+    // compiled with the `vaapi` feature but skip at runtime if no VA-API
+    // device is available.
+
+    /// Check whether a usable VA-API display can be opened.
+    fn vaapi_available() -> bool {
+        use super::super::vaapi_av1::resolve_render_device;
+        let path = resolve_render_device(None);
+        libva::Display::open_drm_display(std::path::Path::new(&path)).is_ok()
+    }
+
+    /// Encoder + Decoder roundtrip: encode 5 NV12 frames, decode them back,
+    /// verify dimensions and pixel format.
+    #[tokio::test]
+    async fn test_vaapi_h264_encode_decode_roundtrip() {
+        if !vaapi_available() {
+            eprintln!("SKIP: no VA-API device available");
+            return;
+        }
+
+        use crate::test_utils::{
+            assert_state_initializing, assert_state_running, assert_state_stopped,
+            create_test_context, create_test_video_frame,
+        };
+        use std::borrow::Cow;
+        use std::collections::HashMap;
+
+        // --- Encode ---
+        let (enc_input_tx, enc_input_rx) = mpsc::channel(10);
+        let mut enc_inputs = HashMap::new();
+        enc_inputs.insert("in".to_string(), enc_input_rx);
+
+        let (enc_context, enc_sender, mut enc_state_rx) = create_test_context(enc_inputs, 10);
+        let encoder_config = VaapiH264EncoderConfig {
+            render_device: None,
+            hw_accel: HwAccelMode::Auto,
+            quality: 40, // fast, lower quality for test speed
+            framerate: 30,
+            low_power: false,
+        };
+        let encoder = VaapiH264EncoderNode::new(encoder_config).unwrap();
+        let enc_handle = tokio::spawn(async move { Box::new(encoder).run(enc_context).await });
+
+        assert_state_initializing(&mut enc_state_rx).await;
+        assert_state_running(&mut enc_state_rx).await;
+
+        for index in 0_u64..5 {
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::Nv12, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(1_000 + 33_333 * index),
+                duration_us: Some(33_333),
+                sequence: Some(index),
+                keyframe: Some(true),
+            });
+            enc_input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(enc_input_tx);
+
+        assert_state_stopped(&mut enc_state_rx).await;
+        enc_handle.await.unwrap().unwrap();
+
+        let encoded_packets = enc_sender.get_packets_for_pin("out").await;
+        assert!(!encoded_packets.is_empty(), "VA-API H.264 encoder produced no packets");
+
+        // --- Decode ---
+        let (dec_input_tx, dec_input_rx) = mpsc::channel(10);
+        let mut dec_inputs = HashMap::new();
+        dec_inputs.insert("in".to_string(), dec_input_rx);
+
+        let (dec_context, dec_sender, mut dec_state_rx) = create_test_context(dec_inputs, 10);
+        let decoder = VaapiH264DecoderNode::new(VaapiH264DecoderConfig::default()).unwrap();
+        let dec_handle = tokio::spawn(async move { Box::new(decoder).run(dec_context).await });
+
+        assert_state_initializing(&mut dec_state_rx).await;
+        assert_state_running(&mut dec_state_rx).await;
+
+        for packet in encoded_packets {
+            if let Packet::Binary { data, metadata, .. } = packet {
+                dec_input_tx
+                    .send(Packet::Binary {
+                        data,
+                        content_type: Some(Cow::Borrowed(H264_CONTENT_TYPE)),
+                        metadata,
+                    })
+                    .await
+                    .unwrap();
+            }
+        }
+        drop(dec_input_tx);
+
+        assert_state_stopped(&mut dec_state_rx).await;
+        dec_handle.await.unwrap().unwrap();
+
+        let decoded_packets = dec_sender.get_packets_for_pin("out").await;
+        assert!(!decoded_packets.is_empty(), "VA-API H.264 decoder produced no frames");
+
+        for packet in decoded_packets {
+            match packet {
+                Packet::Video(frame) => {
+                    assert_eq!(frame.width, 64);
+                    assert_eq!(frame.height, 64);
+                    assert_eq!(frame.pixel_format, PixelFormat::Nv12);
+                    assert!(!frame.data().is_empty(), "Decoded frame should have data");
+                }
+                _ => panic!("Expected Video packet from VA-API H.264 decoder"),
+            }
+        }
+    }
+}
diff --git a/samples/pipelines/dynamic/video_moq_vaapi_h264_colorbars.yml b/samples/pipelines/dynamic/video_moq_vaapi_h264_colorbars.yml
new file mode 100644
index 00000000..30a10638
--- /dev/null
+++ b/samples/pipelines/dynamic/video_moq_vaapi_h264_colorbars.yml
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# Streams SMPTE color bars encoded with VA-API H.264 (GPU-accelerated) over MoQ.
+#
+# Requires: skit built with --features vaapi
+#           VA-API capable GPU with H.264 encode support (Intel, AMD)
+#           System packages: libva-dev, libgbm-dev
+
+name: VA-API H.264 Color Bars (MoQ Stream)
+description: Continuously generates SMPTE color bars and streams via MoQ using VA-API H.264 HW encoder
+mode: dynamic
+client:
+  gateway_path: /moq/video
+  watch:
+    broadcast: output
+    audio: false
+    video: true
+
+nodes:
+  colorbars:
+    kind: video::colorbars
+    params:
+      width: 1280
+      height: 720
+      fps: 30
+      pixel_format: nv12
+      draw_time: true
+
+  vaapi_h264_encoder:
+    kind: video::vaapi::h264_encoder
+    params:
+      quality: 26
+      framerate: 30
+    needs: colorbars
+
+  moq_peer:
+    kind: transport::moq::peer
+    params:
+      gateway_path: /moq/video
+      output_broadcast: output
+      allow_reconnect: true
+      video_codec: h264
+    needs:
+      in: vaapi_h264_encoder
diff --git a/samples/pipelines/oneshot/video_vaapi_h264_colorbars.yml b/samples/pipelines/oneshot/video_vaapi_h264_colorbars.yml
new file mode 100644
index 00000000..ac6f513d
--- /dev/null
+++ b/samples/pipelines/oneshot/video_vaapi_h264_colorbars.yml
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# Demonstrates the VA-API H.264 HW encoder:
+#   Generates SMPTE color bars (NV12), encodes to H.264 via VA-API
+#   (GPU-accelerated), muxes into an MP4 container, and writes the result
+#   to HTTP output.
+#
+# Requires: skit built with --features vaapi
+#           VA-API capable GPU with H.264 encode support (Intel, AMD)
+#           System packages: libva-dev, libgbm-dev
+
+name: VA-API H.264 Encode (MP4 Oneshot)
+description: Generates color bars, encodes to H.264 using VA-API HW encoder, and muxes into MP4 (30 seconds)
+mode: oneshot
+client:
+  input:
+    type: none
+  output:
+    type: video
+
+nodes:
+  colorbars:
+    kind: video::colorbars
+    params:
+      width: 1280
+      height: 720
+      fps: 30
+      frame_count: 900  # 30 seconds at 30fps
+      pixel_format: nv12
+      draw_time: true
+      draw_time_use_pts: true
+
+  vaapi_h264_encoder:
+    kind: video::vaapi::h264_encoder
+    params:
+      quality: 26
+      framerate: 30
+    needs: colorbars
+
+  mp4_muxer:
+    kind: containers::mp4::muxer
+    params:
+      mode: stream
+      video_width: 1280
+      video_height: 720
+    needs: vaapi_h264_encoder
+
+  http_output:
+    kind: streamkit::http_output
+    params:
+      content_type: 'video/mp4; codecs="avc1.4d0028"'
+    needs: mp4_muxer

From 5f0f7bb83f791f2f191de6c9985df818ca62aa11 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 18:03:17 +0000
Subject: [PATCH 15/23] feat(nodes): add VA-API H.264 encoder and decoder nodes

Add vaapi_h264 module with VaapiH264EncoderNode and VaapiH264DecoderNode
using cros-codecs StatelessEncoder/StatelessDecoder for H.264 via VA-API.

- Encoder: CQP rate control, Main profile, macroblock-aligned coding
- Decoder: stateless H.264 decode with format-change handling
- Reuses shared helpers from vaapi_av1 (GBM/NV12 I/O, device detection)
- Registration: video::vaapi::h264_encoder, video::vaapi::h264_decoder
- Sample pipelines: oneshot MP4 + dynamic MoQ for VA-API H.264

Supported on Intel (Sandy Bridge+), AMD, and NVIDIA (decode only).

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/vaapi_h264.rs | 103 ++++++++-------------------
 1 file changed, 28 insertions(+), 75 deletions(-)

diff --git a/crates/nodes/src/video/vaapi_h264.rs b/crates/nodes/src/video/vaapi_h264.rs
index f4373538..b80fbdec 100644
--- a/crates/nodes/src/video/vaapi_h264.rs
+++ b/crates/nodes/src/video/vaapi_h264.rs
@@ -117,10 +117,7 @@ pub struct VaapiH264DecoderConfig {
 
 impl Default for VaapiH264DecoderConfig {
     fn default() -> Self {
-        Self {
-            render_device: None,
-            hw_accel: HwAccelMode::Auto,
-        }
+        Self { render_device: None, hw_accel: HwAccelMode::Auto }
     }
 }
 
@@ -268,7 +265,7 @@ fn vaapi_h264_decode_loop(
         Err(e) => {
             let _ = result_tx.blocking_send(Err(e));
             return;
-        }
+        },
     };
     tracing::info!(device = %path, "VA-API H.264 decoder opened display");
 
@@ -282,7 +279,7 @@ fn vaapi_h264_decode_loop(
             let _ =
                 result_tx.blocking_send(Err(format!("failed to create VA-API H.264 decoder: {e}")));
             return;
-        }
+        },
     };
 
     // Stream resolution — updated on FormatChanged events.
@@ -324,16 +321,15 @@ fn vaapi_h264_decode_loop(
                 Ok(bytes_consumed) => {
                     offset += bytes_consumed;
                     made_progress = true;
-                }
+                },
                 Err(DecodeError::CheckEvents | DecodeError::NotEnoughOutputBuffers(_)) => {
                     // Process pending events / drain ready frames, then retry.
-                }
+                },
                 Err(e) => {
                     tracing::error!(error = %e, "VA-API H.264 decode error");
-                    let _ =
-                        result_tx.blocking_send(Err(format!("VA-API H.264 decode error: {e}")));
+                    let _ = result_tx.blocking_send(Err(format!("VA-API H.264 decode error: {e}")));
                     break;
-                }
+                },
             }
 
             // Process all pending events (format changes + ready frames).
@@ -381,13 +377,7 @@ fn vaapi_h264_decode_loop(
     if let Err(e) = decoder.flush() {
         tracing::warn!(error = %e, "VA-API H.264 decoder flush failed");
     }
-    drain_decoder_events(
-        &mut decoder,
-        result_tx,
-        None,
-        &mut coded_width,
-        &mut coded_height,
-    );
+    drain_decoder_events(&mut decoder, result_tx, None, &mut coded_width, &mut coded_height);
 }
 
 /// Drain all pending events from the decoder.
@@ -420,7 +410,7 @@ fn drain_decoder_events(
                         "VA-API H.264 decoder stream format changed"
                     );
                 }
-            }
+            },
             DecoderEvent::FrameReady(handle) => {
                 if let Err(e) = handle.sync() {
                     tracing::error!(error = %e, "VA-API H.264 frame sync failed");
@@ -441,7 +431,7 @@ fn drain_decoder_events(
                         Err(e) => {
                             tracing::error!(error = %e, "failed to map decoded GBM frame");
                             continue;
-                        }
+                        },
                     };
                     read_nv12_from_mapping(mapping.as_ref(), frame_w, frame_h, &pitches)
                 };
@@ -457,15 +447,15 @@ fn drain_decoder_events(
                         if result_tx.blocking_send(Ok(frame)).is_err() {
                             return (true, had_events);
                         }
-                    }
+                    },
                     Err(e) => {
                         tracing::error!(
                             error = %e,
                             "failed to construct VideoFrame from decoded data"
                         );
-                    }
+                    },
                 }
-            }
+            },
         }
     }
     (false, had_events)
@@ -635,10 +625,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
         let coded_height = align_up_u32(height, H264_MB_SIZE);
 
         let cros_config = CrosH264EncoderConfig {
-            resolution: CrosResolution {
-                width: coded_width,
-                height: coded_height,
-            },
+            resolution: CrosResolution { width: coded_width, height: coded_height },
             profile: H264Profile::Main,
             level: H264Level::L4,
             pred_structure: PredictionStructure::LowDelay { limit: 1024 },
@@ -654,10 +641,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
             display,
             cros_config,
             nv12_fourcc(),
-            CrosResolution {
-                width: coded_width,
-                height: coded_height,
-            },
+            CrosResolution { width: coded_width, height: coded_height },
             config.low_power,
             BlockingMode::Blocking,
         )
@@ -673,15 +657,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
             "VA-API H.264 encoder created"
         );
 
-        Ok(Self {
-            encoder,
-            gbm,
-            width,
-            height,
-            coded_width,
-            coded_height,
-            frame_count: 0,
-        })
+        Ok(Self { encoder, gbm, width, height, coded_width, coded_height, frame_count: 0 })
     }
 
     fn encode(
@@ -690,25 +666,17 @@ impl StandardVideoEncoder for VaapiH264Encoder {
         metadata: Option<PacketMetadata>,
     ) -> Result<Vec<EncodedPacket>, String> {
         if frame.pixel_format == PixelFormat::Rgba8 {
-            return Err(
-                "VA-API H.264 encoder requires NV12 or I420 input; \
+            return Err("VA-API H.264 encoder requires NV12 or I420 input; \
                  insert a video::pixel_convert node upstream"
-                    .into(),
-            );
+                .into());
         }
 
         // Create a GBM frame and upload the raw video data.
         let mut gbm_frame = Arc::clone(&self.gbm)
             .new_frame(
                 nv12_fourcc(),
-                CrosResolution {
-                    width: self.width,
-                    height: self.height,
-                },
-                CrosResolution {
-                    width: self.coded_width,
-                    height: self.coded_height,
-                },
+                CrosResolution { width: self.width, height: self.height },
+                CrosResolution { width: self.coded_width, height: self.coded_height },
                 GbmUsage::Encode,
             )
             .map_err(|e| format!("failed to allocate GBM frame for encoding: {e}"))?;
@@ -732,16 +700,9 @@ impl StandardVideoEncoder for VaapiH264Encoder {
 
         let frame_layout = FrameLayout {
             format: (nv12_fourcc(), 0), // DRM_FORMAT_MOD_LINEAR
-            size: CrosResolution {
-                width: self.coded_width,
-                height: self.coded_height,
-            },
+            size: CrosResolution { width: self.coded_width, height: self.coded_height },
             planes: vec![
-                PlaneLayout {
-                    buffer_index: 0,
-                    offset: 0,
-                    stride: y_stride,
-                },
+                PlaneLayout { buffer_index: 0, offset: 0, stride: y_stride },
                 PlaneLayout {
                     buffer_index: 0,
                     offset: uv_offset,
@@ -768,7 +729,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
                         data: Bytes::from(coded.bitstream),
                         metadata: metadata.clone(),
                     });
-                }
+                },
                 Ok(None) => break,
                 Err(e) => return Err(format!("VA-API H.264 encoder poll error: {e}")),
             }
@@ -778,9 +739,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
     }
 
     fn flush_encoder(&mut self) -> Result<Vec<EncodedPacket>, String> {
-        self.encoder
-            .drain()
-            .map_err(|e| format!("VA-API H.264 encoder drain error: {e}"))?;
+        self.encoder.drain().map_err(|e| format!("VA-API H.264 encoder drain error: {e}"))?;
 
         let mut packets = Vec::new();
         loop {
@@ -788,7 +747,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
                 Ok(Some(coded)) => {
                     packets
                         .push(EncodedPacket { data: Bytes::from(coded.bitstream), metadata: None });
-                }
+                },
                 Ok(None) => break,
                 Err(e) => return Err(format!("VA-API H.264 encoder poll error: {e}")),
             }
@@ -821,10 +780,7 @@ pub fn register_vaapi_h264_nodes(registry: &mut NodeRegistry) {
         },
         serde_json::to_value(schema_for!(VaapiH264DecoderConfig))
             .expect("VaapiH264DecoderConfig schema should serialize to JSON"),
-        StaticPins {
-            inputs: default_decoder.input_pins(),
-            outputs: default_decoder.output_pins(),
-        },
+        StaticPins { inputs: default_decoder.input_pins(), outputs: default_decoder.output_pins() },
         vec![
             "video".to_string(),
             "codecs".to_string(),
@@ -848,10 +804,7 @@ pub fn register_vaapi_h264_nodes(registry: &mut NodeRegistry) {
         },
         serde_json::to_value(schema_for!(VaapiH264EncoderConfig))
             .expect("VaapiH264EncoderConfig schema should serialize to JSON"),
-        StaticPins {
-            inputs: default_encoder.input_pins(),
-            outputs: default_encoder.output_pins(),
-        },
+        StaticPins { inputs: default_encoder.input_pins(), outputs: default_encoder.output_pins() },
         vec![
             "video".to_string(),
             "codecs".to_string(),
@@ -1054,7 +1007,7 @@ mod tests {
                     assert_eq!(frame.height, 64);
                     assert_eq!(frame.pixel_format, PixelFormat::Nv12);
                     assert!(!frame.data().is_empty(), "Decoded frame should have data");
-                }
+                },
                 _ => panic!("Expected Video packet from VA-API H.264 decoder"),
             }
         }

From 5d8391de90f4b4d883f8bba012520d50416c8613 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 18:33:44 +0000
Subject: [PATCH 16/23] fix(nodes): auto-detect VA-API H.264 encoder entrypoint

Modern Intel GPUs (Gen 9+ / Skylake onwards) only expose the low-power
fixed-function encoder (VAEntrypointEncSliceLP), not the full encoder
(VAEntrypointEncSlice).  Query the driver for supported entrypoints and
auto-select the correct one instead of hardcoding low_power=false.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/vaapi_h264.rs | 42 +++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/crates/nodes/src/video/vaapi_h264.rs b/crates/nodes/src/video/vaapi_h264.rs
index b80fbdec..921d93a9 100644
--- a/crates/nodes/src/video/vaapi_h264.rs
+++ b/crates/nodes/src/video/vaapi_h264.rs
@@ -624,6 +624,46 @@ impl StandardVideoEncoder for VaapiH264Encoder {
         let coded_width = align_up_u32(width, H264_MB_SIZE);
         let coded_height = align_up_u32(height, H264_MB_SIZE);
 
+        // Auto-detect the correct entrypoint.  Modern Intel GPUs (Gen 9+ /
+        // Skylake onwards) only expose the low-power fixed-function encoder
+        // (`VAEntrypointEncSliceLP`), while older hardware and some AMD
+        // drivers use `VAEntrypointEncSlice`.  Query the driver and pick
+        // whichever is available, preferring the config value when set.
+        let low_power = {
+            use libva::VAEntrypoint::{VAEntrypointEncSlice, VAEntrypointEncSliceLP};
+            use libva::VAProfile::VAProfileH264Main;
+
+            let entrypoints = display
+                .query_config_entrypoints(VAProfileH264Main)
+                .map_err(|e| format!("failed to query H.264 entrypoints: {e}"))?;
+
+            let has_lp = entrypoints.contains(&VAEntrypointEncSliceLP);
+            let has_full = entrypoints.contains(&VAEntrypointEncSlice);
+
+            if !has_lp && !has_full {
+                return Err(
+                    "VA-API driver does not support H.264 encoding (no EncSlice entrypoint)".into(),
+                );
+            }
+
+            // Prefer the user's explicit config; otherwise auto-detect.
+            if config.low_power {
+                if !has_lp {
+                    return Err(
+                        "low_power=true requested but VAEntrypointEncSliceLP is not supported"
+                            .into(),
+                    );
+                }
+                true
+            } else if has_lp && !has_full {
+                // Driver only supports low-power (common on modern Intel).
+                tracing::info!("auto-selecting low-power H.264 encoder (VAEntrypointEncSliceLP)");
+                true
+            } else {
+                false
+            }
+        };
+
         let cros_config = CrosH264EncoderConfig {
             resolution: CrosResolution { width: coded_width, height: coded_height },
             profile: H264Profile::Main,
@@ -642,7 +682,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
             cros_config,
             nv12_fourcc(),
             CrosResolution { width: coded_width, height: coded_height },
-            config.low_power,
+            low_power,
             BlockingMode::Blocking,
         )
         .map_err(|e| format!("failed to create VA-API H.264 encoder: {e}"))?;

From 81029c2f9a6ef6aaef920dbf3138a79f199d9cd8 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 18:54:55 +0000
Subject: [PATCH 17/23] fix(nodes): bypass GBM for VA-API encoders, use direct
 VA surfaces

Replace GBM-backed frame allocation with direct VA surface creation
and Image API uploads for both H.264 and AV1 VA-API encoders.

The cros-codecs GBM allocator uses GBM_BO_USE_HW_VIDEO_ENCODER, a flag
that Mesa's iris driver does not support for NV12 on some hardware
(e.g. Intel Tiger Lake with Mesa 23.x), causing 'Error allocating
contiguous buffer' failures.

By using libva Surface<()> handles instead:
- Surfaces are created via vaCreateSurfaces (no GBM needed)
- NV12 data is uploaded via the VA Image API (vaCreateImage + vaPutImage)
- The encoder's import_picture passthrough accepts Surface<()> directly
- Pitches/offsets come from the VA driver's VAImage, not GBM

This also adds two new shared helpers in vaapi_av1.rs:
- open_va_display(): opens VA display without GBM device
- write_nv12_to_va_surface(): uploads NV12/I420 frame data to a VA
  surface using the Image API, returning driver pitches/offsets

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 Cargo.toml                           |   1 +
 crates/nodes/src/video/vaapi_av1.rs  | 186 +++++++++++++++++++++------
 crates/nodes/src/video/vaapi_h264.rs |  79 ++++++------
 3 files changed, 183 insertions(+), 83 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 96863e78..ffd8745b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -78,6 +78,7 @@ opt-level = 3
 [profile.dev.package.maybe-rayon]
 opt-level = 3
 
+
 [workspace.lints.rust]
 unsafe_code = "forbid"
 # missing_debug_implementations = "warn"
diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
index f69af6e9..771d17cd 100644
--- a/crates/nodes/src/video/vaapi_av1.rs
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -163,6 +163,118 @@ pub(super) fn open_va_and_gbm(
     Ok((display, gbm, path))
 }
 
+/// Open a VA display without a GBM device.
+///
+/// Used by encoder paths that pass VA surfaces directly to the encoder,
+/// bypassing GBM buffer allocation entirely.  This avoids the
+/// `GBM_BO_USE_HW_VIDEO_ENCODER` flag that Mesa's iris driver does not
+/// support for NV12 on some hardware (e.g. Intel Tiger Lake).
+pub(super) fn open_va_display(
+    render_device: Option<&String>,
+) -> Result<(Rc<libva::Display>, String), String> {
+    let path = resolve_render_device(render_device);
+    let display = libva::Display::open_drm_display(&path)
+        .map_err(|e| format!("failed to open VA display on {path}: {e}"))?;
+    Ok((display, path))
+}
+
+/// Write NV12 (or I420→NV12) data from a StreamKit [`VideoFrame`] into a VA
+/// surface using the VA-API Image API.
+///
+/// Uses `vaCreateImage` + `vaMapBuffer` to obtain a writable mapping, writes
+/// NV12 data respecting the driver's internal pitches/offsets, then drops the
+/// [`Image`] which flushes the data back via `vaPutImage`.
+///
+/// Returns `(pitches, offsets)` — the per-plane stride and byte-offset arrays
+/// from the `VAImage`, needed to build the [`FrameLayout`] for the encoder.
+pub(super) fn write_nv12_to_va_surface(
+    display: &Rc<libva::Display>,
+    surface: &libva::Surface<()>,
+    frame: &VideoFrame,
+) -> Result<([usize; 2], [usize; 2]), String> {
+    let nv12_fourcc_val: u32 = nv12_fourcc().into();
+    let image_fmts = display
+        .query_image_formats()
+        .map_err(|e| format!("failed to query VA image formats: {e}"))?;
+    let image_fmt = image_fmts
+        .into_iter()
+        .find(|f| f.fourcc == nv12_fourcc_val)
+        .ok_or("VA driver does not support NV12 image format")?;
+
+    let mut image = libva::Image::create_from(surface, image_fmt, surface.size(), surface.size())
+        .map_err(|e| format!("failed to create VA image for NV12 upload: {e}"))?;
+
+    let va_image = *image.image();
+    let y_pitch = va_image.pitches[0] as usize;
+    let uv_pitch = va_image.pitches[1] as usize;
+    let y_offset = va_image.offsets[0] as usize;
+    let uv_offset = va_image.offsets[1] as usize;
+
+    let dest = image.as_mut();
+    let src = frame.data.as_ref().as_ref();
+    let w = frame.width as usize;
+    let h = frame.height as usize;
+
+    match frame.pixel_format {
+        PixelFormat::Nv12 => {
+            // Y plane.
+            for row in 0..h {
+                let s = row * w;
+                let d = y_offset + row * y_pitch;
+                if s + w <= src.len() && d + w <= dest.len() {
+                    dest[d..d + w].copy_from_slice(&src[s..s + w]);
+                }
+            }
+            // UV plane (already interleaved in NV12).
+            let uv_h = h / 2;
+            let src_uv = &src[w * h..];
+            for row in 0..uv_h {
+                let s = row * w;
+                let d = uv_offset + row * uv_pitch;
+                if s + w <= src_uv.len() && d + w <= dest.len() {
+                    dest[d..d + w].copy_from_slice(&src_uv[s..s + w]);
+                }
+            }
+        },
+        PixelFormat::I420 => {
+            // Y plane — same as NV12.
+            for row in 0..h {
+                let s = row * w;
+                let d = y_offset + row * y_pitch;
+                if s + w <= src.len() && d + w <= dest.len() {
+                    dest[d..d + w].copy_from_slice(&src[s..s + w]);
+                }
+            }
+            // I420 → NV12: interleave U and V into a single UV plane.
+            let uv_w = w / 2;
+            let uv_h = h / 2;
+            let u_start = w * h;
+            let v_start = u_start + uv_w * uv_h;
+            for row in 0..uv_h {
+                for col in 0..uv_w {
+                    let u_idx = u_start + row * uv_w + col;
+                    let v_idx = v_start + row * uv_w + col;
+                    let d = uv_offset + row * uv_pitch + col * 2;
+                    if u_idx < src.len() && v_idx < src.len() && d + 1 < dest.len() {
+                        dest[d] = src[u_idx];
+                        dest[d + 1] = src[v_idx];
+                    }
+                }
+            }
+        },
+        other => {
+            drop(image);
+            return Err(format!("write_nv12_to_va_surface: unsupported pixel format {other:?}"));
+        },
+    }
+
+    // Sync the surface before dropping the image (which calls vaPutImage).
+    surface.sync().map_err(|e| format!("VA surface sync failed: {e}"))?;
+    drop(image);
+
+    Ok(([y_pitch, uv_pitch], [y_offset, uv_offset]))
+}
+
 /// Copy NV12 plane data from a GBM read-mapping into a flat `Vec<u8>` suitable
 /// for a packed StreamKit [`VideoFrame`].
 ///
@@ -846,14 +958,14 @@ impl EncoderNodeRunner for VaapiAv1EncoderNode {
 // Encoder — internal codec wrapper
 // ---------------------------------------------------------------------------
 
-/// Type alias for the full VA-API AV1 encoder with GBM-backed frames.
+/// Type alias for the VA-API AV1 encoder using direct VA surfaces.
+///
+/// Bypasses GBM buffer allocation entirely — see the H.264 encoder type alias
+/// in `vaapi_h264.rs` for the full rationale.
 type CrosVaapiAv1Encoder = StatelessEncoder<
     cros_codecs::encoder::av1::AV1,
-    GbmVideoFrame,
-    cros_codecs::backend::vaapi::encoder::VaapiBackend<
-        GbmExternalBufferDescriptor,
-        libva::Surface<GbmExternalBufferDescriptor>,
-    >,
+    libva::Surface<()>,
+    cros_codecs::backend::vaapi::encoder::VaapiBackend<(), libva::Surface<()>>,
 >;
 
 /// Internal encoder state wrapping the cros-codecs `StatelessEncoder`.
@@ -862,7 +974,7 @@ type CrosVaapiAv1Encoder = StatelessEncoder<
 /// a `spawn_blocking` thread, matching the pattern in `av1.rs`.
 struct VaapiAv1Encoder {
     encoder: CrosVaapiAv1Encoder,
-    gbm: Arc<GbmDevice>,
+    display: Rc<libva::Display>,
     width: u32,
     height: u32,
     coded_width: u32,
@@ -875,7 +987,7 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
     const CODEC_NAME: &'static str = "VA-API AV1";
 
     fn new_encoder(width: u32, height: u32, config: &Self::Config) -> Result<Self, String> {
-        let (display, gbm, path) = open_va_and_gbm(config.render_device.as_ref())?;
+        let (display, path) = open_va_display(config.render_device.as_ref())?;
         tracing::info!(device = %path, width, height, "VA-API AV1 encoder opening");
 
         let coded_width = align_up_u32(width, AV1_SB_SIZE);
@@ -895,7 +1007,7 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
         };
 
         let encoder = CrosVaapiAv1Encoder::new_vaapi(
-            display,
+            Rc::clone(&display),
             cros_config,
             nv12_fourcc(),
             CrosResolution { width: coded_width, height: coded_height },
@@ -914,7 +1026,7 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
             "VA-API AV1 encoder created"
         );
 
-        Ok(Self { encoder, gbm, width, height, coded_width, coded_height, frame_count: 0 })
+        Ok(Self { encoder, display, width, height, coded_width, coded_height, frame_count: 0 })
     }
 
     fn encode(
@@ -928,46 +1040,36 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
                 .into());
         }
 
-        // Create a GBM frame and upload the raw video data.
-        let mut gbm_frame = Arc::clone(&self.gbm)
-            .new_frame(
-                nv12_fourcc(),
-                CrosResolution { width: self.width, height: self.height },
-                CrosResolution { width: self.coded_width, height: self.coded_height },
-                GbmUsage::Encode,
+        // Create a VA surface and upload NV12 data via the Image API.
+        // This bypasses GBM buffer allocation (GBM_BO_USE_HW_VIDEO_ENCODER),
+        // which Mesa's iris driver does not support for NV12 on all hardware.
+        let nv12_fourcc_val: u32 = nv12_fourcc().into();
+        let mut surfaces = self
+            .display
+            .create_surfaces(
+                libva::VA_RT_FORMAT_YUV420,
+                Some(nv12_fourcc_val),
+                self.coded_width,
+                self.coded_height,
+                Some(libva::UsageHint::USAGE_HINT_ENCODER),
+                vec![()],
             )
-            .map_err(|e| format!("failed to allocate GBM frame for encoding: {e}"))?;
-
-        // Write frame data into the GBM buffer.
-        let pitches = gbm_frame.get_plane_pitch();
-        {
-            let mapping = gbm_frame
-                .map_mut()
-                .map_err(|e| format!("failed to map GBM frame for writing: {e}"))?;
-            write_nv12_to_mapping(mapping.as_ref(), frame, &pitches)?;
-        }
+            .map_err(|e| format!("failed to create VA surface for encoding: {e}"))?;
+        let surface =
+            surfaces.pop().ok_or_else(|| "create_surfaces returned empty vec".to_string())?;
+
+        // Write frame data into the VA surface.
+        let (pitches, offsets) = write_nv12_to_va_surface(&self.display, &surface, frame)?;
 
         let is_keyframe = metadata.as_ref().and_then(|m| m.keyframe).unwrap_or(false);
         let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(self.frame_count);
 
-        // Ideally we'd use `gbm_frame.get_plane_offset()` to get the real UV
-        // plane offset from the GBM allocator, but that method is private in
-        // cros-codecs 0.0.6.  Fall back to computing it from pitch × coded_height,
-        // which is correct for linear (non-tiled) NV12 allocations — the common
-        // case for VA-API encode surfaces.
-        let y_stride = pitches.first().copied().unwrap_or(self.coded_width as usize);
-        let uv_offset = y_stride * self.coded_height as usize;
-
         let frame_layout = FrameLayout {
             format: (nv12_fourcc(), 0), // DRM_FORMAT_MOD_LINEAR
             size: CrosResolution { width: self.coded_width, height: self.coded_height },
             planes: vec![
-                PlaneLayout { buffer_index: 0, offset: 0, stride: y_stride },
-                PlaneLayout {
-                    buffer_index: 0,
-                    offset: uv_offset,
-                    stride: pitches.get(1).copied().unwrap_or(self.coded_width as usize),
-                },
+                PlaneLayout { buffer_index: 0, offset: offsets[0], stride: pitches[0] },
+                PlaneLayout { buffer_index: 0, offset: offsets[1], stride: pitches[1] },
             ],
         };
 
@@ -975,7 +1077,7 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
             CrosFrameMetadata { timestamp, layout: frame_layout, force_keyframe: is_keyframe };
 
         self.encoder
-            .encode(cros_meta, gbm_frame)
+            .encode(cros_meta, surface)
             .map_err(|e| format!("VA-API AV1 encode error: {e}"))?;
 
         self.frame_count += 1;
diff --git a/crates/nodes/src/video/vaapi_h264.rs b/crates/nodes/src/video/vaapi_h264.rs
index 921d93a9..7285fd2c 100644
--- a/crates/nodes/src/video/vaapi_h264.rs
+++ b/crates/nodes/src/video/vaapi_h264.rs
@@ -73,9 +73,10 @@ use super::HwAccelMode;
 use super::H264_CONTENT_TYPE;
 
 // Re-use helpers from the VA-API AV1 module — they are codec-agnostic NV12
-// I/O routines (GBM mapping, render-device detection, etc.).
+// I/O routines (VA surface upload, GBM mapping, render-device detection, etc.).
 use super::vaapi_av1::{
-    align_up_u32, nv12_fourcc, open_va_and_gbm, read_nv12_from_mapping, write_nv12_to_mapping,
+    align_up_u32, nv12_fourcc, open_va_and_gbm, open_va_display, read_nv12_from_mapping,
+    write_nv12_to_mapping, write_nv12_to_va_surface,
 };
 
 // ---------------------------------------------------------------------------
@@ -589,14 +590,17 @@ impl EncoderNodeRunner for VaapiH264EncoderNode {
 // Encoder — internal codec wrapper
 // ---------------------------------------------------------------------------
 
-/// Type alias for the full VA-API H.264 encoder with GBM-backed frames.
+/// Type alias for the VA-API H.264 encoder using direct VA surfaces.
+///
+/// Bypasses GBM buffer allocation entirely — input frames are uploaded to
+/// VA surfaces via the VA-API Image API and passed straight through to the
+/// encoder backend.  This avoids the `GBM_BO_USE_HW_VIDEO_ENCODER` flag
+/// which Mesa's iris driver does not support for NV12 on some hardware
+/// (e.g. Intel Tiger Lake with Mesa 23.x).
 type CrosVaapiH264Encoder = StatelessEncoder<
     cros_codecs::encoder::h264::H264,
-    GbmVideoFrame,
-    cros_codecs::backend::vaapi::encoder::VaapiBackend<
-        GbmExternalBufferDescriptor,
-        libva::Surface<GbmExternalBufferDescriptor>,
-    >,
+    libva::Surface<()>,
+    cros_codecs::backend::vaapi::encoder::VaapiBackend<(), libva::Surface<()>>,
 >;
 
 /// Internal encoder state wrapping the cros-codecs `StatelessEncoder`.
@@ -605,7 +609,7 @@ type CrosVaapiH264Encoder = StatelessEncoder<
 /// a `spawn_blocking` thread.
 struct VaapiH264Encoder {
     encoder: CrosVaapiH264Encoder,
-    gbm: Arc<GbmDevice>,
+    display: Rc<libva::Display>,
     width: u32,
     height: u32,
     coded_width: u32,
@@ -618,7 +622,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
     const CODEC_NAME: &'static str = "VA-API H.264";
 
     fn new_encoder(width: u32, height: u32, config: &Self::Config) -> Result<Self, String> {
-        let (display, gbm, path) = open_va_and_gbm(config.render_device.as_ref())?;
+        let (display, path) = open_va_display(config.render_device.as_ref())?;
         tracing::info!(device = %path, width, height, "VA-API H.264 encoder opening");
 
         let coded_width = align_up_u32(width, H264_MB_SIZE);
@@ -678,7 +682,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
         };
 
         let encoder = CrosVaapiH264Encoder::new_vaapi(
-            display,
+            Rc::clone(&display),
             cros_config,
             nv12_fourcc(),
             CrosResolution { width: coded_width, height: coded_height },
@@ -697,7 +701,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
             "VA-API H.264 encoder created"
         );
 
-        Ok(Self { encoder, gbm, width, height, coded_width, coded_height, frame_count: 0 })
+        Ok(Self { encoder, display, width, height, coded_width, coded_height, frame_count: 0 })
     }
 
     fn encode(
@@ -711,43 +715,36 @@ impl StandardVideoEncoder for VaapiH264Encoder {
                 .into());
         }
 
-        // Create a GBM frame and upload the raw video data.
-        let mut gbm_frame = Arc::clone(&self.gbm)
-            .new_frame(
-                nv12_fourcc(),
-                CrosResolution { width: self.width, height: self.height },
-                CrosResolution { width: self.coded_width, height: self.coded_height },
-                GbmUsage::Encode,
+        // Create a VA surface and upload NV12 data via the Image API.
+        // This bypasses GBM buffer allocation (GBM_BO_USE_HW_VIDEO_ENCODER),
+        // which Mesa's iris driver does not support for NV12 on all hardware.
+        let nv12_fourcc_val: u32 = nv12_fourcc().into();
+        let mut surfaces = self
+            .display
+            .create_surfaces(
+                libva::VA_RT_FORMAT_YUV420,
+                Some(nv12_fourcc_val),
+                self.coded_width,
+                self.coded_height,
+                Some(libva::UsageHint::USAGE_HINT_ENCODER),
+                vec![()],
             )
-            .map_err(|e| format!("failed to allocate GBM frame for encoding: {e}"))?;
-
-        // Write frame data into the GBM buffer.
-        let pitches = gbm_frame.get_plane_pitch();
-        {
-            let mapping = gbm_frame
-                .map_mut()
-                .map_err(|e| format!("failed to map GBM frame for writing: {e}"))?;
-            write_nv12_to_mapping(mapping.as_ref(), frame, &pitches)?;
-        }
+            .map_err(|e| format!("failed to create VA surface for encoding: {e}"))?;
+        let surface =
+            surfaces.pop().ok_or_else(|| "create_surfaces returned empty vec".to_string())?;
+
+        // Write frame data into the VA surface.
+        let (pitches, offsets) = write_nv12_to_va_surface(&self.display, &surface, frame)?;
 
         let is_keyframe = metadata.as_ref().and_then(|m| m.keyframe).unwrap_or(false);
         let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(self.frame_count);
 
-        // Compute UV plane offset from pitch × coded_height (same approach as
-        // the AV1 encoder — get_plane_offset() is private in cros-codecs 0.0.6).
-        let y_stride = pitches.first().copied().unwrap_or(self.coded_width as usize);
-        let uv_offset = y_stride * self.coded_height as usize;
-
         let frame_layout = FrameLayout {
             format: (nv12_fourcc(), 0), // DRM_FORMAT_MOD_LINEAR
             size: CrosResolution { width: self.coded_width, height: self.coded_height },
             planes: vec![
-                PlaneLayout { buffer_index: 0, offset: 0, stride: y_stride },
-                PlaneLayout {
-                    buffer_index: 0,
-                    offset: uv_offset,
-                    stride: pitches.get(1).copied().unwrap_or(self.coded_width as usize),
-                },
+                PlaneLayout { buffer_index: 0, offset: offsets[0], stride: pitches[0] },
+                PlaneLayout { buffer_index: 0, offset: offsets[1], stride: pitches[1] },
             ],
         };
 
@@ -755,7 +752,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
             CrosFrameMetadata { timestamp, layout: frame_layout, force_keyframe: is_keyframe };
 
         self.encoder
-            .encode(cros_meta, gbm_frame)
+            .encode(cros_meta, surface)
             .map_err(|e| format!("VA-API H.264 encode error: {e}"))?;
 
         self.frame_count += 1;

From 3592856350c48426bb1e27e8ed182749a7936dec Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 19:03:50 +0000
Subject: [PATCH 18/23] fix(nodes): use ceiling division for chroma dimensions
 in VA surface upload

write_nv12_to_va_surface used truncating integer division (w / 2, h / 2)
for chroma plane dimensions, which would corrupt chroma data for frames
with odd width or height.  VideoLayout::packed uses (width + 1) / 2 for
chroma dimensions, so the upload function must match.

Changes:
- NV12 path: use (h+1)/2 for uv_h, ((w+1)/2)*2 for chroma row bytes
- I420 path: use (w+1)/2 for uv_w, (h+1)/2 for uv_h

This matches the existing write_nv12_to_mapping (which uses div_ceil)
and i420_to_nv12_buffer in nv_av1.rs.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/vaapi_av1.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
index 771d17cd..84725951 100644
--- a/crates/nodes/src/video/vaapi_av1.rs
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -226,13 +226,17 @@ pub(super) fn write_nv12_to_va_surface(
                 }
             }
             // UV plane (already interleaved in NV12).
-            let uv_h = h / 2;
+            // Use ceiling division to handle odd dimensions, matching
+            // VideoLayout::packed which uses `(width + 1) / 2`.
+            let uv_h = (h + 1) / 2;
+            let chroma_row_bytes = ((w + 1) / 2) * 2;
             let src_uv = &src[w * h..];
             for row in 0..uv_h {
-                let s = row * w;
+                let s = row * chroma_row_bytes;
+                let copy_w = chroma_row_bytes.min(w);
                 let d = uv_offset + row * uv_pitch;
-                if s + w <= src_uv.len() && d + w <= dest.len() {
-                    dest[d..d + w].copy_from_slice(&src_uv[s..s + w]);
+                if s + copy_w <= src_uv.len() && d + copy_w <= dest.len() {
+                    dest[d..d + copy_w].copy_from_slice(&src_uv[s..s + copy_w]);
                 }
             }
         },
@@ -246,8 +250,10 @@ pub(super) fn write_nv12_to_va_surface(
                 }
             }
             // I420 → NV12: interleave U and V into a single UV plane.
-            let uv_w = w / 2;
-            let uv_h = h / 2;
+            // Use ceiling division to handle odd dimensions correctly,
+            // matching VideoLayout::packed and i420_to_nv12_buffer.
+            let uv_w = (w + 1) / 2;
+            let uv_h = (h + 1) / 2;
             let u_start = w * h;
             let v_start = u_start + uv_w * uv_h;
             for row in 0..uv_h {

From 0bf779d3e6917d27d11eadc42c29a208aaa04af8 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 19:10:00 +0000
Subject: [PATCH 19/23] fix(nodes): remove incorrect .min(w) clamp on NV12 UV
 row copy

For odd-width frames, chroma_row_bytes (e.g. 642 for w=641) is the
correct number of bytes per UV row in VideoLayout::packed format.
Clamping to .min(w) would drop the last V sample on every UV row.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/vaapi_av1.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
index 84725951..40c0e33c 100644
--- a/crates/nodes/src/video/vaapi_av1.rs
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -233,10 +233,10 @@ pub(super) fn write_nv12_to_va_surface(
             let src_uv = &src[w * h..];
             for row in 0..uv_h {
                 let s = row * chroma_row_bytes;
-                let copy_w = chroma_row_bytes.min(w);
                 let d = uv_offset + row * uv_pitch;
-                if s + copy_w <= src_uv.len() && d + copy_w <= dest.len() {
-                    dest[d..d + copy_w].copy_from_slice(&src_uv[s..s + copy_w]);
+                if s + chroma_row_bytes <= src_uv.len() && d + chroma_row_bytes <= dest.len() {
+                    dest[d..d + chroma_row_bytes]
+                        .copy_from_slice(&src_uv[s..s + chroma_row_bytes]);
                 }
             }
         },

From fcf8d3d3b4a440c2413eb12b13335fbf5390ff1e Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Thu, 9 Apr 2026 19:10:12 +0000
Subject: [PATCH 20/23] style(nodes): fix rustfmt for VA surface UV copy

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/vaapi_av1.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
index 40c0e33c..9eccd219 100644
--- a/crates/nodes/src/video/vaapi_av1.rs
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -235,8 +235,7 @@ pub(super) fn write_nv12_to_va_surface(
                 let s = row * chroma_row_bytes;
                 let d = uv_offset + row * uv_pitch;
                 if s + chroma_row_bytes <= src_uv.len() && d + chroma_row_bytes <= dest.len() {
-                    dest[d..d + chroma_row_bytes]
-                        .copy_from_slice(&src_uv[s..s + chroma_row_bytes]);
+                    dest[d..d + chroma_row_bytes].copy_from_slice(&src_uv[s..s + chroma_row_bytes]);
                 }
             }
         },

From e455df6a22cf188e55baf9335668d50b3928d5ef Mon Sep 17 00:00:00 2001
From: streamer45 <cstcld91@gmail.com>
Date: Fri, 10 Apr 2026 12:49:48 +0200
Subject: [PATCH 21/23] fix h264 enc/dec, on my laptop anyway :/

---
 Cargo.lock                           |   1 +
 crates/nodes/Cargo.toml              |   3 +-
 crates/nodes/src/video/vaapi_av1.rs  | 478 ++++++++++++++++++++-------
 crates/nodes/src/video/vaapi_h264.rs | 229 ++++++++-----
 justfile                             |   5 +
 5 files changed, 506 insertions(+), 210 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5390b6dc..d6c06a47 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6032,6 +6032,7 @@ dependencies = [
  "fontdue",
  "futures",
  "futures-util",
+ "gbm-sys",
  "hang",
  "image",
  "moq-lite",
diff --git a/crates/nodes/Cargo.toml b/crates/nodes/Cargo.toml
index adba5046..689fdab4 100644
--- a/crates/nodes/Cargo.toml
+++ b/crates/nodes/Cargo.toml
@@ -109,6 +109,7 @@ bytemuck = { version = "1.22", optional = true, features = ["derive"] }
 # HW-accelerated video codecs (optional, behind respective features)
 vk-video = { version = "0.3", optional = true }            # vulkan_video feature — Vulkan Video H.264 HW codec
 cros-codecs = { version = "0.0.6", optional = true, features = ["vaapi"] }  # vaapi feature — requires libva-dev system package
+gbm-sys = { version = "0.3", optional = true }  # vaapi feature — raw GBM BO allocation for encoder frames
 shiguredo_nvcodec = { version = "2025.2", optional = true }
 
 futures-util = "0.3"
@@ -186,7 +187,7 @@ video = ["vp9", "av1", "openh264", "colorbars", "compositor"]
 # vulkan_video: H.264 encode/decode via Vulkan Video (vk-video crate). Cross-vendor (Intel/NVIDIA/AMD).
 vulkan_video = ["dep:schemars", "dep:vk-video", "dep:serde_json"]
 # vaapi: AV1 encode/decode via VA-API (cros-codecs crate). Primarily Intel, also AMD.
-vaapi = ["dep:schemars", "dep:cros-codecs", "dep:serde_json"]
+vaapi = ["dep:schemars", "dep:cros-codecs", "dep:gbm-sys", "dep:serde_json"]
 # nvcodec: AV1 encode/decode via NVENC/NVDEC (shiguredo_nvcodec crate). NVIDIA only.
 nvcodec = ["dep:schemars", "dep:shiguredo_nvcodec", "dep:serde_json"]
 
diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
index 9eccd219..e53809bb 100644
--- a/crates/nodes/src/video/vaapi_av1.rs
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -61,9 +61,8 @@ use cros_codecs::encoder::{
     FrameMetadata as CrosFrameMetadata, PredictionStructure, RateControl, Tunings, VideoEncoder,
 };
 use cros_codecs::libva;
-use cros_codecs::video_frame::gbm_video_frame::{
-    GbmDevice, GbmExternalBufferDescriptor, GbmUsage, GbmVideoFrame,
-};
+use cros_codecs::video_frame::gbm_video_frame::{GbmDevice, GbmUsage, GbmVideoFrame};
+use cros_codecs::video_frame::generic_dma_video_frame::GenericDmaVideoFrame;
 use cros_codecs::video_frame::{ReadMapping, VideoFrame as CrosVideoFrame, WriteMapping};
 use cros_codecs::{Fourcc as CrosFourcc, FrameLayout, PlaneLayout, Resolution as CrosResolution};
 
@@ -96,6 +95,87 @@ const DEFAULT_QUALITY: u32 = 128;
 /// Default framerate for rate-control hints.
 const DEFAULT_FRAMERATE: u32 = 30;
 
+// ---------------------------------------------------------------------------
+// Frame upload strategy
+// ---------------------------------------------------------------------------
+
+/// Strategy for uploading NV12 frames to VA-API for encoding.
+///
+/// Detected at encoder creation time by probing GBM BO allocation.
+/// The fast GBM path is used when the driver supports `GBM_BO_USE_HW_VIDEO_ENCODER`
+/// for NV12; otherwise falls back to VA surface + Image API + DMA-BUF export.
+pub(super) enum FrameUploadStrategy {
+    /// Direct GBM allocation with `GBM_BO_USE_HW_VIDEO_ENCODER`.
+    /// Fastest path: GBM BO → mmap write → DMA-BUF FD → VA surface import.
+    Gbm(Arc<GbmDevice>),
+    /// VA surface + Image API upload + `vaExportSurfaceHandle`.
+    /// Compatible fallback for drivers that don't support GBM NV12 encoder BOs
+    /// (e.g. Mesa iris on Intel Tiger Lake).
+    VaSurface,
+}
+
+/// Probe whether GBM can allocate NV12 BOs with `GBM_BO_USE_HW_VIDEO_ENCODER`.
+///
+/// Tries a small test allocation and returns `Some(gbm_device)` on success.
+pub(super) fn probe_gbm_encode_support(render_device: &str) -> Option<Arc<GbmDevice>> {
+    let gbm = GbmDevice::open(render_device).ok()?;
+
+    // Try a small 64x64 NV12 BO with the encoder flag.
+    let test = Arc::clone(&gbm).new_frame(
+        nv12_fourcc(),
+        CrosResolution { width: 64, height: 64 },
+        CrosResolution { width: 64, height: 64 },
+        GbmUsage::Encode,
+    );
+
+    match test {
+        Ok(_) => {
+            tracing::info!("GBM NV12 encoder BO probe succeeded — using fast GBM path");
+            Some(gbm)
+        },
+        Err(_) => {
+            tracing::info!("GBM NV12 encoder BO probe failed — using VA surface fallback path");
+            None
+        },
+    }
+}
+
+/// Allocate a GBM NV12 frame, write pixel data, and convert to [`GenericDmaVideoFrame`].
+///
+/// This is the fast path: GBM BO → mmap → write NV12 → extract DMA-BUF FD.
+/// Avoids the VA Image API and export_prime round-trip.
+pub(super) fn upload_nv12_via_gbm(
+    gbm: &Arc<GbmDevice>,
+    frame: &VideoFrame,
+    coded_width: u32,
+    coded_height: u32,
+) -> Result<(GenericDmaVideoFrame, Vec<usize>), String> {
+    let mut gbm_frame = Arc::clone(gbm)
+        .new_frame(
+            nv12_fourcc(),
+            CrosResolution { width: frame.width, height: frame.height },
+            CrosResolution { width: coded_width, height: coded_height },
+            GbmUsage::Encode,
+        )
+        .map_err(|e| format!("failed to allocate GBM encode frame: {e}"))?;
+
+    let pitches = CrosVideoFrame::get_plane_pitch(&gbm_frame);
+    {
+        let mapping = CrosVideoFrame::map_mut(&mut gbm_frame)
+            .map_err(|e| format!("failed to map GBM frame for writing: {e}"))?;
+        write_nv12_to_mapping(mapping.as_ref(), frame, &pitches)?;
+    }
+
+    let dma_frame = gbm_frame
+        .to_generic_dma_video_frame()
+        .map_err(|e| format!("failed to convert GBM frame to DMA: {e}"))?;
+
+    // Get pitches from the DMA frame layout (matches what the GBM BO reported).
+    let dma_pitches = CrosVideoFrame::get_plane_pitch(&dma_frame);
+
+    Ok((dma_frame, dma_pitches))
+}
+
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
@@ -163,36 +243,37 @@ pub(super) fn open_va_and_gbm(
     Ok((display, gbm, path))
 }
 
-/// Open a VA display without a GBM device.
-///
-/// Used by encoder paths that pass VA surfaces directly to the encoder,
-/// bypassing GBM buffer allocation entirely.  This avoids the
-/// `GBM_BO_USE_HW_VIDEO_ENCODER` flag that Mesa's iris driver does not
-/// support for NV12 on some hardware (e.g. Intel Tiger Lake).
-pub(super) fn open_va_display(
-    render_device: Option<&String>,
-) -> Result<(Rc<libva::Display>, String), String> {
-    let path = resolve_render_device(render_device);
-    let display = libva::Display::open_drm_display(&path)
-        .map_err(|e| format!("failed to open VA display on {path}: {e}"))?;
-    Ok((display, path))
-}
-
-/// Write NV12 (or I420→NV12) data from a StreamKit [`VideoFrame`] into a VA
-/// surface using the VA-API Image API.
+/// Allocate an NV12 frame as a [`GenericDmaVideoFrame`] using VA surfaces.
 ///
-/// Uses `vaCreateImage` + `vaMapBuffer` to obtain a writable mapping, writes
-/// NV12 data respecting the driver's internal pitches/offsets, then drops the
-/// [`Image`] which flushes the data back via `vaPutImage`.
+/// Bypasses GBM entirely — creates a plain VA surface, uploads NV12 pixel
+/// data via the VA Image API, then exports the surface as a DMA-BUF FD
+/// via `vaExportSurfaceHandle`.  This avoids all GBM usage flags
+/// (`HW_VIDEO_ENCODER`, `HW_VIDEO_DECODER`, `LINEAR`) that Mesa's iris
+/// driver may not support for NV12 on some Intel hardware.
 ///
-/// Returns `(pitches, offsets)` — the per-plane stride and byte-offset arrays
-/// from the `VAImage`, needed to build the [`FrameLayout`] for the encoder.
-pub(super) fn write_nv12_to_va_surface(
+/// Returns the DMA frame together with its per-plane pitches.
+pub(super) fn upload_nv12_to_dma_frame(
     display: &Rc<libva::Display>,
-    surface: &libva::Surface<()>,
     frame: &VideoFrame,
-) -> Result<([usize; 2], [usize; 2]), String> {
+    coded_width: u32,
+    coded_height: u32,
+) -> Result<(GenericDmaVideoFrame, Vec<usize>), String> {
     let nv12_fourcc_val: u32 = nv12_fourcc().into();
+
+    // Create a plain VA surface (no GBM, no external buffer).
+    let mut surfaces = display
+        .create_surfaces(
+            libva::VA_RT_FORMAT_YUV420,
+            Some(nv12_fourcc_val),
+            coded_width,
+            coded_height,
+            Some(libva::UsageHint::USAGE_HINT_ENCODER),
+            vec![()],
+        )
+        .map_err(|e| format!("failed to create VA surface: {e}"))?;
+    let surface = surfaces.pop().ok_or("create_surfaces returned empty vec")?;
+
+    // Upload NV12 data via VA Image API.
     let image_fmts = display
         .query_image_formats()
         .map_err(|e| format!("failed to query VA image formats: {e}"))?;
@@ -201,65 +282,58 @@ pub(super) fn write_nv12_to_va_surface(
         .find(|f| f.fourcc == nv12_fourcc_val)
         .ok_or("VA driver does not support NV12 image format")?;
 
-    let mut image = libva::Image::create_from(surface, image_fmt, surface.size(), surface.size())
+    let mut image = libva::Image::create_from(&surface, image_fmt, surface.size(), surface.size())
         .map_err(|e| format!("failed to create VA image for NV12 upload: {e}"))?;
 
     let va_image = *image.image();
     let y_pitch = va_image.pitches[0] as usize;
     let uv_pitch = va_image.pitches[1] as usize;
-    let y_offset = va_image.offsets[0] as usize;
-    let uv_offset = va_image.offsets[1] as usize;
 
+    // Write pixel data into the VA image buffer.
     let dest = image.as_mut();
     let src = frame.data.as_ref().as_ref();
     let w = frame.width as usize;
     let h = frame.height as usize;
+    let y_offset_img = va_image.offsets[0] as usize;
+    let uv_offset_img = va_image.offsets[1] as usize;
 
     match frame.pixel_format {
         PixelFormat::Nv12 => {
-            // Y plane.
             for row in 0..h {
                 let s = row * w;
-                let d = y_offset + row * y_pitch;
+                let d = y_offset_img + row * y_pitch;
                 if s + w <= src.len() && d + w <= dest.len() {
                     dest[d..d + w].copy_from_slice(&src[s..s + w]);
                 }
             }
-            // UV plane (already interleaved in NV12).
-            // Use ceiling division to handle odd dimensions, matching
-            // VideoLayout::packed which uses `(width + 1) / 2`.
-            let uv_h = (h + 1) / 2;
-            let chroma_row_bytes = ((w + 1) / 2) * 2;
+            let uv_h = h.div_ceil(2);
+            let chroma_row_bytes = w.div_ceil(2) * 2;
             let src_uv = &src[w * h..];
             for row in 0..uv_h {
                 let s = row * chroma_row_bytes;
-                let d = uv_offset + row * uv_pitch;
+                let d = uv_offset_img + row * uv_pitch;
                 if s + chroma_row_bytes <= src_uv.len() && d + chroma_row_bytes <= dest.len() {
                     dest[d..d + chroma_row_bytes].copy_from_slice(&src_uv[s..s + chroma_row_bytes]);
                 }
             }
         },
         PixelFormat::I420 => {
-            // Y plane — same as NV12.
             for row in 0..h {
                 let s = row * w;
-                let d = y_offset + row * y_pitch;
+                let d = y_offset_img + row * y_pitch;
                 if s + w <= src.len() && d + w <= dest.len() {
                     dest[d..d + w].copy_from_slice(&src[s..s + w]);
                 }
             }
-            // I420 → NV12: interleave U and V into a single UV plane.
-            // Use ceiling division to handle odd dimensions correctly,
-            // matching VideoLayout::packed and i420_to_nv12_buffer.
-            let uv_w = (w + 1) / 2;
-            let uv_h = (h + 1) / 2;
+            let uv_w = w.div_ceil(2);
+            let uv_h = h.div_ceil(2);
             let u_start = w * h;
             let v_start = u_start + uv_w * uv_h;
             for row in 0..uv_h {
                 for col in 0..uv_w {
                     let u_idx = u_start + row * uv_w + col;
                     let v_idx = v_start + row * uv_w + col;
-                    let d = uv_offset + row * uv_pitch + col * 2;
+                    let d = uv_offset_img + row * uv_pitch + col * 2;
                     if u_idx < src.len() && v_idx < src.len() && d + 1 < dest.len() {
                         dest[d] = src[u_idx];
                         dest[d + 1] = src[v_idx];
@@ -268,16 +342,119 @@ pub(super) fn write_nv12_to_va_surface(
             }
         },
         other => {
-            drop(image);
-            return Err(format!("write_nv12_to_va_surface: unsupported pixel format {other:?}"));
+            return Err(format!("unsupported pixel format for VA upload: {other:?}"));
         },
     }
 
-    // Sync the surface before dropping the image (which calls vaPutImage).
     surface.sync().map_err(|e| format!("VA surface sync failed: {e}"))?;
     drop(image);
 
-    Ok(([y_pitch, uv_pitch], [y_offset, uv_offset]))
+    // Export the VA surface as a DMA-BUF FD.
+    let prime_desc = surface
+        .export_prime()
+        .map_err(|e| format!("failed to export VA surface as DMA-BUF: {e}"))?;
+
+    let modifier = objects_modifier(&prime_desc);
+    let layers = prime_desc.layers;
+    let objects = prime_desc.objects;
+
+    if layers.is_empty() || objects.is_empty() {
+        return Err("export_prime returned empty layers/objects".into());
+    }
+
+    let layer = &layers[0];
+
+    // Build plane layouts from the PRIME descriptor.
+    let mut planes = Vec::new();
+    for plane_idx in 0..layer.num_planes as usize {
+        planes.push(PlaneLayout {
+            buffer_index: layer.object_index[plane_idx] as usize,
+            offset: layer.offset[plane_idx] as usize,
+            stride: layer.pitch[plane_idx] as usize,
+        });
+    }
+
+    let pitches: Vec<usize> = planes.iter().map(|p| p.stride).collect();
+
+    // Collect DMA-BUF file handles from the exported objects.
+    let dma_handles: Vec<std::fs::File> = objects.into_iter().map(|obj| obj.fd.into()).collect();
+
+    let dma_frame = GenericDmaVideoFrame::new(
+        dma_handles,
+        FrameLayout {
+            format: (nv12_fourcc(), modifier),
+            size: CrosResolution { width: coded_width, height: coded_height },
+            planes,
+        },
+    )
+    .map_err(|e| format!("failed to create NV12 DMA frame from VA export: {e}"))?;
+
+    Ok((dma_frame, pitches))
+}
+
+/// Extract the DRM format modifier from the first PRIME object.
+fn objects_modifier(desc: &libva::DrmPrimeSurfaceDescriptor) -> u64 {
+    desc.objects.first().map_or(0, |o| o.drm_format_modifier)
+}
+
+/// Allocate an empty NV12 [`GenericDmaVideoFrame`] for decoder output.
+///
+/// Creates a plain VA surface and exports it as a DMA-BUF FD.  The decoder
+/// will write decoded pixels into this surface via VA-API; the caller reads
+/// them back via `map()` after the frame is ready.
+///
+/// This avoids GBM allocation for decoder output frames — same rationale as
+/// the encoder path: `GBM_BO_USE_HW_VIDEO_DECODER` is not supported for
+/// contiguous NV12 on some Mesa/iris hardware.
+pub(super) fn allocate_decoder_dma_frame(
+    display: &Rc<libva::Display>,
+    width: u32,
+    height: u32,
+) -> Option<GenericDmaVideoFrame> {
+    let nv12_fourcc_val: u32 = nv12_fourcc().into();
+
+    let mut surfaces = display
+        .create_surfaces(
+            libva::VA_RT_FORMAT_YUV420,
+            Some(nv12_fourcc_val),
+            width,
+            height,
+            Some(libva::UsageHint::USAGE_HINT_DECODER),
+            vec![()],
+        )
+        .ok()?;
+    let surface = surfaces.pop()?;
+
+    let prime_desc = surface.export_prime().ok()?;
+    let modifier = objects_modifier(&prime_desc);
+    let layers = prime_desc.layers;
+    let objects = prime_desc.objects;
+
+    if layers.is_empty() || objects.is_empty() {
+        return None;
+    }
+
+    let layer = &layers[0];
+    let mut planes = Vec::new();
+    for plane_idx in 0..layer.num_planes as usize {
+        planes.push(PlaneLayout {
+            buffer_index: layer.object_index[plane_idx] as usize,
+            offset: layer.offset[plane_idx] as usize,
+            stride: layer.pitch[plane_idx] as usize,
+        });
+    }
+
+    let dma_handles: Vec<std::fs::File> = objects.into_iter().map(|obj| obj.fd.into()).collect();
+
+    GenericDmaVideoFrame::new(
+        dma_handles,
+        FrameLayout {
+            format: (nv12_fourcc(), modifier),
+            size: CrosResolution { width, height },
+            planes,
+        },
+    )
+    .ok()
 }
 
 /// Copy NV12 plane data from a GBM read-mapping into a flat `Vec<u8>` suitable
@@ -615,18 +792,9 @@ fn vaapi_av1_decode_loop(
     result_tx: &mpsc::Sender<Result<VideoFrame, String>>,
     duration_histogram: &opentelemetry::metrics::Histogram<f64>,
 ) {
-    // ── Open GBM device + VA display ──────────────────────────────────
+    // ── Open VA display ────────────────────────────────────────────────
     let path = resolve_render_device(render_device);
 
-    let gbm = match GbmDevice::open(&path) {
-        Ok(g) => g,
-        Err(e) => {
-            let _ =
-                result_tx.blocking_send(Err(format!("failed to open GBM device on {path}: {e}")));
-            return;
-        },
-    };
-
     let display = match libva::Display::open_drm_display(&path) {
         Ok(d) => d,
         Err(e) => {
@@ -638,17 +806,18 @@ fn vaapi_av1_decode_loop(
     tracing::info!(device = %path, "VA-API AV1 decoder opened display");
 
     // ── Create stateless decoder ─────────────────────────────────────────
-    let mut decoder = match StatelessDecoder::<Av1, VaapiDecBackend<GbmVideoFrame>>::new_vaapi(
-        display,
-        BlockingMode::Blocking,
-    ) {
-        Ok(d) => d,
-        Err(e) => {
-            let _ =
-                result_tx.blocking_send(Err(format!("failed to create VA-API AV1 decoder: {e}")));
-            return;
-        },
-    };
+    let mut decoder =
+        match StatelessDecoder::<Av1, VaapiDecBackend<GenericDmaVideoFrame>>::new_vaapi(
+            Rc::clone(&display),
+            BlockingMode::Blocking,
+        ) {
+            Ok(d) => d,
+            Err(e) => {
+                let _ = result_tx
+                    .blocking_send(Err(format!("failed to create VA-API AV1 decoder: {e}")));
+                return;
+            },
+        };
 
     // Stream resolution — updated on FormatChanged events.
     let mut coded_width: u32 = 0;
@@ -669,20 +838,10 @@ fn vaapi_av1_decode_loop(
         let mut eagain_empty_retries: u32 = 0;
 
         while offset < bitstream.len() {
-            let gbm_ref = Arc::clone(&gbm);
+            let display_ref = Rc::clone(&display);
             let cw = coded_width;
             let ch = coded_height;
-            let mut alloc_cb = move || {
-                gbm_ref
-                    .clone()
-                    .new_frame(
-                        nv12_fourcc(),
-                        CrosResolution { width: cw, height: ch },
-                        CrosResolution { width: cw, height: ch },
-                        GbmUsage::Decode,
-                    )
-                    .ok()
-            };
+            let mut alloc_cb = move || allocate_decoder_dma_frame(&display_ref, cw, ch);
 
             let mut made_progress = false;
 
@@ -755,7 +914,7 @@ fn vaapi_av1_decode_loop(
 /// - `should_exit`: the result channel is closed and the caller should return.
 /// - `had_events`: at least one event (format change or frame) was processed.
 fn drain_decoder_events(
-    decoder: &mut StatelessDecoder<Av1, VaapiDecBackend<GbmVideoFrame>>,
+    decoder: &mut StatelessDecoder<Av1, VaapiDecBackend<GenericDmaVideoFrame>>,
     result_tx: &mpsc::Sender<Result<VideoFrame, String>>,
     metadata: Option<&PacketMetadata>,
     coded_width: &mut u32,
@@ -963,14 +1122,18 @@ impl EncoderNodeRunner for VaapiAv1EncoderNode {
 // Encoder — internal codec wrapper
 // ---------------------------------------------------------------------------
 
-/// Type alias for the VA-API AV1 encoder using direct VA surfaces.
+/// Type alias for the VA-API AV1 encoder using `GenericDmaVideoFrame`.
 ///
-/// Bypasses GBM buffer allocation entirely — see the H.264 encoder type alias
-/// in `vaapi_h264.rs` for the full rationale.
+/// Uses DMA-BUF backed frames instead of GBM frames to avoid the
+/// `GBM_BO_USE_HW_VIDEO_ENCODER` flag which Mesa's iris driver does not
+/// support for NV12 on some Intel hardware (e.g. Tiger Lake).
 type CrosVaapiAv1Encoder = StatelessEncoder<
     cros_codecs::encoder::av1::AV1,
-    libva::Surface<()>,
-    cros_codecs::backend::vaapi::encoder::VaapiBackend<(), libva::Surface<()>>,
+    GenericDmaVideoFrame,
+    cros_codecs::backend::vaapi::encoder::VaapiBackend<
+        GenericDmaVideoFrame,
+        libva::Surface<GenericDmaVideoFrame>,
+    >,
 >;
 
 /// Internal encoder state wrapping the cros-codecs `StatelessEncoder`.
@@ -980,8 +1143,7 @@ type CrosVaapiAv1Encoder = StatelessEncoder<
 struct VaapiAv1Encoder {
     encoder: CrosVaapiAv1Encoder,
     display: Rc<libva::Display>,
-    width: u32,
-    height: u32,
+    upload_strategy: FrameUploadStrategy,
     coded_width: u32,
     coded_height: u32,
     frame_count: u64,
@@ -992,12 +1154,24 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
     const CODEC_NAME: &'static str = "VA-API AV1";
 
     fn new_encoder(width: u32, height: u32, config: &Self::Config) -> Result<Self, String> {
-        let (display, path) = open_va_display(config.render_device.as_ref())?;
-        tracing::info!(device = %path, width, height, "VA-API AV1 encoder opening");
+        let path = resolve_render_device(config.render_device.as_ref());
+        let display = libva::Display::open_drm_display(&path)
+            .map_err(|e| format!("failed to open VA display on {path}: {e}"))?;
 
         let coded_width = align_up_u32(width, AV1_SB_SIZE);
         let coded_height = align_up_u32(height, AV1_SB_SIZE);
 
+        // Probe GBM support for the fast path.
+        let upload_strategy = match probe_gbm_encode_support(&path) {
+            Some(gbm) => FrameUploadStrategy::Gbm(gbm),
+            None => FrameUploadStrategy::VaSurface,
+        };
+
+        let strategy_label = match &upload_strategy {
+            FrameUploadStrategy::Gbm(_) => "gbm",
+            FrameUploadStrategy::VaSurface => "va_surface",
+        };
+
         let cros_config = CrosEncoderConfig {
             profile: Av1Profile::Profile0,
             bit_depth: cros_codecs::codec::av1::parser::BitDepth::Depth8,
@@ -1028,10 +1202,11 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
             coded_width,
             coded_height,
             quality = config.quality,
+            upload_strategy = strategy_label,
             "VA-API AV1 encoder created"
         );
 
-        Ok(Self { encoder, display, width, height, coded_width, coded_height, frame_count: 0 })
+        Ok(Self { encoder, display, upload_strategy, coded_width, coded_height, frame_count: 0 })
     }
 
     fn encode(
@@ -1045,36 +1220,32 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
                 .into());
         }
 
-        // Create a VA surface and upload NV12 data via the Image API.
-        // This bypasses GBM buffer allocation (GBM_BO_USE_HW_VIDEO_ENCODER),
-        // which Mesa's iris driver does not support for NV12 on all hardware.
-        let nv12_fourcc_val: u32 = nv12_fourcc().into();
-        let mut surfaces = self
-            .display
-            .create_surfaces(
-                libva::VA_RT_FORMAT_YUV420,
-                Some(nv12_fourcc_val),
-                self.coded_width,
-                self.coded_height,
-                Some(libva::UsageHint::USAGE_HINT_ENCODER),
-                vec![()],
-            )
-            .map_err(|e| format!("failed to create VA surface for encoding: {e}"))?;
-        let surface =
-            surfaces.pop().ok_or_else(|| "create_surfaces returned empty vec".to_string())?;
-
-        // Write frame data into the VA surface.
-        let (pitches, offsets) = write_nv12_to_va_surface(&self.display, &surface, frame)?;
+        // Upload NV12 frame data — dispatch based on detected strategy.
+        let (dma_frame, pitches) = match &self.upload_strategy {
+            FrameUploadStrategy::Gbm(gbm) => {
+                upload_nv12_via_gbm(gbm, frame, self.coded_width, self.coded_height)?
+            },
+            FrameUploadStrategy::VaSurface => {
+                upload_nv12_to_dma_frame(&self.display, frame, self.coded_width, self.coded_height)?
+            },
+        };
 
         let is_keyframe = metadata.as_ref().and_then(|m| m.keyframe).unwrap_or(false);
         let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(self.frame_count);
 
+        let y_stride = pitches.first().copied().unwrap_or(self.coded_width as usize);
+        let uv_stride = pitches.get(1).copied().unwrap_or(self.coded_width as usize);
+
         let frame_layout = FrameLayout {
-            format: (nv12_fourcc(), 0), // DRM_FORMAT_MOD_LINEAR
+            format: (nv12_fourcc(), 0),
             size: CrosResolution { width: self.coded_width, height: self.coded_height },
             planes: vec![
-                PlaneLayout { buffer_index: 0, offset: offsets[0], stride: pitches[0] },
-                PlaneLayout { buffer_index: 0, offset: offsets[1], stride: pitches[1] },
+                PlaneLayout { buffer_index: 0, offset: 0, stride: y_stride },
+                PlaneLayout {
+                    buffer_index: 0,
+                    offset: y_stride * self.coded_height as usize,
+                    stride: uv_stride,
+                },
             ],
         };
 
@@ -1082,7 +1253,7 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
             CrosFrameMetadata { timestamp, layout: frame_layout, force_keyframe: is_keyframe };
 
         self.encoder
-            .encode(cros_meta, surface)
+            .encode(cros_meta, dma_frame)
             .map_err(|e| format!("VA-API AV1 encode error: {e}"))?;
 
         self.frame_count += 1;
@@ -1622,12 +1793,45 @@ mod tests {
         libva::Display::open_drm_display(std::path::Path::new(&path)).is_ok()
     }
 
+    /// Check whether VA-API AV1 encoding is supported on this hardware.
+    /// AV1 encode requires Intel Arc (DG2) or newer — Tiger Lake and
+    /// older Intel GPUs do not support it.
+    fn vaapi_av1_encode_available() -> bool {
+        let path = resolve_render_device(None);
+        let Ok(display) = libva::Display::open_drm_display(std::path::Path::new(&path)) else {
+            return false;
+        };
+        // Try to create the encoder — if AV1 encode isn't supported
+        // the driver will reject the config.
+        let config = CrosEncoderConfig {
+            profile: Av1Profile::Profile0,
+            bit_depth: cros_codecs::codec::av1::parser::BitDepth::Depth8,
+            resolution: CrosResolution { width: 64, height: 64 },
+            pred_structure: PredictionStructure::LowDelay { limit: 1024 },
+            initial_tunings: Tunings {
+                rate_control: RateControl::ConstantQuality(128),
+                framerate: 30,
+                min_quality: 0,
+                max_quality: 255,
+            },
+        };
+        CrosVaapiAv1Encoder::new_vaapi(
+            display,
+            config,
+            nv12_fourcc(),
+            CrosResolution { width: 64, height: 64 },
+            false,
+            BlockingMode::Blocking,
+        )
+        .is_ok()
+    }
+
     /// Encoder + Decoder roundtrip: encode 5 NV12 frames, decode them back,
     /// verify dimensions and pixel format.
     #[tokio::test]
-    async fn test_vaapi_av1_encode_decode_roundtrip() {
-        if !vaapi_available() {
-            eprintln!("SKIP: no VA-API device available");
+    async fn gpu_tests_vaapi_av1_encode_decode_roundtrip() {
+        if !vaapi_av1_encode_available() {
+            eprintln!("SKIP: VA-API AV1 encoding not supported on this hardware");
             return;
         }
 
@@ -1722,9 +1926,9 @@ mod tests {
 
     /// Verify decoded frames preserve metadata from input packets.
     #[tokio::test]
-    async fn test_vaapi_av1_metadata_propagation() {
-        if !vaapi_available() {
-            eprintln!("SKIP: no VA-API device available");
+    async fn gpu_tests_vaapi_av1_metadata_propagation() {
+        if !vaapi_av1_encode_available() {
+            eprintln!("SKIP: VA-API AV1 encoding not supported on this hardware");
             return;
         }
 
@@ -1818,9 +2022,9 @@ mod tests {
     /// Encode I420 input frames and verify the encoder accepts them
     /// (exercises the I420→NV12 conversion path).
     #[tokio::test]
-    async fn test_vaapi_av1_encode_i420_input() {
-        if !vaapi_available() {
-            eprintln!("SKIP: no VA-API device available");
+    async fn gpu_tests_vaapi_av1_encode_i420_input() {
+        if !vaapi_av1_encode_available() {
+            eprintln!("SKIP: VA-API AV1 encoding not supported on this hardware");
             return;
         }
 
@@ -1923,4 +2127,22 @@ mod tests {
             "VA-API AV1 encoder should be registered"
         );
     }
+
+    /// Verify that the frame upload strategy probe runs without panicking
+    /// and reports a coherent result on whatever hardware is present.
+    #[test]
+    fn gpu_tests_vaapi_upload_strategy_probe() {
+        if !vaapi_available() {
+            eprintln!("SKIP: no VA-API device available");
+            return;
+        }
+
+        let path = resolve_render_device(None);
+        let result = probe_gbm_encode_support(&path);
+        match result {
+            Some(_) => eprintln!("  upload strategy: GBM (fast path)"),
+            None => eprintln!("  upload strategy: VA surface (fallback)"),
+        }
+        // Either path is valid — the important thing is no panic.
+    }
 }
diff --git a/crates/nodes/src/video/vaapi_h264.rs b/crates/nodes/src/video/vaapi_h264.rs
index 7285fd2c..cb236adf 100644
--- a/crates/nodes/src/video/vaapi_h264.rs
+++ b/crates/nodes/src/video/vaapi_h264.rs
@@ -62,11 +62,10 @@ use cros_codecs::encoder::{
     FrameMetadata as CrosFrameMetadata, PredictionStructure, RateControl, Tunings, VideoEncoder,
 };
 use cros_codecs::libva;
-use cros_codecs::video_frame::gbm_video_frame::{
-    GbmDevice, GbmExternalBufferDescriptor, GbmUsage, GbmVideoFrame,
-};
-use cros_codecs::video_frame::{ReadMapping, VideoFrame as CrosVideoFrame, WriteMapping};
-use cros_codecs::{Fourcc as CrosFourcc, FrameLayout, PlaneLayout, Resolution as CrosResolution};
+// GBM types are only needed transitively via vaapi_av1 helpers.
+use cros_codecs::video_frame::generic_dma_video_frame::GenericDmaVideoFrame;
+use cros_codecs::video_frame::VideoFrame as CrosVideoFrame;
+use cros_codecs::{FrameLayout, PlaneLayout, Resolution as CrosResolution};
 
 use super::encoder_trait::{self, EncodedPacket, EncoderNodeRunner, StandardVideoEncoder};
 use super::HwAccelMode;
@@ -75,8 +74,9 @@ use super::H264_CONTENT_TYPE;
 // Re-use helpers from the VA-API AV1 module — they are codec-agnostic NV12
 // I/O routines (VA surface upload, GBM mapping, render-device detection, etc.).
 use super::vaapi_av1::{
-    align_up_u32, nv12_fourcc, open_va_and_gbm, open_va_display, read_nv12_from_mapping,
-    write_nv12_to_mapping, write_nv12_to_va_surface,
+    align_up_u32, allocate_decoder_dma_frame, nv12_fourcc, open_va_and_gbm,
+    probe_gbm_encode_support, read_nv12_from_mapping, resolve_render_device,
+    upload_nv12_to_dma_frame, upload_nv12_via_gbm, FrameUploadStrategy,
 };
 
 // ---------------------------------------------------------------------------
@@ -252,7 +252,7 @@ impl ProcessorNode for VaapiH264DecoderNode {
 
 /// Blocking decode loop running inside `spawn_blocking`.
 ///
-/// Creates the VA-API display, GBM device, and cros-codecs `StatelessDecoder`,
+/// Creates the VA-API display and cros-codecs `StatelessDecoder`,
 /// then processes input packets until the channel is closed.
 fn vaapi_h264_decode_loop(
     render_device: Option<&String>,
@@ -260,28 +260,31 @@ fn vaapi_h264_decode_loop(
     result_tx: &mpsc::Sender<Result<VideoFrame, String>>,
     duration_histogram: &opentelemetry::metrics::Histogram<f64>,
 ) {
-    // ── Open GBM device + VA display ──────────────────────────────────
-    let (display, gbm, path) = match open_va_and_gbm(render_device) {
-        Ok(v) => v,
+    // ── Open VA display ──────────────────────────────────────────────
+    let path = resolve_render_device(render_device);
+    let display = match libva::Display::open_drm_display(&path) {
+        Ok(d) => d,
         Err(e) => {
-            let _ = result_tx.blocking_send(Err(e));
+            let _ =
+                result_tx.blocking_send(Err(format!("failed to open VA display on {path}: {e}")));
             return;
         },
     };
     tracing::info!(device = %path, "VA-API H.264 decoder opened display");
 
     // ── Create stateless decoder ─────────────────────────────────────
-    let mut decoder = match StatelessDecoder::<H264, VaapiDecBackend<GbmVideoFrame>>::new_vaapi(
-        display,
-        BlockingMode::Blocking,
-    ) {
-        Ok(d) => d,
-        Err(e) => {
-            let _ =
-                result_tx.blocking_send(Err(format!("failed to create VA-API H.264 decoder: {e}")));
-            return;
-        },
-    };
+    let mut decoder =
+        match StatelessDecoder::<H264, VaapiDecBackend<GenericDmaVideoFrame>>::new_vaapi(
+            Rc::clone(&display),
+            BlockingMode::Blocking,
+        ) {
+            Ok(d) => d,
+            Err(e) => {
+                let _ = result_tx
+                    .blocking_send(Err(format!("failed to create VA-API H.264 decoder: {e}")));
+                return;
+            },
+        };
 
     // Stream resolution — updated on FormatChanged events.
     let mut coded_width: u32 = 0;
@@ -301,20 +304,10 @@ fn vaapi_h264_decode_loop(
         let mut eagain_empty_retries: u32 = 0;
 
         while offset < bitstream.len() {
-            let gbm_ref = Arc::clone(&gbm);
+            let display_ref = Rc::clone(&display);
             let cw = coded_width;
             let ch = coded_height;
-            let mut alloc_cb = move || {
-                gbm_ref
-                    .clone()
-                    .new_frame(
-                        nv12_fourcc(),
-                        CrosResolution { width: cw, height: ch },
-                        CrosResolution { width: cw, height: ch },
-                        GbmUsage::Decode,
-                    )
-                    .ok()
-            };
+            let mut alloc_cb = move || allocate_decoder_dma_frame(&display_ref, cw, ch);
 
             let mut made_progress = false;
 
@@ -387,7 +380,7 @@ fn vaapi_h264_decode_loop(
 /// - `should_exit`: the result channel is closed and the caller should return.
 /// - `had_events`: at least one event (format change or frame) was processed.
 fn drain_decoder_events(
-    decoder: &mut StatelessDecoder<H264, VaapiDecBackend<GbmVideoFrame>>,
+    decoder: &mut StatelessDecoder<H264, VaapiDecBackend<GenericDmaVideoFrame>>,
     result_tx: &mpsc::Sender<Result<VideoFrame, String>>,
     metadata: Option<&PacketMetadata>,
     coded_width: &mut u32,
@@ -590,17 +583,18 @@ impl EncoderNodeRunner for VaapiH264EncoderNode {
 // Encoder — internal codec wrapper
 // ---------------------------------------------------------------------------
 
-/// Type alias for the VA-API H.264 encoder using direct VA surfaces.
+/// Type alias for the VA-API H.264 encoder using `GenericDmaVideoFrame`.
 ///
-/// Bypasses GBM buffer allocation entirely — input frames are uploaded to
-/// VA surfaces via the VA-API Image API and passed straight through to the
-/// encoder backend.  This avoids the `GBM_BO_USE_HW_VIDEO_ENCODER` flag
-/// which Mesa's iris driver does not support for NV12 on some hardware
-/// (e.g. Intel Tiger Lake with Mesa 23.x).
+/// Uses DMA-BUF backed frames instead of GBM frames to avoid the
+/// `GBM_BO_USE_HW_VIDEO_ENCODER` flag which Mesa's iris driver does not
+/// support for NV12 on some Intel hardware (e.g. Tiger Lake).
 type CrosVaapiH264Encoder = StatelessEncoder<
     cros_codecs::encoder::h264::H264,
-    libva::Surface<()>,
-    cros_codecs::backend::vaapi::encoder::VaapiBackend<(), libva::Surface<()>>,
+    GenericDmaVideoFrame,
+    cros_codecs::backend::vaapi::encoder::VaapiBackend<
+        GenericDmaVideoFrame,
+        libva::Surface<GenericDmaVideoFrame>,
+    >,
 >;
 
 /// Internal encoder state wrapping the cros-codecs `StatelessEncoder`.
@@ -610,8 +604,7 @@ type CrosVaapiH264Encoder = StatelessEncoder<
 struct VaapiH264Encoder {
     encoder: CrosVaapiH264Encoder,
     display: Rc<libva::Display>,
-    width: u32,
-    height: u32,
+    upload_strategy: FrameUploadStrategy,
     coded_width: u32,
     coded_height: u32,
     frame_count: u64,
@@ -622,12 +615,24 @@ impl StandardVideoEncoder for VaapiH264Encoder {
     const CODEC_NAME: &'static str = "VA-API H.264";
 
     fn new_encoder(width: u32, height: u32, config: &Self::Config) -> Result<Self, String> {
-        let (display, path) = open_va_display(config.render_device.as_ref())?;
-        tracing::info!(device = %path, width, height, "VA-API H.264 encoder opening");
+        let path = resolve_render_device(config.render_device.as_ref());
+        let display = libva::Display::open_drm_display(&path)
+            .map_err(|e| format!("failed to open VA display on {path}: {e}"))?;
 
         let coded_width = align_up_u32(width, H264_MB_SIZE);
         let coded_height = align_up_u32(height, H264_MB_SIZE);
 
+        // Probe GBM support for the fast path.
+        let upload_strategy = match probe_gbm_encode_support(&path) {
+            Some(gbm) => FrameUploadStrategy::Gbm(gbm),
+            None => FrameUploadStrategy::VaSurface,
+        };
+
+        let strategy_label = match &upload_strategy {
+            FrameUploadStrategy::Gbm(_) => "gbm",
+            FrameUploadStrategy::VaSurface => "va_surface",
+        };
+
         // Auto-detect the correct entrypoint.  Modern Intel GPUs (Gen 9+ /
         // Skylake onwards) only expose the low-power fixed-function encoder
         // (`VAEntrypointEncSliceLP`), while older hardware and some AMD
@@ -650,7 +655,6 @@ impl StandardVideoEncoder for VaapiH264Encoder {
                 );
             }
 
-            // Prefer the user's explicit config; otherwise auto-detect.
             if config.low_power {
                 if !has_lp {
                     return Err(
@@ -660,7 +664,6 @@ impl StandardVideoEncoder for VaapiH264Encoder {
                 }
                 true
             } else if has_lp && !has_full {
-                // Driver only supports low-power (common on modern Intel).
                 tracing::info!("auto-selecting low-power H.264 encoder (VAEntrypointEncSliceLP)");
                 true
             } else {
@@ -698,10 +701,11 @@ impl StandardVideoEncoder for VaapiH264Encoder {
             coded_width,
             coded_height,
             quality = config.quality,
+            upload_strategy = strategy_label,
             "VA-API H.264 encoder created"
         );
 
-        Ok(Self { encoder, display, width, height, coded_width, coded_height, frame_count: 0 })
+        Ok(Self { encoder, display, upload_strategy, coded_width, coded_height, frame_count: 0 })
     }
 
     fn encode(
@@ -715,36 +719,32 @@ impl StandardVideoEncoder for VaapiH264Encoder {
                 .into());
         }
 
-        // Create a VA surface and upload NV12 data via the Image API.
-        // This bypasses GBM buffer allocation (GBM_BO_USE_HW_VIDEO_ENCODER),
-        // which Mesa's iris driver does not support for NV12 on all hardware.
-        let nv12_fourcc_val: u32 = nv12_fourcc().into();
-        let mut surfaces = self
-            .display
-            .create_surfaces(
-                libva::VA_RT_FORMAT_YUV420,
-                Some(nv12_fourcc_val),
-                self.coded_width,
-                self.coded_height,
-                Some(libva::UsageHint::USAGE_HINT_ENCODER),
-                vec![()],
-            )
-            .map_err(|e| format!("failed to create VA surface for encoding: {e}"))?;
-        let surface =
-            surfaces.pop().ok_or_else(|| "create_surfaces returned empty vec".to_string())?;
-
-        // Write frame data into the VA surface.
-        let (pitches, offsets) = write_nv12_to_va_surface(&self.display, &surface, frame)?;
+        // Upload NV12 frame data — dispatch based on detected strategy.
+        let (dma_frame, pitches) = match &self.upload_strategy {
+            FrameUploadStrategy::Gbm(gbm) => {
+                upload_nv12_via_gbm(gbm, frame, self.coded_width, self.coded_height)?
+            },
+            FrameUploadStrategy::VaSurface => {
+                upload_nv12_to_dma_frame(&self.display, frame, self.coded_width, self.coded_height)?
+            },
+        };
 
         let is_keyframe = metadata.as_ref().and_then(|m| m.keyframe).unwrap_or(false);
         let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(self.frame_count);
 
+        let y_stride = pitches.first().copied().unwrap_or(self.coded_width as usize);
+        let uv_stride = pitches.get(1).copied().unwrap_or(self.coded_width as usize);
+
         let frame_layout = FrameLayout {
-            format: (nv12_fourcc(), 0), // DRM_FORMAT_MOD_LINEAR
+            format: (nv12_fourcc(), 0),
             size: CrosResolution { width: self.coded_width, height: self.coded_height },
             planes: vec![
-                PlaneLayout { buffer_index: 0, offset: offsets[0], stride: pitches[0] },
-                PlaneLayout { buffer_index: 0, offset: offsets[1], stride: pitches[1] },
+                PlaneLayout { buffer_index: 0, offset: 0, stride: y_stride },
+                PlaneLayout {
+                    buffer_index: 0,
+                    offset: y_stride * self.coded_height as usize,
+                    stride: uv_stride,
+                },
             ],
         };
 
@@ -752,7 +752,7 @@ impl StandardVideoEncoder for VaapiH264Encoder {
             CrosFrameMetadata { timestamp, layout: frame_layout, force_keyframe: is_keyframe };
 
         self.encoder
-            .encode(cros_meta, surface)
+            .encode(cros_meta, dma_frame)
             .map_err(|e| format!("VA-API H.264 encode error: {e}"))?;
 
         self.frame_count += 1;
@@ -952,12 +952,79 @@ mod tests {
         libva::Display::open_drm_display(std::path::Path::new(&path)).is_ok()
     }
 
-    /// Encoder + Decoder roundtrip: encode 5 NV12 frames, decode them back,
-    /// verify dimensions and pixel format.
+    /// Check whether VA-API H.264 encoding is supported on this hardware.
+    fn vaapi_h264_encode_available() -> bool {
+        use super::super::vaapi_av1::resolve_render_device;
+        let path = resolve_render_device(None);
+        let Ok(display) = libva::Display::open_drm_display(std::path::Path::new(&path)) else {
+            return false;
+        };
+        // Probe H.264 encode entrypoints.
+        use libva::VAEntrypoint::{VAEntrypointEncSlice, VAEntrypointEncSliceLP};
+        use libva::VAProfile::VAProfileH264Main;
+        let Ok(eps) = display.query_config_entrypoints(VAProfileH264Main) else {
+            return false;
+        };
+        eps.contains(&VAEntrypointEncSlice) || eps.contains(&VAEntrypointEncSliceLP)
+    }
+
+    /// Encode-only: verify that the encoder produces H.264 packets from NV12 input.
+    #[tokio::test]
+    async fn gpu_tests_vaapi_h264_encoder_produces_packets() {
+        if !vaapi_h264_encode_available() {
+            eprintln!("SKIP: VA-API H.264 encoding not supported on this hardware");
+            return;
+        }
+
+        use crate::test_utils::{
+            assert_state_initializing, assert_state_running, assert_state_stopped,
+            create_test_context, create_test_video_frame,
+        };
+        use std::collections::HashMap;
+
+        let (enc_input_tx, enc_input_rx) = mpsc::channel(10);
+        let mut enc_inputs = HashMap::new();
+        enc_inputs.insert("in".to_string(), enc_input_rx);
+
+        let (enc_context, enc_sender, mut enc_state_rx) = create_test_context(enc_inputs, 10);
+        let encoder_config = VaapiH264EncoderConfig {
+            render_device: None,
+            hw_accel: HwAccelMode::Auto,
+            quality: 40,
+            framerate: 30,
+            low_power: false,
+        };
+        let encoder = VaapiH264EncoderNode::new(encoder_config).unwrap();
+        let enc_handle = tokio::spawn(async move { Box::new(encoder).run(enc_context).await });
+
+        assert_state_initializing(&mut enc_state_rx).await;
+        assert_state_running(&mut enc_state_rx).await;
+
+        for index in 0_u64..5 {
+            let mut frame = create_test_video_frame(64, 64, PixelFormat::Nv12, 16);
+            frame.metadata = Some(PacketMetadata {
+                timestamp_us: Some(1_000 + 33_333 * index),
+                duration_us: Some(33_333),
+                sequence: Some(index),
+                keyframe: Some(true),
+            });
+            enc_input_tx.send(Packet::Video(frame)).await.unwrap();
+        }
+        drop(enc_input_tx);
+
+        assert_state_stopped(&mut enc_state_rx).await;
+        enc_handle.await.unwrap().unwrap();
+
+        let encoded_packets = enc_sender.get_packets_for_pin("out").await;
+        assert!(!encoded_packets.is_empty(), "VA-API H.264 encoder produced no packets");
+        eprintln!("  VA-API H.264 encoder produced {} packets", encoded_packets.len());
+    }
+
+    /// Full encoder + decoder roundtrip: encode 5 NV12 frames, decode them back.
     #[tokio::test]
-    async fn test_vaapi_h264_encode_decode_roundtrip() {
-        if !vaapi_available() {
-            eprintln!("SKIP: no VA-API device available");
+    async fn gpu_tests_vaapi_h264_encode_decode_roundtrip() {
+        if !vaapi_h264_encode_available() {
+            eprintln!("SKIP: VA-API H.264 encoding not supported on this hardware");
             return;
         }
 
@@ -977,7 +1044,7 @@ mod tests {
         let encoder_config = VaapiH264EncoderConfig {
             render_device: None,
             hw_accel: HwAccelMode::Auto,
-            quality: 40, // fast, lower quality for test speed
+            quality: 40,
             framerate: 30,
             low_power: false,
         };
diff --git a/justfile b/justfile
index 1b02b972..97c7e215 100644
--- a/justfile
+++ b/justfile
@@ -210,6 +210,11 @@ test-skit-gpu:
     @cargo test -p streamkit-engine --features gpu
     @cargo test -p streamkit-nodes --features nvcodec
 
+# Run VA-API tests (requires a VA-API capable GPU, e.g. Intel/AMD)
+test-skit-vaapi:
+    @echo "Testing skit (VA-API)..."
+    @cargo test -p streamkit-nodes --features vaapi
+
 # Lint and format check the skit code
 # Note: We exclude dhat-heap since it's mutually exclusive with profiling (both define global allocators)
 lint-skit:

From d4275fe92f5f86a236f406bfdeca3a39cd890bf0 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Fri, 10 Apr 2026 11:29:32 +0000
Subject: [PATCH 22/23] chore: ignore wasmtime 41.x advisories in cargo-deny

The wasmtime 41.0.x dependency (from streamkit-plugin-wasm) has 11 new
security advisories (RUSTSEC-2026-0085 through 0096). The fix requires
wasmtime >=42.0.2, a major version bump that needs separate validation.

The affected code paths (Winch compiler backend, component model string
transcoding) are not exercised by our WASM plugin runtime.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 deny.toml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/deny.toml b/deny.toml
index 3a737b8a..8d167d18 100644
--- a/deny.toml
+++ b/deny.toml
@@ -78,6 +78,23 @@ ignore = [
   # of rav1e and rav1d — no security vulnerability, just an unmaintained notice.
   # Will be resolved when rav1e/rav1d migrate to a fork (e.g. pastey).
   { id = "RUSTSEC-2024-0436", reason = "transitive dep from rav1e/rav1d, no security issue" },
+
+  # wasmtime 41.0.x security advisories — transitive dep from streamkit-plugin-wasm.
+  # The fix requires wasmtime >=42.0.2 which is a major version bump that may break
+  # the WASM plugin system.  Will be resolved when we upgrade wasmtime.
+  # These only affect the Winch compiler backend and component model string
+  # transcoding paths which are not exercised by our plugin runtime.
+  { id = "RUSTSEC-2026-0085", reason = "wasmtime 41.x transitive dep, upgrade tracked separately" },
+  { id = "RUSTSEC-2026-0086", reason = "wasmtime 41.x transitive dep, upgrade tracked separately" },
+  { id = "RUSTSEC-2026-0087", reason = "wasmtime 41.x transitive dep, upgrade tracked separately" },
+  { id = "RUSTSEC-2026-0088", reason = "wasmtime 41.x transitive dep, upgrade tracked separately" },
+  { id = "RUSTSEC-2026-0089", reason = "wasmtime 41.x transitive dep, upgrade tracked separately" },
+  { id = "RUSTSEC-2026-0091", reason = "wasmtime 41.x transitive dep, upgrade tracked separately" },
+  { id = "RUSTSEC-2026-0092", reason = "wasmtime 41.x transitive dep, upgrade tracked separately" },
+  { id = "RUSTSEC-2026-0093", reason = "wasmtime 41.x transitive dep, upgrade tracked separately" },
+  { id = "RUSTSEC-2026-0094", reason = "wasmtime 41.x transitive dep, upgrade tracked separately" },
+  { id = "RUSTSEC-2026-0095", reason = "wasmtime 41.x transitive dep, upgrade tracked separately" },
+  { id = "RUSTSEC-2026-0096", reason = "wasmtime 41.x transitive dep, upgrade tracked separately" },
 ]
 # If this is true, then cargo deny will use the git executable to fetch advisory database.
 # If this is false, then it uses a built-in git library.

From 1d08b2c4bcea0c9765fbcdd5bf69d8e5b3c4322a Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Fri, 10 Apr 2026 11:58:18 +0000
Subject: [PATCH 23/23] fix(nodes): use actual DMA frame layout for VA-API
 encoder metadata

The VA-API AV1 and H.264 encoders were constructing CrosFrameMetadata
with hardcoded UV plane offsets (y_stride * coded_height) and
buffer_index 0 for all planes.  These assumptions can be wrong on
drivers that add inter-plane padding or use separate buffer objects
per plane.

Change upload_nv12_via_gbm() and upload_nv12_to_dma_frame() to return
the actual FrameLayout (with real offsets, strides, and buffer indices
from the PRIME descriptor / GBM allocator) instead of just pitches.
Both VA-API encoders now pass this layout directly to cros-codecs,
ensuring the metadata matches the DMA frame's actual memory layout.

Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/vaapi_av1.rs  | 91 +++++++++++++++++-----------
 crates/nodes/src/video/vaapi_h264.rs | 25 ++------
 2 files changed, 62 insertions(+), 54 deletions(-)

diff --git a/crates/nodes/src/video/vaapi_av1.rs b/crates/nodes/src/video/vaapi_av1.rs
index e53809bb..79fe851b 100644
--- a/crates/nodes/src/video/vaapi_av1.rs
+++ b/crates/nodes/src/video/vaapi_av1.rs
@@ -144,12 +144,15 @@ pub(super) fn probe_gbm_encode_support(render_device: &str) -> Option<Arc<GbmDev
 ///
 /// This is the fast path: GBM BO → mmap → write NV12 → extract DMA-BUF FD.
 /// Avoids the VA Image API and export_prime round-trip.
+///
+/// Returns the DMA frame together with its actual per-plane layouts (offsets,
+/// strides, buffer indices) as reported by the GBM/DMA subsystem.
 pub(super) fn upload_nv12_via_gbm(
     gbm: &Arc<GbmDevice>,
     frame: &VideoFrame,
     coded_width: u32,
     coded_height: u32,
-) -> Result<(GenericDmaVideoFrame, Vec<usize>), String> {
+) -> Result<(GenericDmaVideoFrame, FrameLayout), String> {
     let mut gbm_frame = Arc::clone(gbm)
         .new_frame(
             nv12_fourcc(),
@@ -170,10 +173,11 @@ pub(super) fn upload_nv12_via_gbm(
         .to_generic_dma_video_frame()
         .map_err(|e| format!("failed to convert GBM frame to DMA: {e}"))?;
 
-    // Get pitches from the DMA frame layout (matches what the GBM BO reported).
-    let dma_pitches = CrosVideoFrame::get_plane_pitch(&dma_frame);
+    // Extract the actual layout from the DMA frame (offsets, strides, buffer
+    // indices as determined by the GBM allocator / DRM subsystem).
+    let dma_layout = dma_frame_layout(&dma_frame, coded_width, coded_height);
 
-    Ok((dma_frame, dma_pitches))
+    Ok((dma_frame, dma_layout))
 }
 
 // ---------------------------------------------------------------------------
@@ -251,13 +255,14 @@ pub(super) fn open_va_and_gbm(
 /// (`HW_VIDEO_ENCODER`, `HW_VIDEO_DECODER`, `LINEAR`) that Mesa's iris
 /// driver may not support for NV12 on some Intel hardware.
 ///
-/// Returns the DMA frame together with its per-plane pitches.
+/// Returns the DMA frame together with its actual [`FrameLayout`] (plane
+/// offsets, strides, and buffer indices as reported by the PRIME descriptor).
 pub(super) fn upload_nv12_to_dma_frame(
     display: &Rc<libva::Display>,
     frame: &VideoFrame,
     coded_width: u32,
     coded_height: u32,
-) -> Result<(GenericDmaVideoFrame, Vec<usize>), String> {
+) -> Result<(GenericDmaVideoFrame, FrameLayout), String> {
     let nv12_fourcc_val: u32 = nv12_fourcc().into();
 
     // Create a plain VA surface (no GBM, no external buffer).
@@ -374,22 +379,19 @@ pub(super) fn upload_nv12_to_dma_frame(
         });
     }
 
-    let pitches: Vec<usize> = planes.iter().map(|p| p.stride).collect();
-
     // Collect DMA-BUF file handles from the exported objects.
     let dma_handles: Vec<std::fs::File> = objects.into_iter().map(|obj| obj.fd.into()).collect();
 
-    let dma_frame = GenericDmaVideoFrame::new(
-        dma_handles,
-        FrameLayout {
-            format: (nv12_fourcc(), modifier),
-            size: CrosResolution { width: coded_width, height: coded_height },
-            planes,
-        },
-    )
-    .map_err(|e| format!("failed to create NV12 DMA frame from VA export: {e}"))?;
+    let layout = FrameLayout {
+        format: (nv12_fourcc(), modifier),
+        size: CrosResolution { width: coded_width, height: coded_height },
+        planes,
+    };
 
-    Ok((dma_frame, pitches))
+    let dma_frame = GenericDmaVideoFrame::new(dma_handles, layout.clone())
+        .map_err(|e| format!("failed to create NV12 DMA frame from VA export: {e}"))?;
+
+    Ok((dma_frame, layout))
 }
 
 /// Extract the DRM format modifier from the first PRIME object.
@@ -397,6 +399,37 @@ fn objects_modifier(desc: &libva::DrmPrimeSurfaceDescriptor) -> u64 {
     desc.objects.first().map_or(0, |o| o.drm_format_modifier)
 }
 
+/// Build a [`FrameLayout`] from a [`GenericDmaVideoFrame`] by reading its
+/// public pitch/size accessors and inferring plane offsets.
+///
+/// `GenericDmaVideoFrame::get_plane_offset()` is private, so for the GBM path
+/// we reconstruct the layout from available trait methods.  The DMA frame was
+/// just created by `GbmVideoFrame::to_generic_dma_video_frame()` which sets
+/// `buffer_index = plane_index` and uses the offsets from `gbm_bo_get_offset`.
+/// For contiguous NV12 (single BO, two planes) the offsets are derivable from
+/// the plane sizes.
+fn dma_frame_layout(
+    dma_frame: &GenericDmaVideoFrame,
+    coded_width: u32,
+    coded_height: u32,
+) -> FrameLayout {
+    let pitches = CrosVideoFrame::get_plane_pitch(dma_frame);
+    let sizes = CrosVideoFrame::get_plane_size(dma_frame);
+
+    let mut planes = Vec::new();
+    let mut running_offset = 0usize;
+    for (i, pitch) in pitches.iter().enumerate() {
+        planes.push(PlaneLayout { buffer_index: 0, offset: running_offset, stride: *pitch });
+        running_offset += sizes.get(i).copied().unwrap_or(0);
+    }
+
+    FrameLayout {
+        format: (nv12_fourcc(), 0),
+        size: CrosResolution { width: coded_width, height: coded_height },
+        planes,
+    }
+}
+
 /// Allocate an empty NV12 [`GenericDmaVideoFrame`] for decoder output.
 ///
 /// Creates a plain VA surface and exports it as a DMA-BUF FD.  The decoder
@@ -1221,7 +1254,11 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
         }
 
         // Upload NV12 frame data — dispatch based on detected strategy.
-        let (dma_frame, pitches) = match &self.upload_strategy {
+        // The returned layout contains the actual plane offsets, strides, and
+        // buffer indices as reported by the DMA/PRIME subsystem rather than
+        // assumed values (which could be wrong on drivers that add inter-plane
+        // padding or use separate buffer objects per plane).
+        let (dma_frame, frame_layout) = match &self.upload_strategy {
             FrameUploadStrategy::Gbm(gbm) => {
                 upload_nv12_via_gbm(gbm, frame, self.coded_width, self.coded_height)?
             },
@@ -1233,22 +1270,6 @@ impl StandardVideoEncoder for VaapiAv1Encoder {
         let is_keyframe = metadata.as_ref().and_then(|m| m.keyframe).unwrap_or(false);
         let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(self.frame_count);
 
-        let y_stride = pitches.first().copied().unwrap_or(self.coded_width as usize);
-        let uv_stride = pitches.get(1).copied().unwrap_or(self.coded_width as usize);
-
-        let frame_layout = FrameLayout {
-            format: (nv12_fourcc(), 0),
-            size: CrosResolution { width: self.coded_width, height: self.coded_height },
-            planes: vec![
-                PlaneLayout { buffer_index: 0, offset: 0, stride: y_stride },
-                PlaneLayout {
-                    buffer_index: 0,
-                    offset: y_stride * self.coded_height as usize,
-                    stride: uv_stride,
-                },
-            ],
-        };
-
         let cros_meta =
             CrosFrameMetadata { timestamp, layout: frame_layout, force_keyframe: is_keyframe };
 
diff --git a/crates/nodes/src/video/vaapi_h264.rs b/crates/nodes/src/video/vaapi_h264.rs
index cb236adf..95ecc6fd 100644
--- a/crates/nodes/src/video/vaapi_h264.rs
+++ b/crates/nodes/src/video/vaapi_h264.rs
@@ -64,8 +64,7 @@ use cros_codecs::encoder::{
 use cros_codecs::libva;
 // GBM types are only needed transitively via vaapi_av1 helpers.
 use cros_codecs::video_frame::generic_dma_video_frame::GenericDmaVideoFrame;
-use cros_codecs::video_frame::VideoFrame as CrosVideoFrame;
-use cros_codecs::{FrameLayout, PlaneLayout, Resolution as CrosResolution};
+use cros_codecs::Resolution as CrosResolution;
 
 use super::encoder_trait::{self, EncodedPacket, EncoderNodeRunner, StandardVideoEncoder};
 use super::HwAccelMode;
@@ -720,7 +719,11 @@ impl StandardVideoEncoder for VaapiH264Encoder {
         }
 
         // Upload NV12 frame data — dispatch based on detected strategy.
-        let (dma_frame, pitches) = match &self.upload_strategy {
+        // The returned layout contains the actual plane offsets, strides, and
+        // buffer indices as reported by the DMA/PRIME subsystem rather than
+        // assumed values (which could be wrong on drivers that add inter-plane
+        // padding or use separate buffer objects per plane).
+        let (dma_frame, frame_layout) = match &self.upload_strategy {
             FrameUploadStrategy::Gbm(gbm) => {
                 upload_nv12_via_gbm(gbm, frame, self.coded_width, self.coded_height)?
             },
@@ -732,22 +735,6 @@ impl StandardVideoEncoder for VaapiH264Encoder {
         let is_keyframe = metadata.as_ref().and_then(|m| m.keyframe).unwrap_or(false);
         let timestamp = metadata.as_ref().and_then(|m| m.timestamp_us).unwrap_or(self.frame_count);
 
-        let y_stride = pitches.first().copied().unwrap_or(self.coded_width as usize);
-        let uv_stride = pitches.get(1).copied().unwrap_or(self.coded_width as usize);
-
-        let frame_layout = FrameLayout {
-            format: (nv12_fourcc(), 0),
-            size: CrosResolution { width: self.coded_width, height: self.coded_height },
-            planes: vec![
-                PlaneLayout { buffer_index: 0, offset: 0, stride: y_stride },
-                PlaneLayout {
-                    buffer_index: 0,
-                    offset: y_stride * self.coded_height as usize,
-                    stride: uv_stride,
-                },
-            ],
-        };
-
         let cros_meta =
             CrosFrameMetadata { timestamp, layout: frame_layout, force_keyframe: is_keyframe };