diff --git a/configure b/configure
index e2e9fc26d8c4d..69a6d75cc582f 100755
--- a/configure
+++ b/configure
@@ -3355,6 +3355,10 @@ scale_npp_filter_deps="ffnvcodec libnpp"
 scale2ref_npp_filter_deps="ffnvcodec libnpp"
 scale_cuda_filter_deps="ffnvcodec"
 scale_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+lut3d_cuda_filter_deps="ffnvcodec"
+lut3d_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+tonemap_cuda_filter_deps="ffnvcodec"
+tonemap_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 thumbnail_cuda_filter_deps="ffnvcodec"
 thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 97f8f1727203e..6a37adf15a416 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -35,6 +35,7 @@ OBJS-$(CONFIG_QSVVPP)                        += qsvvpp.o
 OBJS-$(CONFIG_SCENE_SAD)                     += scene_sad.o
 OBJS-$(CONFIG_DNN)                           += dnn_filter_common.o
 
+
 # audio filters
 OBJS-$(CONFIG_AAP_FILTER)                    += af_aap.o
 OBJS-$(CONFIG_ABENCH_FILTER)                 += f_bench.o
@@ -379,6 +380,8 @@ OBJS-$(CONFIG_LUT1D_FILTER)                  += vf_lut3d.o
 OBJS-$(CONFIG_LUT_FILTER)                    += vf_lut.o
 OBJS-$(CONFIG_LUT2_FILTER)                   += vf_lut2.o framesync.o
 OBJS-$(CONFIG_LUT3D_FILTER)                  += vf_lut3d.o framesync.o
+OBJS-$(CONFIG_LUT3D_CUDA_FILTER)             += vf_lut3d_cuda.o \
+                                                vf_lut3d_cuda.ptx.o cuda/load_helper.o
 OBJS-$(CONFIG_LUTRGB_FILTER)                 += vf_lut.o
 OBJS-$(CONFIG_LUTYUV_FILTER)                 += vf_lut.o
 OBJS-$(CONFIG_MASKEDCLAMP_FILTER)            += vf_maskedclamp.o framesync.o
@@ -530,6 +533,8 @@ OBJS-$(CONFIG_TMEDIAN_FILTER)                += vf_xmedian.o framesync.o
 OBJS-$(CONFIG_TMIDEQUALIZER_FILTER)          += vf_tmidequalizer.o
 OBJS-$(CONFIG_TMIX_FILTER)                   += vf_mix.o framesync.o
 OBJS-$(CONFIG_TONEMAP_FILTER)                += vf_tonemap.o
+OBJS-$(CONFIG_TONEMAP_CUDA_FILTER)           += vf_tonemap_cuda.o \
+                                                vf_tonemap_cuda.ptx.o cuda/load_helper.o
 OBJS-$(CONFIG_TONEMAP_OPENCL_FILTER)         += vf_tonemap_opencl.o opencl.o \
                                                 opencl/tonemap.o opencl/colorspace_common.o
 OBJS-$(CONFIG_TONEMAP_VAAPI_FILTER)          += vf_tonemap_vaapi.o vaapi_vpp.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 3bc045b28f552..4789bc2bb41ee 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -354,6 +354,7 @@ extern const FFFilter ff_vf_lut;
 extern const FFFilter ff_vf_lut1d;
 extern const FFFilter ff_vf_lut2;
 extern const FFFilter ff_vf_lut3d;
+extern const FFFilter ff_vf_lut3d_cuda;
 extern const FFFilter ff_vf_lutrgb;
 extern const FFFilter ff_vf_lutyuv;
 extern const FFFilter ff_vf_maskedclamp;
@@ -499,6 +500,7 @@ extern const FFFilter ff_vf_tmedian;
 extern const FFFilter ff_vf_tmidequalizer;
 extern const FFFilter ff_vf_tmix;
 extern const FFFilter ff_vf_tonemap;
+extern const FFFilter ff_vf_tonemap_cuda;
 extern const FFFilter ff_vf_tonemap_opencl;
 extern const FFFilter ff_vf_tonemap_vaapi;
 extern const FFFilter ff_vf_tpad;
diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c
index 2d6036df7423e..d07949fde2c88 100644
--- a/libavfilter/avfiltergraph.c
+++ b/libavfilter/avfiltergraph.c
@@ -448,6 +448,85 @@ static int formats_declared(AVFilterContext *f)
  *          was made and the negotiation is stuck;
  *          a negative error code if some other error happened
  */
+static void describe_link_formats_cfg(AVFilterLink *link, AVFilterFormatsConfig *cfg, AVBPrint *bp)
+{
+    if (!link || !cfg)
+        return;
+        
+    av_bprintf(bp, "formats=");
+    if (cfg->formats && cfg->formats->nb_formats > 0) {
+        av_bprintf(bp, "[");
+        for (unsigned i = 0; i < cfg->formats->nb_formats; i++) {
+            if (i > 0) av_bprintf(bp, ", ");
+            if (link->type == AVMEDIA_TYPE_VIDEO) {
+                av_bprintf(bp, "%s", av_get_pix_fmt_name(cfg->formats->formats[i]));
+            } else if (link->type == AVMEDIA_TYPE_AUDIO) {
+                av_bprintf(bp, "%s", av_get_sample_fmt_name(cfg->formats->formats[i]));
+            } else {
+                av_bprintf(bp, "%d", cfg->formats->formats[i]);
+            }
+        }
+        av_bprintf(bp, "]");
+    } else {
+        av_bprintf(bp, "none");
+    }
+    
+    if (link->type == AVMEDIA_TYPE_AUDIO) {
+        av_bprintf(bp, ", sample_rates=");
+        if (cfg->samplerates && cfg->samplerates->nb_formats > 0) {
+            av_bprintf(bp, "[");
+            for (unsigned i = 0; i < cfg->samplerates->nb_formats; i++) {
+                if (i > 0) av_bprintf(bp, ", ");
+                av_bprintf(bp, "%d", cfg->samplerates->formats[i]);
+            }
+            av_bprintf(bp, "]");
+        } else {
+            av_bprintf(bp, "none");
+        }
+        
+        av_bprintf(bp, ", channel_layouts=");
+        if (cfg->channel_layouts && cfg->channel_layouts->nb_channel_layouts > 0) {
+            av_bprintf(bp, "[");
+            for (int i = 0; i < cfg->channel_layouts->nb_channel_layouts; i++) {
+                if (i > 0) av_bprintf(bp, ", ");
+                char buf[256];
+                av_channel_layout_describe(&cfg->channel_layouts->channel_layouts[i], buf, sizeof(buf));
+                av_bprintf(bp, "%s", buf);
+            }
+            av_bprintf(bp, "]");
+        } else if (cfg->channel_layouts && cfg->channel_layouts->all_layouts) {
+            av_bprintf(bp, "all");
+        } else {
+            av_bprintf(bp, "none");
+        }
+    } else if (link->type == AVMEDIA_TYPE_VIDEO) {
+        av_bprintf(bp, ", color_spaces=");
+        if (cfg->color_spaces && cfg->color_spaces->nb_formats > 0) {
+            av_bprintf(bp, "[");
+            for (unsigned i = 0; i < cfg->color_spaces->nb_formats; i++) {
+                if (i > 0) av_bprintf(bp, ", ");
+                av_bprintf(bp, "%s", av_color_space_name(cfg->color_spaces->formats[i]));
+            }
+            av_bprintf(bp, "]");
+        } else {
+            av_bprintf(bp, "none");
+        }
+        
+        av_bprintf(bp, ", color_ranges=");
+        if (cfg->color_ranges && cfg->color_ranges->nb_formats > 0) {
+            av_bprintf(bp, "[");
+            for (unsigned i = 0; i < cfg->color_ranges->nb_formats; i++) {
+                if (i > 0) av_bprintf(bp, ", ");
+                av_bprintf(bp, "%s", av_color_range_name(cfg->color_ranges->formats[i]));
+            }
+            av_bprintf(bp, "]");
+        } else {
+            av_bprintf(bp, "none");
+        }
+    }
+}
+
+
 static int query_formats(AVFilterGraph *graph, void *log_ctx)
 {
     int i, j, ret;
@@ -526,6 +605,26 @@ static int query_formats(AVFilterGraph *graph, void *log_ctx)
                 }
 
                 /* couldn't merge format lists. auto-insert conversion filter */
+                {
+                    AVBPrint src_formats, dst_formats;
+                    av_bprint_init(&src_formats, 0, AV_BPRINT_SIZE_AUTOMATIC);
+                    av_bprint_init(&dst_formats, 0, AV_BPRINT_SIZE_AUTOMATIC);
+                    
+                    describe_link_formats_cfg(link, &link->outcfg, &src_formats);
+                    describe_link_formats_cfg(link, &link->incfg, &dst_formats);
+                    
+                    av_log(log_ctx, AV_LOG_INFO,
+                           "Auto-inserting filter '%s' between filters '%s' and '%s'\n"
+                           "Source link supports: %s\n"
+                           "Destination link supports: %s\n",
+                           neg->conversion_filter, link->src->name, link->dst->name,
+                           src_formats.str ? src_formats.str : "unknown",
+                           dst_formats.str ? dst_formats.str : "unknown");
+                    
+                    av_bprint_finalize(&src_formats, NULL);
+                    av_bprint_finalize(&dst_formats, NULL);
+                }
+                
                 if (!(filter = avfilter_get_by_name(neg->conversion_filter))) {
                     av_log(log_ctx, AV_LOG_ERROR,
                            "'%s' filter not present, cannot convert formats.\n",
@@ -578,9 +677,26 @@ static int query_formats(AVFilterGraph *graph, void *log_ctx)
                         (ret = MERGE(m, outlink)) <= 0) {
                         if (ret < 0)
                             return ret;
-                        av_log(log_ctx, AV_LOG_ERROR,
-                               "Impossible to convert between the formats supported by the filter "
-                               "'%s' and the filter '%s'\n", link->src->name, link->dst->name);
+                        {
+                            AVBPrint inlink_formats, outlink_formats;
+                            av_bprint_init(&inlink_formats, 0, AV_BPRINT_SIZE_AUTOMATIC);
+                            av_bprint_init(&outlink_formats, 0, AV_BPRINT_SIZE_AUTOMATIC);
+                            
+                            describe_link_formats_cfg(inlink, &inlink->outcfg, &inlink_formats);
+                            describe_link_formats_cfg(outlink, &outlink->incfg, &outlink_formats);
+                            
+                            av_log(log_ctx, AV_LOG_ERROR,
+                                   "Impossible to convert between the formats supported by the filter "
+                                   "'%s' and the filter '%s'\n"
+                                   "Input link %s supports: %s\n"
+                                   "Output link %s supports: %s\n",
+                                   link->src->name, link->dst->name,
+                                   link->src->name, inlink_formats.str ? inlink_formats.str : "unknown",
+                                   link->dst->name, outlink_formats.str ? outlink_formats.str : "unknown");
+                            
+                            av_bprint_finalize(&inlink_formats, NULL);
+                            av_bprint_finalize(&outlink_formats, NULL);
+                        }
                         return AVERROR(ENOSYS);
                     }
                 }
diff --git a/libavfilter/vf_colorspace_cuda.c b/libavfilter/vf_colorspace_cuda.c
index 54d6228cd1bb4..8ec0f24996998 100644
--- a/libavfilter/vf_colorspace_cuda.c
+++ b/libavfilter/vf_colorspace_cuda.c
@@ -32,6 +32,7 @@
 
 #include "avfilter.h"
 #include "filters.h"
+#include "colorspace.h"
 
 #include "cuda/load_helper.h"
 
@@ -59,13 +60,57 @@ typedef struct CUDAColorspaceContext {
     CUstream cu_stream;
     CUmodule cu_module;
     CUfunction cu_convert[AVCOL_RANGE_NB];
+    CUfunction cu_convert_colorspace;
+
 
     enum AVPixelFormat pix_fmt;
     enum AVColorRange range;
+    enum AVColorSpace in_csp, out_csp, user_csp;
+    enum AVColorPrimaries in_prm, out_prm, user_prm;
+    enum AVColorTransferCharacteristic in_trc, out_trc, user_trc;
+
+    // Conversion matrices for YUV-to-YUV colorspace conversion
+    float yuv2yuv_matrix[3][3];
+    float yuv_offset[2]; // input and output offsets
+    int colorspace_conversion_needed;
 
     int num_planes;
 } CUDAColorspaceContext;
 
+static int calculate_yuv2yuv_matrix(CUDAColorspaceContext* s)
+{
+    const AVLumaCoefficients *in_lumacoef, *out_lumacoef;
+    double yuv2rgb[3][3], rgb2yuv[3][3], yuv2yuv[3][3];
+    int i, j;
+
+    // Get luma coefficients for input and output colorspaces
+    in_lumacoef = av_csp_luma_coeffs_from_avcsp(s->in_csp);
+    out_lumacoef = av_csp_luma_coeffs_from_avcsp(s->out_csp);
+
+    if (!in_lumacoef || !out_lumacoef) {
+        return AVERROR(EINVAL);
+    }
+
+    // Calculate YUV to RGB matrix for input colorspace
+    ff_fill_rgb2yuv_table(in_lumacoef, rgb2yuv);
+    ff_matrix_invert_3x3(rgb2yuv, yuv2rgb);
+
+    // Calculate RGB to YUV matrix for output colorspace
+    ff_fill_rgb2yuv_table(out_lumacoef, rgb2yuv);
+
+    // Combine: YUV_in -> RGB -> YUV_out
+    ff_matrix_mul_3x3(yuv2yuv, yuv2rgb, rgb2yuv);
+
+    // Convert to float for CUDA kernel
+    for (i = 0; i < 3; i++) {
+        for (j = 0; j < 3; j++) {
+            s->yuv2yuv_matrix[i][j] = (float)yuv2yuv[i][j];
+        }
+    }
+
+    return 0;
+}
+
 static av_cold int cudacolorspace_init(AVFilterContext* ctx)
 {
     CUDAColorspaceContext* s = ctx->priv;
@@ -78,6 +123,7 @@ static av_cold int cudacolorspace_init(AVFilterContext* ctx)
     if (!s->tmp_frame)
         return AVERROR(ENOMEM);
 
+
     return 0;
 }
 
@@ -85,6 +131,7 @@ static av_cold void cudacolorspace_uninit(AVFilterContext* ctx)
 {
     CUDAColorspaceContext* s = ctx->priv;
 
+
     if (s->hwctx && s->cu_module) {
         CudaFunctions* cu = s->hwctx->internal->cuda_dl;
         CUcontext dummy;
@@ -172,13 +219,29 @@ static av_cold int init_processing_chain(AVFilterContext* ctx, int width,
         return AVERROR(EINVAL);
     }
 
-    if ((AVCOL_RANGE_MPEG != s->range) && (AVCOL_RANGE_JPEG != s->range)) {
+    if ((AVCOL_RANGE_MPEG != s->range) && (AVCOL_RANGE_JPEG != s->range) && 
+        (AVCOL_RANGE_UNSPECIFIED != s->range)) {
         av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n");
         return AVERROR(EINVAL);
     }
 
     s->num_planes = av_pix_fmt_count_planes(s->pix_fmt);
 
+    // Determine if colorspace conversion is needed
+    s->colorspace_conversion_needed = (s->in_csp != AVCOL_SPC_UNSPECIFIED && 
+                                       s->out_csp != AVCOL_SPC_UNSPECIFIED && 
+                                       s->in_csp != s->out_csp);
+
+    if (s->colorspace_conversion_needed) {
+        ret = calculate_yuv2yuv_matrix(s);
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to calculate colorspace conversion matrix\n");
+            return ret;
+        }
+        av_log(ctx, AV_LOG_INFO, "Colorspace conversion: %s -> %s\n",
+               av_color_space_name(s->in_csp), av_color_space_name(s->out_csp));
+    }
+
     ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height);
     if (ret < 0)
         return ret;
@@ -218,6 +281,10 @@ static av_cold int cudacolorspace_load_functions(AVFilterContext* ctx)
     if (ret < 0)
         goto fail;
 
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_convert_colorspace, s->cu_module, "colorspace_convert_cuda"));
+    if (ret < 0)
+        goto fail;
+
 fail:
     CHECK_CU(cu->cuCtxPopCurrent(&dummy));
     return ret;
@@ -236,6 +303,19 @@ static av_cold int cudacolorspace_config_props(AVFilterLink* outlink)
     outlink->w = inlink->w;
     outlink->h = inlink->h;
 
+    // Set defaults - actual colorspace will be determined from frame properties
+    s->in_csp = inlink->colorspace != AVCOL_SPC_UNSPECIFIED ? inlink->colorspace : AVCOL_SPC_BT709;
+    s->in_prm = AVCOL_PRI_BT709;  // Default, will be updated from frame
+    s->in_trc = AVCOL_TRC_BT709;  // Default, will be updated from frame
+
+    // Set output colorspace properties
+    s->out_csp = s->user_csp != AVCOL_SPC_UNSPECIFIED ? s->user_csp : s->in_csp;
+    s->out_prm = s->user_prm != AVCOL_PRI_UNSPECIFIED ? s->user_prm : s->in_prm;
+    s->out_trc = s->user_trc != AVCOL_TRC_UNSPECIFIED ? s->user_trc : s->in_trc;
+
+    // Set output link properties
+    outlink->colorspace = s->out_csp;
+
     ret = init_processing_chain(ctx, inlink->w, inlink->h);
     if (ret < 0)
         return ret;
@@ -272,45 +352,108 @@ static int conv_cuda_convert(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
     if (ret < 0)
         return ret;
 
-    out->color_range = s->range;
-
-    for (int i = 0; i < s->num_planes; i++) {
-        int width = in->width, height = in->height, comp_id = (i > 0);
-
-        switch (s->pix_fmt) {
-        case AV_PIX_FMT_YUV444P:
-            break;
-        case AV_PIX_FMT_YUV420P:
-            width = comp_id ? in->width / 2 : in->width;
-            /* fall-through */
-        case AV_PIX_FMT_NV12:
-            height = comp_id ? in->height / 2 : in->height;
-            break;
-        default:
-            av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n",
-                   av_get_pix_fmt_name(s->pix_fmt));
-            return AVERROR(EINVAL);
-        }
-
-        if (!s->cu_convert[out->color_range]) {
-            av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n");
-            return AVERROR(EINVAL);
+    // Update input colorspace properties from frame
+    s->in_csp = in->colorspace != AVCOL_SPC_UNSPECIFIED ? in->colorspace : s->in_csp;
+    s->in_prm = in->color_primaries != AVCOL_PRI_UNSPECIFIED ? in->color_primaries : s->in_prm;
+    s->in_trc = in->color_trc != AVCOL_TRC_UNSPECIFIED ? in->color_trc : s->in_trc;
+
+    // Update output colorspace if user didn't specify
+    if (s->user_csp == AVCOL_SPC_UNSPECIFIED) s->out_csp = s->in_csp;
+    if (s->user_prm == AVCOL_PRI_UNSPECIFIED) s->out_prm = s->in_prm;
+    if (s->user_trc == AVCOL_TRC_UNSPECIFIED) s->out_trc = s->in_trc;
+
+    // Check if we need to recalculate matrix
+    int need_recalc = (s->colorspace_conversion_needed && 
+                      (s->in_csp != s->out_csp));
+    
+    if (need_recalc) {
+        ret = calculate_yuv2yuv_matrix(s);
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to recalculate colorspace conversion matrix\n");
+            goto fail;
         }
+    }
 
-        if (in->color_range != out->color_range) {
-            void* args[] = {&in->data[i], &out->data[i], &in->linesize[i],
-                            &comp_id};
+    // Set output frame properties
+    out->color_range = s->range != AVCOL_RANGE_UNSPECIFIED ? s->range : in->color_range;
+    out->colorspace = s->out_csp;
+    out->color_primaries = s->out_prm;
+    out->color_trc = s->out_trc;
+
+    // If only colorspace conversion is needed (no range conversion)
+    if (s->colorspace_conversion_needed && (in->color_range == out->color_range || s->range == AVCOL_RANGE_UNSPECIFIED)) {
+        for (int i = 0; i < s->num_planes; i++) {
+            int width = in->width, height = in->height;
+
+            switch (s->pix_fmt) {
+            case AV_PIX_FMT_YUV444P:
+                break;
+            case AV_PIX_FMT_YUV420P:
+                width = (i > 0) ? in->width / 2 : in->width;
+                /* fall-through */
+            case AV_PIX_FMT_NV12:
+                height = (i > 0) ? in->height / 2 : in->height;
+                break;
+            default:
+                av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n",
+                       av_get_pix_fmt_name(s->pix_fmt));
+                return AVERROR(EINVAL);
+            }
+
+            void* args[] = {&in->data[i], &out->data[i], &in->linesize[i], &out->linesize[i],
+                            &width, &height, &i, 
+                            &s->yuv2yuv_matrix[0][0], &s->yuv2yuv_matrix[0][1], &s->yuv2yuv_matrix[0][2],
+                            &s->yuv2yuv_matrix[1][0], &s->yuv2yuv_matrix[1][1], &s->yuv2yuv_matrix[1][2],
+                            &s->yuv2yuv_matrix[2][0], &s->yuv2yuv_matrix[2][1], &s->yuv2yuv_matrix[2][2]};
             ret = CHECK_CU(cu->cuLaunchKernel(
-                s->cu_convert[out->color_range], DIV_UP(width, BLOCKX),
+                s->cu_convert_colorspace, DIV_UP(width, BLOCKX),
                 DIV_UP(height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream,
                 args, NULL));
-        } else {
-            ret = av_hwframe_transfer_data(out, in, 0);
             if (ret < 0)
-                return ret;
+                goto fail;
+        }
+    } else {
+        // Handle range conversion or passthrough
+        for (int i = 0; i < s->num_planes; i++) {
+            int width = in->width, height = in->height, comp_id = (i > 0);
+
+            switch (s->pix_fmt) {
+            case AV_PIX_FMT_YUV444P:
+                break;
+            case AV_PIX_FMT_YUV420P:
+                width = comp_id ? in->width / 2 : in->width;
+                /* fall-through */
+            case AV_PIX_FMT_NV12:
+                height = comp_id ? in->height / 2 : in->height;
+                break;
+            default:
+                av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n",
+                       av_get_pix_fmt_name(s->pix_fmt));
+                return AVERROR(EINVAL);
+            }
+
+            if (s->range != AVCOL_RANGE_UNSPECIFIED && in->color_range != out->color_range) {
+                if (!s->cu_convert[out->color_range]) {
+                    av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n");
+                    return AVERROR(EINVAL);
+                }
+
+                void* args[] = {&in->data[i], &out->data[i], &in->linesize[i], &comp_id};
+                ret = CHECK_CU(cu->cuLaunchKernel(
+                    s->cu_convert[out->color_range], DIV_UP(width, BLOCKX),
+                    DIV_UP(height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream,
+                    args, NULL));
+                if (ret < 0)
+                    goto fail;
+            } else {
+                ret = av_hwframe_transfer_data(out, in, 0);
+                if (ret < 0)
+                    goto fail;
+            }
         }
     }
 
+fail:
     CHECK_CU(cu->cuCtxPopCurrent(&dummy));
     return ret;
 }
@@ -344,6 +487,7 @@ static int cudacolorspace_conv(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
     return 0;
 }
 
+
 static int cudacolorspace_filter_frame(AVFilterLink* link, AVFrame* in)
 {
     AVFilterContext* ctx = link->dst;
@@ -392,6 +536,27 @@ static const AVOption options[] = {
         {"mpeg", "Limited range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, .unit = "range"},
         {"pc",   "Full range",    0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, .unit = "range"},
         {"jpeg", "Full range",    0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, .unit = "range"},
+
+    {"space", "Output colorspace", OFFSET(user_csp), AV_OPT_TYPE_INT, { .i64 = AVCOL_SPC_UNSPECIFIED }, AVCOL_SPC_RGB, AVCOL_SPC_NB - 1, FLAGS, .unit = "csp"},
+        {"bt709",       NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_BT709 },       0, 0, FLAGS, .unit = "csp"},
+        {"bt470bg",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_BT470BG },     0, 0, FLAGS, .unit = "csp"},
+        {"smpte170m",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_SMPTE170M },   0, 0, FLAGS, .unit = "csp"},
+        {"bt2020nc",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_BT2020_NCL },  0, 0, FLAGS, .unit = "csp"},
+        {"bt2020ncl",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_BT2020_NCL },  0, 0, FLAGS, .unit = "csp"},
+        {"bt601",       NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_SMPTE170M },   0, 0, FLAGS, .unit = "csp"},
+
+    {"primaries", "Output color primaries", OFFSET(user_prm), AV_OPT_TYPE_INT, { .i64 = AVCOL_PRI_UNSPECIFIED }, AVCOL_PRI_RESERVED0, AVCOL_PRI_NB - 1, FLAGS, .unit = "prm"},
+        {"bt709",       NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_PRI_BT709 },      0, 0, FLAGS, .unit = "prm"},
+        {"bt470bg",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_PRI_BT470BG },    0, 0, FLAGS, .unit = "prm"},
+        {"smpte170m",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_PRI_SMPTE170M },  0, 0, FLAGS, .unit = "prm"},
+        {"bt2020",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_PRI_BT2020 },     0, 0, FLAGS, .unit = "prm"},
+
+    {"trc", "Output transfer characteristics", OFFSET(user_trc), AV_OPT_TYPE_INT, { .i64 = AVCOL_TRC_UNSPECIFIED }, AVCOL_TRC_RESERVED0, AVCOL_TRC_NB - 1, FLAGS, .unit = "trc"},
+        {"bt709",       NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_TRC_BT709 },      0, 0, FLAGS, .unit = "trc"},
+        {"bt2020-10",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_TRC_BT2020_10 },  0, 0, FLAGS, .unit = "trc"},
+        {"smpte170m",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_TRC_SMPTE170M },  0, 0, FLAGS, .unit = "trc"},
+    
+
     {NULL},
 };
 
diff --git a/libavfilter/vf_colorspace_cuda.cu b/libavfilter/vf_colorspace_cuda.cu
index 59db9f69f3c36..b3ab4a5e498cd 100644
--- a/libavfilter/vf_colorspace_cuda.cu
+++ b/libavfilter/vf_colorspace_cuda.cu
@@ -91,4 +91,36 @@ __global__ void to_mpeg_cuda(const unsigned char* src, unsigned char* dst,
     dst[x + y * pitch] = static_cast<unsigned char>(dst_);
 }
 
+__global__ void colorspace_convert_cuda(const unsigned char* src, unsigned char* dst,
+                                       int src_pitch, int dst_pitch, int width, int height, int plane_id,
+                                       float m00, float m01, float m02,
+                                       float m10, float m11, float m12,
+                                       float m20, float m21, float m22)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= width || y >= height)
+        return;
+
+    if (plane_id == 0) {
+        // Luma plane (Y) - apply only first row of matrix
+        float src_y = (float)src[x + y * src_pitch];
+        float dst_y = m00 * src_y + m01 * 128.0f + m02 * 128.0f;
+        dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_y + 0.5f), 0, 255);
+    } else if (plane_id == 1) {
+        // U plane - apply second row of matrix, but we only have U component
+        float src_u = (float)src[x + y * src_pitch] - 128.0f;
+        // For U plane conversion, we apply the matrix considering U input and zero V
+        float dst_u = m10 * 128.0f + m11 * (src_u + 128.0f) + m12 * 128.0f;
+        dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_u + 0.5f), 0, 255);
+    } else {
+        // V plane - apply third row of matrix, but we only have V component  
+        float src_v = (float)src[x + y * src_pitch] - 128.0f;
+        // For V plane conversion, we apply the matrix considering zero U input and V
+        float dst_v = m20 * 128.0f + m21 * 128.0f + m22 * (src_v + 128.0f);
+        dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_v + 0.5f), 0, 255);
+    }
+}
+
 }
diff --git a/libavfilter/vf_lut3d_cuda.c b/libavfilter/vf_lut3d_cuda.c
new file mode 100644
index 0000000000000..16fceb4a1c3d0
--- /dev/null
+++ b/libavfilter/vf_lut3d_cuda.c
@@ -0,0 +1,604 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <float.h>
+#include <stdio.h>
+
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/mem.h"
+#include "libavutil/file_open.h"
+#include "libavutil/intfloat.h"
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+
+#include "avfilter.h"
+#include "filters.h"
+#include "video.h"
+#include "lut3d.h"
+
+#include "cuda/load_helper.h"
+#include "vf_lut3d_cuda.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_0RGB32,
+    AV_PIX_FMT_0BGR32,
+    AV_PIX_FMT_RGB32,
+    AV_PIX_FMT_BGR32,
+    AV_PIX_FMT_RGB48,
+    AV_PIX_FMT_BGR48,
+    AV_PIX_FMT_RGBA64,
+    AV_PIX_FMT_BGRA64,
+};
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCKX 32
+#define BLOCKY 16
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+typedef struct LUT3DCudaContext {
+    const AVClass *class;
+
+    AVCUDADeviceContext *hwctx;
+
+    enum AVPixelFormat in_fmt, out_fmt;
+    const AVPixFmtDescriptor *in_desc, *out_desc;
+
+    AVBufferRef *frames_ctx;
+    AVFrame     *frame;
+    AVFrame *tmp_frame;
+
+    CUcontext   cu_ctx;
+    CUmodule    cu_module;
+    CUfunction  cu_func;
+    CUstream    cu_stream;
+    CUdeviceptr cu_lut;
+
+    struct rgbvec *lut;
+    int lutsize;
+    int lutsize2;
+    struct rgbvec scale;
+    int interpolation;
+    char *file;
+
+    int bit_depth;
+    int passthrough;
+} LUT3DCudaContext;
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+static int get_bit_depth(enum AVPixelFormat fmt)
+{
+    switch (fmt) {
+    case AV_PIX_FMT_0RGB32:
+    case AV_PIX_FMT_0BGR32:
+    case AV_PIX_FMT_RGB32:
+    case AV_PIX_FMT_BGR32:
+        return 8;
+    case AV_PIX_FMT_RGB48:
+    case AV_PIX_FMT_BGR48:
+        return 12;
+    case AV_PIX_FMT_RGBA64:
+    case AV_PIX_FMT_BGRA64:
+        return 10;
+    default:
+        return 8;
+    }
+}
+
+static av_cold int lut3d_cuda_init(AVFilterContext *ctx)
+{
+    LUT3DCudaContext *s = ctx->priv;
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    s->tmp_frame = av_frame_alloc();
+    if (!s->tmp_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold void lut3d_cuda_uninit(AVFilterContext *ctx)
+{
+    LUT3DCudaContext *s = ctx->priv;
+
+    if (s->hwctx && s->cu_module) {
+        CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+        CUcontext dummy;
+
+        CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+        if (s->cu_lut) {
+            CHECK_CU(cu->cuMemFree(s->cu_lut));
+            s->cu_lut = 0;
+        }
+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
+        s->cu_module = NULL;
+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    }
+
+    av_freep(&s->lut);
+    av_frame_free(&s->frame);
+    av_buffer_unref(&s->frames_ctx);
+    av_frame_free(&s->tmp_frame);
+}
+
+static av_cold int init_hwframe_ctx(LUT3DCudaContext *s, AVBufferRef *device_ctx, int width, int height)
+{
+    AVBufferRef *out_ref = NULL;
+    AVHWFramesContext *out_ctx;
+    int ret;
+
+    out_ref = av_hwframe_ctx_alloc(device_ctx);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+    out_ctx = (AVHWFramesContext*)out_ref->data;
+
+    out_ctx->format    = AV_PIX_FMT_CUDA;
+    out_ctx->sw_format = s->out_fmt;
+    out_ctx->width     = FFALIGN(width,  32);
+    out_ctx->height    = FFALIGN(height, 32);
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_unref(s->frame);
+    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
+    if (ret < 0)
+        goto fail;
+
+    s->frame->width  = width;
+    s->frame->height = height;
+
+    av_buffer_unref(&s->frames_ctx);
+    s->frames_ctx = out_ref;
+
+    return 0;
+fail:
+    av_buffer_unref(&out_ref);
+    return ret;
+}
+
+#define MAX_LINE_SIZE 512
+
+static int skip_line(const char *p)
+{
+    while (*p && av_isspace(*p))
+        p++;
+    return !*p || *p == '#';
+}
+
+#define NEXT_LINE(loop_cond) do {                           \
+    if (!fgets(line, sizeof(line), f)) {                    \
+        av_log(ctx, AV_LOG_ERROR, "Unexpected EOF\n");      \
+        return AVERROR_INVALIDDATA;                         \
+    }                                                       \
+} while (loop_cond)
+
+static int parse_cube(AVFilterContext *ctx, FILE *f)
+{
+    LUT3DCudaContext *lut3d = ctx->priv;
+    char line[MAX_LINE_SIZE];
+    float min[3] = {0.0, 0.0, 0.0};
+    float max[3] = {1.0, 1.0, 1.0};
+    int size = 0, size2, i, j, k;
+
+    while (fgets(line, sizeof(line), f)) {
+        if (!strncmp(line, "LUT_3D_SIZE", 11)) {
+            size = strtol(line + 12, NULL, 0);
+            if (size < 2 || size > MAX_LEVEL) {
+                av_log(ctx, AV_LOG_ERROR, "Too large or invalid 3D LUT size\n");
+                return AVERROR(EINVAL);
+            }
+            lut3d->lutsize = size;
+            lut3d->lutsize2 = size * size;
+            break;
+        }
+    }
+
+    if (!size) {
+        av_log(ctx, AV_LOG_ERROR, "3D LUT size not found\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (!(lut3d->lut = av_malloc_array(size * size * size, sizeof(*lut3d->lut))))
+        return AVERROR(ENOMEM);
+
+    size2 = size * size;
+    rewind(f);
+
+    for (i = 0; i < size; i++) {
+        for (j = 0; j < size; j++) {
+            for (k = 0; k < size; k++) {
+                struct rgbvec *vec = &lut3d->lut[i * size2 + j * size + k];
+
+                do {
+try_again:
+                    NEXT_LINE(0);
+                    if (!strncmp(line, "DOMAIN_", 7)) {
+                        float *vals = NULL;
+                        if      (!strncmp(line + 7, "MIN ", 4)) vals = min;
+                        else if (!strncmp(line + 7, "MAX ", 4)) vals = max;
+                        if (!vals)
+                            return AVERROR_INVALIDDATA;
+                        if (av_sscanf(line + 11, "%f %f %f", vals, vals + 1, vals + 2) != 3)
+                            return AVERROR_INVALIDDATA;
+                        av_log(ctx, AV_LOG_DEBUG, "min: %f %f %f | max: %f %f %f\n",
+                               min[0], min[1], min[2], max[0], max[1], max[2]);
+                        goto try_again;
+                    } else if (!strncmp(line, "TITLE", 5)) {
+                        goto try_again;
+                    }
+                } while (skip_line(line));
+                if (av_sscanf(line, "%f %f %f", &vec->r, &vec->g, &vec->b) != 3)
+                    return AVERROR_INVALIDDATA;
+                vec->r = av_clipf(vec->r, 0.f, 1.f);
+                vec->g = av_clipf(vec->g, 0.f, 1.f);
+                vec->b = av_clipf(vec->b, 0.f, 1.f);
+            }
+        }
+    }
+
+    lut3d->scale.r = av_clipf(1. / (max[0] - min[0]), 0.f, 1.f) * (lut3d->lutsize - 1);
+    lut3d->scale.g = av_clipf(1. / (max[1] - min[1]), 0.f, 1.f) * (lut3d->lutsize - 1);
+    lut3d->scale.b = av_clipf(1. / (max[2] - min[2]), 0.f, 1.f) * (lut3d->lutsize - 1);
+
+    return 0;
+}
+
+static av_cold int lut3d_cuda_load_cube(AVFilterContext *ctx, const char *fname)
+{
+    FILE *f;
+    int ret;
+
+    f = avpriv_fopen_utf8(fname, "r");
+    if (!f) {
+        ret = AVERROR_INVALIDDATA;
+        av_log(ctx, AV_LOG_ERROR, "Cannot read file '%s': %s\n", fname, av_err2str(ret));
+        return ret;
+    }
+
+    ret = parse_cube(ctx, f);
+    fclose(f);
+    return ret;
+}
+
+static av_cold int lut3d_cuda_load_functions(AVFilterContext *ctx)
+{
+    LUT3DCudaContext *s = ctx->priv;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    char function_name[128];
+    int ret;
+
+    extern const unsigned char ff_vf_lut3d_cuda_ptx_data[];
+    extern const unsigned int ff_vf_lut3d_cuda_ptx_len;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
+                              ff_vf_lut3d_cuda_ptx_data, ff_vf_lut3d_cuda_ptx_len);
+    if (ret < 0)
+        goto fail;
+
+    switch(s->interpolation) {
+        case INTERPOLATE_NEAREST:
+            snprintf(function_name, sizeof(function_name), "lut3d_interp_%d_nearest", s->bit_depth);
+            break;
+        case INTERPOLATE_TRILINEAR:
+            snprintf(function_name, sizeof(function_name), "lut3d_interp_%d_trilinear", s->bit_depth);
+            break;
+        case INTERPOLATE_TETRAHEDRAL:
+        default:
+            snprintf(function_name, sizeof(function_name), "lut3d_interp_%d_tetrahedral", s->bit_depth);
+            break;
+    }
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func, s->cu_module, function_name));
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_FATAL, "Failed to load CUDA function %s\n", function_name);
+        goto fail;
+    }
+
+    ret = CHECK_CU(cu->cuMemAlloc(&s->cu_lut, s->lutsize * s->lutsize * s->lutsize * 3 * sizeof(float)));
+    if (ret < 0)
+        goto fail;
+
+    ret = CHECK_CU(cu->cuMemcpyHtoD(s->cu_lut, s->lut, s->lutsize * s->lutsize * s->lutsize * sizeof(struct rgbvec)));
+    if (ret < 0)
+        goto fail;
+
+fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    return ret;
+}
+
+static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int in_height,
+                                         int out_width, int out_height)
+{
+    LUT3DCudaContext *s = ctx->priv;
+    FilterLink     *inl = ff_filter_link(ctx->inputs[0]);
+    FilterLink    *outl = ff_filter_link(ctx->outputs[0]);
+
+    AVHWFramesContext *in_frames_ctx;
+    enum AVPixelFormat in_format;
+    enum AVPixelFormat out_format;
+    int ret;
+
+    if (!inl->hw_frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
+        return AVERROR(EINVAL);
+    }
+    in_frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data;
+    in_format     = in_frames_ctx->sw_format;
+    out_format    = in_format;
+
+    if (!format_is_supported(in_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
+               av_get_pix_fmt_name(in_format));
+        return AVERROR(ENOSYS);
+    }
+    if (!format_is_supported(out_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
+               av_get_pix_fmt_name(out_format));
+        return AVERROR(ENOSYS);
+    }
+
+    s->in_fmt = in_format;
+    s->out_fmt = out_format;
+    s->in_desc  = av_pix_fmt_desc_get(s->in_fmt);
+    s->out_desc = av_pix_fmt_desc_get(s->out_fmt);
+    s->bit_depth = get_bit_depth(s->in_fmt);
+
+    if (in_width == out_width && in_height == out_height && in_format == out_format) {
+        s->frames_ctx = av_buffer_ref(inl->hw_frames_ctx);
+        if (!s->frames_ctx)
+            return AVERROR(ENOMEM);
+    } else {
+        s->passthrough = 0;
+
+        ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, out_width, out_height);
+        if (ret < 0)
+            return ret;
+    }
+
+    outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
+    if (!outl->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int lut3d_cuda_config_props(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    FilterLink      *inl = ff_filter_link(inlink);
+    LUT3DCudaContext *s  = ctx->priv;
+    AVHWFramesContext     *frames_ctx;
+    AVCUDADeviceContext *device_hwctx;
+    int ret;
+
+    if (s->file) {
+        ret = lut3d_cuda_load_cube(ctx, s->file);
+        if (ret < 0)
+            return ret;
+    }
+
+    outlink->w = inlink->w;
+    outlink->h = inlink->h;
+
+    ret = init_processing_chain(ctx, inlink->w, inlink->h, outlink->w, outlink->h);
+    if (ret < 0)
+        return ret;
+
+    frames_ctx   = (AVHWFramesContext*)inl->hw_frames_ctx->data;
+    device_hwctx = frames_ctx->device_ctx->hwctx;
+
+    s->hwctx = device_hwctx;
+    s->cu_stream = s->hwctx->stream;
+
+    av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d fmt:%s -> w:%d h:%d fmt:%s%s\n",
+           inlink->w, inlink->h, av_get_pix_fmt_name(s->in_fmt),
+           outlink->w, outlink->h, av_get_pix_fmt_name(s->out_fmt),
+           s->passthrough ? " (passthrough)" : "");
+
+    ret = lut3d_cuda_load_functions(ctx);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static int call_lut3d_kernel(AVFilterContext *ctx, AVFrame *out, AVFrame *in)
+{
+    LUT3DCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    CUtexObject tex = 0;
+    int ret;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format = (s->bit_depth > 8) ? CU_AD_FORMAT_UNSIGNED_INT16 : CU_AD_FORMAT_UNSIGNED_INT8,
+        .res.pitch2D.numChannels = 4,
+        .res.pitch2D.width = in->width,
+        .res.pitch2D.height = in->height,
+        .res.pitch2D.pitchInBytes = in->linesize[0],
+        .res.pitch2D.devPtr = (CUdeviceptr)in->data[0],
+    };
+
+    ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    void *args[] = {
+        &tex,
+        &out->data[0],
+        &out->width, &out->height, &out->linesize[0],
+        &s->cu_lut, &s->lutsize, 
+        &s->scale.r, &s->scale.g, &s->scale.b
+    };
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func,
+                                       DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1,
+                                       BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL));
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    return ret;
+}
+
+static int lut3d_cuda_filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterContext       *ctx = link->dst;
+    LUT3DCudaContext        *s = ctx->priv;
+    AVFilterLink      *outlink = ctx->outputs[0];
+    CudaFunctions          *cu = s->hwctx->internal->cuda_dl;
+
+    AVFrame *out = NULL;
+    CUcontext dummy;
+    int ret = 0;
+
+    if (s->passthrough)
+        return ff_filter_frame(outlink, in);
+
+    out = av_frame_alloc();
+    if (!out) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        goto fail;
+
+    ret = av_hwframe_get_buffer(s->frames_ctx, out, 0);
+    if (ret < 0)
+        goto fail;
+
+    ret = call_lut3d_kernel(ctx, out, in);
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto fail;
+
+    ret = av_frame_copy_props(out, in);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_free(&in);
+    return ff_filter_frame(outlink, out);
+
+fail:
+    av_frame_free(&in);
+    av_frame_free(&out);
+    return ret;
+}
+
+#define OFFSET(x) offsetof(LUT3DCudaContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
+#define TFLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM)
+
+static const AVOption lut3d_cuda_options[] = {
+    { "file", "set 3D LUT file name", OFFSET(file), AV_OPT_TYPE_STRING, {.str=NULL}, .flags = FLAGS },
+    { "interp", "select interpolation mode", OFFSET(interpolation), AV_OPT_TYPE_INT, {.i64=INTERPOLATE_TETRAHEDRAL}, 0, NB_INTERP_MODE-1, TFLAGS, .unit = "interp_mode" },
+        { "nearest",     "use values from the nearest defined points",            0, AV_OPT_TYPE_CONST, {.i64=INTERPOLATE_NEAREST},     0, 0, TFLAGS, .unit = "interp_mode" },
+        { "trilinear",   "interpolate values using the 8 points defining a cube", 0, AV_OPT_TYPE_CONST, {.i64=INTERPOLATE_TRILINEAR},   0, 0, TFLAGS, .unit = "interp_mode" },
+        { "tetrahedral", "interpolate values using a tetrahedron",                0, AV_OPT_TYPE_CONST, {.i64=INTERPOLATE_TETRAHEDRAL}, 0, 0, TFLAGS, .unit = "interp_mode" },
+    { "passthrough", "Do not process frames at all if parameters match", OFFSET(passthrough), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS },
+    { NULL },
+};
+
+static const AVClass lut3d_cuda_class = {
+    .class_name = "lut3d_cuda",
+    .item_name  = av_default_item_name,
+    .option     = lut3d_cuda_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVFilterPad lut3d_cuda_inputs[] = {
+    {
+        .name        = "default",
+        .type        = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = lut3d_cuda_filter_frame,
+    },
+};
+
+static const AVFilterPad lut3d_cuda_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = lut3d_cuda_config_props,
+    },
+};
+
+const FFFilter ff_vf_lut3d_cuda = {
+    .p.name        = "lut3d_cuda",
+    .p.description = NULL_IF_CONFIG_SMALL("GPU accelerated 3D LUT filter"),
+
+    .p.priv_class  = &lut3d_cuda_class,
+
+    .init          = lut3d_cuda_init,
+    .uninit        = lut3d_cuda_uninit,
+
+    .priv_size = sizeof(LUT3DCudaContext),
+
+    FILTER_INPUTS(lut3d_cuda_inputs),
+    FILTER_OUTPUTS(lut3d_cuda_outputs),
+
+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
+
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
\ No newline at end of file
diff --git a/libavfilter/vf_lut3d_cuda.cu b/libavfilter/vf_lut3d_cuda.cu
new file mode 100644
index 0000000000000..a142299e13d76
--- /dev/null
+++ b/libavfilter/vf_lut3d_cuda.cu
@@ -0,0 +1,423 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cuda/vector_helpers.cuh"
+
+#define BLOCKX 32
+#define BLOCKY 16
+
+extern "C" {
+
+__device__ static inline float3 lerpf3(float3 a, float3 b, float t)
+{
+    return make_float3(
+        a.x + (b.x - a.x) * t,
+        a.y + (b.y - a.y) * t,
+        a.z + (b.z - a.z) * t
+    );
+}
+
+__device__ static inline float3 interp_nearest_cuda(float3 *lut, int lutsize, float3 s)
+{
+    int r = (int)(s.x + 0.5f);
+    int g = (int)(s.y + 0.5f);
+    int b = (int)(s.z + 0.5f);
+    
+    r = min(max(r, 0), lutsize - 1);
+    g = min(max(g, 0), lutsize - 1);
+    b = min(max(b, 0), lutsize - 1);
+    
+    return lut[r * lutsize * lutsize + g * lutsize + b];
+}
+
+__device__ static inline float3 interp_trilinear_cuda(float3 *lut, int lutsize, float3 s)
+{
+    int lutsize2 = lutsize * lutsize;
+    
+    int prev_r = (int)s.x;
+    int prev_g = (int)s.y;
+    int prev_b = (int)s.z;
+    
+    int next_r = min(prev_r + 1, lutsize - 1);
+    int next_g = min(prev_g + 1, lutsize - 1);
+    int next_b = min(prev_b + 1, lutsize - 1);
+    
+    float3 d = make_float3(s.x - prev_r, s.y - prev_g, s.z - prev_b);
+    
+    float3 c000 = lut[prev_r * lutsize2 + prev_g * lutsize + prev_b];
+    float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b];
+    float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b];
+    float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b];
+    float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b];
+    float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b];
+    float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b];
+    float3 c111 = lut[next_r * lutsize2 + next_g * lutsize + next_b];
+    
+    float3 c00 = lerpf3(c000, c100, d.x);
+    float3 c10 = lerpf3(c010, c110, d.x);
+    float3 c01 = lerpf3(c001, c101, d.x);
+    float3 c11 = lerpf3(c011, c111, d.x);
+    float3 c0 = lerpf3(c00, c10, d.y);
+    float3 c1 = lerpf3(c01, c11, d.y);
+    
+    return lerpf3(c0, c1, d.z);
+}
+
+__device__ static inline float3 interp_tetrahedral_cuda(float3 *lut, int lutsize, float3 s)
+{
+    int lutsize2 = lutsize * lutsize;
+    
+    int prev_r = (int)s.x;
+    int prev_g = (int)s.y;
+    int prev_b = (int)s.z;
+    
+    int next_r = min(prev_r + 1, lutsize - 1);
+    int next_g = min(prev_g + 1, lutsize - 1);
+    int next_b = min(prev_b + 1, lutsize - 1);
+    
+    float3 d = make_float3(s.x - prev_r, s.y - prev_g, s.z - prev_b);
+    
+    float3 c000 = lut[prev_r * lutsize2 + prev_g * lutsize + prev_b];
+    float3 c111 = lut[next_r * lutsize2 + next_g * lutsize + next_b];
+    
+    float3 c;
+    if (d.x > d.y && d.x > d.z) {
+        if (d.y > d.z) {
+            float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b];
+            float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b];
+            c = make_float3(
+                c000.x + d.x * (c100.x - c000.x) + d.y * (c110.x - c100.x) + d.z * (c111.x - c110.x),
+                c000.y + d.x * (c100.y - c000.y) + d.y * (c110.y - c100.y) + d.z * (c111.y - c110.y),
+                c000.z + d.x * (c100.z - c000.z) + d.y * (c110.z - c100.z) + d.z * (c111.z - c110.z)
+            );
+        } else {
+            float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b];
+            float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b];
+            c = make_float3(
+                c000.x + d.x * (c100.x - c000.x) + d.z * (c101.x - c100.x) + d.y * (c111.x - c101.x),
+                c000.y + d.x * (c100.y - c000.y) + d.z * (c101.y - c100.y) + d.y * (c111.y - c101.y),
+                c000.z + d.x * (c100.z - c000.z) + d.z * (c101.z - c100.z) + d.y * (c111.z - c101.z)
+            );
+        }
+    } else if (d.y > d.z) {
+        if (d.x > d.z) {
+            float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b];
+            float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b];
+            c = make_float3(
+                c000.x + d.y * (c010.x - c000.x) + d.x * (c110.x - c010.x) + d.z * (c111.x - c110.x),
+                c000.y + d.y * (c010.y - c000.y) + d.x * (c110.y - c010.y) + d.z * (c111.y - c110.y),
+                c000.z + d.y * (c010.z - c000.z) + d.x * (c110.z - c010.z) + d.z * (c111.z - c110.z)
+            );
+        } else {
+            float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b];
+            float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b];
+            c = make_float3(
+                c000.x + d.y * (c010.x - c000.x) + d.z * (c011.x - c010.x) + d.x * (c111.x - c011.x),
+                c000.y + d.y * (c010.y - c000.y) + d.z * (c011.y - c010.y) + d.x * (c111.y - c011.y),
+                c000.z + d.y * (c010.z - c000.z) + d.z * (c011.z - c010.z) + d.x * (c111.z - c011.z)
+            );
+        }
+    } else {
+        if (d.x > d.y) {
+            float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b];
+            float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b];
+            c = make_float3(
+                c000.x + d.z * (c001.x - c000.x) + d.x * (c101.x - c001.x) + d.y * (c111.x - c101.x),
+                c000.y + d.z * (c001.y - c000.y) + d.x * (c101.y - c001.y) + d.y * (c111.y - c101.y),
+                c000.z + d.z * (c001.z - c000.z) + d.x * (c101.z - c001.z) + d.y * (c111.z - c101.z)
+            );
+        } else {
+            float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b];
+            float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b];
+            c = make_float3(
+                c000.x + d.z * (c001.x - c000.x) + d.y * (c011.x - c001.x) + d.x * (c111.x - c011.x),
+                c000.y + d.z * (c001.y - c000.y) + d.y * (c011.y - c001.y) + d.x * (c111.y - c011.y),
+                c000.z + d.z * (c001.z - c000.z) + d.y * (c011.z - c001.z) + d.x * (c111.z - c011.z)
+            );
+        }
+    }
+    
+    return c;
+}
+
+__global__ void lut3d_interp_8_nearest(
+    cudaTextureObject_t src_tex,
+    uchar4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    uchar4 src = tex2D<uchar4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        src.x / 255.0f * scale_r,
+        src.y / 255.0f * scale_g,
+        src.z / 255.0f * scale_b
+    );
+    
+    float3 result = interp_nearest_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4(
+        (unsigned char)(result.x * 255.0f + 0.5f),
+        (unsigned char)(result.y * 255.0f + 0.5f),
+        (unsigned char)(result.z * 255.0f + 0.5f),
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_10_nearest(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 6) * scale_r,  // Convert 10-bit to normalized range
+        (src.y >> 6) * scale_g,
+        (src.z >> 6) * scale_b
+    );
+    
+    float3 result = interp_nearest_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 1023.0f + 0.5f) << 6,  // Convert back to 10-bit in 16-bit container
+        (unsigned short)(result.y * 1023.0f + 0.5f) << 6,
+        (unsigned short)(result.z * 1023.0f + 0.5f) << 6,
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_12_nearest(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 4) * scale_r,  // Convert 12-bit to normalized range
+        (src.y >> 4) * scale_g,
+        (src.z >> 4) * scale_b
+    );
+    
+    float3 result = interp_nearest_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 4095.0f + 0.5f) << 4,  // Convert back to 12-bit in 16-bit container
+        (unsigned short)(result.y * 4095.0f + 0.5f) << 4,
+        (unsigned short)(result.z * 4095.0f + 0.5f) << 4,
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_8_trilinear(
+    cudaTextureObject_t src_tex,
+    uchar4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    uchar4 src = tex2D<uchar4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        src.x / 255.0f * scale_r,
+        src.y / 255.0f * scale_g,
+        src.z / 255.0f * scale_b
+    );
+    
+    float3 result = interp_trilinear_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4(
+        (unsigned char)(result.x * 255.0f + 0.5f),
+        (unsigned char)(result.y * 255.0f + 0.5f),
+        (unsigned char)(result.z * 255.0f + 0.5f),
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_10_trilinear(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 6) / 1023.0f * scale_r,
+        (src.y >> 6) / 1023.0f * scale_g,
+        (src.z >> 6) / 1023.0f * scale_b
+    );
+    
+    float3 result = interp_trilinear_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 1023.0f + 0.5f) << 6,
+        (unsigned short)(result.y * 1023.0f + 0.5f) << 6,
+        (unsigned short)(result.z * 1023.0f + 0.5f) << 6,
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_12_trilinear(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 4) / 4095.0f * scale_r,
+        (src.y >> 4) / 4095.0f * scale_g,
+        (src.z >> 4) / 4095.0f * scale_b
+    );
+    
+    float3 result = interp_trilinear_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 4095.0f + 0.5f) << 4,
+        (unsigned short)(result.y * 4095.0f + 0.5f) << 4,
+        (unsigned short)(result.z * 4095.0f + 0.5f) << 4,
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_8_tetrahedral(
+    cudaTextureObject_t src_tex,
+    uchar4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    uchar4 src = tex2D<uchar4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        src.x / 255.0f * scale_r,
+        src.y / 255.0f * scale_g,
+        src.z / 255.0f * scale_b
+    );
+    
+    float3 result = interp_tetrahedral_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4(
+        (unsigned char)(result.x * 255.0f + 0.5f),
+        (unsigned char)(result.y * 255.0f + 0.5f),
+        (unsigned char)(result.z * 255.0f + 0.5f),
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_10_tetrahedral(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 6) / 1023.0f * scale_r,
+        (src.y >> 6) / 1023.0f * scale_g,
+        (src.z >> 6) / 1023.0f * scale_b
+    );
+    
+    float3 result = interp_tetrahedral_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 1023.0f + 0.5f) << 6,
+        (unsigned short)(result.y * 1023.0f + 0.5f) << 6,
+        (unsigned short)(result.z * 1023.0f + 0.5f) << 6,
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_12_tetrahedral(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 4) / 4095.0f * scale_r,
+        (src.y >> 4) / 4095.0f * scale_g,
+        (src.z >> 4) / 4095.0f * scale_b
+    );
+    
+    float3 result = interp_tetrahedral_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 4095.0f + 0.5f) << 4,
+        (unsigned short)(result.y * 4095.0f + 0.5f) << 4,
+        (unsigned short)(result.z * 4095.0f + 0.5f) << 4,
+        src.w
+    );
+}
+
+}
\ No newline at end of file
diff --git a/libavfilter/vf_lut3d_cuda.h b/libavfilter/vf_lut3d_cuda.h
new file mode 100644
index 0000000000000..236f338c8cc8d
--- /dev/null
+++ b/libavfilter/vf_lut3d_cuda.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef AVFILTER_LUT3D_CUDA_H
+#define AVFILTER_LUT3D_CUDA_H
+
+#endif
\ No newline at end of file
diff --git a/libavfilter/vf_tonemap_cuda.c b/libavfilter/vf_tonemap_cuda.c
new file mode 100644
index 0000000000000..370e364ad9779
--- /dev/null
+++ b/libavfilter/vf_tonemap_cuda.c
@@ -0,0 +1,637 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <float.h>
+#include <stdio.h>
+
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/csp.h"
+
+#include "avfilter.h"
+#include "filters.h"
+#include "video.h"
+
+#include "cuda/load_helper.h"
+#include "vf_tonemap_cuda.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_GBRPF32,
+    AV_PIX_FMT_RGB48,
+    AV_PIX_FMT_RGBA64,
+    AV_PIX_FMT_P010LE,
+    AV_PIX_FMT_P016LE,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV420P10LE,
+    AV_PIX_FMT_YUV420P16LE,
+    AV_PIX_FMT_NV12,
+};
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCKX 32
+#define BLOCKY 16
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+enum TonemapAlgorithm {
+    TONEMAP_NONE,
+    TONEMAP_LINEAR,
+    TONEMAP_GAMMA,
+    TONEMAP_CLIP,
+    TONEMAP_REINHARD,
+    TONEMAP_HABLE,
+    TONEMAP_MOBIUS,
+    TONEMAP_MAX,
+};
+
+typedef struct TonemapCudaContext {
+    const AVClass *class;
+
+    AVCUDADeviceContext *hwctx;
+
+    enum AVPixelFormat in_fmt, out_fmt;
+    const AVPixFmtDescriptor *in_desc, *out_desc;
+
+    AVBufferRef *frames_ctx;
+    AVFrame     *frame;
+    AVFrame *tmp_frame;
+
+    CUcontext   cu_ctx;
+    CUmodule    cu_module;
+    CUfunction  cu_func;
+    CUstream    cu_stream;
+
+
+    enum TonemapAlgorithm tonemap;
+    double param;
+    double desat;
+    double peak;
+
+    const AVLumaCoefficients *coeffs;
+    int passthrough;
+} TonemapCudaContext;
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+static av_cold int tonemap_cuda_init(AVFilterContext *ctx)
+{
+    TonemapCudaContext *s = ctx->priv;
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    s->tmp_frame = av_frame_alloc();
+    if (!s->tmp_frame)
+        return AVERROR(ENOMEM);
+
+
+    switch(s->tonemap) {
+    case TONEMAP_GAMMA:
+        if (isnan(s->param))
+            s->param = 1.8f;
+        break;
+    case TONEMAP_REINHARD:
+        if (!isnan(s->param))
+            s->param = (1.0f - s->param) / s->param;
+        break;
+    case TONEMAP_MOBIUS:
+        if (isnan(s->param))
+            s->param = 0.3f;
+        break;
+    }
+
+    if (isnan(s->param))
+        s->param = 1.0f;
+
+    return 0;
+}
+
+static av_cold void tonemap_cuda_uninit(AVFilterContext *ctx)
+{
+    TonemapCudaContext *s = ctx->priv;
+
+
+    if (s->hwctx && s->cu_module) {
+        CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+        CUcontext dummy;
+
+        CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
+        s->cu_module = NULL;
+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    }
+
+    av_frame_free(&s->frame);
+    av_buffer_unref(&s->frames_ctx);
+    av_frame_free(&s->tmp_frame);
+}
+
+static av_cold int init_hwframe_ctx(TonemapCudaContext *s, AVBufferRef *device_ctx, int width, int height)
+{
+    AVBufferRef *out_ref = NULL;
+    AVHWFramesContext *out_ctx;
+    int ret;
+
+    out_ref = av_hwframe_ctx_alloc(device_ctx);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+    out_ctx = (AVHWFramesContext*)out_ref->data;
+
+    out_ctx->format    = AV_PIX_FMT_CUDA;
+    out_ctx->sw_format = s->out_fmt;
+    out_ctx->width     = FFALIGN(width,  32);
+    out_ctx->height    = FFALIGN(height, 32);
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_unref(s->frame);
+    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
+    if (ret < 0)
+        goto fail;
+
+    s->frame->width  = width;
+    s->frame->height = height;
+
+    av_buffer_unref(&s->frames_ctx);
+    s->frames_ctx = out_ref;
+
+    return 0;
+fail:
+    av_buffer_unref(&out_ref);
+    return ret;
+}
+
+static av_cold int tonemap_cuda_load_functions(AVFilterContext *ctx)
+{
+    TonemapCudaContext *s = ctx->priv;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    char function_name[128];
+    int ret;
+
+    extern const unsigned char ff_vf_tonemap_cuda_ptx_data[];
+    extern const unsigned int ff_vf_tonemap_cuda_ptx_len;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
+                              ff_vf_tonemap_cuda_ptx_data, ff_vf_tonemap_cuda_ptx_len);
+    if (ret < 0)
+        goto fail;
+
+    if (s->in_fmt == AV_PIX_FMT_GBRPF32) {
+        snprintf(function_name, sizeof(function_name), "tonemap_cuda_planar_float");
+    } else if ((s->in_fmt == AV_PIX_FMT_P010LE || s->in_fmt == AV_PIX_FMT_P016LE) && s->out_fmt == AV_PIX_FMT_NV12) {
+        snprintf(function_name, sizeof(function_name), "tonemap_cuda_p016_to_nv12");
+    } else if (s->in_fmt == AV_PIX_FMT_YUV420P10LE || s->in_fmt == AV_PIX_FMT_YUV420P16LE) {
+        snprintf(function_name, sizeof(function_name), "tonemap_cuda_yuv420p10");
+    } else {
+        snprintf(function_name, sizeof(function_name), "tonemap_cuda_16bit");
+    }
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func, s->cu_module, function_name));
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_FATAL, "Failed to load CUDA function %s\n", function_name);
+        goto fail;
+    }
+
+fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    return ret;
+}
+
+static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int in_height,
+                                         int out_width, int out_height)
+{
+    TonemapCudaContext *s = ctx->priv;
+    FilterLink     *inl = ff_filter_link(ctx->inputs[0]);
+    FilterLink    *outl = ff_filter_link(ctx->outputs[0]);
+
+    AVHWFramesContext *in_frames_ctx;
+    enum AVPixelFormat in_format;
+    enum AVPixelFormat out_format;
+    int ret;
+
+    if (!inl->hw_frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
+        return AVERROR(EINVAL);
+    }
+    in_frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data;
+    in_format     = in_frames_ctx->sw_format;
+    // Convert HDR to SDR format for encoder compatibility
+    if (in_format == AV_PIX_FMT_P010LE || in_format == AV_PIX_FMT_P016LE) {
+        out_format = AV_PIX_FMT_NV12;
+    } else if (in_format == AV_PIX_FMT_YUV420P10LE || in_format == AV_PIX_FMT_YUV420P16LE) {
+        out_format = AV_PIX_FMT_YUV420P;
+    } else {
+        out_format = in_format;
+    }
+
+    if (!format_is_supported(in_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
+               av_get_pix_fmt_name(in_format));
+        return AVERROR(ENOSYS);
+    }
+    if (!format_is_supported(out_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
+               av_get_pix_fmt_name(out_format));
+        return AVERROR(ENOSYS);
+    }
+
+    s->in_fmt = in_format;
+    s->out_fmt = out_format;
+    s->in_desc  = av_pix_fmt_desc_get(s->in_fmt);
+    s->out_desc = av_pix_fmt_desc_get(s->out_fmt);
+
+    if (in_width == out_width && in_height == out_height && in_format == out_format) {
+        s->frames_ctx = av_buffer_ref(inl->hw_frames_ctx);
+        if (!s->frames_ctx)
+            return AVERROR(ENOMEM);
+    } else {
+        s->passthrough = 0;
+
+        ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, out_width, out_height);
+        if (ret < 0)
+            return ret;
+    }
+
+    outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
+    if (!outl->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int tonemap_cuda_config_props(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    FilterLink      *inl = ff_filter_link(inlink);
+    TonemapCudaContext *s  = ctx->priv;
+    AVHWFramesContext     *frames_ctx;
+    AVCUDADeviceContext *device_hwctx;
+    int ret;
+
+    outlink->w = inlink->w;
+    outlink->h = inlink->h;
+
+    ret = init_processing_chain(ctx, inlink->w, inlink->h, outlink->w, outlink->h);
+    if (ret < 0)
+        return ret;
+
+    frames_ctx   = (AVHWFramesContext*)inl->hw_frames_ctx->data;
+    device_hwctx = frames_ctx->device_ctx->hwctx;
+
+    s->hwctx = device_hwctx;
+    s->cu_stream = s->hwctx->stream;
+
+    s->coeffs = av_csp_luma_coeffs_from_avcsp(inlink->colorspace);
+
+    av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d fmt:%s -> w:%d h:%d fmt:%s%s\n",
+           inlink->w, inlink->h, av_get_pix_fmt_name(s->in_fmt),
+           outlink->w, outlink->h, av_get_pix_fmt_name(s->out_fmt),
+           s->passthrough ? " (passthrough)" : "");
+
+    ret = tonemap_cuda_load_functions(ctx);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static int call_tonemap_kernel(AVFilterContext *ctx, AVFrame *out, AVFrame *in)
+{
+    TonemapCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    CUtexObject tex = 0;
+    int ret;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    if (s->in_fmt == AV_PIX_FMT_GBRPF32) {
+        // Planar float format
+        float coeff_r = av_q2d(s->coeffs->cr);
+        float coeff_g = av_q2d(s->coeffs->cg);
+        float coeff_b = av_q2d(s->coeffs->cb);
+        void *args[] = {
+            &in->data[0], &in->data[1], &in->data[2],
+            &out->data[0], &out->data[1], &out->data[2],
+            &out->width, &out->height, &in->linesize[0], &out->linesize[0],
+            &s->tonemap, &s->param, &s->desat, &s->peak,
+            &coeff_r, &coeff_g, &coeff_b
+        };
+
+        ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func,
+                                           DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1,
+                                           BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL));
+    } else if ((s->in_fmt == AV_PIX_FMT_P010LE || s->in_fmt == AV_PIX_FMT_P016LE) && s->out_fmt == AV_PIX_FMT_NV12) {
+        // P016 to NV12 conversion
+        CUDA_TEXTURE_DESC tex_desc_y = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+        
+        CUDA_TEXTURE_DESC tex_desc_uv = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+
+        CUDA_RESOURCE_DESC res_desc_y = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = 1,
+            .res.pitch2D.width = in->width,
+            .res.pitch2D.height = in->height,
+            .res.pitch2D.pitchInBytes = in->linesize[0],
+            .res.pitch2D.devPtr = (CUdeviceptr)in->data[0],
+        };
+
+        CUDA_RESOURCE_DESC res_desc_uv = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = 2,
+            .res.pitch2D.width = in->width / 2,
+            .res.pitch2D.height = in->height / 2,
+            .res.pitch2D.pitchInBytes = in->linesize[1],
+            .res.pitch2D.devPtr = (CUdeviceptr)in->data[1],
+        };
+
+        CUtexObject tex_y = 0, tex_uv = 0;
+        ret = CHECK_CU(cu->cuTexObjectCreate(&tex_y, &res_desc_y, &tex_desc_y, NULL));
+        if (ret < 0) goto exit;
+        
+        ret = CHECK_CU(cu->cuTexObjectCreate(&tex_uv, &res_desc_uv, &tex_desc_uv, NULL));
+        if (ret < 0) goto exit;
+
+        float coeff_r = av_q2d(s->coeffs->cr);
+        float coeff_g = av_q2d(s->coeffs->cg);
+        float coeff_b = av_q2d(s->coeffs->cb);
+
+        void *args[] = {
+            &tex_y, &tex_uv,
+            &out->data[0], &out->data[1],
+            &out->width, &out->height, &out->linesize[0], &out->linesize[1],
+            &s->tonemap, &s->param, &s->desat, &s->peak,
+            &coeff_r, &coeff_g, &coeff_b
+        };
+
+        ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func,
+                                           DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1,
+                                           BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL));
+        
+        if (tex_y) CHECK_CU(cu->cuTexObjectDestroy(tex_y));
+        if (tex_uv) CHECK_CU(cu->cuTexObjectDestroy(tex_uv));
+    } else if (s->in_fmt == AV_PIX_FMT_YUV420P10LE || s->in_fmt == AV_PIX_FMT_YUV420P16LE) {
+        // YUV format handling
+        CUDA_TEXTURE_DESC tex_desc_y = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+        
+        CUDA_TEXTURE_DESC tex_desc_uv = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+
+        CUDA_RESOURCE_DESC res_desc_y = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = 1,
+            .res.pitch2D.width = in->width,
+            .res.pitch2D.height = in->height,
+            .res.pitch2D.pitchInBytes = in->linesize[0],
+            .res.pitch2D.devPtr = (CUdeviceptr)in->data[0],
+        };
+
+        CUDA_RESOURCE_DESC res_desc_uv = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = 2,
+            .res.pitch2D.width = in->width / 2,
+            .res.pitch2D.height = in->height / 2,
+            .res.pitch2D.pitchInBytes = in->linesize[1],
+            .res.pitch2D.devPtr = (CUdeviceptr)in->data[1],
+        };
+
+        CUtexObject tex_y = 0, tex_uv = 0;
+        ret = CHECK_CU(cu->cuTexObjectCreate(&tex_y, &res_desc_y, &tex_desc_y, NULL));
+        if (ret < 0) goto exit;
+        
+        ret = CHECK_CU(cu->cuTexObjectCreate(&tex_uv, &res_desc_uv, &tex_desc_uv, NULL));
+        if (ret < 0) goto exit;
+
+        float coeff_r = av_q2d(s->coeffs->cr);
+        float coeff_g = av_q2d(s->coeffs->cg);
+        float coeff_b = av_q2d(s->coeffs->cb);
+
+        void *args[] = {
+            &tex_y, &tex_uv,
+            &out->data[0], &out->data[1], &out->data[2],
+            &out->width, &out->height, &out->linesize[0], &out->linesize[1],
+            &s->tonemap, &s->param, &s->desat, &s->peak,
+            &coeff_r, &coeff_g, &coeff_b
+        };
+
+        ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func,
+                                           DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1,
+                                           BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL));
+        
+        if (tex_y) CHECK_CU(cu->cuTexObjectDestroy(tex_y));
+        if (tex_uv) CHECK_CU(cu->cuTexObjectDestroy(tex_uv));
+    } else {
+        // 16-bit formats with texture
+        CUDA_TEXTURE_DESC tex_desc = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+
+        CUDA_RESOURCE_DESC res_desc = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = 4,
+            .res.pitch2D.width = in->width,
+            .res.pitch2D.height = in->height,
+            .res.pitch2D.pitchInBytes = in->linesize[0],
+            .res.pitch2D.devPtr = (CUdeviceptr)in->data[0],
+        };
+
+        ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+        if (ret < 0)
+            goto exit;
+
+        int bit_depth = (s->in_fmt == AV_PIX_FMT_RGB48) ? 16 : 16;
+        float coeff_r = av_q2d(s->coeffs->cr);
+        float coeff_g = av_q2d(s->coeffs->cg);
+        float coeff_b = av_q2d(s->coeffs->cb);
+
+        void *args[] = {
+            &tex,
+            &out->data[0],
+            &out->width, &out->height, &out->linesize[0],
+            &s->tonemap, &s->param, &s->desat, &s->peak,
+            &coeff_r, &coeff_g, &coeff_b,
+            &bit_depth
+        };
+
+        ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func,
+                                           DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1,
+                                           BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL));
+    }
+
+exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    return ret;
+}
+
+
+static int tonemap_cuda_filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterContext       *ctx = link->dst;
+    TonemapCudaContext        *s = ctx->priv;
+    AVFilterLink      *outlink = ctx->outputs[0];
+    CudaFunctions          *cu = s->hwctx->internal->cuda_dl;
+
+    AVFrame *out = NULL;
+    CUcontext dummy;
+    int ret = 0;
+
+    if (s->passthrough)
+        return ff_filter_frame(outlink, in);
+
+    out = av_frame_alloc();
+    if (!out) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        goto fail;
+
+    ret = av_hwframe_get_buffer(s->frames_ctx, out, 0);
+    if (ret < 0)
+        goto fail;
+
+    ret = call_tonemap_kernel(ctx, out, in);
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto fail;
+
+    ret = av_frame_copy_props(out, in);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_free(&in);
+    return ff_filter_frame(outlink, out);
+
+fail:
+    av_frame_free(&in);
+    av_frame_free(&out);
+    return ret;
+}
+
+#define OFFSET(x) offsetof(TonemapCudaContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
+
+static const AVOption tonemap_cuda_options[] = {
+    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64=TONEMAP_NONE}, 0, TONEMAP_MAX-1, FLAGS, .unit = "tonemap" },
+        { "none",     0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_NONE},     0, 0, FLAGS, .unit = "tonemap" },
+        { "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_LINEAR},   0, 0, FLAGS, .unit = "tonemap" },
+        { "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_GAMMA},    0, 0, FLAGS, .unit = "tonemap" },
+        { "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_CLIP},     0, 0, FLAGS, .unit = "tonemap" },
+        { "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_REINHARD}, 0, 0, FLAGS, .unit = "tonemap" },
+        { "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_HABLE},    0, 0, FLAGS, .unit = "tonemap" },
+        { "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_MOBIUS},   0, 0, FLAGS, .unit = "tonemap" },
+    { "param",        "tonemap parameter", OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl=NAN}, DBL_MIN, DBL_MAX, FLAGS },
+    { "desat",        "desaturation strength", OFFSET(desat), AV_OPT_TYPE_DOUBLE, {.dbl=2}, 0, DBL_MAX, FLAGS },
+    { "peak",         "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS },
+    { "passthrough",  "Do not process frames at all if parameters match", OFFSET(passthrough), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS },
+    { NULL },
+};
+
+static const AVClass tonemap_cuda_class = {
+    .class_name = "tonemap_cuda",
+    .item_name  = av_default_item_name,
+    .option     = tonemap_cuda_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVFilterPad tonemap_cuda_inputs[] = {
+    {
+        .name        = "default",
+        .type        = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = tonemap_cuda_filter_frame,
+    },
+};
+
+static const AVFilterPad tonemap_cuda_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = tonemap_cuda_config_props,
+    },
+};
+
+const FFFilter ff_vf_tonemap_cuda = {
+    .p.name        = "tonemap_cuda",
+    .p.description = NULL_IF_CONFIG_SMALL("GPU accelerated HDR to SDR tonemap filter"),
+
+    .p.priv_class  = &tonemap_cuda_class,
+
+    .init          = tonemap_cuda_init,
+    .uninit        = tonemap_cuda_uninit,
+
+    .priv_size = sizeof(TonemapCudaContext),
+
+    FILTER_INPUTS(tonemap_cuda_inputs),
+    FILTER_OUTPUTS(tonemap_cuda_outputs),
+
+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
+
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
\ No newline at end of file
diff --git a/libavfilter/vf_tonemap_cuda.cu b/libavfilter/vf_tonemap_cuda.cu
new file mode 100644
index 0000000000000..3338e5d17d675
--- /dev/null
+++ b/libavfilter/vf_tonemap_cuda.cu
@@ -0,0 +1,308 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cuda/vector_helpers.cuh"
+
+#define BLOCKX 32
+#define BLOCKY 16
+
+extern "C" {
+
+enum TonemapAlgorithm {
+    TONEMAP_NONE,
+    TONEMAP_LINEAR,
+    TONEMAP_GAMMA,
+    TONEMAP_CLIP,
+    TONEMAP_REINHARD,
+    TONEMAP_HABLE,
+    TONEMAP_MOBIUS,
+    TONEMAP_MAX,
+};
+
+__device__ static inline float hable_cuda(float in)
+{
+    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
+    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
+}
+
+__device__ static inline float mobius_cuda(float in, float j, float peak)
+{
+    float a, b;
+
+    if (in <= j)
+        return in;
+
+    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
+    b = (j * j - 2.0f * j * peak + peak) / fmaxf(peak - 1.0f, 1e-6f);
+
+    return (b * b + 2.0f * b * j + j * j) / (b - a) * (in + a) / (in + b);
+}
+
+__device__ static inline float apply_tonemap_cuda(int algorithm, float sig, float param, float peak)
+{
+    switch (algorithm) {
+    case TONEMAP_NONE:
+        return sig;
+    case TONEMAP_LINEAR:
+        return sig * param / peak;
+    case TONEMAP_GAMMA:
+        return sig > 0.05f ? powf(sig / peak, 1.0f / param)
+                           : sig * powf(0.05f / peak, 1.0f / param) / 0.05f;
+    case TONEMAP_CLIP:
+        return fminf(fmaxf(sig * param, 0.0f), 1.0f);
+    case TONEMAP_HABLE:
+        return hable_cuda(sig) / hable_cuda(peak);
+    case TONEMAP_REINHARD:
+        return sig / (sig + param) * (peak + param) / peak;
+    case TONEMAP_MOBIUS:
+        return mobius_cuda(sig, param, peak);
+    default:
+        return sig;
+    }
+}
+
+__global__ void tonemap_cuda_float(
+    cudaTextureObject_t src_tex,
+    float *dst_r, float *dst_g, float *dst_b,
+    int dst_width, int dst_height, int dst_pitch,
+    int algorithm, float param, float desat, float peak,
+    float coeff_r, float coeff_g, float coeff_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    float4 src = tex2D<float4>(src_tex, x, y);
+    float r = src.x;
+    float g = src.y;
+    float b = src.z;
+    
+    // Desaturate to prevent unnatural colors
+    if (desat > 0) {
+        float luma = coeff_r * r + coeff_g * g + coeff_b * b;
+        float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f);
+        r = r * (1.0f - overbright) + luma * overbright;
+        g = g * (1.0f - overbright) + luma * overbright;
+        b = b * (1.0f - overbright) + luma * overbright;
+    }
+    
+    // Pick the brightest component
+    float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f);
+    float sig_orig = sig;
+    
+    // Apply tonemap algorithm
+    sig = apply_tonemap_cuda(algorithm, sig, param, peak);
+    
+    // Apply the computed scale factor to the color
+    float scale = sig / sig_orig;
+    r *= scale;
+    g *= scale;
+    b *= scale;
+    
+    int idx = y * (dst_pitch / sizeof(float)) + x;
+    dst_r[idx] = r;
+    dst_g[idx] = g;
+    dst_b[idx] = b;
+}
+
+__global__ void tonemap_cuda_planar_float(
+    float *src_r, float *src_g, float *src_b,
+    float *dst_r, float *dst_g, float *dst_b,
+    int width, int height, int src_pitch, int dst_pitch,
+    int algorithm, float param, float desat, float peak,
+    float coeff_r, float coeff_g, float coeff_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= width || y >= height)
+        return;
+    
+    int src_idx = y * (src_pitch / sizeof(float)) + x;
+    int dst_idx = y * (dst_pitch / sizeof(float)) + x;
+    
+    float r = src_r[src_idx];
+    float g = src_g[src_idx];
+    float b = src_b[src_idx];
+    
+    // Desaturate to prevent unnatural colors
+    if (desat > 0) {
+        float luma = coeff_r * r + coeff_g * g + coeff_b * b;
+        float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f);
+        r = r * (1.0f - overbright) + luma * overbright;
+        g = g * (1.0f - overbright) + luma * overbright;
+        b = b * (1.0f - overbright) + luma * overbright;
+    }
+    
+    // Pick the brightest component
+    float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f);
+    float sig_orig = sig;
+    
+    // Apply tonemap algorithm
+    sig = apply_tonemap_cuda(algorithm, sig, param, peak);
+    
+    // Apply the computed scale factor to the color
+    float scale = sig / sig_orig;
+    r *= scale;
+    g *= scale;
+    b *= scale;
+    
+    dst_r[dst_idx] = r;
+    dst_g[dst_idx] = g;
+    dst_b[dst_idx] = b;
+}
+
+__global__ void tonemap_cuda_16bit(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    int algorithm, float param, float desat, float peak,
+    float coeff_r, float coeff_g, float coeff_b, int bit_depth)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    float max_val = (1 << bit_depth) - 1;
+    
+    float r = src.x / max_val;
+    float g = src.y / max_val;
+    float b = src.z / max_val;
+    
+    // Desaturate to prevent unnatural colors
+    if (desat > 0) {
+        float luma = coeff_r * r + coeff_g * g + coeff_b * b;
+        float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f);
+        r = r * (1.0f - overbright) + luma * overbright;
+        g = g * (1.0f - overbright) + luma * overbright;
+        b = b * (1.0f - overbright) + luma * overbright;
+    }
+    
+    // Pick the brightest component
+    float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f);
+    float sig_orig = sig;
+    
+    // Apply tonemap algorithm
+    sig = apply_tonemap_cuda(algorithm, sig, param, peak);
+    
+    // Apply the computed scale factor to the color
+    float scale = sig / sig_orig;
+    r *= scale;
+    g *= scale;
+    b *= scale;
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(r * max_val + 0.5f),
+        (unsigned short)(g * max_val + 0.5f),
+        (unsigned short)(b * max_val + 0.5f),
+        src.w
+    );
+}
+
+__global__ void tonemap_cuda_p016_to_nv12(
+    cudaTextureObject_t src_y_tex, cudaTextureObject_t src_uv_tex,
+    unsigned char *dst_y, unsigned char *dst_uv,
+    int dst_width, int dst_height, int dst_pitch_y, int dst_pitch_uv,
+    int algorithm, float param, float desat, float peak,
+    float coeff_r, float coeff_g, float coeff_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    // Sample Y component (16-bit)
+    unsigned short y_val = tex2D<unsigned short>(src_y_tex, x, y);
+    
+    // Sample UV components (subsampled, 16-bit)
+    int uv_x = x / 2;
+    int uv_y = y / 2;
+    ushort2 uv_val = tex2D<ushort2>(src_uv_tex, uv_x, uv_y);
+    
+    // Convert YUV to RGB (BT.2020 for HDR, 10-bit in 16-bit container)
+    float Y = (y_val - 64.0f * 64.0f) / (940.0f * 64.0f); // 10-bit scaling (shifted up by 6 bits in 16-bit)
+    float U = (uv_val.x - 512.0f * 64.0f) / (896.0f * 64.0f);
+    float V = (uv_val.y - 512.0f * 64.0f) / (896.0f * 64.0f);
+    
+    // BT.2020 YUV to RGB conversion
+    float r = Y + 1.7166f * V;
+    float g = Y - 0.1873f * U - 0.6526f * V;
+    float b = Y + 2.0426f * U;
+    
+    r = fmaxf(r, 0.0f);
+    g = fmaxf(g, 0.0f);
+    b = fmaxf(b, 0.0f);
+    
+    // Desaturate to prevent unnatural colors
+    if (desat > 0) {
+        float luma = coeff_r * r + coeff_g * g + coeff_b * b;
+        float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f);
+        r = r * (1.0f - overbright) + luma * overbright;
+        g = g * (1.0f - overbright) + luma * overbright;
+        b = b * (1.0f - overbright) + luma * overbright;
+    }
+    
+    // Pick the brightest component
+    float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f);
+    float sig_orig = sig;
+    
+    // Apply tonemap algorithm
+    sig = apply_tonemap_cuda(algorithm, sig, param, peak);
+    
+    // Apply the computed scale factor to the color
+    float scale = sig / sig_orig;
+    r *= scale;
+    g *= scale;
+    b *= scale;
+    
+    // Convert back to YUV (BT.709 for SDR)
+    float out_Y = 0.2126f * r + 0.7152f * g + 0.0722f * b;
+    float out_U = -0.1146f * r - 0.3854f * g + 0.5000f * b;
+    float out_V = 0.5000f * r - 0.4542f * g - 0.0458f * b;
+    
+    // Scale to 8-bit range
+    unsigned char y_out = (unsigned char)(out_Y * 219.0f + 16.0f + 0.5f);
+    unsigned char u_out = (unsigned char)(out_U * 224.0f + 128.0f + 0.5f);
+    unsigned char v_out = (unsigned char)(out_V * 224.0f + 128.0f + 0.5f);
+    
+    // Clamp to valid range
+    y_out = min(max(y_out, (unsigned char)16), (unsigned char)235);
+    u_out = min(max(u_out, (unsigned char)16), (unsigned char)240);
+    v_out = min(max(v_out, (unsigned char)16), (unsigned char)240);
+    
+    // Write Y component
+    dst_y[y * dst_pitch_y + x] = y_out;
+    
+    // Write UV components in NV12 format (only for even pixels to maintain 4:2:0 subsampling)
+    if (x % 2 == 0 && y % 2 == 0) {
+        int uv_idx = (y / 2) * dst_pitch_uv + (x / 2) * 2;
+        dst_uv[uv_idx] = u_out;
+        dst_uv[uv_idx + 1] = v_out;
+    }
+}
+
+}
\ No newline at end of file
diff --git a/libavfilter/vf_tonemap_cuda.h b/libavfilter/vf_tonemap_cuda.h
new file mode 100644
index 0000000000000..175221c3c78db
--- /dev/null
+++ b/libavfilter/vf_tonemap_cuda.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef AVFILTER_VF_TONEMAP_CUDA_H
+#define AVFILTER_VF_TONEMAP_CUDA_H
+
+#include "avfilter.h"
+
+#endif /* AVFILTER_VF_TONEMAP_CUDA_H */
\ No newline at end of file