diff --git a/configure b/configure index e2e9fc26d8c4d..69a6d75cc582f 100755 --- a/configure +++ b/configure @@ -3355,6 +3355,10 @@ scale_npp_filter_deps="ffnvcodec libnpp" scale2ref_npp_filter_deps="ffnvcodec libnpp" scale_cuda_filter_deps="ffnvcodec" scale_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +lut3d_cuda_filter_deps="ffnvcodec" +lut3d_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +tonemap_cuda_filter_deps="ffnvcodec" +tonemap_cuda_filter_deps_any="cuda_nvcc cuda_llvm" thumbnail_cuda_filter_deps="ffnvcodec" thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm" transpose_npp_filter_deps="ffnvcodec libnpp" diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 97f8f1727203e..6a37adf15a416 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -35,6 +35,7 @@ OBJS-$(CONFIG_QSVVPP) += qsvvpp.o OBJS-$(CONFIG_SCENE_SAD) += scene_sad.o OBJS-$(CONFIG_DNN) += dnn_filter_common.o + # audio filters OBJS-$(CONFIG_AAP_FILTER) += af_aap.o OBJS-$(CONFIG_ABENCH_FILTER) += f_bench.o @@ -379,6 +380,8 @@ OBJS-$(CONFIG_LUT1D_FILTER) += vf_lut3d.o OBJS-$(CONFIG_LUT_FILTER) += vf_lut.o OBJS-$(CONFIG_LUT2_FILTER) += vf_lut2.o framesync.o OBJS-$(CONFIG_LUT3D_FILTER) += vf_lut3d.o framesync.o +OBJS-$(CONFIG_LUT3D_CUDA_FILTER) += vf_lut3d_cuda.o \ + vf_lut3d_cuda.ptx.o cuda/load_helper.o OBJS-$(CONFIG_LUTRGB_FILTER) += vf_lut.o OBJS-$(CONFIG_LUTYUV_FILTER) += vf_lut.o OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += vf_maskedclamp.o framesync.o @@ -530,6 +533,8 @@ OBJS-$(CONFIG_TMEDIAN_FILTER) += vf_xmedian.o framesync.o OBJS-$(CONFIG_TMIDEQUALIZER_FILTER) += vf_tmidequalizer.o OBJS-$(CONFIG_TMIX_FILTER) += vf_mix.o framesync.o OBJS-$(CONFIG_TONEMAP_FILTER) += vf_tonemap.o +OBJS-$(CONFIG_TONEMAP_CUDA_FILTER) += vf_tonemap_cuda.o \ + vf_tonemap_cuda.ptx.o cuda/load_helper.o OBJS-$(CONFIG_TONEMAP_OPENCL_FILTER) += vf_tonemap_opencl.o opencl.o \ opencl/tonemap.o opencl/colorspace_common.o OBJS-$(CONFIG_TONEMAP_VAAPI_FILTER) += vf_tonemap_vaapi.o vaapi_vpp.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 3bc045b28f552..4789bc2bb41ee 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -354,6 +354,7 @@ extern const FFFilter ff_vf_lut; extern const FFFilter ff_vf_lut1d; extern const FFFilter ff_vf_lut2; extern const FFFilter ff_vf_lut3d; +extern const FFFilter ff_vf_lut3d_cuda; extern const FFFilter ff_vf_lutrgb; extern const FFFilter ff_vf_lutyuv; extern const FFFilter ff_vf_maskedclamp; @@ -499,6 +500,7 @@ extern const FFFilter ff_vf_tmedian; extern const FFFilter ff_vf_tmidequalizer; extern const FFFilter ff_vf_tmix; extern const FFFilter ff_vf_tonemap; +extern const FFFilter ff_vf_tonemap_cuda; extern const FFFilter ff_vf_tonemap_opencl; extern const FFFilter ff_vf_tonemap_vaapi; extern const FFFilter ff_vf_tpad; diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c index 2d6036df7423e..d07949fde2c88 100644 --- a/libavfilter/avfiltergraph.c +++ b/libavfilter/avfiltergraph.c @@ -448,6 +448,85 @@ static int formats_declared(AVFilterContext *f) * was made and the negotiation is stuck; * a negative error code if some other error happened */ +static void describe_link_formats_cfg(AVFilterLink *link, AVFilterFormatsConfig *cfg, AVBPrint *bp) +{ + if (!link || !cfg) + return; + + av_bprintf(bp, "formats="); + if (cfg->formats && cfg->formats->nb_formats > 0) { + av_bprintf(bp, "["); + for (unsigned i = 0; i < cfg->formats->nb_formats; i++) { + if (i > 0) av_bprintf(bp, ", "); + if (link->type == AVMEDIA_TYPE_VIDEO) { + av_bprintf(bp, "%s", av_get_pix_fmt_name(cfg->formats->formats[i])); + } else if (link->type == AVMEDIA_TYPE_AUDIO) { + av_bprintf(bp, "%s", av_get_sample_fmt_name(cfg->formats->formats[i])); + } else { + av_bprintf(bp, "%d", cfg->formats->formats[i]); + } + } + av_bprintf(bp, "]"); + } else { + av_bprintf(bp, "none"); + } + + if (link->type == AVMEDIA_TYPE_AUDIO) { + av_bprintf(bp, ", sample_rates="); + if (cfg->samplerates && cfg->samplerates->nb_formats > 0) { + av_bprintf(bp, "["); + for (unsigned i = 0; i < cfg->samplerates->nb_formats; i++) { + if (i > 0) av_bprintf(bp, ", "); + av_bprintf(bp, "%d", cfg->samplerates->formats[i]); + } + av_bprintf(bp, "]"); + } else { + av_bprintf(bp, "none"); + } + + av_bprintf(bp, ", channel_layouts="); + if (cfg->channel_layouts && cfg->channel_layouts->nb_channel_layouts > 0) { + av_bprintf(bp, "["); + for (int i = 0; i < cfg->channel_layouts->nb_channel_layouts; i++) { + if (i > 0) av_bprintf(bp, ", "); + char buf[256]; + av_channel_layout_describe(&cfg->channel_layouts->channel_layouts[i], buf, sizeof(buf)); + av_bprintf(bp, "%s", buf); + } + av_bprintf(bp, "]"); + } else if (cfg->channel_layouts && cfg->channel_layouts->all_layouts) { + av_bprintf(bp, "all"); + } else { + av_bprintf(bp, "none"); + } + } else if (link->type == AVMEDIA_TYPE_VIDEO) { + av_bprintf(bp, ", color_spaces="); + if (cfg->color_spaces && cfg->color_spaces->nb_formats > 0) { + av_bprintf(bp, "["); + for (unsigned i = 0; i < cfg->color_spaces->nb_formats; i++) { + if (i > 0) av_bprintf(bp, ", "); + av_bprintf(bp, "%s", av_color_space_name(cfg->color_spaces->formats[i])); + } + av_bprintf(bp, "]"); + } else { + av_bprintf(bp, "none"); + } + + av_bprintf(bp, ", color_ranges="); + if (cfg->color_ranges && cfg->color_ranges->nb_formats > 0) { + av_bprintf(bp, "["); + for (unsigned i = 0; i < cfg->color_ranges->nb_formats; i++) { + if (i > 0) av_bprintf(bp, ", "); + av_bprintf(bp, "%s", av_color_range_name(cfg->color_ranges->formats[i])); + } + av_bprintf(bp, "]"); + } else { + av_bprintf(bp, "none"); + } + } +} + + static int query_formats(AVFilterGraph *graph, void *log_ctx) { int i, j, ret; @@ -526,6 +605,26 @@ static int query_formats(AVFilterGraph *graph, void *log_ctx) } /* couldn't merge format lists. auto-insert conversion filter */ + { + AVBPrint src_formats, dst_formats; + av_bprint_init(&src_formats, 0, AV_BPRINT_SIZE_AUTOMATIC); + av_bprint_init(&dst_formats, 0, AV_BPRINT_SIZE_AUTOMATIC); + + describe_link_formats_cfg(link, &link->outcfg, &src_formats); + describe_link_formats_cfg(link, &link->incfg, &dst_formats); + + av_log(log_ctx, AV_LOG_INFO, + "Auto-inserting filter '%s' between filters '%s' and '%s'\n" + "Source link supports: %s\n" + "Destination link supports: %s\n", + neg->conversion_filter, link->src->name, link->dst->name, + src_formats.str ? src_formats.str : "unknown", + dst_formats.str ? dst_formats.str : "unknown"); + + av_bprint_finalize(&src_formats, NULL); + av_bprint_finalize(&dst_formats, NULL); + } + if (!(filter = avfilter_get_by_name(neg->conversion_filter))) { av_log(log_ctx, AV_LOG_ERROR, "'%s' filter not present, cannot convert formats.\n", @@ -578,9 +677,26 @@ static int query_formats(AVFilterGraph *graph, void *log_ctx) (ret = MERGE(m, outlink)) <= 0) { if (ret < 0) return ret; - av_log(log_ctx, AV_LOG_ERROR, - "Impossible to convert between the formats supported by the filter " - "'%s' and the filter '%s'\n", link->src->name, link->dst->name); + { + AVBPrint inlink_formats, outlink_formats; + av_bprint_init(&inlink_formats, 0, AV_BPRINT_SIZE_AUTOMATIC); + av_bprint_init(&outlink_formats, 0, AV_BPRINT_SIZE_AUTOMATIC); + + describe_link_formats_cfg(inlink, &inlink->outcfg, &inlink_formats); + describe_link_formats_cfg(outlink, &outlink->incfg, &outlink_formats); + + av_log(log_ctx, AV_LOG_ERROR, + "Impossible to convert between the formats supported by the filter " + "'%s' and the filter '%s'\n" + "Input link %s supports: %s\n" + "Output link %s supports: %s\n", + link->src->name, link->dst->name, + link->src->name, inlink_formats.str ? inlink_formats.str : "unknown", + link->dst->name, outlink_formats.str ? outlink_formats.str : "unknown"); + + av_bprint_finalize(&inlink_formats, NULL); + av_bprint_finalize(&outlink_formats, NULL); + } return AVERROR(ENOSYS); } } diff --git a/libavfilter/vf_colorspace_cuda.c b/libavfilter/vf_colorspace_cuda.c index 54d6228cd1bb4..8ec0f24996998 100644 --- a/libavfilter/vf_colorspace_cuda.c +++ b/libavfilter/vf_colorspace_cuda.c @@ -32,6 +32,7 @@ #include "avfilter.h" #include "filters.h" +#include "colorspace.h" #include "cuda/load_helper.h" @@ -59,13 +60,57 @@ typedef struct CUDAColorspaceContext { CUstream cu_stream; CUmodule cu_module; CUfunction cu_convert[AVCOL_RANGE_NB]; + CUfunction cu_convert_colorspace; + enum AVPixelFormat pix_fmt; enum AVColorRange range; + enum AVColorSpace in_csp, out_csp, user_csp; + enum AVColorPrimaries in_prm, out_prm, user_prm; + enum AVColorTransferCharacteristic in_trc, out_trc, user_trc; + + // Conversion matrices for YUV-to-YUV colorspace conversion + float yuv2yuv_matrix[3][3]; + float yuv_offset[2]; // input and output offsets + int colorspace_conversion_needed; int num_planes; } CUDAColorspaceContext; +static int calculate_yuv2yuv_matrix(CUDAColorspaceContext* s) +{ + const AVLumaCoefficients *in_lumacoef, *out_lumacoef; + double yuv2rgb[3][3], rgb2yuv[3][3], yuv2yuv[3][3]; + int i, j; + + // Get luma coefficients for input and output colorspaces + in_lumacoef = av_csp_luma_coeffs_from_avcsp(s->in_csp); + out_lumacoef = av_csp_luma_coeffs_from_avcsp(s->out_csp); + + if (!in_lumacoef || !out_lumacoef) { + return AVERROR(EINVAL); + } + + // Calculate YUV to RGB matrix for input colorspace + ff_fill_rgb2yuv_table(in_lumacoef, rgb2yuv); + ff_matrix_invert_3x3(rgb2yuv, yuv2rgb); + + // Calculate RGB to YUV matrix for output colorspace + ff_fill_rgb2yuv_table(out_lumacoef, rgb2yuv); + + // Combine: YUV_in -> RGB -> YUV_out + ff_matrix_mul_3x3(yuv2yuv, yuv2rgb, rgb2yuv); + + // Convert to float for CUDA kernel + for (i = 0; i < 3; i++) { + for (j = 0; j < 3; j++) { + s->yuv2yuv_matrix[i][j] = (float)yuv2yuv[i][j]; + } + } + + return 0; +} + static av_cold int cudacolorspace_init(AVFilterContext* ctx) { CUDAColorspaceContext* s = ctx->priv; @@ -78,6 +123,7 @@ static av_cold int cudacolorspace_init(AVFilterContext* ctx) if (!s->tmp_frame) return AVERROR(ENOMEM); + return 0; } @@ -85,6 +131,7 @@ static av_cold void cudacolorspace_uninit(AVFilterContext* ctx) { CUDAColorspaceContext* s = ctx->priv; + if (s->hwctx && s->cu_module) { CudaFunctions* cu = s->hwctx->internal->cuda_dl; CUcontext dummy; @@ -172,13 +219,29 @@ static av_cold int init_processing_chain(AVFilterContext* ctx, int width, return AVERROR(EINVAL); } - if ((AVCOL_RANGE_MPEG != s->range) && (AVCOL_RANGE_JPEG != s->range)) { + if ((AVCOL_RANGE_MPEG != s->range) && (AVCOL_RANGE_JPEG != s->range) && + (AVCOL_RANGE_UNSPECIFIED != s->range)) { av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n"); return AVERROR(EINVAL); } s->num_planes = av_pix_fmt_count_planes(s->pix_fmt); + // Determine if colorspace conversion is needed + s->colorspace_conversion_needed = (s->in_csp != AVCOL_SPC_UNSPECIFIED && + s->out_csp != AVCOL_SPC_UNSPECIFIED && + s->in_csp != s->out_csp); + + if (s->colorspace_conversion_needed) { + ret = calculate_yuv2yuv_matrix(s); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, "Failed to calculate colorspace conversion matrix\n"); + return ret; + } + av_log(ctx, AV_LOG_INFO, "Colorspace conversion: %s -> %s\n", + av_color_space_name(s->in_csp), av_color_space_name(s->out_csp)); + } + ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height); if (ret < 0) return ret; @@ -218,6 +281,10 @@ static av_cold int cudacolorspace_load_functions(AVFilterContext* ctx) if (ret < 0) goto fail; + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_convert_colorspace, s->cu_module, "colorspace_convert_cuda")); + if (ret < 0) + goto fail; + fail: CHECK_CU(cu->cuCtxPopCurrent(&dummy)); return ret; @@ -236,6 +303,19 @@ static av_cold int cudacolorspace_config_props(AVFilterLink* outlink) outlink->w = inlink->w; outlink->h = inlink->h; + // Set defaults - actual colorspace will be determined from frame properties + s->in_csp = inlink->colorspace != AVCOL_SPC_UNSPECIFIED ? inlink->colorspace : AVCOL_SPC_BT709; + s->in_prm = AVCOL_PRI_BT709; // Default, will be updated from frame + s->in_trc = AVCOL_TRC_BT709; // Default, will be updated from frame + + // Set output colorspace properties + s->out_csp = s->user_csp != AVCOL_SPC_UNSPECIFIED ? s->user_csp : s->in_csp; + s->out_prm = s->user_prm != AVCOL_PRI_UNSPECIFIED ? s->user_prm : s->in_prm; + s->out_trc = s->user_trc != AVCOL_TRC_UNSPECIFIED ? s->user_trc : s->in_trc; + + // Set output link properties + outlink->colorspace = s->out_csp; + ret = init_processing_chain(ctx, inlink->w, inlink->h); if (ret < 0) return ret; @@ -272,45 +352,108 @@ static int conv_cuda_convert(AVFilterContext* ctx, AVFrame* out, AVFrame* in) if (ret < 0) return ret; - out->color_range = s->range; - - for (int i = 0; i < s->num_planes; i++) { - int width = in->width, height = in->height, comp_id = (i > 0); - - switch (s->pix_fmt) { - case AV_PIX_FMT_YUV444P: - break; - case AV_PIX_FMT_YUV420P: - width = comp_id ? in->width / 2 : in->width; - /* fall-through */ - case AV_PIX_FMT_NV12: - height = comp_id ? in->height / 2 : in->height; - break; - default: - av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n", - av_get_pix_fmt_name(s->pix_fmt)); - return AVERROR(EINVAL); - } - - if (!s->cu_convert[out->color_range]) { - av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n"); - return AVERROR(EINVAL); + // Update input colorspace properties from frame + s->in_csp = in->colorspace != AVCOL_SPC_UNSPECIFIED ? in->colorspace : s->in_csp; + s->in_prm = in->color_primaries != AVCOL_PRI_UNSPECIFIED ? in->color_primaries : s->in_prm; + s->in_trc = in->color_trc != AVCOL_TRC_UNSPECIFIED ? in->color_trc : s->in_trc; + + // Update output colorspace if user didn't specify + if (s->user_csp == AVCOL_SPC_UNSPECIFIED) s->out_csp = s->in_csp; + if (s->user_prm == AVCOL_PRI_UNSPECIFIED) s->out_prm = s->in_prm; + if (s->user_trc == AVCOL_TRC_UNSPECIFIED) s->out_trc = s->in_trc; + + // Check if we need to recalculate matrix + int need_recalc = (s->colorspace_conversion_needed && + (s->in_csp != s->out_csp)); + + if (need_recalc) { + ret = calculate_yuv2yuv_matrix(s); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, "Failed to recalculate colorspace conversion matrix\n"); + goto fail; } + } - if (in->color_range != out->color_range) { - void* args[] = {&in->data[i], &out->data[i], &in->linesize[i], - &comp_id}; + // Set output frame properties + out->color_range = s->range != AVCOL_RANGE_UNSPECIFIED ? s->range : in->color_range; + out->colorspace = s->out_csp; + out->color_primaries = s->out_prm; + out->color_trc = s->out_trc; + + // If only colorspace conversion is needed (no range conversion) + if (s->colorspace_conversion_needed && (in->color_range == out->color_range || s->range == AVCOL_RANGE_UNSPECIFIED)) { + for (int i = 0; i < s->num_planes; i++) { + int width = in->width, height = in->height; + + switch (s->pix_fmt) { + case AV_PIX_FMT_YUV444P: + break; + case AV_PIX_FMT_YUV420P: + width = (i > 0) ? in->width / 2 : in->width; + /* fall-through */ + case AV_PIX_FMT_NV12: + height = (i > 0) ? in->height / 2 : in->height; + break; + default: + av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n", + av_get_pix_fmt_name(s->pix_fmt)); + return AVERROR(EINVAL); + } + + void* args[] = {&in->data[i], &out->data[i], &in->linesize[i], &out->linesize[i], + &width, &height, &i, + &s->yuv2yuv_matrix[0][0], &s->yuv2yuv_matrix[0][1], &s->yuv2yuv_matrix[0][2], + &s->yuv2yuv_matrix[1][0], &s->yuv2yuv_matrix[1][1], &s->yuv2yuv_matrix[1][2], + &s->yuv2yuv_matrix[2][0], &s->yuv2yuv_matrix[2][1], &s->yuv2yuv_matrix[2][2]}; ret = CHECK_CU(cu->cuLaunchKernel( - s->cu_convert[out->color_range], DIV_UP(width, BLOCKX), + s->cu_convert_colorspace, DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); - } else { - ret = av_hwframe_transfer_data(out, in, 0); if (ret < 0) - return ret; + goto fail; + } + } else { + // Handle range conversion or passthrough + for (int i = 0; i < s->num_planes; i++) { + int width = in->width, height = in->height, comp_id = (i > 0); + + switch (s->pix_fmt) { + case AV_PIX_FMT_YUV444P: + break; + case AV_PIX_FMT_YUV420P: + width = comp_id ? in->width / 2 : in->width; + /* fall-through */ + case AV_PIX_FMT_NV12: + height = comp_id ? in->height / 2 : in->height; + break; + default: + av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n", + av_get_pix_fmt_name(s->pix_fmt)); + return AVERROR(EINVAL); + } + + if (s->range != AVCOL_RANGE_UNSPECIFIED && in->color_range != out->color_range) { + if (!s->cu_convert[out->color_range]) { + av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n"); + return AVERROR(EINVAL); + } + + void* args[] = {&in->data[i], &out->data[i], &in->linesize[i], &comp_id}; + ret = CHECK_CU(cu->cuLaunchKernel( + s->cu_convert[out->color_range], DIV_UP(width, BLOCKX), + DIV_UP(height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream, + args, NULL)); + if (ret < 0) + goto fail; + } else { + ret = av_hwframe_transfer_data(out, in, 0); + if (ret < 0) + goto fail; + } } } +fail: CHECK_CU(cu->cuCtxPopCurrent(&dummy)); return ret; } @@ -344,6 +487,7 @@ static int cudacolorspace_conv(AVFilterContext* ctx, AVFrame* out, AVFrame* in) return 0; } + static int cudacolorspace_filter_frame(AVFilterLink* link, AVFrame* in) { AVFilterContext* ctx = link->dst; @@ -392,6 +536,27 @@ static const AVOption options[] = { {"mpeg", "Limited range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, .unit = "range"}, {"pc", "Full range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, .unit = "range"}, {"jpeg", "Full range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, .unit = "range"}, + + {"space", "Output colorspace", OFFSET(user_csp), AV_OPT_TYPE_INT, { .i64 = AVCOL_SPC_UNSPECIFIED }, AVCOL_SPC_RGB, AVCOL_SPC_NB - 1, FLAGS, .unit = "csp"}, + {"bt709", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_BT709 }, 0, 0, FLAGS, .unit = "csp"}, + {"bt470bg", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_BT470BG }, 0, 0, FLAGS, .unit = "csp"}, + {"smpte170m", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_SMPTE170M }, 0, 0, FLAGS, .unit = "csp"}, + {"bt2020nc", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_BT2020_NCL }, 0, 0, FLAGS, .unit = "csp"}, + {"bt2020ncl", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_BT2020_NCL }, 0, 0, FLAGS, .unit = "csp"}, + {"bt601", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_SPC_SMPTE170M }, 0, 0, FLAGS, .unit = "csp"}, + + {"primaries", "Output color primaries", OFFSET(user_prm), AV_OPT_TYPE_INT, { .i64 = AVCOL_PRI_UNSPECIFIED }, AVCOL_PRI_RESERVED0, AVCOL_PRI_NB - 1, FLAGS, .unit = "prm"}, + {"bt709", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_PRI_BT709 }, 0, 0, FLAGS, .unit = "prm"}, + {"bt470bg", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_PRI_BT470BG }, 0, 0, FLAGS, .unit = "prm"}, + {"smpte170m", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_PRI_SMPTE170M }, 0, 0, FLAGS, .unit = "prm"}, + {"bt2020", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_PRI_BT2020 }, 0, 0, FLAGS, .unit = "prm"}, + + {"trc", "Output transfer characteristics", OFFSET(user_trc), AV_OPT_TYPE_INT, { .i64 = AVCOL_TRC_UNSPECIFIED }, AVCOL_TRC_RESERVED0, AVCOL_TRC_NB - 1, FLAGS, .unit = "trc"}, + {"bt709", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_TRC_BT709 }, 0, 0, FLAGS, .unit = "trc"}, + {"bt2020-10", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_TRC_BT2020_10 }, 0, 0, FLAGS, .unit = "trc"}, + {"smpte170m", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_TRC_SMPTE170M }, 0, 0, FLAGS, .unit = "trc"}, + + {NULL}, }; diff --git a/libavfilter/vf_colorspace_cuda.cu b/libavfilter/vf_colorspace_cuda.cu index 59db9f69f3c36..b3ab4a5e498cd 100644 --- a/libavfilter/vf_colorspace_cuda.cu +++ b/libavfilter/vf_colorspace_cuda.cu @@ -91,4 +91,36 @@ __global__ void to_mpeg_cuda(const unsigned char* src, unsigned char* dst, dst[x + y * pitch] = static_cast(dst_); } +__global__ void colorspace_convert_cuda(const unsigned char* src, unsigned char* dst, + int src_pitch, int dst_pitch, int width, int height, int plane_id, + float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) + return; + + if (plane_id == 0) { + // Luma plane (Y) - apply only first row of matrix + float src_y = (float)src[x + y * src_pitch]; + float dst_y = m00 * src_y + m01 * 128.0f + m02 * 128.0f; + dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_y + 0.5f), 0, 255); + } else if (plane_id == 1) { + // U plane - apply second row of matrix, but we only have U component + float src_u = (float)src[x + y * src_pitch] - 128.0f; + // For U plane conversion, we apply the matrix considering U input and zero V + float dst_u = m10 * 128.0f + m11 * (src_u + 128.0f) + m12 * 128.0f; + dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_u + 0.5f), 0, 255); + } else { + // V plane - apply third row of matrix, but we only have V component + float src_v = (float)src[x + y * src_pitch] - 128.0f; + // For V plane conversion, we apply the matrix considering zero U input and V + float dst_v = m20 * 128.0f + m21 * 128.0f + m22 * (src_v + 128.0f); + dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_v + 0.5f), 0, 255); + } +} + } diff --git a/libavfilter/vf_lut3d_cuda.c b/libavfilter/vf_lut3d_cuda.c new file mode 100644 index 0000000000000..16fceb4a1c3d0 --- /dev/null +++ b/libavfilter/vf_lut3d_cuda.c @@ -0,0 +1,604 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +#include "libavutil/common.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" +#include "libavutil/internal.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/mem.h" +#include "libavutil/file_open.h" +#include "libavutil/intfloat.h" +#include "libavutil/avassert.h" +#include "libavutil/avstring.h" + +#include "avfilter.h" +#include "filters.h" +#include "video.h" +#include "lut3d.h" + +#include "cuda/load_helper.h" +#include "vf_lut3d_cuda.h" + +static const enum AVPixelFormat supported_formats[] = { + AV_PIX_FMT_0RGB32, + AV_PIX_FMT_0BGR32, + AV_PIX_FMT_RGB32, + AV_PIX_FMT_BGR32, + AV_PIX_FMT_RGB48, + AV_PIX_FMT_BGR48, + AV_PIX_FMT_RGBA64, + AV_PIX_FMT_BGRA64, +}; + +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) +#define BLOCKX 32 +#define BLOCKY 16 + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + +typedef struct LUT3DCudaContext { + const AVClass *class; + + AVCUDADeviceContext *hwctx; + + enum AVPixelFormat in_fmt, out_fmt; + const AVPixFmtDescriptor *in_desc, *out_desc; + + AVBufferRef *frames_ctx; + AVFrame *frame; + AVFrame *tmp_frame; + + CUcontext cu_ctx; + CUmodule cu_module; + CUfunction cu_func; + CUstream cu_stream; + CUdeviceptr cu_lut; + + struct rgbvec *lut; + int lutsize; + int lutsize2; + struct rgbvec scale; + int interpolation; + char *file; + + int bit_depth; + int passthrough; +} LUT3DCudaContext; + +static int format_is_supported(enum AVPixelFormat fmt) +{ + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) + if (supported_formats[i] == fmt) + return 1; + return 0; +} + +static int get_bit_depth(enum AVPixelFormat fmt) +{ + switch (fmt) { + case AV_PIX_FMT_0RGB32: + case AV_PIX_FMT_0BGR32: + case AV_PIX_FMT_RGB32: + case AV_PIX_FMT_BGR32: + return 8; + case AV_PIX_FMT_RGB48: + case AV_PIX_FMT_BGR48: + return 12; + case AV_PIX_FMT_RGBA64: + case AV_PIX_FMT_BGRA64: + return 10; + default: + return 8; + } +} + +static av_cold int lut3d_cuda_init(AVFilterContext *ctx) +{ + LUT3DCudaContext *s = ctx->priv; + + s->frame = av_frame_alloc(); + if (!s->frame) + return AVERROR(ENOMEM); + + s->tmp_frame = av_frame_alloc(); + if (!s->tmp_frame) + return AVERROR(ENOMEM); + + return 0; +} + +static av_cold void lut3d_cuda_uninit(AVFilterContext *ctx) +{ + LUT3DCudaContext *s = ctx->priv; + + if (s->hwctx && s->cu_module) { + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext dummy; + + CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (s->cu_lut) { + CHECK_CU(cu->cuMemFree(s->cu_lut)); + s->cu_lut = 0; + } + CHECK_CU(cu->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + } + + av_freep(&s->lut); + av_frame_free(&s->frame); + av_buffer_unref(&s->frames_ctx); + av_frame_free(&s->tmp_frame); +} + +static av_cold int init_hwframe_ctx(LUT3DCudaContext *s, AVBufferRef *device_ctx, int width, int height) +{ + AVBufferRef *out_ref = NULL; + AVHWFramesContext *out_ctx; + int ret; + + out_ref = av_hwframe_ctx_alloc(device_ctx); + if (!out_ref) + return AVERROR(ENOMEM); + out_ctx = (AVHWFramesContext*)out_ref->data; + + out_ctx->format = AV_PIX_FMT_CUDA; + out_ctx->sw_format = s->out_fmt; + out_ctx->width = FFALIGN(width, 32); + out_ctx->height = FFALIGN(height, 32); + + ret = av_hwframe_ctx_init(out_ref); + if (ret < 0) + goto fail; + + av_frame_unref(s->frame); + ret = av_hwframe_get_buffer(out_ref, s->frame, 0); + if (ret < 0) + goto fail; + + s->frame->width = width; + s->frame->height = height; + + av_buffer_unref(&s->frames_ctx); + s->frames_ctx = out_ref; + + return 0; +fail: + av_buffer_unref(&out_ref); + return ret; +} + +#define MAX_LINE_SIZE 512 + +static int skip_line(const char *p) +{ + while (*p && av_isspace(*p)) + p++; + return !*p || *p == '#'; +} + +#define NEXT_LINE(loop_cond) do { \ + if (!fgets(line, sizeof(line), f)) { \ + av_log(ctx, AV_LOG_ERROR, "Unexpected EOF\n"); \ + return AVERROR_INVALIDDATA; \ + } \ +} while (loop_cond) + +static int parse_cube(AVFilterContext *ctx, FILE *f) +{ + LUT3DCudaContext *lut3d = ctx->priv; + char line[MAX_LINE_SIZE]; + float min[3] = {0.0, 0.0, 0.0}; + float max[3] = {1.0, 1.0, 1.0}; + int size = 0, size2, i, j, k; + + while (fgets(line, sizeof(line), f)) { + if (!strncmp(line, "LUT_3D_SIZE", 11)) { + size = strtol(line + 12, NULL, 0); + if (size < 2 || size > MAX_LEVEL) { + av_log(ctx, AV_LOG_ERROR, "Too large or invalid 3D LUT size\n"); + return AVERROR(EINVAL); + } + lut3d->lutsize = size; + lut3d->lutsize2 = size * size; + break; + } + } + + if (!size) { + av_log(ctx, AV_LOG_ERROR, "3D LUT size not found\n"); + return AVERROR(EINVAL); + } + + if (!(lut3d->lut = av_malloc_array(size * size * size, sizeof(*lut3d->lut)))) + return AVERROR(ENOMEM); + + size2 = size * size; + rewind(f); + + for (i = 0; i < size; i++) { + for (j = 0; j < size; j++) { + for (k = 0; k < size; k++) { + struct rgbvec *vec = &lut3d->lut[i * size2 + j * size + k]; + + do { +try_again: + NEXT_LINE(0); + if (!strncmp(line, "DOMAIN_", 7)) { + float *vals = NULL; + if (!strncmp(line + 7, "MIN ", 4)) vals = min; + else if (!strncmp(line + 7, "MAX ", 4)) vals = max; + if (!vals) + return AVERROR_INVALIDDATA; + if (av_sscanf(line + 11, "%f %f %f", vals, vals + 1, vals + 2) != 3) + return AVERROR_INVALIDDATA; + av_log(ctx, AV_LOG_DEBUG, "min: %f %f %f | max: %f %f %f\n", + min[0], min[1], min[2], max[0], max[1], max[2]); + goto try_again; + } else if (!strncmp(line, "TITLE", 5)) { + goto try_again; + } + } while (skip_line(line)); + if (av_sscanf(line, "%f %f %f", &vec->r, &vec->g, &vec->b) != 3) + return AVERROR_INVALIDDATA; + vec->r = av_clipf(vec->r, 0.f, 1.f); + vec->g = av_clipf(vec->g, 0.f, 1.f); + vec->b = av_clipf(vec->b, 0.f, 1.f); + } + } + } + + lut3d->scale.r = av_clipf(1. / (max[0] - min[0]), 0.f, 1.f) * (lut3d->lutsize - 1); + lut3d->scale.g = av_clipf(1. / (max[1] - min[1]), 0.f, 1.f) * (lut3d->lutsize - 1); + lut3d->scale.b = av_clipf(1. / (max[2] - min[2]), 0.f, 1.f) * (lut3d->lutsize - 1); + + return 0; +} + +static av_cold int lut3d_cuda_load_cube(AVFilterContext *ctx, const char *fname) +{ + FILE *f; + int ret; + + f = avpriv_fopen_utf8(fname, "r"); + if (!f) { + ret = AVERROR_INVALIDDATA; + av_log(ctx, AV_LOG_ERROR, "Cannot read file '%s': %s\n", fname, av_err2str(ret)); + return ret; + } + + ret = parse_cube(ctx, f); + fclose(f); + return ret; +} + +static av_cold int lut3d_cuda_load_functions(AVFilterContext *ctx) +{ + LUT3DCudaContext *s = ctx->priv; + CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + char function_name[128]; + int ret; + + extern const unsigned char ff_vf_lut3d_cuda_ptx_data[]; + extern const unsigned int ff_vf_lut3d_cuda_ptx_len; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, + ff_vf_lut3d_cuda_ptx_data, ff_vf_lut3d_cuda_ptx_len); + if (ret < 0) + goto fail; + + switch(s->interpolation) { + case INTERPOLATE_NEAREST: + snprintf(function_name, sizeof(function_name), "lut3d_interp_%d_nearest", s->bit_depth); + break; + case INTERPOLATE_TRILINEAR: + snprintf(function_name, sizeof(function_name), "lut3d_interp_%d_trilinear", s->bit_depth); + break; + case INTERPOLATE_TETRAHEDRAL: + default: + snprintf(function_name, sizeof(function_name), "lut3d_interp_%d_tetrahedral", s->bit_depth); + break; + } + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func, s->cu_module, function_name)); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed to load CUDA function %s\n", function_name); + goto fail; + } + + ret = CHECK_CU(cu->cuMemAlloc(&s->cu_lut, s->lutsize * s->lutsize * s->lutsize * 3 * sizeof(float))); + if (ret < 0) + goto fail; + + ret = CHECK_CU(cu->cuMemcpyHtoD(s->cu_lut, s->lut, s->lutsize * s->lutsize * s->lutsize * sizeof(struct rgbvec))); + if (ret < 0) + goto fail; + +fail: + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + return ret; +} + +static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int in_height, + int out_width, int out_height) +{ + LUT3DCudaContext *s = ctx->priv; + FilterLink *inl = ff_filter_link(ctx->inputs[0]); + FilterLink *outl = ff_filter_link(ctx->outputs[0]); + + AVHWFramesContext *in_frames_ctx; + enum AVPixelFormat in_format; + enum AVPixelFormat out_format; + int ret; + + if (!inl->hw_frames_ctx) { + av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n"); + return AVERROR(EINVAL); + } + in_frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data; + in_format = in_frames_ctx->sw_format; + out_format = in_format; + + if (!format_is_supported(in_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", + av_get_pix_fmt_name(in_format)); + return AVERROR(ENOSYS); + } + if (!format_is_supported(out_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n", + av_get_pix_fmt_name(out_format)); + return AVERROR(ENOSYS); + } + + s->in_fmt = in_format; + s->out_fmt = out_format; + s->in_desc = av_pix_fmt_desc_get(s->in_fmt); + s->out_desc = av_pix_fmt_desc_get(s->out_fmt); + s->bit_depth = get_bit_depth(s->in_fmt); + + if (in_width == out_width && in_height == out_height && in_format == out_format) { + s->frames_ctx = av_buffer_ref(inl->hw_frames_ctx); + if (!s->frames_ctx) + return AVERROR(ENOMEM); + } else { + s->passthrough = 0; + + ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, out_width, out_height); + if (ret < 0) + return ret; + } + + outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx); + if (!outl->hw_frames_ctx) + return AVERROR(ENOMEM); + + return 0; +} + +static av_cold int lut3d_cuda_config_props(AVFilterLink *outlink) +{ + AVFilterContext *ctx = outlink->src; + AVFilterLink *inlink = outlink->src->inputs[0]; + FilterLink *inl = ff_filter_link(inlink); + LUT3DCudaContext *s = ctx->priv; + AVHWFramesContext *frames_ctx; + AVCUDADeviceContext *device_hwctx; + int ret; + + if (s->file) { + ret = lut3d_cuda_load_cube(ctx, s->file); + if (ret < 0) + return ret; + } + + outlink->w = inlink->w; + outlink->h = inlink->h; + + ret = init_processing_chain(ctx, inlink->w, inlink->h, outlink->w, outlink->h); + if (ret < 0) + return ret; + + frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data; + device_hwctx = frames_ctx->device_ctx->hwctx; + + s->hwctx = device_hwctx; + s->cu_stream = s->hwctx->stream; + + av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d fmt:%s -> w:%d h:%d fmt:%s%s\n", + inlink->w, inlink->h, av_get_pix_fmt_name(s->in_fmt), + outlink->w, outlink->h, av_get_pix_fmt_name(s->out_fmt), + s->passthrough ? " (passthrough)" : ""); + + ret = lut3d_cuda_load_functions(ctx); + if (ret < 0) + return ret; + + return 0; +} + +static int call_lut3d_kernel(AVFilterContext *ctx, AVFrame *out, AVFrame *in) +{ + LUT3DCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; + CUtexObject tex = 0; + int ret; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = (s->bit_depth > 8) ? CU_AD_FORMAT_UNSIGNED_INT16 : CU_AD_FORMAT_UNSIGNED_INT8, + .res.pitch2D.numChannels = 4, + .res.pitch2D.width = in->width, + .res.pitch2D.height = in->height, + .res.pitch2D.pitchInBytes = in->linesize[0], + .res.pitch2D.devPtr = (CUdeviceptr)in->data[0], + }; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + void *args[] = { + &tex, + &out->data[0], + &out->width, &out->height, &out->linesize[0], + &s->cu_lut, &s->lutsize, + &s->scale.r, &s->scale.g, &s->scale.b + }; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func, + DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); + +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + return ret; +} + +static int lut3d_cuda_filter_frame(AVFilterLink *link, AVFrame *in) +{ + AVFilterContext *ctx = link->dst; + LUT3DCudaContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + AVFrame *out = NULL; + CUcontext dummy; + int ret = 0; + + if (s->passthrough) + return ff_filter_frame(outlink, in); + + out = av_frame_alloc(); + if (!out) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + goto fail; + + ret = av_hwframe_get_buffer(s->frames_ctx, out, 0); + if (ret < 0) + goto fail; + + ret = call_lut3d_kernel(ctx, out, in); + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + if (ret < 0) + goto fail; + + ret = av_frame_copy_props(out, in); + if (ret < 0) + goto fail; + + av_frame_free(&in); + return ff_filter_frame(outlink, out); + +fail: + av_frame_free(&in); + av_frame_free(&out); + return ret; +} + +#define OFFSET(x) offsetof(LUT3DCudaContext, x) +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) +#define TFLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM) + +static const AVOption lut3d_cuda_options[] = { + { "file", "set 3D LUT file name", OFFSET(file), AV_OPT_TYPE_STRING, {.str=NULL}, .flags = FLAGS }, + { "interp", "select interpolation mode", OFFSET(interpolation), AV_OPT_TYPE_INT, {.i64=INTERPOLATE_TETRAHEDRAL}, 0, NB_INTERP_MODE-1, TFLAGS, .unit = "interp_mode" }, + { "nearest", "use values from the nearest defined points", 0, AV_OPT_TYPE_CONST, {.i64=INTERPOLATE_NEAREST}, 0, 0, TFLAGS, .unit = "interp_mode" }, + { "trilinear", "interpolate values using the 8 points defining a cube", 0, AV_OPT_TYPE_CONST, {.i64=INTERPOLATE_TRILINEAR}, 0, 0, TFLAGS, .unit = "interp_mode" }, + { "tetrahedral", "interpolate values using a tetrahedron", 0, AV_OPT_TYPE_CONST, {.i64=INTERPOLATE_TETRAHEDRAL}, 0, 0, TFLAGS, .unit = "interp_mode" }, + { "passthrough", "Do not process frames at all if parameters match", OFFSET(passthrough), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS }, + { NULL }, +}; + +static const AVClass lut3d_cuda_class = { + .class_name = "lut3d_cuda", + .item_name = av_default_item_name, + .option = lut3d_cuda_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const AVFilterPad lut3d_cuda_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = lut3d_cuda_filter_frame, + }, +}; + +static const AVFilterPad lut3d_cuda_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = lut3d_cuda_config_props, + }, +}; + +const FFFilter ff_vf_lut3d_cuda = { + .p.name = "lut3d_cuda", + .p.description = NULL_IF_CONFIG_SMALL("GPU accelerated 3D LUT filter"), + + .p.priv_class = &lut3d_cuda_class, + + .init = lut3d_cuda_init, + .uninit = lut3d_cuda_uninit, + + .priv_size = sizeof(LUT3DCudaContext), + + FILTER_INPUTS(lut3d_cuda_inputs), + FILTER_OUTPUTS(lut3d_cuda_outputs), + + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), + + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; \ No newline at end of file diff --git a/libavfilter/vf_lut3d_cuda.cu b/libavfilter/vf_lut3d_cuda.cu new file mode 100644 index 0000000000000..a142299e13d76 --- /dev/null +++ b/libavfilter/vf_lut3d_cuda.cu @@ -0,0 +1,423 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "cuda/vector_helpers.cuh" + +#define BLOCKX 32 +#define BLOCKY 16 + +extern "C" { + +__device__ static inline float3 lerpf3(float3 a, float3 b, float t) +{ + return make_float3( + a.x + (b.x - a.x) * t, + a.y + (b.y - a.y) * t, + a.z + (b.z - a.z) * t + ); +} + +__device__ static inline float3 interp_nearest_cuda(float3 *lut, int lutsize, float3 s) +{ + int r = (int)(s.x + 0.5f); + int g = (int)(s.y + 0.5f); + int b = (int)(s.z + 0.5f); + + r = min(max(r, 0), lutsize - 1); + g = min(max(g, 0), lutsize - 1); + b = min(max(b, 0), lutsize - 1); + + return lut[r * lutsize * lutsize + g * lutsize + b]; +} + +__device__ static inline float3 interp_trilinear_cuda(float3 *lut, int lutsize, float3 s) +{ + int lutsize2 = lutsize * lutsize; + + int prev_r = (int)s.x; + int prev_g = (int)s.y; + int prev_b = (int)s.z; + + int next_r = min(prev_r + 1, lutsize - 1); + int next_g = min(prev_g + 1, lutsize - 1); + int next_b = min(prev_b + 1, lutsize - 1); + + float3 d = make_float3(s.x - prev_r, s.y - prev_g, s.z - prev_b); + + float3 c000 = lut[prev_r * lutsize2 + prev_g * lutsize + prev_b]; + float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b]; + float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b]; + float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b]; + float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b]; + float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b]; + float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b]; + float3 c111 = lut[next_r * lutsize2 + next_g * lutsize + next_b]; + + float3 c00 = lerpf3(c000, c100, d.x); + float3 c10 = lerpf3(c010, c110, d.x); + float3 c01 = lerpf3(c001, c101, d.x); + float3 c11 = lerpf3(c011, c111, d.x); + float3 c0 = lerpf3(c00, c10, d.y); + float3 c1 = lerpf3(c01, c11, d.y); + + return lerpf3(c0, c1, d.z); +} + +__device__ static inline float3 interp_tetrahedral_cuda(float3 *lut, int lutsize, float3 s) +{ + int lutsize2 = lutsize * lutsize; + + int prev_r = (int)s.x; + int prev_g = (int)s.y; + int prev_b = (int)s.z; + + int next_r = min(prev_r + 1, lutsize - 1); + int next_g = min(prev_g + 1, lutsize - 1); + int next_b = min(prev_b + 1, lutsize - 1); + + float3 d = make_float3(s.x - prev_r, s.y - prev_g, s.z - prev_b); + + float3 c000 = lut[prev_r * lutsize2 + prev_g * lutsize + prev_b]; + float3 c111 = lut[next_r * lutsize2 + next_g * lutsize + next_b]; + + float3 c; + if (d.x > d.y && d.x > d.z) { + if (d.y > d.z) { + float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b]; + float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b]; + c = make_float3( + c000.x + d.x * (c100.x - c000.x) + d.y * (c110.x - c100.x) + d.z * (c111.x - c110.x), + c000.y + d.x * (c100.y - c000.y) + d.y * (c110.y - c100.y) + d.z * (c111.y - c110.y), + c000.z + d.x * (c100.z - c000.z) + d.y * (c110.z - c100.z) + d.z * (c111.z - c110.z) + ); + } else { + float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b]; + float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b]; + c = make_float3( + c000.x + d.x * (c100.x - c000.x) + d.z * (c101.x - c100.x) + d.y * (c111.x - c101.x), + c000.y + d.x * (c100.y - c000.y) + d.z * (c101.y - c100.y) + d.y * (c111.y - c101.y), + c000.z + d.x * (c100.z - c000.z) + d.z * (c101.z - c100.z) + d.y * (c111.z - c101.z) + ); + } + } else if (d.y > d.z) { + if (d.x > d.z) { + float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b]; + float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b]; + c = make_float3( + c000.x + d.y * (c010.x - c000.x) + d.x * (c110.x - c010.x) + d.z * (c111.x - c110.x), + c000.y + d.y * (c010.y - c000.y) + d.x * (c110.y - c010.y) + d.z * (c111.y - c110.y), + c000.z + d.y * (c010.z - c000.z) + d.x * (c110.z - c010.z) + d.z * (c111.z - c110.z) + ); + } else { + float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b]; + float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b]; + c = make_float3( + c000.x + d.y * (c010.x - c000.x) + d.z * (c011.x - c010.x) + d.x * (c111.x - c011.x), + c000.y + d.y * (c010.y - c000.y) + d.z * (c011.y - c010.y) + d.x * (c111.y - c011.y), + c000.z + d.y * (c010.z - c000.z) + d.z * (c011.z - c010.z) + d.x * (c111.z - c011.z) + ); + } + } else { + if (d.x > d.y) { + float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b]; + float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b]; + c = make_float3( + c000.x + d.z * (c001.x - c000.x) + d.x * (c101.x - c001.x) + d.y * (c111.x - c101.x), + c000.y + d.z * (c001.y - c000.y) + d.x * (c101.y - c001.y) + d.y * (c111.y - c101.y), + c000.z + d.z * (c001.z - c000.z) + d.x * (c101.z - c001.z) + d.y * (c111.z - c101.z) + ); + } else { + float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b]; + float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b]; + c = make_float3( + c000.x + d.z * (c001.x - c000.x) + d.y * (c011.x - c001.x) + d.x * (c111.x - c011.x), + c000.y + d.z * (c001.y - c000.y) + d.y * (c011.y - c001.y) + d.x * (c111.y - c011.y), + c000.z + d.z * (c001.z - c000.z) + d.y * (c011.z - c001.z) + d.x * (c111.z - c011.z) + ); + } + } + + return c; +} + +__global__ void lut3d_interp_8_nearest( + cudaTextureObject_t src_tex, + uchar4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + uchar4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + src.x / 255.0f * scale_r, + src.y / 255.0f * scale_g, + src.z / 255.0f * scale_b + ); + + float3 result = interp_nearest_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4( + (unsigned char)(result.x * 255.0f + 0.5f), + (unsigned char)(result.y * 255.0f + 0.5f), + (unsigned char)(result.z * 255.0f + 0.5f), + src.w + ); +} + +__global__ void lut3d_interp_10_nearest( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 6) * scale_r, // Convert 10-bit to normalized range + (src.y >> 6) * scale_g, + (src.z >> 6) * scale_b + ); + + float3 result = interp_nearest_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 1023.0f + 0.5f) << 6, // Convert back to 10-bit in 16-bit container + (unsigned short)(result.y * 1023.0f + 0.5f) << 6, + (unsigned short)(result.z * 1023.0f + 0.5f) << 6, + src.w + ); +} + +__global__ void lut3d_interp_12_nearest( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 4) * scale_r, // Convert 12-bit to normalized range + (src.y >> 4) * scale_g, + (src.z >> 4) * scale_b + ); + + float3 result = interp_nearest_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 4095.0f + 0.5f) << 4, // Convert back to 12-bit in 16-bit container + (unsigned short)(result.y * 4095.0f + 0.5f) << 4, + (unsigned short)(result.z * 4095.0f + 0.5f) << 4, + src.w + ); +} + +__global__ void lut3d_interp_8_trilinear( + cudaTextureObject_t src_tex, + uchar4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + uchar4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + src.x / 255.0f * scale_r, + src.y / 255.0f * scale_g, + src.z / 255.0f * scale_b + ); + + float3 result = interp_trilinear_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4( + (unsigned char)(result.x * 255.0f + 0.5f), + (unsigned char)(result.y * 255.0f + 0.5f), + (unsigned char)(result.z * 255.0f + 0.5f), + src.w + ); +} + +__global__ void lut3d_interp_10_trilinear( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 6) / 1023.0f * scale_r, + (src.y >> 6) / 1023.0f * scale_g, + (src.z >> 6) / 1023.0f * scale_b + ); + + float3 result = interp_trilinear_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 1023.0f + 0.5f) << 6, + (unsigned short)(result.y * 1023.0f + 0.5f) << 6, + (unsigned short)(result.z * 1023.0f + 0.5f) << 6, + src.w + ); +} + +__global__ void lut3d_interp_12_trilinear( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 4) / 4095.0f * scale_r, + (src.y >> 4) / 4095.0f * scale_g, + (src.z >> 4) / 4095.0f * scale_b + ); + + float3 result = interp_trilinear_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 4095.0f + 0.5f) << 4, + (unsigned short)(result.y * 4095.0f + 0.5f) << 4, + (unsigned short)(result.z * 4095.0f + 0.5f) << 4, + src.w + ); +} + +__global__ void lut3d_interp_8_tetrahedral( + cudaTextureObject_t src_tex, + uchar4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + uchar4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + src.x / 255.0f * scale_r, + src.y / 255.0f * scale_g, + src.z / 255.0f * scale_b + ); + + float3 result = interp_tetrahedral_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4( + (unsigned char)(result.x * 255.0f + 0.5f), + (unsigned char)(result.y * 255.0f + 0.5f), + (unsigned char)(result.z * 255.0f + 0.5f), + src.w + ); +} + +__global__ void lut3d_interp_10_tetrahedral( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 6) / 1023.0f * scale_r, + (src.y >> 6) / 1023.0f * scale_g, + (src.z >> 6) / 1023.0f * scale_b + ); + + float3 result = interp_tetrahedral_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 1023.0f + 0.5f) << 6, + (unsigned short)(result.y * 1023.0f + 0.5f) << 6, + (unsigned short)(result.z * 1023.0f + 0.5f) << 6, + src.w + ); +} + +__global__ void lut3d_interp_12_tetrahedral( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 4) / 4095.0f * scale_r, + (src.y >> 4) / 4095.0f * scale_g, + (src.z >> 4) / 4095.0f * scale_b + ); + + float3 result = interp_tetrahedral_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 4095.0f + 0.5f) << 4, + (unsigned short)(result.y * 4095.0f + 0.5f) << 4, + (unsigned short)(result.z * 4095.0f + 0.5f) << 4, + src.w + ); +} + +} \ No newline at end of file diff --git a/libavfilter/vf_lut3d_cuda.h b/libavfilter/vf_lut3d_cuda.h new file mode 100644 index 0000000000000..236f338c8cc8d --- /dev/null +++ b/libavfilter/vf_lut3d_cuda.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef AVFILTER_LUT3D_CUDA_H +#define AVFILTER_LUT3D_CUDA_H + +#endif \ No newline at end of file diff --git a/libavfilter/vf_tonemap_cuda.c b/libavfilter/vf_tonemap_cuda.c new file mode 100644 index 0000000000000..370e364ad9779 --- /dev/null +++ b/libavfilter/vf_tonemap_cuda.c @@ -0,0 +1,637 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +#include "libavutil/common.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" +#include "libavutil/internal.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/csp.h" + +#include "avfilter.h" +#include "filters.h" +#include "video.h" + +#include "cuda/load_helper.h" +#include "vf_tonemap_cuda.h" + +static const enum AVPixelFormat supported_formats[] = { + AV_PIX_FMT_GBRPF32, + AV_PIX_FMT_RGB48, + AV_PIX_FMT_RGBA64, + AV_PIX_FMT_P010LE, + AV_PIX_FMT_P016LE, + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_YUV420P10LE, + AV_PIX_FMT_YUV420P16LE, + AV_PIX_FMT_NV12, +}; + +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) +#define BLOCKX 32 +#define BLOCKY 16 + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + +enum TonemapAlgorithm { + TONEMAP_NONE, + TONEMAP_LINEAR, + TONEMAP_GAMMA, + TONEMAP_CLIP, + TONEMAP_REINHARD, + TONEMAP_HABLE, + TONEMAP_MOBIUS, + TONEMAP_MAX, +}; + +typedef struct TonemapCudaContext { + const AVClass *class; + + AVCUDADeviceContext *hwctx; + + enum AVPixelFormat in_fmt, out_fmt; + const AVPixFmtDescriptor *in_desc, *out_desc; + + AVBufferRef *frames_ctx; + AVFrame *frame; + AVFrame *tmp_frame; + + CUcontext cu_ctx; + CUmodule cu_module; + CUfunction cu_func; + CUstream cu_stream; + + + enum TonemapAlgorithm tonemap; + double param; + double desat; + double peak; + + const AVLumaCoefficients *coeffs; + int passthrough; +} TonemapCudaContext; + +static int format_is_supported(enum AVPixelFormat fmt) +{ + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) + if (supported_formats[i] == fmt) + return 1; + return 0; +} + +static av_cold int tonemap_cuda_init(AVFilterContext *ctx) +{ + TonemapCudaContext *s = ctx->priv; + + s->frame = av_frame_alloc(); + if (!s->frame) + return AVERROR(ENOMEM); + + s->tmp_frame = av_frame_alloc(); + if (!s->tmp_frame) + return AVERROR(ENOMEM); + + + switch(s->tonemap) { + case TONEMAP_GAMMA: + if (isnan(s->param)) + s->param = 1.8f; + break; + case TONEMAP_REINHARD: + if (!isnan(s->param)) + s->param = (1.0f - s->param) / s->param; + break; + case TONEMAP_MOBIUS: + if (isnan(s->param)) + s->param = 0.3f; + break; + } + + if (isnan(s->param)) + s->param = 1.0f; + + return 0; +} + +static av_cold void tonemap_cuda_uninit(AVFilterContext *ctx) +{ + TonemapCudaContext *s = ctx->priv; + + + if (s->hwctx && s->cu_module) { + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext dummy; + + CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + CHECK_CU(cu->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + } + + av_frame_free(&s->frame); + av_buffer_unref(&s->frames_ctx); + av_frame_free(&s->tmp_frame); +} + +static av_cold int init_hwframe_ctx(TonemapCudaContext *s, AVBufferRef *device_ctx, int width, int height) +{ + AVBufferRef *out_ref = NULL; + AVHWFramesContext *out_ctx; + int ret; + + out_ref = av_hwframe_ctx_alloc(device_ctx); + if (!out_ref) + return AVERROR(ENOMEM); + out_ctx = (AVHWFramesContext*)out_ref->data; + + out_ctx->format = AV_PIX_FMT_CUDA; + out_ctx->sw_format = s->out_fmt; + out_ctx->width = FFALIGN(width, 32); + out_ctx->height = FFALIGN(height, 32); + + ret = av_hwframe_ctx_init(out_ref); + if (ret < 0) + goto fail; + + av_frame_unref(s->frame); + ret = av_hwframe_get_buffer(out_ref, s->frame, 0); + if (ret < 0) + goto fail; + + s->frame->width = width; + s->frame->height = height; + + av_buffer_unref(&s->frames_ctx); + s->frames_ctx = out_ref; + + return 0; +fail: + av_buffer_unref(&out_ref); + return ret; +} + +static av_cold int tonemap_cuda_load_functions(AVFilterContext *ctx) +{ + TonemapCudaContext *s = ctx->priv; + CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + char function_name[128]; + int ret; + + extern const unsigned char ff_vf_tonemap_cuda_ptx_data[]; + extern const unsigned int ff_vf_tonemap_cuda_ptx_len; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, + ff_vf_tonemap_cuda_ptx_data, ff_vf_tonemap_cuda_ptx_len); + if (ret < 0) + goto fail; + + if (s->in_fmt == AV_PIX_FMT_GBRPF32) { + snprintf(function_name, sizeof(function_name), "tonemap_cuda_planar_float"); + } else if ((s->in_fmt == AV_PIX_FMT_P010LE || s->in_fmt == AV_PIX_FMT_P016LE) && s->out_fmt == AV_PIX_FMT_NV12) { + snprintf(function_name, sizeof(function_name), "tonemap_cuda_p016_to_nv12"); + } else if (s->in_fmt == AV_PIX_FMT_YUV420P10LE || s->in_fmt == AV_PIX_FMT_YUV420P16LE) { + snprintf(function_name, sizeof(function_name), "tonemap_cuda_yuv420p10"); + } else { + snprintf(function_name, sizeof(function_name), "tonemap_cuda_16bit"); + } + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func, s->cu_module, function_name)); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed to load CUDA function %s\n", function_name); + goto fail; + } + +fail: + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + return ret; +} + +static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int in_height, + int out_width, int out_height) +{ + TonemapCudaContext *s = ctx->priv; + FilterLink *inl = ff_filter_link(ctx->inputs[0]); + FilterLink *outl = ff_filter_link(ctx->outputs[0]); + + AVHWFramesContext *in_frames_ctx; + enum AVPixelFormat in_format; + enum AVPixelFormat out_format; + int ret; + + if (!inl->hw_frames_ctx) { + av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n"); + return AVERROR(EINVAL); + } + in_frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data; + in_format = in_frames_ctx->sw_format; + // Convert HDR to SDR format for encoder compatibility + if (in_format == AV_PIX_FMT_P010LE || in_format == AV_PIX_FMT_P016LE) { + out_format = AV_PIX_FMT_NV12; + } else if (in_format == AV_PIX_FMT_YUV420P10LE || in_format == AV_PIX_FMT_YUV420P16LE) { + out_format = AV_PIX_FMT_YUV420P; + } else { + out_format = in_format; + } + + if (!format_is_supported(in_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", + av_get_pix_fmt_name(in_format)); + return AVERROR(ENOSYS); + } + if (!format_is_supported(out_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n", + av_get_pix_fmt_name(out_format)); + return AVERROR(ENOSYS); + } + + s->in_fmt = in_format; + s->out_fmt = out_format; + s->in_desc = av_pix_fmt_desc_get(s->in_fmt); + s->out_desc = av_pix_fmt_desc_get(s->out_fmt); + + if (in_width == out_width && in_height == out_height && in_format == out_format) { + s->frames_ctx = av_buffer_ref(inl->hw_frames_ctx); + if (!s->frames_ctx) + return AVERROR(ENOMEM); + } else { + s->passthrough = 0; + + ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, out_width, out_height); + if (ret < 0) + return ret; + } + + outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx); + if (!outl->hw_frames_ctx) + return AVERROR(ENOMEM); + + return 0; +} + +static av_cold int tonemap_cuda_config_props(AVFilterLink *outlink) +{ + AVFilterContext *ctx = outlink->src; + AVFilterLink *inlink = outlink->src->inputs[0]; + FilterLink *inl = ff_filter_link(inlink); + TonemapCudaContext *s = ctx->priv; + AVHWFramesContext *frames_ctx; + AVCUDADeviceContext *device_hwctx; + int ret; + + outlink->w = inlink->w; + outlink->h = inlink->h; + + ret = init_processing_chain(ctx, inlink->w, inlink->h, outlink->w, outlink->h); + if (ret < 0) + return ret; + + frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data; + device_hwctx = frames_ctx->device_ctx->hwctx; + + s->hwctx = device_hwctx; + s->cu_stream = s->hwctx->stream; + + s->coeffs = av_csp_luma_coeffs_from_avcsp(inlink->colorspace); + + av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d fmt:%s -> w:%d h:%d fmt:%s%s\n", + inlink->w, inlink->h, av_get_pix_fmt_name(s->in_fmt), + outlink->w, outlink->h, av_get_pix_fmt_name(s->out_fmt), + s->passthrough ? " (passthrough)" : ""); + + ret = tonemap_cuda_load_functions(ctx); + if (ret < 0) + return ret; + + return 0; +} + +static int call_tonemap_kernel(AVFilterContext *ctx, AVFrame *out, AVFrame *in) +{ + TonemapCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; + CUtexObject tex = 0; + int ret; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + if (s->in_fmt == AV_PIX_FMT_GBRPF32) { + // Planar float format + float coeff_r = av_q2d(s->coeffs->cr); + float coeff_g = av_q2d(s->coeffs->cg); + float coeff_b = av_q2d(s->coeffs->cb); + void *args[] = { + &in->data[0], &in->data[1], &in->data[2], + &out->data[0], &out->data[1], &out->data[2], + &out->width, &out->height, &in->linesize[0], &out->linesize[0], + &s->tonemap, &s->param, &s->desat, &s->peak, + &coeff_r, &coeff_g, &coeff_b + }; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func, + DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); + } else if ((s->in_fmt == AV_PIX_FMT_P010LE || s->in_fmt == AV_PIX_FMT_P016LE) && s->out_fmt == AV_PIX_FMT_NV12) { + // P016 to NV12 conversion + CUDA_TEXTURE_DESC tex_desc_y = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_TEXTURE_DESC tex_desc_uv = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc_y = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = 1, + .res.pitch2D.width = in->width, + .res.pitch2D.height = in->height, + .res.pitch2D.pitchInBytes = in->linesize[0], + .res.pitch2D.devPtr = (CUdeviceptr)in->data[0], + }; + + CUDA_RESOURCE_DESC res_desc_uv = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = 2, + .res.pitch2D.width = in->width / 2, + .res.pitch2D.height = in->height / 2, + .res.pitch2D.pitchInBytes = in->linesize[1], + .res.pitch2D.devPtr = (CUdeviceptr)in->data[1], + }; + + CUtexObject tex_y = 0, tex_uv = 0; + ret = CHECK_CU(cu->cuTexObjectCreate(&tex_y, &res_desc_y, &tex_desc_y, NULL)); + if (ret < 0) goto exit; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex_uv, &res_desc_uv, &tex_desc_uv, NULL)); + if (ret < 0) goto exit; + + float coeff_r = av_q2d(s->coeffs->cr); + float coeff_g = av_q2d(s->coeffs->cg); + float coeff_b = av_q2d(s->coeffs->cb); + + void *args[] = { + &tex_y, &tex_uv, + &out->data[0], &out->data[1], + &out->width, &out->height, &out->linesize[0], &out->linesize[1], + &s->tonemap, &s->param, &s->desat, &s->peak, + &coeff_r, &coeff_g, &coeff_b + }; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func, + DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); + + if (tex_y) CHECK_CU(cu->cuTexObjectDestroy(tex_y)); + if (tex_uv) CHECK_CU(cu->cuTexObjectDestroy(tex_uv)); + } else if (s->in_fmt == AV_PIX_FMT_YUV420P10LE || s->in_fmt == AV_PIX_FMT_YUV420P16LE) { + // YUV format handling + CUDA_TEXTURE_DESC tex_desc_y = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_TEXTURE_DESC tex_desc_uv = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc_y = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = 1, + .res.pitch2D.width = in->width, + .res.pitch2D.height = in->height, + .res.pitch2D.pitchInBytes = in->linesize[0], + .res.pitch2D.devPtr = (CUdeviceptr)in->data[0], + }; + + CUDA_RESOURCE_DESC res_desc_uv = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = 2, + .res.pitch2D.width = in->width / 2, + .res.pitch2D.height = in->height / 2, + .res.pitch2D.pitchInBytes = in->linesize[1], + .res.pitch2D.devPtr = (CUdeviceptr)in->data[1], + }; + + CUtexObject tex_y = 0, tex_uv = 0; + ret = CHECK_CU(cu->cuTexObjectCreate(&tex_y, &res_desc_y, &tex_desc_y, NULL)); + if (ret < 0) goto exit; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex_uv, &res_desc_uv, &tex_desc_uv, NULL)); + if (ret < 0) goto exit; + + float coeff_r = av_q2d(s->coeffs->cr); + float coeff_g = av_q2d(s->coeffs->cg); + float coeff_b = av_q2d(s->coeffs->cb); + + void *args[] = { + &tex_y, &tex_uv, + &out->data[0], &out->data[1], &out->data[2], + &out->width, &out->height, &out->linesize[0], &out->linesize[1], + &s->tonemap, &s->param, &s->desat, &s->peak, + &coeff_r, &coeff_g, &coeff_b + }; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func, + DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); + + if (tex_y) CHECK_CU(cu->cuTexObjectDestroy(tex_y)); + if (tex_uv) CHECK_CU(cu->cuTexObjectDestroy(tex_uv)); + } else { + // 16-bit formats with texture + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = 4, + .res.pitch2D.width = in->width, + .res.pitch2D.height = in->height, + .res.pitch2D.pitchInBytes = in->linesize[0], + .res.pitch2D.devPtr = (CUdeviceptr)in->data[0], + }; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + int bit_depth = (s->in_fmt == AV_PIX_FMT_RGB48) ? 16 : 16; + float coeff_r = av_q2d(s->coeffs->cr); + float coeff_g = av_q2d(s->coeffs->cg); + float coeff_b = av_q2d(s->coeffs->cb); + + void *args[] = { + &tex, + &out->data[0], + &out->width, &out->height, &out->linesize[0], + &s->tonemap, &s->param, &s->desat, &s->peak, + &coeff_r, &coeff_g, &coeff_b, + &bit_depth + }; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func, + DIV_UP(out->width, BLOCKX), DIV_UP(out->height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); + } + +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + return ret; +} + + +static int tonemap_cuda_filter_frame(AVFilterLink *link, AVFrame *in) +{ + AVFilterContext *ctx = link->dst; + TonemapCudaContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + AVFrame *out = NULL; + CUcontext dummy; + int ret = 0; + + if (s->passthrough) + return ff_filter_frame(outlink, in); + + out = av_frame_alloc(); + if (!out) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + goto fail; + + ret = av_hwframe_get_buffer(s->frames_ctx, out, 0); + if (ret < 0) + goto fail; + + ret = call_tonemap_kernel(ctx, out, in); + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + if (ret < 0) + goto fail; + + ret = av_frame_copy_props(out, in); + if (ret < 0) + goto fail; + + av_frame_free(&in); + return ff_filter_frame(outlink, out); + +fail: + av_frame_free(&in); + av_frame_free(&out); + return ret; +} + +#define OFFSET(x) offsetof(TonemapCudaContext, x) +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) + +static const AVOption tonemap_cuda_options[] = { + { "tonemap", "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64=TONEMAP_NONE}, 0, TONEMAP_MAX-1, FLAGS, .unit = "tonemap" }, + { "none", 0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_NONE}, 0, 0, FLAGS, .unit = "tonemap" }, + { "linear", 0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_LINEAR}, 0, 0, FLAGS, .unit = "tonemap" }, + { "gamma", 0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_GAMMA}, 0, 0, FLAGS, .unit = "tonemap" }, + { "clip", 0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_CLIP}, 0, 0, FLAGS, .unit = "tonemap" }, + { "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_REINHARD}, 0, 0, FLAGS, .unit = "tonemap" }, + { "hable", 0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_HABLE}, 0, 0, FLAGS, .unit = "tonemap" }, + { "mobius", 0, 0, AV_OPT_TYPE_CONST, {.i64=TONEMAP_MOBIUS}, 0, 0, FLAGS, .unit = "tonemap" }, + { "param", "tonemap parameter", OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl=NAN}, DBL_MIN, DBL_MAX, FLAGS }, + { "desat", "desaturation strength", OFFSET(desat), AV_OPT_TYPE_DOUBLE, {.dbl=2}, 0, DBL_MAX, FLAGS }, + { "peak", "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS }, + { "passthrough", "Do not process frames at all if parameters match", OFFSET(passthrough), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS }, + { NULL }, +}; + +static const AVClass tonemap_cuda_class = { + .class_name = "tonemap_cuda", + .item_name = av_default_item_name, + .option = tonemap_cuda_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const AVFilterPad tonemap_cuda_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = tonemap_cuda_filter_frame, + }, +}; + +static const AVFilterPad tonemap_cuda_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = tonemap_cuda_config_props, + }, +}; + +const FFFilter ff_vf_tonemap_cuda = { + .p.name = "tonemap_cuda", + .p.description = NULL_IF_CONFIG_SMALL("GPU accelerated HDR to SDR tonemap filter"), + + .p.priv_class = &tonemap_cuda_class, + + .init = tonemap_cuda_init, + .uninit = tonemap_cuda_uninit, + + .priv_size = sizeof(TonemapCudaContext), + + FILTER_INPUTS(tonemap_cuda_inputs), + FILTER_OUTPUTS(tonemap_cuda_outputs), + + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), + + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; \ No newline at end of file diff --git a/libavfilter/vf_tonemap_cuda.cu b/libavfilter/vf_tonemap_cuda.cu new file mode 100644 index 0000000000000..3338e5d17d675 --- /dev/null +++ b/libavfilter/vf_tonemap_cuda.cu @@ -0,0 +1,308 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "cuda/vector_helpers.cuh" + +#define BLOCKX 32 +#define BLOCKY 16 + +extern "C" { + +enum TonemapAlgorithm { + TONEMAP_NONE, + TONEMAP_LINEAR, + TONEMAP_GAMMA, + TONEMAP_CLIP, + TONEMAP_REINHARD, + TONEMAP_HABLE, + TONEMAP_MOBIUS, + TONEMAP_MAX, +}; + +__device__ static inline float hable_cuda(float in) +{ + float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f; + return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f; +} + +__device__ static inline float mobius_cuda(float in, float j, float peak) +{ + float a, b; + + if (in <= j) + return in; + + a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak); + b = (j * j - 2.0f * j * peak + peak) / fmaxf(peak - 1.0f, 1e-6f); + + return (b * b + 2.0f * b * j + j * j) / (b - a) * (in + a) / (in + b); +} + +__device__ static inline float apply_tonemap_cuda(int algorithm, float sig, float param, float peak) +{ + switch (algorithm) { + case TONEMAP_NONE: + return sig; + case TONEMAP_LINEAR: + return sig * param / peak; + case TONEMAP_GAMMA: + return sig > 0.05f ? powf(sig / peak, 1.0f / param) + : sig * powf(0.05f / peak, 1.0f / param) / 0.05f; + case TONEMAP_CLIP: + return fminf(fmaxf(sig * param, 0.0f), 1.0f); + case TONEMAP_HABLE: + return hable_cuda(sig) / hable_cuda(peak); + case TONEMAP_REINHARD: + return sig / (sig + param) * (peak + param) / peak; + case TONEMAP_MOBIUS: + return mobius_cuda(sig, param, peak); + default: + return sig; + } +} + +__global__ void tonemap_cuda_float( + cudaTextureObject_t src_tex, + float *dst_r, float *dst_g, float *dst_b, + int dst_width, int dst_height, int dst_pitch, + int algorithm, float param, float desat, float peak, + float coeff_r, float coeff_g, float coeff_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + float4 src = tex2D(src_tex, x, y); + float r = src.x; + float g = src.y; + float b = src.z; + + // Desaturate to prevent unnatural colors + if (desat > 0) { + float luma = coeff_r * r + coeff_g * g + coeff_b * b; + float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f); + r = r * (1.0f - overbright) + luma * overbright; + g = g * (1.0f - overbright) + luma * overbright; + b = b * (1.0f - overbright) + luma * overbright; + } + + // Pick the brightest component + float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f); + float sig_orig = sig; + + // Apply tonemap algorithm + sig = apply_tonemap_cuda(algorithm, sig, param, peak); + + // Apply the computed scale factor to the color + float scale = sig / sig_orig; + r *= scale; + g *= scale; + b *= scale; + + int idx = y * (dst_pitch / sizeof(float)) + x; + dst_r[idx] = r; + dst_g[idx] = g; + dst_b[idx] = b; +} + +__global__ void tonemap_cuda_planar_float( + float *src_r, float *src_g, float *src_b, + float *dst_r, float *dst_g, float *dst_b, + int width, int height, int src_pitch, int dst_pitch, + int algorithm, float param, float desat, float peak, + float coeff_r, float coeff_g, float coeff_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) + return; + + int src_idx = y * (src_pitch / sizeof(float)) + x; + int dst_idx = y * (dst_pitch / sizeof(float)) + x; + + float r = src_r[src_idx]; + float g = src_g[src_idx]; + float b = src_b[src_idx]; + + // Desaturate to prevent unnatural colors + if (desat > 0) { + float luma = coeff_r * r + coeff_g * g + coeff_b * b; + float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f); + r = r * (1.0f - overbright) + luma * overbright; + g = g * (1.0f - overbright) + luma * overbright; + b = b * (1.0f - overbright) + luma * overbright; + } + + // Pick the brightest component + float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f); + float sig_orig = sig; + + // Apply tonemap algorithm + sig = apply_tonemap_cuda(algorithm, sig, param, peak); + + // Apply the computed scale factor to the color + float scale = sig / sig_orig; + r *= scale; + g *= scale; + b *= scale; + + dst_r[dst_idx] = r; + dst_g[dst_idx] = g; + dst_b[dst_idx] = b; +} + +__global__ void tonemap_cuda_16bit( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + int algorithm, float param, float desat, float peak, + float coeff_r, float coeff_g, float coeff_b, int bit_depth) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + float max_val = (1 << bit_depth) - 1; + + float r = src.x / max_val; + float g = src.y / max_val; + float b = src.z / max_val; + + // Desaturate to prevent unnatural colors + if (desat > 0) { + float luma = coeff_r * r + coeff_g * g + coeff_b * b; + float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f); + r = r * (1.0f - overbright) + luma * overbright; + g = g * (1.0f - overbright) + luma * overbright; + b = b * (1.0f - overbright) + luma * overbright; + } + + // Pick the brightest component + float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f); + float sig_orig = sig; + + // Apply tonemap algorithm + sig = apply_tonemap_cuda(algorithm, sig, param, peak); + + // Apply the computed scale factor to the color + float scale = sig / sig_orig; + r *= scale; + g *= scale; + b *= scale; + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(r * max_val + 0.5f), + (unsigned short)(g * max_val + 0.5f), + (unsigned short)(b * max_val + 0.5f), + src.w + ); +} + +__global__ void tonemap_cuda_p016_to_nv12( + cudaTextureObject_t src_y_tex, cudaTextureObject_t src_uv_tex, + unsigned char *dst_y, unsigned char *dst_uv, + int dst_width, int dst_height, int dst_pitch_y, int dst_pitch_uv, + int algorithm, float param, float desat, float peak, + float coeff_r, float coeff_g, float coeff_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + // Sample Y component (16-bit) + unsigned short y_val = tex2D(src_y_tex, x, y); + + // Sample UV components (subsampled, 16-bit) + int uv_x = x / 2; + int uv_y = y / 2; + ushort2 uv_val = tex2D(src_uv_tex, uv_x, uv_y); + + // Convert YUV to RGB (BT.2020 for HDR, 10-bit in 16-bit container) + float Y = (y_val - 64.0f * 64.0f) / (940.0f * 64.0f); // 10-bit scaling (shifted up by 6 bits in 16-bit) + float U = (uv_val.x - 512.0f * 64.0f) / (896.0f * 64.0f); + float V = (uv_val.y - 512.0f * 64.0f) / (896.0f * 64.0f); + + // BT.2020 YUV to RGB conversion + float r = Y + 1.7166f * V; + float g = Y - 0.1873f * U - 0.6526f * V; + float b = Y + 2.0426f * U; + + r = fmaxf(r, 0.0f); + g = fmaxf(g, 0.0f); + b = fmaxf(b, 0.0f); + + // Desaturate to prevent unnatural colors + if (desat > 0) { + float luma = coeff_r * r + coeff_g * g + coeff_b * b; + float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f); + r = r * (1.0f - overbright) + luma * overbright; + g = g * (1.0f - overbright) + luma * overbright; + b = b * (1.0f - overbright) + luma * overbright; + } + + // Pick the brightest component + float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f); + float sig_orig = sig; + + // Apply tonemap algorithm + sig = apply_tonemap_cuda(algorithm, sig, param, peak); + + // Apply the computed scale factor to the color + float scale = sig / sig_orig; + r *= scale; + g *= scale; + b *= scale; + + // Convert back to YUV (BT.709 for SDR) + float out_Y = 0.2126f * r + 0.7152f * g + 0.0722f * b; + float out_U = -0.1146f * r - 0.3854f * g + 0.5000f * b; + float out_V = 0.5000f * r - 0.4542f * g - 0.0458f * b; + + // Scale to 8-bit range + unsigned char y_out = (unsigned char)(out_Y * 219.0f + 16.0f + 0.5f); + unsigned char u_out = (unsigned char)(out_U * 224.0f + 128.0f + 0.5f); + unsigned char v_out = (unsigned char)(out_V * 224.0f + 128.0f + 0.5f); + + // Clamp to valid range + y_out = min(max(y_out, (unsigned char)16), (unsigned char)235); + u_out = min(max(u_out, (unsigned char)16), (unsigned char)240); + v_out = min(max(v_out, (unsigned char)16), (unsigned char)240); + + // Write Y component + dst_y[y * dst_pitch_y + x] = y_out; + + // Write UV components in NV12 format (only for even pixels to maintain 4:2:0 subsampling) + if (x % 2 == 0 && y % 2 == 0) { + int uv_idx = (y / 2) * dst_pitch_uv + (x / 2) * 2; + dst_uv[uv_idx] = u_out; + dst_uv[uv_idx + 1] = v_out; + } +} + +} \ No newline at end of file diff --git a/libavfilter/vf_tonemap_cuda.h b/libavfilter/vf_tonemap_cuda.h new file mode 100644 index 0000000000000..175221c3c78db --- /dev/null +++ b/libavfilter/vf_tonemap_cuda.h @@ -0,0 +1,28 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef AVFILTER_VF_TONEMAP_CUDA_H +#define AVFILTER_VF_TONEMAP_CUDA_H + +#include "avfilter.h" + +#endif /* AVFILTER_VF_TONEMAP_CUDA_H */ \ No newline at end of file