From 6d9882551fa0c5793d1859e02ee9c3904ab7aa9a Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Fri, 15 May 2026 09:41:22 -0700 Subject: [PATCH] torchcodec-xpu: delegate CPU fallback to CpuDeviceInterface Changes: * Delegate CPU fallback to CpuDeviceInterface * Introduce `FORCE_CPU_FALLBACK` to easy verify the fallback Fixes: https://github.com/intel/torchlib-xpu/issues/44 Requires: https://github.com/meta-pytorch/torchcodec/pull/1346 Signed-off-by: Dmitry Rogozhkin --- packages/torchcodec-xpu/README.md | 2 + .../src/torchcodec_xpu/XpuDeviceInterface.cpp | 69 +++++++------------ .../src/torchcodec_xpu/XpuDeviceInterface.h | 4 ++ 3 files changed, 30 insertions(+), 45 deletions(-) diff --git a/packages/torchcodec-xpu/README.md b/packages/torchcodec-xpu/README.md index afaf306..b782a47 100644 --- a/packages/torchcodec-xpu/README.md +++ b/packages/torchcodec-xpu/README.md @@ -24,6 +24,8 @@ The following environment variables can be used to customize the behavior of Int * `USE_SYCL_KERNELS = on|off` (default: `off`) - use SYCL kernels for augmentation such as color space conversion instead of VAAPI interface. If SYCL kernels are requested but can not be used due to hardware limitations, then fallback to VAAPI will be attempted. +* `FORCE_CPU_FALLBACL=on|off` (default: `off`) - force CPU fallback. + ## Known limitations * [IntelĀ® Data Center GPU Max Series][PVC] (Ponte Vecchio, PVC) GPUs are not supported due to missing hardware media engines diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp index 9794fb9..e017bd5 100644 --- a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp +++ b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp @@ -28,6 +28,7 @@ namespace facebook::torchcodec { namespace xpu { const char* USE_SYCL_KERNELS = std::getenv("USE_SYCL_KERNELS"); +const char* FORCE_CPU_FALLBACK = std::getenv("FORCE_CPU_FALLBACK"); static bool g_xpu = registerDeviceInterface( DeviceInterfaceKey(StableDeviceType::XPU), @@ -65,6 +66,13 @@ inline bool use_sycl_color_conversion_kernel() { #endif } +inline bool force_cpu_fallback() { + if (!FORCE_CPU_FALLBACK) { + return false; + } + return to_bool(FORCE_CPU_FALLBACK); +} + bool has_fp64(const StableDevice& device) { int deviceIndex = getDeviceIndex(device); sycl::device syclDevice = c10::xpu::get_raw_device(deviceIndex); @@ -127,36 +135,6 @@ torch::stable::Tensor allocateEmptyHWCTensor( device); } -// Self-contained SW NV12/YUV->RGB24 conversion for the CPU fallback path. -// Uses libswscale directly rather than delegating to CpuDeviceInterface: the -// relevant torchcodec symbols (CpuDeviceInterface ctor, createDeviceInterface) -// are not exported from the installed libtorchcodec_core6.so, so delegation is -// not linkable against the shipped wheel. -void convertSWFrameToRGB_sws( - AVFrame* avFrame, - torch::stable::Tensor& dstRGB_CPU) { - const int width = avFrame->width; - const int height = avFrame->height; - auto srcFormat = static_cast(avFrame->format); - - SwsContext* sws = sws_getContext(width, height, srcFormat, width, height, - AV_PIX_FMT_RGB24, SWS_BILINEAR, nullptr, nullptr, nullptr); - TORCH_CHECK( - sws != nullptr, "sws_getContext failed for ", av_get_pix_fmt_name(srcFormat), - " -> RGB24 at ", width, "x", - height); - - uint8_t* dstData[4] = {static_cast(dstRGB_CPU.mutable_data_ptr()), - nullptr, nullptr, nullptr}; - int dstLinesize[4] = {width * 3, 0, 0, 0}; - - int scaled = sws_scale(sws, avFrame->data, avFrame->linesize, 0, - height, dstData, dstLinesize); - sws_freeContext(sws); - TORCH_CHECK( - scaled == height, "sws_scale produced ", scaled, " lines, expected ", height); -} - } // namespace xpu int getDeviceIndex(const StableDevice& device) { @@ -184,11 +162,13 @@ XpuDeviceInterface::XpuDeviceInterface(const StableDevice& device) {1}, kStableUInt8, std::nullopt, StableDevice(device)); auto arch = xpu::getArchitecture(device); - // Checking for devices which don't have HW media engines so we can skip - // initialization of VAAPI context. - if (arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc && + if (!xpu::force_cpu_fallback()) { + // Checking for devices which don't have HW media engines so we can skip + // initialization of VAAPI context. + if (arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc && arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc_vg) { ctx_ = xpu::getVaapiContext(device_); + } } if (xpu::use_sycl_color_conversion_kernel()) { @@ -216,6 +196,13 @@ void XpuDeviceInterface::initialize( TORCH_CHECK(avStream != nullptr, "avStream is null"); codecContext_ = codecContext; timeBase_ = avStream->time_base; + + cpuInterface_ = createDeviceInterface(kStableCPU); + STD_TORCH_CHECK( + cpuInterface_ != nullptr, "Failed to create CPU device interface"); + cpuInterface_->initialize(avStream, avFormatCtx, codecContext); + cpuInterface_->initializeVideo( + VideoStreamOptions(), {}, /*resizedOutputDims=*/std::nullopt); } void XpuDeviceInterface::initializeVideo( @@ -369,25 +356,17 @@ void XpuDeviceInterface::convertAVFrameToFrameOutput( // general or on this particular device. In this case we have a frame on the // CPU. We send the frame back to the XPU device when we're done. - // Self-contained SW->RGB conversion via libswscale. We do not delegate to - // CpuDeviceInterface because its constructor and the createDeviceInterface - // factory are not exported from the installed libtorchcodec_core wheel. - auto frameDims = FrameDims(avFrame->height, avFrame->width); - torch::stable::Tensor cpuRGB = torch::stable::empty( - {frameDims.height, frameDims.width, 3}, - kStableUInt8, - std::nullopt, - StableDevice(kStableCPU)); - xpu::convertSWFrameToRGB_sws(avFrame.get(), cpuRGB); + FrameOutput cpuFrameOutput; + cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput); // Finally, we need to send the frame back to the GPU. Note that the // pre-allocated tensor is on the GPU, so we can't send that to the CPU // device interface. We copy it over here. if (preAllocatedOutputTensor.has_value()) { - torch::stable::copy_(preAllocatedOutputTensor.value(), cpuRGB); + torch::stable::copy_(preAllocatedOutputTensor.value(), cpuFrameOutput.data); frameOutput.data = preAllocatedOutputTensor.value(); } else { - frameOutput.data = torch::stable::to(cpuRGB, device_); + frameOutput.data = torch::stable::to(cpuFrameOutput.data, device_); } return; } diff --git a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h index c6643c4..71b7566 100644 --- a/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h +++ b/packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.h @@ -39,6 +39,10 @@ class XpuDeviceInterface : public DeviceInterface { std::nullopt) override; private: + // We sometimes encounter frames that cannot be decoded on the XPU device. + // Rather than erroring out, we decode them on the CPU. + std::unique_ptr cpuInterface_; + VideoStreamOptions videoStreamOptions_; AVRational timeBase_; bool has_fp64_;