Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/torchcodec-xpu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ The following environment variables can be used to customize the behavior of Int

* `USE_SYCL_KERNELS = on|off` (default: `off`) - use SYCL kernels for augmentation such as color space conversion instead of VAAPI interface. If SYCL kernels are requested but can not be used due to hardware limitations, then fallback to VAAPI will be attempted.

* `FORCE_CPU_FALLBACL=on|off` (default: `off`) - force CPU fallback.

## Known limitations

* [Intel® Data Center GPU Max Series][PVC] (Ponte Vecchio, PVC) GPUs are not supported due to missing hardware media engines
Expand Down
69 changes: 24 additions & 45 deletions packages/torchcodec-xpu/src/torchcodec_xpu/XpuDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ namespace facebook::torchcodec {
namespace xpu {

const char* USE_SYCL_KERNELS = std::getenv("USE_SYCL_KERNELS");
const char* FORCE_CPU_FALLBACK = std::getenv("FORCE_CPU_FALLBACK");

static bool g_xpu = registerDeviceInterface(
DeviceInterfaceKey(StableDeviceType::XPU),
Expand Down Expand Up @@ -65,6 +66,13 @@ inline bool use_sycl_color_conversion_kernel() {
#endif
}

inline bool force_cpu_fallback() {
if (!FORCE_CPU_FALLBACK) {
return false;
}
return to_bool(FORCE_CPU_FALLBACK);
}

bool has_fp64(const StableDevice& device) {
int deviceIndex = getDeviceIndex(device);
sycl::device syclDevice = c10::xpu::get_raw_device(deviceIndex);
Expand Down Expand Up @@ -127,36 +135,6 @@ torch::stable::Tensor allocateEmptyHWCTensor(
device);
}

// Self-contained SW NV12/YUV->RGB24 conversion for the CPU fallback path.
// Uses libswscale directly rather than delegating to CpuDeviceInterface: the
// relevant torchcodec symbols (CpuDeviceInterface ctor, createDeviceInterface)
// are not exported from the installed libtorchcodec_core6.so, so delegation is
// not linkable against the shipped wheel.
void convertSWFrameToRGB_sws(
AVFrame* avFrame,
torch::stable::Tensor& dstRGB_CPU) {
const int width = avFrame->width;
const int height = avFrame->height;
auto srcFormat = static_cast<AVPixelFormat>(avFrame->format);

SwsContext* sws = sws_getContext(width, height, srcFormat, width, height,
AV_PIX_FMT_RGB24, SWS_BILINEAR, nullptr, nullptr, nullptr);
TORCH_CHECK(
sws != nullptr, "sws_getContext failed for ", av_get_pix_fmt_name(srcFormat),
" -> RGB24 at ", width, "x",
height);

uint8_t* dstData[4] = {static_cast<uint8_t*>(dstRGB_CPU.mutable_data_ptr()),
nullptr, nullptr, nullptr};
int dstLinesize[4] = {width * 3, 0, 0, 0};

int scaled = sws_scale(sws, avFrame->data, avFrame->linesize, 0,
height, dstData, dstLinesize);
sws_freeContext(sws);
TORCH_CHECK(
scaled == height, "sws_scale produced ", scaled, " lines, expected ", height);
}

} // namespace xpu

int getDeviceIndex(const StableDevice& device) {
Expand Down Expand Up @@ -184,11 +162,13 @@ XpuDeviceInterface::XpuDeviceInterface(const StableDevice& device)
{1}, kStableUInt8, std::nullopt, StableDevice(device));

auto arch = xpu::getArchitecture(device);
// Checking for devices which don't have HW media engines so we can skip
// initialization of VAAPI context.
if (arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc &&
if (!xpu::force_cpu_fallback()) {
// Checking for devices which don't have HW media engines so we can skip
// initialization of VAAPI context.
if (arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc &&
arch != sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc_vg) {
ctx_ = xpu::getVaapiContext(device_);
}
}

if (xpu::use_sycl_color_conversion_kernel()) {
Expand Down Expand Up @@ -216,6 +196,13 @@ void XpuDeviceInterface::initialize(
TORCH_CHECK(avStream != nullptr, "avStream is null");
codecContext_ = codecContext;
timeBase_ = avStream->time_base;

cpuInterface_ = createDeviceInterface(kStableCPU);
STD_TORCH_CHECK(
cpuInterface_ != nullptr, "Failed to create CPU device interface");
cpuInterface_->initialize(avStream, avFormatCtx, codecContext);
cpuInterface_->initializeVideo(
VideoStreamOptions(), {}, /*resizedOutputDims=*/std::nullopt);
}

void XpuDeviceInterface::initializeVideo(
Expand Down Expand Up @@ -369,25 +356,17 @@ void XpuDeviceInterface::convertAVFrameToFrameOutput(
// general or on this particular device. In this case we have a frame on the
// CPU. We send the frame back to the XPU device when we're done.

// Self-contained SW->RGB conversion via libswscale. We do not delegate to
// CpuDeviceInterface because its constructor and the createDeviceInterface
// factory are not exported from the installed libtorchcodec_core wheel.
auto frameDims = FrameDims(avFrame->height, avFrame->width);
torch::stable::Tensor cpuRGB = torch::stable::empty(
{frameDims.height, frameDims.width, 3},
kStableUInt8,
std::nullopt,
StableDevice(kStableCPU));
xpu::convertSWFrameToRGB_sws(avFrame.get(), cpuRGB);
FrameOutput cpuFrameOutput;
cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);

// Finally, we need to send the frame back to the GPU. Note that the
// pre-allocated tensor is on the GPU, so we can't send that to the CPU
// device interface. We copy it over here.
if (preAllocatedOutputTensor.has_value()) {
torch::stable::copy_(preAllocatedOutputTensor.value(), cpuRGB);
torch::stable::copy_(preAllocatedOutputTensor.value(), cpuFrameOutput.data);
frameOutput.data = preAllocatedOutputTensor.value();
} else {
frameOutput.data = torch::stable::to(cpuRGB, device_);
frameOutput.data = torch::stable::to(cpuFrameOutput.data, device_);
}
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ class XpuDeviceInterface : public DeviceInterface {
std::nullopt) override;

private:
// We sometimes encounter frames that cannot be decoded on the XPU device.
// Rather than erroring out, we decode them on the CPU.
std::unique_ptr<DeviceInterface> cpuInterface_;

VideoStreamOptions videoStreamOptions_;
AVRational timeBase_;
bool has_fp64_;
Expand Down
Loading