From ee8946b65357d58bf7d96c56e4882cdf9a10c1ef Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:22:42 +0000 Subject: [PATCH 01/54] feat: add playback pipeline benchmark for measuring per-stage performance Creates a comprehensive benchmark (playback-pipeline-benchmark) that measures each stage of the playback pipeline independently: - Decode-only performance (frame retrieval latency) - Full pipeline (decode + GPU render + readback) at multiple resolutions - Scrubbing performance (random-access frame rendering) Reports min/avg/p50/p95/p99/max statistics for each stage, effective FPS, and frame budget utilization analysis. Usage: cargo run -p cap-editor --example playback-pipeline-benchmark -- \ --recording-path /path/to/recording [--fps 60] [--frames 300] Co-authored-by: Richie McIlroy --- Cargo.lock | 1 + crates/editor/Cargo.toml | 5 + .../examples/playback-pipeline-benchmark.rs | 690 ++++++++++++++++++ 3 files changed, 696 insertions(+) create mode 100644 crates/editor/examples/playback-pipeline-benchmark.rs diff --git a/Cargo.lock b/Cargo.lock index c4f8665e90a..2107d7eae85 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1315,6 +1315,7 @@ dependencies = [ "tokio", "tokio-util", "tracing", + "tracing-subscriber", "workspace-hack", ] diff --git a/crates/editor/Cargo.toml b/crates/editor/Cargo.toml index c612d1e33fe..1d52c776f89 100644 --- a/crates/editor/Cargo.toml +++ b/crates/editor/Cargo.toml @@ -10,6 +10,10 @@ workspace = true name = "decode-benchmark" path = "examples/decode-benchmark.rs" +[[example]] +name = "playback-pipeline-benchmark" +path = "examples/playback-pipeline-benchmark.rs" + [dependencies] cap-media = { path = "../media" } cap-project = { path = "../project" } @@ -30,4 +34,5 @@ flume.workspace = true tokio-util = "0.7.15" ringbuf = "0.4.8" lru = "0.12" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/crates/editor/examples/playback-pipeline-benchmark.rs b/crates/editor/examples/playback-pipeline-benchmark.rs new file mode 100644 index 00000000000..c1524f5faa4 --- /dev/null +++ b/crates/editor/examples/playback-pipeline-benchmark.rs @@ -0,0 +1,690 @@ +use cap_project::{ + ProjectConfiguration, RecordingMeta, RecordingMetaInner, StudioRecordingMeta, + TimelineConfiguration, TimelineSegment, XY, +}; +use cap_rendering::{ + FrameRenderer, ProjectRecordingsMeta, ProjectUniforms, RenderVideoConstants, RendererLayers, + ZoomFocusInterpolator, decoder::spawn_decoder, + spring_mass_damper::SpringMassDamperSimulationConfig, +}; +use std::{ + path::{Path, PathBuf}, + sync::Arc, + time::Instant, +}; + +fn percentile(data: &[f64], p: f64) -> f64 { + if data.is_empty() { + return 0.0; + } + let mut sorted: Vec = data.iter().copied().filter(|x| !x.is_nan()).collect(); + if sorted.is_empty() { + return 0.0; + } + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let idx = ((p / 100.0) * (sorted.len() - 1) as f64).round() as usize; + sorted[idx.min(sorted.len() - 1)] +} + +fn print_stats(label: &str, times_ms: &[f64]) { + if times_ms.is_empty() { + println!(" {label}: no data"); + return; + } + let avg = times_ms.iter().sum::() / times_ms.len() as f64; + let min = times_ms.iter().copied().fold(f64::INFINITY, f64::min); + let max = times_ms.iter().copied().fold(f64::NEG_INFINITY, f64::max); + let p50 = percentile(times_ms, 50.0); + let p95 = percentile(times_ms, 95.0); + let p99 = percentile(times_ms, 99.0); + + println!(" {label}:"); + println!(" avg={avg:.2}ms min={min:.2}ms max={max:.2}ms"); + println!(" p50={p50:.2}ms p95={p95:.2}ms p99={p99:.2}ms"); +} + +#[derive(Default)] +struct PipelineTimings { + decode_ms: Vec, + render_ms: Vec, + total_ms: Vec, + decode_failures: usize, + render_failures: usize, + frames_rendered: usize, +} + +impl PipelineTimings { + fn print_report(&self, label: &str) { + println!("\n{}", "=".repeat(60)); + println!(" {label}"); + println!("{}", "=".repeat(60)); + + println!(" Frames rendered: {}", self.frames_rendered); + if self.decode_failures > 0 { + println!(" Decode failures: {}", self.decode_failures); + } + if self.render_failures > 0 { + println!(" Render failures: {}", self.render_failures); + } + + if !self.total_ms.is_empty() { + let total_time: f64 = self.total_ms.iter().sum(); + let effective_fps = self.frames_rendered as f64 / (total_time / 1000.0); + println!(" Effective FPS: {effective_fps:.1}"); + println!(" Total time: {total_time:.0}ms for {} frames", self.frames_rendered); + } + + println!(); + print_stats("Decode", &self.decode_ms); + print_stats("GPU Render+Readback", &self.render_ms); + print_stats("Total (decode+render)", &self.total_ms); + } +} + +async fn load_recording( + recording_path: &Path, +) -> Result< + ( + RecordingMeta, + Box, + ProjectConfiguration, + Arc, + ), + String, +> { + let recording_meta = RecordingMeta::load_for_project(recording_path) + .map_err(|e| format!("Failed to load recording meta: {e}"))?; + + let RecordingMetaInner::Studio(meta) = &recording_meta.inner else { + return Err("Not a studio recording".to_string()); + }; + let meta = meta.clone(); + + let mut project = recording_meta.project_config(); + + if project.timeline.is_none() { + let timeline_segments = match meta.as_ref() { + StudioRecordingMeta::SingleSegment { segment } => { + let display_path = recording_meta.path(&segment.display.path); + let duration = + match cap_rendering::Video::new(&display_path, 0.0) { + Ok(v) => v.duration, + Err(_) => 5.0, + }; + vec![TimelineSegment { + recording_clip: 0, + start: 0.0, + end: duration, + timescale: 1.0, + }] + } + StudioRecordingMeta::MultipleSegments { inner } => inner + .segments + .iter() + .enumerate() + .filter_map(|(i, segment)| { + let display_path = recording_meta.path(&segment.display.path); + let duration = match cap_rendering::Video::new(&display_path, 0.0) { + Ok(v) => v.duration, + Err(_) => 5.0, + }; + if duration <= 0.0 { + return None; + } + Some(TimelineSegment { + recording_clip: i as u32, + start: 0.0, + end: duration, + timescale: 1.0, + }) + }) + .collect(), + }; + + if !timeline_segments.is_empty() { + project.timeline = Some(TimelineConfiguration { + segments: timeline_segments, + zoom_segments: Vec::new(), + scene_segments: Vec::new(), + mask_segments: Vec::new(), + text_segments: Vec::new(), + }); + } + } + + let recordings = Arc::new( + ProjectRecordingsMeta::new(&recording_meta.project_path, meta.as_ref()) + .map_err(|e| format!("Failed to create recordings meta: {e}"))?, + ); + + Ok((recording_meta, meta, project, recordings)) +} + +async fn run_decode_only_benchmark( + recording_meta: &RecordingMeta, + meta: &StudioRecordingMeta, + project: &ProjectConfiguration, + fps: u32, + frame_count: usize, +) -> PipelineTimings { + let mut timings = PipelineTimings::default(); + + let display_path = match meta { + StudioRecordingMeta::SingleSegment { segment } => { + recording_meta.path(&segment.display.path) + } + StudioRecordingMeta::MultipleSegments { inner } => { + recording_meta.path(&inner.segments[0].display.path) + } + }; + + let display_fps = match meta { + StudioRecordingMeta::SingleSegment { segment } => segment.display.fps, + StudioRecordingMeta::MultipleSegments { inner } => inner.segments[0].display.fps, + }; + + let decoder = match spawn_decoder("benchmark-screen", display_path, display_fps, 0.0, false).await { + Ok(d) => d, + Err(e) => { + eprintln!("Failed to create decoder: {e}"); + return timings; + } + }; + + println!(" Decoder type: {}", decoder.decoder_type()); + println!( + " Hardware accelerated: {}", + decoder.is_hardware_accelerated() + ); + let (w, h) = decoder.video_dimensions(); + println!(" Video dimensions: {w}x{h}"); + + let duration = project + .timeline + .as_ref() + .map(|t| t.duration()) + .unwrap_or(10.0); + let max_frames = ((duration * fps as f64).ceil() as usize).min(frame_count); + + println!(" Decoding {max_frames} frames at {fps}fps..."); + + for i in 0..max_frames { + let time = i as f32 / fps as f32; + let start = Instant::now(); + match decoder.get_frame(time).await { + Some(_frame) => { + let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0; + timings.decode_ms.push(elapsed_ms); + timings.total_ms.push(elapsed_ms); + timings.frames_rendered += 1; + } + None => { + timings.decode_failures += 1; + } + } + } + + timings +} + +async fn run_full_pipeline_benchmark( + recording_meta: &RecordingMeta, + meta: &StudioRecordingMeta, + project: &ProjectConfiguration, + recordings: &ProjectRecordingsMeta, + fps: u32, + frame_count: usize, + resolution_base: XY, +) -> PipelineTimings { + let mut timings = PipelineTimings::default(); + + let render_constants = match RenderVideoConstants::new( + &recordings.segments, + recording_meta.clone(), + (*meta).clone(), + ) + .await + { + Ok(rc) => Arc::new(rc), + Err(e) => { + eprintln!("Failed to create render constants: {e}"); + return timings; + } + }; + + println!( + " GPU adapter: {} (software={})", + render_constants._adapter.get_info().name, + render_constants.is_software_adapter + ); + + let segments = + match cap_editor::create_segments(recording_meta, meta, false).await { + Ok(s) => s, + Err(e) => { + eprintln!("Failed to create segments: {e}"); + return timings; + } + }; + + if segments.is_empty() { + eprintln!("No segments found"); + return timings; + } + + let mut frame_renderer = FrameRenderer::new(&render_constants); + let mut layers = RendererLayers::new_with_options( + &render_constants.device, + &render_constants.queue, + render_constants.is_software_adapter, + ); + + let first_segment = &segments[0]; + let (screen_w, screen_h) = first_segment.decoders.screen_video_dimensions(); + let camera_dims = first_segment.decoders.camera_video_dimensions(); + layers.prepare_for_video_dimensions( + &render_constants.device, + screen_w, + screen_h, + camera_dims.map(|(w, _)| w), + camera_dims.map(|(_, h)| h), + ); + + let duration = project + .timeline + .as_ref() + .map(|t| t.duration()) + .unwrap_or(10.0); + let max_frames = ((duration * fps as f64).ceil() as usize).min(frame_count); + + println!(" Rendering {max_frames} frames at {fps}fps, resolution base: {}x{}...", resolution_base.x, resolution_base.y); + + let cursor_smoothing = + (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { + tension: project.cursor.tension, + mass: project.cursor.mass, + friction: project.cursor.friction, + }); + + for i in 0..max_frames { + let frame_time = i as f64 / fps as f64; + + let Some((segment_time, segment)) = project.get_segment_time(frame_time) else { + break; + }; + + let segment_media = match segments.get(segment.recording_clip as usize) { + Some(sm) => sm, + None => { + timings.decode_failures += 1; + continue; + } + }; + + let clip_offsets = project + .clips + .iter() + .find(|v| v.index == segment.recording_clip) + .map(|v| v.offsets) + .unwrap_or_default(); + + let decode_start = Instant::now(); + let segment_frames_opt = if i == 0 { + segment_media + .decoders + .get_frames_initial( + segment_time as f32, + !project.camera.hide, + clip_offsets, + ) + .await + } else { + segment_media + .decoders + .get_frames(segment_time as f32, !project.camera.hide, clip_offsets) + .await + }; + let decode_elapsed_ms = decode_start.elapsed().as_secs_f64() * 1000.0; + + let Some(segment_frames) = segment_frames_opt else { + timings.decode_failures += 1; + continue; + }; + + timings.decode_ms.push(decode_elapsed_ms); + + let zoom_focus_interpolator = ZoomFocusInterpolator::new( + &segment_media.cursor, + cursor_smoothing, + project.screen_movement_spring, + duration, + ); + + let uniforms = ProjectUniforms::new( + &render_constants, + project, + i as u32, + fps, + resolution_base, + &segment_media.cursor, + &segment_frames, + duration, + &zoom_focus_interpolator, + ); + + let render_start = Instant::now(); + match frame_renderer + .render( + segment_frames, + uniforms, + &segment_media.cursor, + &mut layers, + ) + .await + { + Ok(_frame) => { + let render_elapsed_ms = render_start.elapsed().as_secs_f64() * 1000.0; + timings.render_ms.push(render_elapsed_ms); + timings.total_ms.push(decode_elapsed_ms + render_elapsed_ms); + timings.frames_rendered += 1; + } + Err(e) => { + timings.render_failures += 1; + if timings.render_failures <= 3 { + eprintln!(" Render failed at frame {i}: {e}"); + } + } + } + + if (i + 1) % 100 == 0 { + println!(" Progress: {}/{max_frames} frames", i + 1); + } + } + + timings +} + +async fn run_scrubbing_benchmark( + recording_meta: &RecordingMeta, + meta: &StudioRecordingMeta, + project: &ProjectConfiguration, + recordings: &ProjectRecordingsMeta, + fps: u32, + resolution_base: XY, +) -> PipelineTimings { + let mut timings = PipelineTimings::default(); + + let render_constants = match RenderVideoConstants::new( + &recordings.segments, + recording_meta.clone(), + (*meta).clone(), + ) + .await + { + Ok(rc) => Arc::new(rc), + Err(e) => { + eprintln!("Failed to create render constants: {e}"); + return timings; + } + }; + + let segments = + match cap_editor::create_segments(recording_meta, meta, false).await { + Ok(s) => s, + Err(e) => { + eprintln!("Failed to create segments: {e}"); + return timings; + } + }; + + if segments.is_empty() { + eprintln!("No segments found"); + return timings; + } + + let mut frame_renderer = FrameRenderer::new(&render_constants); + let mut layers = RendererLayers::new_with_options( + &render_constants.device, + &render_constants.queue, + render_constants.is_software_adapter, + ); + + let first_segment = &segments[0]; + let (screen_w, screen_h) = first_segment.decoders.screen_video_dimensions(); + let camera_dims = first_segment.decoders.camera_video_dimensions(); + layers.prepare_for_video_dimensions( + &render_constants.device, + screen_w, + screen_h, + camera_dims.map(|(w, _)| w), + camera_dims.map(|(_, h)| h), + ); + + let duration = project + .timeline + .as_ref() + .map(|t| t.duration()) + .unwrap_or(10.0); + + let cursor_smoothing = + (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { + tension: project.cursor.tension, + mass: project.cursor.mass, + friction: project.cursor.friction, + }); + + let scrub_positions: Vec = { + let golden_ratio = 1.618_034; + let mut positions = Vec::with_capacity(50); + let mut pos = 0.0; + for _ in 0..50 { + pos = (pos + golden_ratio * duration) % duration; + positions.push(pos); + } + positions + }; + + println!(" Scrubbing to {} random positions...", scrub_positions.len()); + + for (i, &scrub_time) in scrub_positions.iter().enumerate() { + let Some((segment_time, segment)) = project.get_segment_time(scrub_time) else { + continue; + }; + + let segment_media = match segments.get(segment.recording_clip as usize) { + Some(sm) => sm, + None => continue, + }; + + let clip_offsets = project + .clips + .iter() + .find(|v| v.index == segment.recording_clip) + .map(|v| v.offsets) + .unwrap_or_default(); + + let decode_start = Instant::now(); + let segment_frames_opt = segment_media + .decoders + .get_frames_initial(segment_time as f32, !project.camera.hide, clip_offsets) + .await; + let decode_elapsed_ms = decode_start.elapsed().as_secs_f64() * 1000.0; + + let Some(segment_frames) = segment_frames_opt else { + timings.decode_failures += 1; + continue; + }; + + timings.decode_ms.push(decode_elapsed_ms); + + let frame_number = (scrub_time * fps as f64).round() as u32; + + let zoom_focus_interpolator = ZoomFocusInterpolator::new( + &segment_media.cursor, + cursor_smoothing, + project.screen_movement_spring, + duration, + ); + + let uniforms = ProjectUniforms::new( + &render_constants, + project, + frame_number, + fps, + resolution_base, + &segment_media.cursor, + &segment_frames, + duration, + &zoom_focus_interpolator, + ); + + let render_start = Instant::now(); + match frame_renderer + .render( + segment_frames, + uniforms, + &segment_media.cursor, + &mut layers, + ) + .await + { + Ok(_frame) => { + let render_elapsed_ms = render_start.elapsed().as_secs_f64() * 1000.0; + timings.render_ms.push(render_elapsed_ms); + timings.total_ms.push(decode_elapsed_ms + render_elapsed_ms); + timings.frames_rendered += 1; + } + Err(e) => { + timings.render_failures += 1; + if timings.render_failures <= 3 { + eprintln!(" Render failed at scrub {i}: {e}"); + } + } + } + } + + timings +} + +#[tokio::main] +async fn main() { + tracing_subscriber::fmt() + .with_env_filter( + tracing_subscriber::EnvFilter::from_default_env() + .add_directive(tracing::Level::WARN.into()), + ) + .init(); + + ffmpeg::init().expect("Failed to initialize FFmpeg"); + + let args: Vec = std::env::args().collect(); + + let recording_path = args + .iter() + .position(|a| a == "--recording-path") + .and_then(|i| args.get(i + 1)) + .map(PathBuf::from) + .expect("Usage: playback-pipeline-benchmark --recording-path [--fps ] [--frames ]"); + + let fps = args + .iter() + .position(|a| a == "--fps") + .and_then(|i| args.get(i + 1)) + .and_then(|s| s.parse().ok()) + .unwrap_or(60); + + let frame_count = args + .iter() + .position(|a| a == "--frames") + .and_then(|i| args.get(i + 1)) + .and_then(|s| s.parse().ok()) + .unwrap_or(300); + + println!("{}", "=".repeat(60)); + println!(" CAP PLAYBACK PIPELINE BENCHMARK"); + println!("{}", "=".repeat(60)); + println!(); + println!("Recording: {}", recording_path.display()); + println!("Target FPS: {fps}"); + println!("Max frames: {frame_count}"); + println!(); + + let (recording_meta, meta, project, recordings) = match load_recording(&recording_path).await { + Ok(r) => r, + Err(e) => { + eprintln!("Failed to load recording: {e}"); + std::process::exit(1); + } + }; + + let duration = project + .timeline + .as_ref() + .map(|t| t.duration()) + .unwrap_or(0.0); + println!("Recording duration: {duration:.2}s"); + + let resolutions = [ + (XY::new(1920, 1080), "Full (1920x1080)"), + (XY::new(1248, 702), "Half (1248x702)"), + (XY::new(480, 270), "Quarter (480x270)"), + ]; + + println!("\n--- DECODE-ONLY BENCHMARK ---"); + let decode_timings = run_decode_only_benchmark( + &recording_meta, + meta.as_ref(), + &project, + fps, + frame_count, + ) + .await; + decode_timings.print_report("DECODE-ONLY"); + + for (resolution_base, label) in &resolutions { + println!("\n--- FULL PIPELINE: {label} ---"); + let pipeline_timings = run_full_pipeline_benchmark( + &recording_meta, + meta.as_ref(), + &project, + &recordings, + fps, + frame_count, + *resolution_base, + ) + .await; + pipeline_timings.print_report(&format!("FULL PIPELINE - {label}")); + } + + println!("\n--- SCRUBBING BENCHMARK (Half resolution) ---"); + let scrub_timings = run_scrubbing_benchmark( + &recording_meta, + meta.as_ref(), + &project, + &recordings, + fps, + XY::new(1248, 702), + ) + .await; + scrub_timings.print_report("SCRUBBING (Half resolution)"); + + println!("\n{}", "=".repeat(60)); + println!(" BENCHMARK COMPLETE"); + println!("{}", "=".repeat(60)); + + let target_frame_time_ms = 1000.0 / fps as f64; + println!("\nTarget frame time at {fps}fps: {target_frame_time_ms:.2}ms"); + + if !decode_timings.decode_ms.is_empty() { + let decode_p95 = percentile(&decode_timings.decode_ms, 95.0); + let decode_budget_pct = decode_p95 / target_frame_time_ms * 100.0; + println!( + "Decode p95 ({decode_p95:.2}ms) uses {decode_budget_pct:.0}% of frame budget" + ); + } + + for (_resolution_base, label) in &resolutions { + println!("\n{label}:"); + println!(" Frame budget: {target_frame_time_ms:.2}ms"); + } +} From e782d12d0a13371c6dbe33b6568800e0731d3140 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:22:42 +0000 Subject: [PATCH 02/54] feat: add playback pipeline benchmark for measuring per-stage performance Creates a comprehensive benchmark (playback-pipeline-benchmark) that measures each stage of the playback pipeline independently: - Decode-only performance (frame retrieval latency) - Full pipeline (decode + GPU render + readback) at multiple resolutions - Scrubbing performance (random-access frame rendering) Reports min/avg/p50/p95/p99/max statistics for each stage, effective FPS, and frame budget utilization analysis. Usage: cargo run -p cap-editor --example playback-pipeline-benchmark -- \ --recording-path /path/to/recording [--fps 60] [--frames 300] --- Cargo.lock | 1 + crates/editor/Cargo.toml | 5 + .../examples/playback-pipeline-benchmark.rs | 690 ++++++++++++++++++ 3 files changed, 696 insertions(+) create mode 100644 crates/editor/examples/playback-pipeline-benchmark.rs diff --git a/Cargo.lock b/Cargo.lock index c4f8665e90a..2107d7eae85 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1315,6 +1315,7 @@ dependencies = [ "tokio", "tokio-util", "tracing", + "tracing-subscriber", "workspace-hack", ] diff --git a/crates/editor/Cargo.toml b/crates/editor/Cargo.toml index c612d1e33fe..1d52c776f89 100644 --- a/crates/editor/Cargo.toml +++ b/crates/editor/Cargo.toml @@ -10,6 +10,10 @@ workspace = true name = "decode-benchmark" path = "examples/decode-benchmark.rs" +[[example]] +name = "playback-pipeline-benchmark" +path = "examples/playback-pipeline-benchmark.rs" + [dependencies] cap-media = { path = "../media" } cap-project = { path = "../project" } @@ -30,4 +34,5 @@ flume.workspace = true tokio-util = "0.7.15" ringbuf = "0.4.8" lru = "0.12" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/crates/editor/examples/playback-pipeline-benchmark.rs b/crates/editor/examples/playback-pipeline-benchmark.rs new file mode 100644 index 00000000000..c1524f5faa4 --- /dev/null +++ b/crates/editor/examples/playback-pipeline-benchmark.rs @@ -0,0 +1,690 @@ +use cap_project::{ + ProjectConfiguration, RecordingMeta, RecordingMetaInner, StudioRecordingMeta, + TimelineConfiguration, TimelineSegment, XY, +}; +use cap_rendering::{ + FrameRenderer, ProjectRecordingsMeta, ProjectUniforms, RenderVideoConstants, RendererLayers, + ZoomFocusInterpolator, decoder::spawn_decoder, + spring_mass_damper::SpringMassDamperSimulationConfig, +}; +use std::{ + path::{Path, PathBuf}, + sync::Arc, + time::Instant, +}; + +fn percentile(data: &[f64], p: f64) -> f64 { + if data.is_empty() { + return 0.0; + } + let mut sorted: Vec = data.iter().copied().filter(|x| !x.is_nan()).collect(); + if sorted.is_empty() { + return 0.0; + } + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let idx = ((p / 100.0) * (sorted.len() - 1) as f64).round() as usize; + sorted[idx.min(sorted.len() - 1)] +} + +fn print_stats(label: &str, times_ms: &[f64]) { + if times_ms.is_empty() { + println!(" {label}: no data"); + return; + } + let avg = times_ms.iter().sum::() / times_ms.len() as f64; + let min = times_ms.iter().copied().fold(f64::INFINITY, f64::min); + let max = times_ms.iter().copied().fold(f64::NEG_INFINITY, f64::max); + let p50 = percentile(times_ms, 50.0); + let p95 = percentile(times_ms, 95.0); + let p99 = percentile(times_ms, 99.0); + + println!(" {label}:"); + println!(" avg={avg:.2}ms min={min:.2}ms max={max:.2}ms"); + println!(" p50={p50:.2}ms p95={p95:.2}ms p99={p99:.2}ms"); +} + +#[derive(Default)] +struct PipelineTimings { + decode_ms: Vec, + render_ms: Vec, + total_ms: Vec, + decode_failures: usize, + render_failures: usize, + frames_rendered: usize, +} + +impl PipelineTimings { + fn print_report(&self, label: &str) { + println!("\n{}", "=".repeat(60)); + println!(" {label}"); + println!("{}", "=".repeat(60)); + + println!(" Frames rendered: {}", self.frames_rendered); + if self.decode_failures > 0 { + println!(" Decode failures: {}", self.decode_failures); + } + if self.render_failures > 0 { + println!(" Render failures: {}", self.render_failures); + } + + if !self.total_ms.is_empty() { + let total_time: f64 = self.total_ms.iter().sum(); + let effective_fps = self.frames_rendered as f64 / (total_time / 1000.0); + println!(" Effective FPS: {effective_fps:.1}"); + println!(" Total time: {total_time:.0}ms for {} frames", self.frames_rendered); + } + + println!(); + print_stats("Decode", &self.decode_ms); + print_stats("GPU Render+Readback", &self.render_ms); + print_stats("Total (decode+render)", &self.total_ms); + } +} + +async fn load_recording( + recording_path: &Path, +) -> Result< + ( + RecordingMeta, + Box, + ProjectConfiguration, + Arc, + ), + String, +> { + let recording_meta = RecordingMeta::load_for_project(recording_path) + .map_err(|e| format!("Failed to load recording meta: {e}"))?; + + let RecordingMetaInner::Studio(meta) = &recording_meta.inner else { + return Err("Not a studio recording".to_string()); + }; + let meta = meta.clone(); + + let mut project = recording_meta.project_config(); + + if project.timeline.is_none() { + let timeline_segments = match meta.as_ref() { + StudioRecordingMeta::SingleSegment { segment } => { + let display_path = recording_meta.path(&segment.display.path); + let duration = + match cap_rendering::Video::new(&display_path, 0.0) { + Ok(v) => v.duration, + Err(_) => 5.0, + }; + vec![TimelineSegment { + recording_clip: 0, + start: 0.0, + end: duration, + timescale: 1.0, + }] + } + StudioRecordingMeta::MultipleSegments { inner } => inner + .segments + .iter() + .enumerate() + .filter_map(|(i, segment)| { + let display_path = recording_meta.path(&segment.display.path); + let duration = match cap_rendering::Video::new(&display_path, 0.0) { + Ok(v) => v.duration, + Err(_) => 5.0, + }; + if duration <= 0.0 { + return None; + } + Some(TimelineSegment { + recording_clip: i as u32, + start: 0.0, + end: duration, + timescale: 1.0, + }) + }) + .collect(), + }; + + if !timeline_segments.is_empty() { + project.timeline = Some(TimelineConfiguration { + segments: timeline_segments, + zoom_segments: Vec::new(), + scene_segments: Vec::new(), + mask_segments: Vec::new(), + text_segments: Vec::new(), + }); + } + } + + let recordings = Arc::new( + ProjectRecordingsMeta::new(&recording_meta.project_path, meta.as_ref()) + .map_err(|e| format!("Failed to create recordings meta: {e}"))?, + ); + + Ok((recording_meta, meta, project, recordings)) +} + +async fn run_decode_only_benchmark( + recording_meta: &RecordingMeta, + meta: &StudioRecordingMeta, + project: &ProjectConfiguration, + fps: u32, + frame_count: usize, +) -> PipelineTimings { + let mut timings = PipelineTimings::default(); + + let display_path = match meta { + StudioRecordingMeta::SingleSegment { segment } => { + recording_meta.path(&segment.display.path) + } + StudioRecordingMeta::MultipleSegments { inner } => { + recording_meta.path(&inner.segments[0].display.path) + } + }; + + let display_fps = match meta { + StudioRecordingMeta::SingleSegment { segment } => segment.display.fps, + StudioRecordingMeta::MultipleSegments { inner } => inner.segments[0].display.fps, + }; + + let decoder = match spawn_decoder("benchmark-screen", display_path, display_fps, 0.0, false).await { + Ok(d) => d, + Err(e) => { + eprintln!("Failed to create decoder: {e}"); + return timings; + } + }; + + println!(" Decoder type: {}", decoder.decoder_type()); + println!( + " Hardware accelerated: {}", + decoder.is_hardware_accelerated() + ); + let (w, h) = decoder.video_dimensions(); + println!(" Video dimensions: {w}x{h}"); + + let duration = project + .timeline + .as_ref() + .map(|t| t.duration()) + .unwrap_or(10.0); + let max_frames = ((duration * fps as f64).ceil() as usize).min(frame_count); + + println!(" Decoding {max_frames} frames at {fps}fps..."); + + for i in 0..max_frames { + let time = i as f32 / fps as f32; + let start = Instant::now(); + match decoder.get_frame(time).await { + Some(_frame) => { + let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0; + timings.decode_ms.push(elapsed_ms); + timings.total_ms.push(elapsed_ms); + timings.frames_rendered += 1; + } + None => { + timings.decode_failures += 1; + } + } + } + + timings +} + +async fn run_full_pipeline_benchmark( + recording_meta: &RecordingMeta, + meta: &StudioRecordingMeta, + project: &ProjectConfiguration, + recordings: &ProjectRecordingsMeta, + fps: u32, + frame_count: usize, + resolution_base: XY, +) -> PipelineTimings { + let mut timings = PipelineTimings::default(); + + let render_constants = match RenderVideoConstants::new( + &recordings.segments, + recording_meta.clone(), + (*meta).clone(), + ) + .await + { + Ok(rc) => Arc::new(rc), + Err(e) => { + eprintln!("Failed to create render constants: {e}"); + return timings; + } + }; + + println!( + " GPU adapter: {} (software={})", + render_constants._adapter.get_info().name, + render_constants.is_software_adapter + ); + + let segments = + match cap_editor::create_segments(recording_meta, meta, false).await { + Ok(s) => s, + Err(e) => { + eprintln!("Failed to create segments: {e}"); + return timings; + } + }; + + if segments.is_empty() { + eprintln!("No segments found"); + return timings; + } + + let mut frame_renderer = FrameRenderer::new(&render_constants); + let mut layers = RendererLayers::new_with_options( + &render_constants.device, + &render_constants.queue, + render_constants.is_software_adapter, + ); + + let first_segment = &segments[0]; + let (screen_w, screen_h) = first_segment.decoders.screen_video_dimensions(); + let camera_dims = first_segment.decoders.camera_video_dimensions(); + layers.prepare_for_video_dimensions( + &render_constants.device, + screen_w, + screen_h, + camera_dims.map(|(w, _)| w), + camera_dims.map(|(_, h)| h), + ); + + let duration = project + .timeline + .as_ref() + .map(|t| t.duration()) + .unwrap_or(10.0); + let max_frames = ((duration * fps as f64).ceil() as usize).min(frame_count); + + println!(" Rendering {max_frames} frames at {fps}fps, resolution base: {}x{}...", resolution_base.x, resolution_base.y); + + let cursor_smoothing = + (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { + tension: project.cursor.tension, + mass: project.cursor.mass, + friction: project.cursor.friction, + }); + + for i in 0..max_frames { + let frame_time = i as f64 / fps as f64; + + let Some((segment_time, segment)) = project.get_segment_time(frame_time) else { + break; + }; + + let segment_media = match segments.get(segment.recording_clip as usize) { + Some(sm) => sm, + None => { + timings.decode_failures += 1; + continue; + } + }; + + let clip_offsets = project + .clips + .iter() + .find(|v| v.index == segment.recording_clip) + .map(|v| v.offsets) + .unwrap_or_default(); + + let decode_start = Instant::now(); + let segment_frames_opt = if i == 0 { + segment_media + .decoders + .get_frames_initial( + segment_time as f32, + !project.camera.hide, + clip_offsets, + ) + .await + } else { + segment_media + .decoders + .get_frames(segment_time as f32, !project.camera.hide, clip_offsets) + .await + }; + let decode_elapsed_ms = decode_start.elapsed().as_secs_f64() * 1000.0; + + let Some(segment_frames) = segment_frames_opt else { + timings.decode_failures += 1; + continue; + }; + + timings.decode_ms.push(decode_elapsed_ms); + + let zoom_focus_interpolator = ZoomFocusInterpolator::new( + &segment_media.cursor, + cursor_smoothing, + project.screen_movement_spring, + duration, + ); + + let uniforms = ProjectUniforms::new( + &render_constants, + project, + i as u32, + fps, + resolution_base, + &segment_media.cursor, + &segment_frames, + duration, + &zoom_focus_interpolator, + ); + + let render_start = Instant::now(); + match frame_renderer + .render( + segment_frames, + uniforms, + &segment_media.cursor, + &mut layers, + ) + .await + { + Ok(_frame) => { + let render_elapsed_ms = render_start.elapsed().as_secs_f64() * 1000.0; + timings.render_ms.push(render_elapsed_ms); + timings.total_ms.push(decode_elapsed_ms + render_elapsed_ms); + timings.frames_rendered += 1; + } + Err(e) => { + timings.render_failures += 1; + if timings.render_failures <= 3 { + eprintln!(" Render failed at frame {i}: {e}"); + } + } + } + + if (i + 1) % 100 == 0 { + println!(" Progress: {}/{max_frames} frames", i + 1); + } + } + + timings +} + +async fn run_scrubbing_benchmark( + recording_meta: &RecordingMeta, + meta: &StudioRecordingMeta, + project: &ProjectConfiguration, + recordings: &ProjectRecordingsMeta, + fps: u32, + resolution_base: XY, +) -> PipelineTimings { + let mut timings = PipelineTimings::default(); + + let render_constants = match RenderVideoConstants::new( + &recordings.segments, + recording_meta.clone(), + (*meta).clone(), + ) + .await + { + Ok(rc) => Arc::new(rc), + Err(e) => { + eprintln!("Failed to create render constants: {e}"); + return timings; + } + }; + + let segments = + match cap_editor::create_segments(recording_meta, meta, false).await { + Ok(s) => s, + Err(e) => { + eprintln!("Failed to create segments: {e}"); + return timings; + } + }; + + if segments.is_empty() { + eprintln!("No segments found"); + return timings; + } + + let mut frame_renderer = FrameRenderer::new(&render_constants); + let mut layers = RendererLayers::new_with_options( + &render_constants.device, + &render_constants.queue, + render_constants.is_software_adapter, + ); + + let first_segment = &segments[0]; + let (screen_w, screen_h) = first_segment.decoders.screen_video_dimensions(); + let camera_dims = first_segment.decoders.camera_video_dimensions(); + layers.prepare_for_video_dimensions( + &render_constants.device, + screen_w, + screen_h, + camera_dims.map(|(w, _)| w), + camera_dims.map(|(_, h)| h), + ); + + let duration = project + .timeline + .as_ref() + .map(|t| t.duration()) + .unwrap_or(10.0); + + let cursor_smoothing = + (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { + tension: project.cursor.tension, + mass: project.cursor.mass, + friction: project.cursor.friction, + }); + + let scrub_positions: Vec = { + let golden_ratio = 1.618_034; + let mut positions = Vec::with_capacity(50); + let mut pos = 0.0; + for _ in 0..50 { + pos = (pos + golden_ratio * duration) % duration; + positions.push(pos); + } + positions + }; + + println!(" Scrubbing to {} random positions...", scrub_positions.len()); + + for (i, &scrub_time) in scrub_positions.iter().enumerate() { + let Some((segment_time, segment)) = project.get_segment_time(scrub_time) else { + continue; + }; + + let segment_media = match segments.get(segment.recording_clip as usize) { + Some(sm) => sm, + None => continue, + }; + + let clip_offsets = project + .clips + .iter() + .find(|v| v.index == segment.recording_clip) + .map(|v| v.offsets) + .unwrap_or_default(); + + let decode_start = Instant::now(); + let segment_frames_opt = segment_media + .decoders + .get_frames_initial(segment_time as f32, !project.camera.hide, clip_offsets) + .await; + let decode_elapsed_ms = decode_start.elapsed().as_secs_f64() * 1000.0; + + let Some(segment_frames) = segment_frames_opt else { + timings.decode_failures += 1; + continue; + }; + + timings.decode_ms.push(decode_elapsed_ms); + + let frame_number = (scrub_time * fps as f64).round() as u32; + + let zoom_focus_interpolator = ZoomFocusInterpolator::new( + &segment_media.cursor, + cursor_smoothing, + project.screen_movement_spring, + duration, + ); + + let uniforms = ProjectUniforms::new( + &render_constants, + project, + frame_number, + fps, + resolution_base, + &segment_media.cursor, + &segment_frames, + duration, + &zoom_focus_interpolator, + ); + + let render_start = Instant::now(); + match frame_renderer + .render( + segment_frames, + uniforms, + &segment_media.cursor, + &mut layers, + ) + .await + { + Ok(_frame) => { + let render_elapsed_ms = render_start.elapsed().as_secs_f64() * 1000.0; + timings.render_ms.push(render_elapsed_ms); + timings.total_ms.push(decode_elapsed_ms + render_elapsed_ms); + timings.frames_rendered += 1; + } + Err(e) => { + timings.render_failures += 1; + if timings.render_failures <= 3 { + eprintln!(" Render failed at scrub {i}: {e}"); + } + } + } + } + + timings +} + +#[tokio::main] +async fn main() { + tracing_subscriber::fmt() + .with_env_filter( + tracing_subscriber::EnvFilter::from_default_env() + .add_directive(tracing::Level::WARN.into()), + ) + .init(); + + ffmpeg::init().expect("Failed to initialize FFmpeg"); + + let args: Vec = std::env::args().collect(); + + let recording_path = args + .iter() + .position(|a| a == "--recording-path") + .and_then(|i| args.get(i + 1)) + .map(PathBuf::from) + .expect("Usage: playback-pipeline-benchmark --recording-path [--fps ] [--frames ]"); + + let fps = args + .iter() + .position(|a| a == "--fps") + .and_then(|i| args.get(i + 1)) + .and_then(|s| s.parse().ok()) + .unwrap_or(60); + + let frame_count = args + .iter() + .position(|a| a == "--frames") + .and_then(|i| args.get(i + 1)) + .and_then(|s| s.parse().ok()) + .unwrap_or(300); + + println!("{}", "=".repeat(60)); + println!(" CAP PLAYBACK PIPELINE BENCHMARK"); + println!("{}", "=".repeat(60)); + println!(); + println!("Recording: {}", recording_path.display()); + println!("Target FPS: {fps}"); + println!("Max frames: {frame_count}"); + println!(); + + let (recording_meta, meta, project, recordings) = match load_recording(&recording_path).await { + Ok(r) => r, + Err(e) => { + eprintln!("Failed to load recording: {e}"); + std::process::exit(1); + } + }; + + let duration = project + .timeline + .as_ref() + .map(|t| t.duration()) + .unwrap_or(0.0); + println!("Recording duration: {duration:.2}s"); + + let resolutions = [ + (XY::new(1920, 1080), "Full (1920x1080)"), + (XY::new(1248, 702), "Half (1248x702)"), + (XY::new(480, 270), "Quarter (480x270)"), + ]; + + println!("\n--- DECODE-ONLY BENCHMARK ---"); + let decode_timings = run_decode_only_benchmark( + &recording_meta, + meta.as_ref(), + &project, + fps, + frame_count, + ) + .await; + decode_timings.print_report("DECODE-ONLY"); + + for (resolution_base, label) in &resolutions { + println!("\n--- FULL PIPELINE: {label} ---"); + let pipeline_timings = run_full_pipeline_benchmark( + &recording_meta, + meta.as_ref(), + &project, + &recordings, + fps, + frame_count, + *resolution_base, + ) + .await; + pipeline_timings.print_report(&format!("FULL PIPELINE - {label}")); + } + + println!("\n--- SCRUBBING BENCHMARK (Half resolution) ---"); + let scrub_timings = run_scrubbing_benchmark( + &recording_meta, + meta.as_ref(), + &project, + &recordings, + fps, + XY::new(1248, 702), + ) + .await; + scrub_timings.print_report("SCRUBBING (Half resolution)"); + + println!("\n{}", "=".repeat(60)); + println!(" BENCHMARK COMPLETE"); + println!("{}", "=".repeat(60)); + + let target_frame_time_ms = 1000.0 / fps as f64; + println!("\nTarget frame time at {fps}fps: {target_frame_time_ms:.2}ms"); + + if !decode_timings.decode_ms.is_empty() { + let decode_p95 = percentile(&decode_timings.decode_ms, 95.0); + let decode_budget_pct = decode_p95 / target_frame_time_ms * 100.0; + println!( + "Decode p95 ({decode_p95:.2}ms) uses {decode_budget_pct:.0}% of frame budget" + ); + } + + for (_resolution_base, label) in &resolutions { + println!("\n{label}:"); + println!(" Frame budget: {target_frame_time_ms:.2}ms"); + } +} From 1432fd717b566fe39940b11ba6e387eec5c370ac Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:24:14 +0000 Subject: [PATCH 03/54] improve: optimize playback prefetch and decode parallelism Key changes to playback.rs: - Increase PARALLEL_DECODE_TASKS from 4 to 6 (sustained) - Add INITIAL_PARALLEL_DECODE_TASKS of 8 during ramp-up phase - Increase PREFETCH_BUFFER_SIZE from 60 to 90 frames - Increase FRAME_CACHE_SIZE from 60 to 90 (matches decoder cache) - Reduce warmup threshold from 20 to 10 frames (faster first frame) - Reduce warmup timeout from 1000ms to 500ms - Reduce frame wait timeouts from 200ms to 100ms for better responsiveness - Reduce aggressive skip threshold from 10 to 6 frames behind - Reduce PREFETCH_BEHIND from 15 to 10 (focus more on forward frames) These changes should improve: - First-frame latency (faster warmup) - Sustained playback FPS (more parallel decoding) - Frame availability (larger cache and prefetch buffer) - Responsiveness (shorter wait timeouts) Co-authored-by: Richie McIlroy --- crates/editor/src/playback.rs | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index 000f209c6be..841b5a90c37 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -34,11 +34,13 @@ use crate::{ audio::AudioSegment, editor, editor_instance::SegmentMedia, segments::get_audio_segments, }; -const PREFETCH_BUFFER_SIZE: usize = 60; -const PARALLEL_DECODE_TASKS: usize = 4; -const MAX_PREFETCH_AHEAD: u32 = 60; -const PREFETCH_BEHIND: u32 = 15; -const FRAME_CACHE_SIZE: usize = 60; +const PREFETCH_BUFFER_SIZE: usize = 90; +const PARALLEL_DECODE_TASKS: usize = 6; +const INITIAL_PARALLEL_DECODE_TASKS: usize = 8; +const MAX_PREFETCH_AHEAD: u32 = 90; +const PREFETCH_BEHIND: u32 = 10; +const FRAME_CACHE_SIZE: usize = 90; +const RAMP_UP_FRAME_COUNT: u32 = 15; #[derive(Debug)] pub enum PlaybackStartError { @@ -161,8 +163,6 @@ impl Playback { let mut in_flight: FuturesUnordered = FuturesUnordered::new(); let mut frames_decoded: u32 = 0; let mut prefetched_behind: HashSet = HashSet::new(); - const INITIAL_PARALLEL_TASKS: usize = 4; - const RAMP_UP_AFTER_FRAMES: u32 = 5; let mut cached_project = prefetch_project.borrow().clone(); @@ -203,8 +203,8 @@ impl Playback { let current_playback_frame = *playback_position_rx.borrow(); let max_prefetch_frame = current_playback_frame + MAX_PREFETCH_AHEAD; - let effective_parallel = if frames_decoded < RAMP_UP_AFTER_FRAMES { - INITIAL_PARALLEL_TASKS + let effective_parallel = if frames_decoded < RAMP_UP_FRAME_COUNT { + INITIAL_PARALLEL_DECODE_TASKS } else { PARALLEL_DECODE_TASKS }; @@ -382,13 +382,13 @@ impl Playback { let mut prefetch_buffer: VecDeque = VecDeque::with_capacity(PREFETCH_BUFFER_SIZE); let mut frame_cache = FrameCache::new(FRAME_CACHE_SIZE); - let aggressive_skip_threshold = 10u32; + let aggressive_skip_threshold = 6u32; let mut total_frames_rendered = 0u64; let mut _total_frames_skipped = 0u64; - let warmup_target_frames = 20usize; - let warmup_after_first_timeout = Duration::from_millis(1000); + let warmup_target_frames = 10usize; + let warmup_after_first_timeout = Duration::from_millis(500); let warmup_no_frames_timeout = Duration::from_secs(5); let warmup_start = Instant::now(); let mut first_frame_time: Option = None; @@ -506,7 +506,7 @@ impl Playback { if is_in_flight { let wait_start = Instant::now(); - let max_wait = Duration::from_millis(200); + let max_wait = Duration::from_millis(100); let mut found_frame = None; while wait_start.elapsed() < max_wait { @@ -557,7 +557,7 @@ impl Playback { let _ = frame_request_tx.send(frame_number); let wait_result = tokio::time::timeout( - Duration::from_millis(200), + Duration::from_millis(100), prefetch_rx.recv(), ) .await; @@ -604,7 +604,7 @@ impl Playback { guard.insert(frame_number); } - let max_wait = Duration::from_millis(200); + let max_wait = Duration::from_millis(100); let data = tokio::select! { _ = stop_rx.changed() => { if let Ok(mut guard) = main_in_flight.write() { From 4438a5a019918684c18972637bbee1e6191e2f08 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:24:14 +0000 Subject: [PATCH 04/54] improve: optimize playback prefetch and decode parallelism Key changes to playback.rs: - Increase PARALLEL_DECODE_TASKS from 4 to 6 (sustained) - Add INITIAL_PARALLEL_DECODE_TASKS of 8 during ramp-up phase - Increase PREFETCH_BUFFER_SIZE from 60 to 90 frames - Increase FRAME_CACHE_SIZE from 60 to 90 (matches decoder cache) - Reduce warmup threshold from 20 to 10 frames (faster first frame) - Reduce warmup timeout from 1000ms to 500ms - Reduce frame wait timeouts from 200ms to 100ms for better responsiveness - Reduce aggressive skip threshold from 10 to 6 frames behind - Reduce PREFETCH_BEHIND from 15 to 10 (focus more on forward frames) These changes should improve: - First-frame latency (faster warmup) - Sustained playback FPS (more parallel decoding) - Frame availability (larger cache and prefetch buffer) - Responsiveness (shorter wait timeouts) --- crates/editor/src/playback.rs | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index 000f209c6be..841b5a90c37 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -34,11 +34,13 @@ use crate::{ audio::AudioSegment, editor, editor_instance::SegmentMedia, segments::get_audio_segments, }; -const PREFETCH_BUFFER_SIZE: usize = 60; -const PARALLEL_DECODE_TASKS: usize = 4; -const MAX_PREFETCH_AHEAD: u32 = 60; -const PREFETCH_BEHIND: u32 = 15; -const FRAME_CACHE_SIZE: usize = 60; +const PREFETCH_BUFFER_SIZE: usize = 90; +const PARALLEL_DECODE_TASKS: usize = 6; +const INITIAL_PARALLEL_DECODE_TASKS: usize = 8; +const MAX_PREFETCH_AHEAD: u32 = 90; +const PREFETCH_BEHIND: u32 = 10; +const FRAME_CACHE_SIZE: usize = 90; +const RAMP_UP_FRAME_COUNT: u32 = 15; #[derive(Debug)] pub enum PlaybackStartError { @@ -161,8 +163,6 @@ impl Playback { let mut in_flight: FuturesUnordered = FuturesUnordered::new(); let mut frames_decoded: u32 = 0; let mut prefetched_behind: HashSet = HashSet::new(); - const INITIAL_PARALLEL_TASKS: usize = 4; - const RAMP_UP_AFTER_FRAMES: u32 = 5; let mut cached_project = prefetch_project.borrow().clone(); @@ -203,8 +203,8 @@ impl Playback { let current_playback_frame = *playback_position_rx.borrow(); let max_prefetch_frame = current_playback_frame + MAX_PREFETCH_AHEAD; - let effective_parallel = if frames_decoded < RAMP_UP_AFTER_FRAMES { - INITIAL_PARALLEL_TASKS + let effective_parallel = if frames_decoded < RAMP_UP_FRAME_COUNT { + INITIAL_PARALLEL_DECODE_TASKS } else { PARALLEL_DECODE_TASKS }; @@ -382,13 +382,13 @@ impl Playback { let mut prefetch_buffer: VecDeque = VecDeque::with_capacity(PREFETCH_BUFFER_SIZE); let mut frame_cache = FrameCache::new(FRAME_CACHE_SIZE); - let aggressive_skip_threshold = 10u32; + let aggressive_skip_threshold = 6u32; let mut total_frames_rendered = 0u64; let mut _total_frames_skipped = 0u64; - let warmup_target_frames = 20usize; - let warmup_after_first_timeout = Duration::from_millis(1000); + let warmup_target_frames = 10usize; + let warmup_after_first_timeout = Duration::from_millis(500); let warmup_no_frames_timeout = Duration::from_secs(5); let warmup_start = Instant::now(); let mut first_frame_time: Option = None; @@ -506,7 +506,7 @@ impl Playback { if is_in_flight { let wait_start = Instant::now(); - let max_wait = Duration::from_millis(200); + let max_wait = Duration::from_millis(100); let mut found_frame = None; while wait_start.elapsed() < max_wait { @@ -557,7 +557,7 @@ impl Playback { let _ = frame_request_tx.send(frame_number); let wait_result = tokio::time::timeout( - Duration::from_millis(200), + Duration::from_millis(100), prefetch_rx.recv(), ) .await; @@ -604,7 +604,7 @@ impl Playback { guard.insert(frame_number); } - let max_wait = Duration::from_millis(200); + let max_wait = Duration::from_millis(100); let data = tokio::select! { _ = stop_rx.changed() => { if let Ok(mut guard) = main_in_flight.write() { From 2ef2580b2f8cd577155faf4bdbf50121a7c96f1b Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:29:33 +0000 Subject: [PATCH 05/54] improve: batch GPU command submissions for YUV conversion and rendering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, YUV→RGBA conversion (compute shader) created its own command encoder and called queue.submit() independently, then the main render pass created another encoder and submitted again. This meant 2+ GPU submissions per frame. Changes: - Add convert_nv12_to_encoder() and convert_yuv420p_to_encoder() methods to YuvToRgbaConverter that dispatch compute passes on an external encoder instead of creating their own - Add prepare_with_encoder() to DisplayLayer that uses the batched conversion methods - Add prepare_with_encoder() to RendererLayers that passes the shared encoder through to display layer - Modify produce_frame() to create the command encoder first and pass it to both prepare and render phases Result: Single queue.submit() per frame instead of 2+, reducing GPU overhead and improving frame throughput. Co-authored-by: Richie McIlroy --- crates/rendering/src/layers/display.rs | 218 +++++++++++++++++++++++++ crates/rendering/src/lib.rs | 87 +++++++++- crates/rendering/src/yuv_converter.rs | 144 ++++++++++++++++ 3 files changed, 445 insertions(+), 4 deletions(-) diff --git a/crates/rendering/src/layers/display.rs b/crates/rendering/src/layers/display.rs index f2f3df3a40e..05d545ce0a4 100644 --- a/crates/rendering/src/layers/display.rs +++ b/crates/rendering/src/layers/display.rs @@ -401,6 +401,224 @@ impl DisplayLayer { (skipped, actual_width, actual_height) } + pub fn prepare_with_encoder( + &mut self, + device: &wgpu::Device, + queue: &wgpu::Queue, + segment_frames: &DecodedSegmentFrames, + frame_size: XY, + uniforms: CompositeVideoFrameUniforms, + encoder: &mut wgpu::CommandEncoder, + ) -> (bool, u32, u32) { + self.pending_copy = None; + + let actual_width = segment_frames.screen_frame.width(); + let actual_height = segment_frames.screen_frame.height(); + let format = segment_frames.screen_frame.format(); + let current_recording_time = segment_frames.recording_time; + + let skipped = self + .last_recording_time + .is_some_and(|last| (last - current_recording_time).abs() < 0.001); + + if !skipped { + let next_texture = 1 - self.current_texture; + + if self.frame_textures[next_texture].width() != frame_size.x + || self.frame_textures[next_texture].height() != frame_size.y + { + self.frame_textures[next_texture] = + CompositeVideoFramePipeline::create_frame_texture( + device, + frame_size.x, + frame_size.y, + ); + self.frame_texture_views[next_texture] = + self.frame_textures[next_texture].create_view(&Default::default()); + + self.bind_groups[next_texture] = Some(self.pipeline.bind_group( + device, + &self.uniforms_buffer, + &self.frame_texture_views[next_texture], + )); + } + + let frame_uploaded = match format { + PixelFormat::Rgba => { + let frame_data = segment_frames.screen_frame.data(); + let src_bytes_per_row = frame_size.x * 4; + + queue.write_texture( + wgpu::TexelCopyTextureInfo { + texture: &self.frame_textures[next_texture], + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + frame_data, + wgpu::TexelCopyBufferLayout { + offset: 0, + bytes_per_row: Some(src_bytes_per_row), + rows_per_image: Some(frame_size.y), + }, + wgpu::Extent3d { + width: frame_size.x, + height: frame_size.y, + depth_or_array_layers: 1, + }, + ); + true + } + PixelFormat::Nv12 => { + let screen_frame = &segment_frames.screen_frame; + + if !self.prefer_cpu_conversion { + if let (Some(y_data), Some(uv_data)) = + (screen_frame.y_plane(), screen_frame.uv_plane()) + { + let y_stride = screen_frame.y_stride(); + let uv_stride = screen_frame.uv_stride(); + + #[cfg(target_os = "windows")] + let convert_width = actual_width; + #[cfg(target_os = "windows")] + let convert_height = actual_height; + #[cfg(not(target_os = "windows"))] + let convert_width = frame_size.x; + #[cfg(not(target_os = "windows"))] + let convert_height = frame_size.y; + + let convert_result = self.yuv_converter.convert_nv12_to_encoder( + device, + queue, + encoder, + y_data, + uv_data, + convert_width, + convert_height, + y_stride, + uv_stride, + ); + + match convert_result { + Ok(_) => { + if self.yuv_converter.output_texture().is_some() { + self.pending_copy = Some(PendingTextureCopy { + width: convert_width, + height: convert_height, + dst_texture_index: next_texture, + }); + true + } else { + false + } + } + Err(_) => false, + } + } else { + false + } + } else if let (Some(y_data), Some(uv_data)) = + (screen_frame.y_plane(), screen_frame.uv_plane()) + { + let y_stride = screen_frame.y_stride(); + let uv_stride = screen_frame.uv_stride(); + let convert_result = self.yuv_converter.convert_nv12_cpu( + device, + queue, + y_data, + uv_data, + frame_size.x, + frame_size.y, + y_stride, + uv_stride, + ); + + match convert_result { + Ok(_) => { + if self.yuv_converter.output_texture().is_some() { + self.pending_copy = Some(PendingTextureCopy { + width: frame_size.x, + height: frame_size.y, + dst_texture_index: next_texture, + }); + true + } else { + false + } + } + Err(_) => false, + } + } else { + false + } + } + PixelFormat::Yuv420p => { + let screen_frame = &segment_frames.screen_frame; + let y_plane = screen_frame.y_plane(); + let u_plane = screen_frame.u_plane(); + let v_plane = screen_frame.v_plane(); + + if let (Some(y_data), Some(u_data), Some(v_data)) = (y_plane, u_plane, v_plane) + { + let convert_result = if self.prefer_cpu_conversion { + self.yuv_converter.convert_yuv420p_cpu( + device, + queue, + y_data, + u_data, + v_data, + frame_size.x, + frame_size.y, + screen_frame.y_stride(), + screen_frame.uv_stride(), + ) + } else { + self.yuv_converter.convert_yuv420p_to_encoder( + device, + queue, + encoder, + y_data, + u_data, + v_data, + frame_size.x, + frame_size.y, + screen_frame.y_stride(), + screen_frame.uv_stride(), + ) + }; + + match convert_result { + Ok(_) => { + if self.yuv_converter.output_texture().is_some() { + self.pending_copy = Some(PendingTextureCopy { + width: frame_size.x, + height: frame_size.y, + dst_texture_index: next_texture, + }); + true + } else { + false + } + } + Err(_) => false, + } + } else { + false + } + } + }; + + if frame_uploaded { + self.last_recording_time = Some(current_recording_time); + self.current_texture = next_texture; + } + } + + uniforms.write_to_buffer(queue, &self.uniforms_buffer); + (skipped, actual_width, actual_height) + } + pub fn copy_to_texture(&mut self, encoder: &mut wgpu::CommandEncoder) { let Some(pending) = self.pending_copy.take() else { return; diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index 9848f21859d..7956b0582ad 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -2013,6 +2013,85 @@ impl RendererLayers { Ok(()) } + pub async fn prepare_with_encoder( + &mut self, + constants: &RenderVideoConstants, + uniforms: &ProjectUniforms, + segment_frames: &DecodedSegmentFrames, + cursor: &CursorEvents, + encoder: &mut wgpu::CommandEncoder, + ) -> Result<(), RenderingError> { + self.background + .prepare( + constants, + uniforms, + Background::from(uniforms.project.background.source.clone()), + ) + .await?; + + if uniforms.project.background.blur > 0.0 { + self.background_blur.prepare(&constants.queue, uniforms); + } + + self.display.prepare_with_encoder( + &constants.device, + &constants.queue, + segment_frames, + constants.options.screen_size, + uniforms.display, + encoder, + ); + + self.cursor.prepare( + segment_frames, + uniforms.resolution_base, + cursor, + &uniforms.zoom, + uniforms, + constants, + ); + + self.camera.prepare( + &constants.device, + &constants.queue, + uniforms.camera, + constants.options.camera_size.and_then(|size| { + segment_frames + .camera_frame + .as_ref() + .map(|frame| (size, frame, segment_frames.recording_time)) + }), + ); + + self.camera_only.prepare( + &constants.device, + &constants.queue, + uniforms.camera_only, + constants.options.camera_size.and_then(|size| { + segment_frames + .camera_frame + .as_ref() + .map(|frame| (size, frame, segment_frames.recording_time)) + }), + ); + + self.text.prepare( + &constants.device, + &constants.queue, + uniforms.output_size, + &uniforms.texts, + ); + + self.captions.prepare( + uniforms, + segment_frames, + XY::new(uniforms.output_size.0, uniforms.output_size.1), + constants, + ); + + Ok(()) + } + pub fn render( &mut self, device: &wgpu::Device, @@ -2112,16 +2191,16 @@ async fn produce_frame( layers: &mut RendererLayers, session: &mut RenderSession, ) -> Result { - layers - .prepare(constants, &uniforms, &segment_frames, cursor) - .await?; - let mut encoder = constants.device.create_command_encoder( &(wgpu::CommandEncoderDescriptor { label: Some("Render Encoder"), }), ); + layers + .prepare_with_encoder(constants, &uniforms, &segment_frames, cursor, &mut encoder) + .await?; + layers.render( &constants.device, &constants.queue, diff --git a/crates/rendering/src/yuv_converter.rs b/crates/rendering/src/yuv_converter.rs index e93c29f30aa..30b107ce485 100644 --- a/crates/rendering/src/yuv_converter.rs +++ b/crates/rendering/src/yuv_converter.rs @@ -753,6 +753,81 @@ impl YuvToRgbaConverter { Ok(self.current_output_view()) } + #[allow(clippy::too_many_arguments)] + pub fn convert_nv12_to_encoder( + &mut self, + device: &wgpu::Device, + queue: &wgpu::Queue, + encoder: &mut wgpu::CommandEncoder, + y_data: &[u8], + uv_data: &[u8], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, + ) -> Result<&wgpu::TextureView, YuvConversionError> { + let (effective_width, effective_height, _downscaled) = + validate_dimensions(width, height, self.gpu_max_texture_size)?; + self.ensure_texture_size(device, effective_width, effective_height); + self.swap_output_buffer(); + + upload_plane_with_stride(queue, &self.y_texture, y_data, width, height, y_stride, "Y")?; + + let half_height = height / 2; + let expected_uv_size = (uv_stride * half_height) as usize; + if uv_data.len() < expected_uv_size { + return Err(YuvConversionError::PlaneSizeMismatch { + plane: "UV", + expected: expected_uv_size, + actual: uv_data.len(), + }); + } + + queue.write_texture( + wgpu::TexelCopyTextureInfo { + texture: &self.uv_texture, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + uv_data, + wgpu::TexelCopyBufferLayout { + offset: 0, + bytes_per_row: Some(uv_stride), + rows_per_image: Some(half_height), + }, + wgpu::Extent3d { + width: width / 2, + height: half_height, + depth_or_array_layers: 1, + }, + ); + + let output_index = self.current_output; + let bind_group = self.bind_group_cache.get_or_create_nv12( + device, + &self.pipelines.nv12_bind_group_layout, + &self.y_view, + &self.uv_view, + &self.output_views[output_index], + output_index, + self.allocated_width, + self.allocated_height, + ); + + { + let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("NV12 Conversion Pass (Batched)"), + ..Default::default() + }); + compute_pass.set_pipeline(&self.pipelines.nv12_pipeline); + compute_pass.set_bind_group(0, bind_group, &[]); + compute_pass.dispatch_workgroups(width.div_ceil(8), height.div_ceil(8), 1); + } + + Ok(self.current_output_view()) + } + #[cfg(target_os = "macos")] pub fn convert_nv12_from_iosurface( &mut self, @@ -913,6 +988,75 @@ impl YuvToRgbaConverter { Ok(self.current_output_view()) } + #[allow(clippy::too_many_arguments)] + pub fn convert_yuv420p_to_encoder( + &mut self, + device: &wgpu::Device, + queue: &wgpu::Queue, + encoder: &mut wgpu::CommandEncoder, + y_data: &[u8], + u_data: &[u8], + v_data: &[u8], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, + ) -> Result<&wgpu::TextureView, YuvConversionError> { + let (effective_width, effective_height, _downscaled) = + validate_dimensions(width, height, self.gpu_max_texture_size)?; + self.ensure_texture_size(device, effective_width, effective_height); + self.swap_output_buffer(); + + upload_plane_with_stride(queue, &self.y_texture, y_data, width, height, y_stride, "Y")?; + + let half_width = width / 2; + let half_height = height / 2; + + upload_plane_with_stride( + queue, + &self.u_texture, + u_data, + half_width, + half_height, + uv_stride, + "U", + )?; + upload_plane_with_stride( + queue, + &self.v_texture, + v_data, + half_width, + half_height, + uv_stride, + "V", + )?; + + let output_index = self.current_output; + let bind_group = self.bind_group_cache.get_or_create_yuv420p( + device, + &self.pipelines.yuv420p_bind_group_layout, + &self.y_view, + &self.u_view, + &self.v_view, + &self.output_views[output_index], + output_index, + self.allocated_width, + self.allocated_height, + ); + + { + let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("YUV420P Conversion Pass (Batched)"), + ..Default::default() + }); + compute_pass.set_pipeline(&self.pipelines.yuv420p_pipeline); + compute_pass.set_bind_group(0, bind_group, &[]); + compute_pass.dispatch_workgroups(width.div_ceil(8), height.div_ceil(8), 1); + } + + Ok(self.current_output_view()) + } + #[cfg(target_os = "windows")] #[allow(clippy::too_many_arguments)] pub fn convert_nv12_from_d3d11_texture( From b4797a0df87085c83332db9ecc312ba1d1fee662 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:29:33 +0000 Subject: [PATCH 06/54] improve: batch GPU command submissions for YUV conversion and rendering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, YUV→RGBA conversion (compute shader) created its own command encoder and called queue.submit() independently, then the main render pass created another encoder and submitted again. This meant 2+ GPU submissions per frame. Changes: - Add convert_nv12_to_encoder() and convert_yuv420p_to_encoder() methods to YuvToRgbaConverter that dispatch compute passes on an external encoder instead of creating their own - Add prepare_with_encoder() to DisplayLayer that uses the batched conversion methods - Add prepare_with_encoder() to RendererLayers that passes the shared encoder through to display layer - Modify produce_frame() to create the command encoder first and pass it to both prepare and render phases Result: Single queue.submit() per frame instead of 2+, reducing GPU overhead and improving frame throughput. --- crates/rendering/src/layers/display.rs | 218 +++++++++++++++++++++++++ crates/rendering/src/lib.rs | 87 +++++++++- crates/rendering/src/yuv_converter.rs | 144 ++++++++++++++++ 3 files changed, 445 insertions(+), 4 deletions(-) diff --git a/crates/rendering/src/layers/display.rs b/crates/rendering/src/layers/display.rs index f2f3df3a40e..05d545ce0a4 100644 --- a/crates/rendering/src/layers/display.rs +++ b/crates/rendering/src/layers/display.rs @@ -401,6 +401,224 @@ impl DisplayLayer { (skipped, actual_width, actual_height) } + pub fn prepare_with_encoder( + &mut self, + device: &wgpu::Device, + queue: &wgpu::Queue, + segment_frames: &DecodedSegmentFrames, + frame_size: XY, + uniforms: CompositeVideoFrameUniforms, + encoder: &mut wgpu::CommandEncoder, + ) -> (bool, u32, u32) { + self.pending_copy = None; + + let actual_width = segment_frames.screen_frame.width(); + let actual_height = segment_frames.screen_frame.height(); + let format = segment_frames.screen_frame.format(); + let current_recording_time = segment_frames.recording_time; + + let skipped = self + .last_recording_time + .is_some_and(|last| (last - current_recording_time).abs() < 0.001); + + if !skipped { + let next_texture = 1 - self.current_texture; + + if self.frame_textures[next_texture].width() != frame_size.x + || self.frame_textures[next_texture].height() != frame_size.y + { + self.frame_textures[next_texture] = + CompositeVideoFramePipeline::create_frame_texture( + device, + frame_size.x, + frame_size.y, + ); + self.frame_texture_views[next_texture] = + self.frame_textures[next_texture].create_view(&Default::default()); + + self.bind_groups[next_texture] = Some(self.pipeline.bind_group( + device, + &self.uniforms_buffer, + &self.frame_texture_views[next_texture], + )); + } + + let frame_uploaded = match format { + PixelFormat::Rgba => { + let frame_data = segment_frames.screen_frame.data(); + let src_bytes_per_row = frame_size.x * 4; + + queue.write_texture( + wgpu::TexelCopyTextureInfo { + texture: &self.frame_textures[next_texture], + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + frame_data, + wgpu::TexelCopyBufferLayout { + offset: 0, + bytes_per_row: Some(src_bytes_per_row), + rows_per_image: Some(frame_size.y), + }, + wgpu::Extent3d { + width: frame_size.x, + height: frame_size.y, + depth_or_array_layers: 1, + }, + ); + true + } + PixelFormat::Nv12 => { + let screen_frame = &segment_frames.screen_frame; + + if !self.prefer_cpu_conversion { + if let (Some(y_data), Some(uv_data)) = + (screen_frame.y_plane(), screen_frame.uv_plane()) + { + let y_stride = screen_frame.y_stride(); + let uv_stride = screen_frame.uv_stride(); + + #[cfg(target_os = "windows")] + let convert_width = actual_width; + #[cfg(target_os = "windows")] + let convert_height = actual_height; + #[cfg(not(target_os = "windows"))] + let convert_width = frame_size.x; + #[cfg(not(target_os = "windows"))] + let convert_height = frame_size.y; + + let convert_result = self.yuv_converter.convert_nv12_to_encoder( + device, + queue, + encoder, + y_data, + uv_data, + convert_width, + convert_height, + y_stride, + uv_stride, + ); + + match convert_result { + Ok(_) => { + if self.yuv_converter.output_texture().is_some() { + self.pending_copy = Some(PendingTextureCopy { + width: convert_width, + height: convert_height, + dst_texture_index: next_texture, + }); + true + } else { + false + } + } + Err(_) => false, + } + } else { + false + } + } else if let (Some(y_data), Some(uv_data)) = + (screen_frame.y_plane(), screen_frame.uv_plane()) + { + let y_stride = screen_frame.y_stride(); + let uv_stride = screen_frame.uv_stride(); + let convert_result = self.yuv_converter.convert_nv12_cpu( + device, + queue, + y_data, + uv_data, + frame_size.x, + frame_size.y, + y_stride, + uv_stride, + ); + + match convert_result { + Ok(_) => { + if self.yuv_converter.output_texture().is_some() { + self.pending_copy = Some(PendingTextureCopy { + width: frame_size.x, + height: frame_size.y, + dst_texture_index: next_texture, + }); + true + } else { + false + } + } + Err(_) => false, + } + } else { + false + } + } + PixelFormat::Yuv420p => { + let screen_frame = &segment_frames.screen_frame; + let y_plane = screen_frame.y_plane(); + let u_plane = screen_frame.u_plane(); + let v_plane = screen_frame.v_plane(); + + if let (Some(y_data), Some(u_data), Some(v_data)) = (y_plane, u_plane, v_plane) + { + let convert_result = if self.prefer_cpu_conversion { + self.yuv_converter.convert_yuv420p_cpu( + device, + queue, + y_data, + u_data, + v_data, + frame_size.x, + frame_size.y, + screen_frame.y_stride(), + screen_frame.uv_stride(), + ) + } else { + self.yuv_converter.convert_yuv420p_to_encoder( + device, + queue, + encoder, + y_data, + u_data, + v_data, + frame_size.x, + frame_size.y, + screen_frame.y_stride(), + screen_frame.uv_stride(), + ) + }; + + match convert_result { + Ok(_) => { + if self.yuv_converter.output_texture().is_some() { + self.pending_copy = Some(PendingTextureCopy { + width: frame_size.x, + height: frame_size.y, + dst_texture_index: next_texture, + }); + true + } else { + false + } + } + Err(_) => false, + } + } else { + false + } + } + }; + + if frame_uploaded { + self.last_recording_time = Some(current_recording_time); + self.current_texture = next_texture; + } + } + + uniforms.write_to_buffer(queue, &self.uniforms_buffer); + (skipped, actual_width, actual_height) + } + pub fn copy_to_texture(&mut self, encoder: &mut wgpu::CommandEncoder) { let Some(pending) = self.pending_copy.take() else { return; diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index 9848f21859d..7956b0582ad 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -2013,6 +2013,85 @@ impl RendererLayers { Ok(()) } + pub async fn prepare_with_encoder( + &mut self, + constants: &RenderVideoConstants, + uniforms: &ProjectUniforms, + segment_frames: &DecodedSegmentFrames, + cursor: &CursorEvents, + encoder: &mut wgpu::CommandEncoder, + ) -> Result<(), RenderingError> { + self.background + .prepare( + constants, + uniforms, + Background::from(uniforms.project.background.source.clone()), + ) + .await?; + + if uniforms.project.background.blur > 0.0 { + self.background_blur.prepare(&constants.queue, uniforms); + } + + self.display.prepare_with_encoder( + &constants.device, + &constants.queue, + segment_frames, + constants.options.screen_size, + uniforms.display, + encoder, + ); + + self.cursor.prepare( + segment_frames, + uniforms.resolution_base, + cursor, + &uniforms.zoom, + uniforms, + constants, + ); + + self.camera.prepare( + &constants.device, + &constants.queue, + uniforms.camera, + constants.options.camera_size.and_then(|size| { + segment_frames + .camera_frame + .as_ref() + .map(|frame| (size, frame, segment_frames.recording_time)) + }), + ); + + self.camera_only.prepare( + &constants.device, + &constants.queue, + uniforms.camera_only, + constants.options.camera_size.and_then(|size| { + segment_frames + .camera_frame + .as_ref() + .map(|frame| (size, frame, segment_frames.recording_time)) + }), + ); + + self.text.prepare( + &constants.device, + &constants.queue, + uniforms.output_size, + &uniforms.texts, + ); + + self.captions.prepare( + uniforms, + segment_frames, + XY::new(uniforms.output_size.0, uniforms.output_size.1), + constants, + ); + + Ok(()) + } + pub fn render( &mut self, device: &wgpu::Device, @@ -2112,16 +2191,16 @@ async fn produce_frame( layers: &mut RendererLayers, session: &mut RenderSession, ) -> Result { - layers - .prepare(constants, &uniforms, &segment_frames, cursor) - .await?; - let mut encoder = constants.device.create_command_encoder( &(wgpu::CommandEncoderDescriptor { label: Some("Render Encoder"), }), ); + layers + .prepare_with_encoder(constants, &uniforms, &segment_frames, cursor, &mut encoder) + .await?; + layers.render( &constants.device, &constants.queue, diff --git a/crates/rendering/src/yuv_converter.rs b/crates/rendering/src/yuv_converter.rs index e93c29f30aa..30b107ce485 100644 --- a/crates/rendering/src/yuv_converter.rs +++ b/crates/rendering/src/yuv_converter.rs @@ -753,6 +753,81 @@ impl YuvToRgbaConverter { Ok(self.current_output_view()) } + #[allow(clippy::too_many_arguments)] + pub fn convert_nv12_to_encoder( + &mut self, + device: &wgpu::Device, + queue: &wgpu::Queue, + encoder: &mut wgpu::CommandEncoder, + y_data: &[u8], + uv_data: &[u8], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, + ) -> Result<&wgpu::TextureView, YuvConversionError> { + let (effective_width, effective_height, _downscaled) = + validate_dimensions(width, height, self.gpu_max_texture_size)?; + self.ensure_texture_size(device, effective_width, effective_height); + self.swap_output_buffer(); + + upload_plane_with_stride(queue, &self.y_texture, y_data, width, height, y_stride, "Y")?; + + let half_height = height / 2; + let expected_uv_size = (uv_stride * half_height) as usize; + if uv_data.len() < expected_uv_size { + return Err(YuvConversionError::PlaneSizeMismatch { + plane: "UV", + expected: expected_uv_size, + actual: uv_data.len(), + }); + } + + queue.write_texture( + wgpu::TexelCopyTextureInfo { + texture: &self.uv_texture, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + uv_data, + wgpu::TexelCopyBufferLayout { + offset: 0, + bytes_per_row: Some(uv_stride), + rows_per_image: Some(half_height), + }, + wgpu::Extent3d { + width: width / 2, + height: half_height, + depth_or_array_layers: 1, + }, + ); + + let output_index = self.current_output; + let bind_group = self.bind_group_cache.get_or_create_nv12( + device, + &self.pipelines.nv12_bind_group_layout, + &self.y_view, + &self.uv_view, + &self.output_views[output_index], + output_index, + self.allocated_width, + self.allocated_height, + ); + + { + let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("NV12 Conversion Pass (Batched)"), + ..Default::default() + }); + compute_pass.set_pipeline(&self.pipelines.nv12_pipeline); + compute_pass.set_bind_group(0, bind_group, &[]); + compute_pass.dispatch_workgroups(width.div_ceil(8), height.div_ceil(8), 1); + } + + Ok(self.current_output_view()) + } + #[cfg(target_os = "macos")] pub fn convert_nv12_from_iosurface( &mut self, @@ -913,6 +988,75 @@ impl YuvToRgbaConverter { Ok(self.current_output_view()) } + #[allow(clippy::too_many_arguments)] + pub fn convert_yuv420p_to_encoder( + &mut self, + device: &wgpu::Device, + queue: &wgpu::Queue, + encoder: &mut wgpu::CommandEncoder, + y_data: &[u8], + u_data: &[u8], + v_data: &[u8], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, + ) -> Result<&wgpu::TextureView, YuvConversionError> { + let (effective_width, effective_height, _downscaled) = + validate_dimensions(width, height, self.gpu_max_texture_size)?; + self.ensure_texture_size(device, effective_width, effective_height); + self.swap_output_buffer(); + + upload_plane_with_stride(queue, &self.y_texture, y_data, width, height, y_stride, "Y")?; + + let half_width = width / 2; + let half_height = height / 2; + + upload_plane_with_stride( + queue, + &self.u_texture, + u_data, + half_width, + half_height, + uv_stride, + "U", + )?; + upload_plane_with_stride( + queue, + &self.v_texture, + v_data, + half_width, + half_height, + uv_stride, + "V", + )?; + + let output_index = self.current_output; + let bind_group = self.bind_group_cache.get_or_create_yuv420p( + device, + &self.pipelines.yuv420p_bind_group_layout, + &self.y_view, + &self.u_view, + &self.v_view, + &self.output_views[output_index], + output_index, + self.allocated_width, + self.allocated_height, + ); + + { + let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("YUV420P Conversion Pass (Batched)"), + ..Default::default() + }); + compute_pass.set_pipeline(&self.pipelines.yuv420p_pipeline); + compute_pass.set_bind_group(0, bind_group, &[]); + compute_pass.dispatch_workgroups(width.div_ceil(8), height.div_ceil(8), 1); + } + + Ok(self.current_output_view()) + } + #[cfg(target_os = "windows")] #[allow(clippy::too_many_arguments)] pub fn convert_nv12_from_d3d11_texture( From 238a993b745916fa2bb02598507d2f92e5148572 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:30:24 +0000 Subject: [PATCH 07/54] improve: pipeline GPU readback to overlap with next frame rendering Previously finish_encoder() would: 1. Discard any pending readback result 2. Submit current frame readback 3. Wait synchronously for current readback Now it: 1. Wait for previous frame's readback (should already be done) 2. Submit current frame's readback (starts async) 3. Return the previous frame immediately This means GPU readback of frame N overlaps with CPU processing and decode of frame N+1, reducing the synchronous wait time per frame. The first frame still waits synchronously, but subsequent frames benefit from the pipelining. Co-authored-by: Richie McIlroy --- crates/rendering/src/frame_pipeline.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index 2517b6a6785..e2953af9445 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -428,9 +428,11 @@ pub async fn finish_encoder( uniforms: &ProjectUniforms, encoder: wgpu::CommandEncoder, ) -> Result { - if let Some(prev) = session.pipelined_readback.take_pending() { - let _ = prev.wait(device).await; - } + let previous_frame = if let Some(prev) = session.pipelined_readback.take_pending() { + Some(prev.wait(device).await?) + } else { + None + }; session.pipelined_readback.perform_resize_if_needed(device); @@ -444,6 +446,10 @@ pub async fn finish_encoder( .pipelined_readback .submit_readback(device, queue, texture, uniforms, encoder)?; + if let Some(prev_frame) = previous_frame { + return Ok(prev_frame); + } + let pending = session .pipelined_readback .take_pending() From 4442719739b662f76c9a4e919aaf4b931d17f535 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:30:24 +0000 Subject: [PATCH 08/54] improve: pipeline GPU readback to overlap with next frame rendering Previously finish_encoder() would: 1. Discard any pending readback result 2. Submit current frame readback 3. Wait synchronously for current readback Now it: 1. Wait for previous frame's readback (should already be done) 2. Submit current frame's readback (starts async) 3. Return the previous frame immediately This means GPU readback of frame N overlaps with CPU processing and decode of frame N+1, reducing the synchronous wait time per frame. The first frame still waits synchronously, but subsequent frames benefit from the pipelining. --- crates/rendering/src/frame_pipeline.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index 2517b6a6785..e2953af9445 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -428,9 +428,11 @@ pub async fn finish_encoder( uniforms: &ProjectUniforms, encoder: wgpu::CommandEncoder, ) -> Result { - if let Some(prev) = session.pipelined_readback.take_pending() { - let _ = prev.wait(device).await; - } + let previous_frame = if let Some(prev) = session.pipelined_readback.take_pending() { + Some(prev.wait(device).await?) + } else { + None + }; session.pipelined_readback.perform_resize_if_needed(device); @@ -444,6 +446,10 @@ pub async fn finish_encoder( .pipelined_readback .submit_readback(device, queue, texture, uniforms, encoder)?; + if let Some(prev_frame) = previous_frame { + return Ok(prev_frame); + } + let pending = session .pipelined_readback .take_pending() From 7787f24fde08c7b77751c2db32299318ad7262dd Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:32:16 +0000 Subject: [PATCH 09/54] improve: tighten audio-video sync thresholds for better playback sync Audio sync improvements: - Pre-rendered audio: tighten jump detection threshold from 100ms to 50ms (audio now re-syncs when video playhead drifts by more than 50ms) - macOS (non-prerendered): reduce sync threshold from 120ms to 80ms - Windows (non-prerendered): reduce sync threshold from 200ms to 100ms - Windows: reduce hard seek threshold from 500ms to 300ms - Windows: reduce min sync interval callbacks from 50 to 30 for more responsive correction These changes address user reports of audio being behind video or taking a couple of seconds to sync up during playback. Co-authored-by: Richie McIlroy --- crates/editor/src/playback.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index 841b5a90c37..6ac8571a4a9 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -1040,14 +1040,14 @@ impl AudioPlayback { #[cfg(target_os = "windows")] const FIXED_LATENCY_SECS: f64 = 0.08; #[cfg(target_os = "windows")] - const SYNC_THRESHOLD_SECS: f64 = 0.20; + const SYNC_THRESHOLD_SECS: f64 = 0.10; #[cfg(target_os = "windows")] - const HARD_SEEK_THRESHOLD_SECS: f64 = 0.5; + const HARD_SEEK_THRESHOLD_SECS: f64 = 0.3; #[cfg(target_os = "windows")] - const MIN_SYNC_INTERVAL_CALLBACKS: u32 = 50; + const MIN_SYNC_INTERVAL_CALLBACKS: u32 = 30; #[cfg(not(target_os = "windows"))] - const SYNC_THRESHOLD_SECS: f64 = 0.12; + const SYNC_THRESHOLD_SECS: f64 = 0.08; #[cfg(target_os = "windows")] let mut callbacks_since_last_sync: u32 = MIN_SYNC_INTERVAL_CALLBACKS; @@ -1205,7 +1205,7 @@ impl AudioPlayback { let video_playhead = *playhead_rx_for_stream.borrow_and_update(); let jump = (video_playhead - last_video_playhead).abs(); - if jump > 0.1 { + if jump > 0.05 { audio_buffer.set_playhead(video_playhead); } From c7d49d3b36d24e8101ff9fc63fe3f1c55cd9d764 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:32:16 +0000 Subject: [PATCH 10/54] improve: tighten audio-video sync thresholds for better playback sync Audio sync improvements: - Pre-rendered audio: tighten jump detection threshold from 100ms to 50ms (audio now re-syncs when video playhead drifts by more than 50ms) - macOS (non-prerendered): reduce sync threshold from 120ms to 80ms - Windows (non-prerendered): reduce sync threshold from 200ms to 100ms - Windows: reduce hard seek threshold from 500ms to 300ms - Windows: reduce min sync interval callbacks from 50 to 30 for more responsive correction These changes address user reports of audio being behind video or taking a couple of seconds to sync up during playback. --- crates/editor/src/playback.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index 841b5a90c37..6ac8571a4a9 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -1040,14 +1040,14 @@ impl AudioPlayback { #[cfg(target_os = "windows")] const FIXED_LATENCY_SECS: f64 = 0.08; #[cfg(target_os = "windows")] - const SYNC_THRESHOLD_SECS: f64 = 0.20; + const SYNC_THRESHOLD_SECS: f64 = 0.10; #[cfg(target_os = "windows")] - const HARD_SEEK_THRESHOLD_SECS: f64 = 0.5; + const HARD_SEEK_THRESHOLD_SECS: f64 = 0.3; #[cfg(target_os = "windows")] - const MIN_SYNC_INTERVAL_CALLBACKS: u32 = 50; + const MIN_SYNC_INTERVAL_CALLBACKS: u32 = 30; #[cfg(not(target_os = "windows"))] - const SYNC_THRESHOLD_SECS: f64 = 0.12; + const SYNC_THRESHOLD_SECS: f64 = 0.08; #[cfg(target_os = "windows")] let mut callbacks_since_last_sync: u32 = MIN_SYNC_INTERVAL_CALLBACKS; @@ -1205,7 +1205,7 @@ impl AudioPlayback { let video_playhead = *playhead_rx_for_stream.borrow_and_update(); let jump = (video_playhead - last_video_playhead).abs(); - if jump > 0.1 { + if jump > 0.05 { audio_buffer.set_playhead(video_playhead); } From 0545b6aabc7170dabbb4454a95900fb677341780 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:37:35 +0000 Subject: [PATCH 11/54] improve: avoid cloning CursorEvents per frame, use Arc sharing ZoomFocusInterpolator was cloning CursorEvents (Vec of cursor moves and clicks) on every frame during playback. For recordings with many cursor events, this adds unnecessary allocation pressure. Changes: - Add ZoomFocusInterpolator::new_arc() that accepts Arc - Playback and preview paths now use Arc sharing instead of deep cloning - Reduces per-frame allocation during playback - Renderer channel capacity reduced from 8 to 4 to reduce stale frame queuing and wasted decode work Co-authored-by: Richie McIlroy --- crates/editor/src/editor.rs | 2 +- crates/editor/src/editor_instance.rs | 4 ++-- crates/editor/src/playback.rs | 4 ++-- .../rendering/src/zoom_focus_interpolation.rs | 19 +++++++++++++++++-- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/crates/editor/src/editor.rs b/crates/editor/src/editor.rs index 012869016a6..a80d18b3992 100644 --- a/crates/editor/src/editor.rs +++ b/crates/editor/src/editor.rs @@ -55,7 +55,7 @@ impl Renderer { let total_frames = (30_f64 * max_duration).ceil() as u32; - let (tx, rx) = mpsc::channel(8); + let (tx, rx) = mpsc::channel(4); let this = Self { rx, diff --git a/crates/editor/src/editor_instance.rs b/crates/editor/src/editor_instance.rs index 16b05e33e8f..a48f65c0632 100644 --- a/crates/editor/src/editor_instance.rs +++ b/crates/editor/src/editor_instance.rs @@ -513,8 +513,8 @@ impl EditorInstance { }, ); - let zoom_focus_interpolator = ZoomFocusInterpolator::new( - &segment_medias.cursor, + let zoom_focus_interpolator = ZoomFocusInterpolator::new_arc( + segment_medias.cursor.clone(), cursor_smoothing, project.screen_movement_spring, total_duration, diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index 6ac8571a4a9..dc304ff53da 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -657,8 +657,8 @@ impl Playback { friction: cached_project.cursor.friction, }); - let zoom_focus_interpolator = ZoomFocusInterpolator::new( - &segment_media.cursor, + let zoom_focus_interpolator = ZoomFocusInterpolator::new_arc( + segment_media.cursor.clone(), cursor_smoothing, cached_project.screen_movement_spring, duration, diff --git a/crates/rendering/src/zoom_focus_interpolation.rs b/crates/rendering/src/zoom_focus_interpolation.rs index a622b9c7d09..c6980262540 100644 --- a/crates/rendering/src/zoom_focus_interpolation.rs +++ b/crates/rendering/src/zoom_focus_interpolation.rs @@ -16,7 +16,7 @@ struct SmoothedFocusEvent { pub struct ZoomFocusInterpolator { events: Option>, - cursor_events: CursorEvents, + cursor_events: std::sync::Arc, cursor_smoothing: Option, screen_spring: ScreenMovementSpring, duration_secs: f64, @@ -31,7 +31,22 @@ impl ZoomFocusInterpolator { ) -> Self { Self { events: None, - cursor_events: cursor_events.clone(), + cursor_events: std::sync::Arc::new(cursor_events.clone()), + cursor_smoothing, + screen_spring, + duration_secs, + } + } + + pub fn new_arc( + cursor_events: std::sync::Arc, + cursor_smoothing: Option, + screen_spring: ScreenMovementSpring, + duration_secs: f64, + ) -> Self { + Self { + events: None, + cursor_events, cursor_smoothing, screen_spring, duration_secs, From d0d230d80d5a8c98d4cc26428db59a104bd755a8 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:37:35 +0000 Subject: [PATCH 12/54] improve: avoid cloning CursorEvents per frame, use Arc sharing ZoomFocusInterpolator was cloning CursorEvents (Vec of cursor moves and clicks) on every frame during playback. For recordings with many cursor events, this adds unnecessary allocation pressure. Changes: - Add ZoomFocusInterpolator::new_arc() that accepts Arc - Playback and preview paths now use Arc sharing instead of deep cloning - Reduces per-frame allocation during playback - Renderer channel capacity reduced from 8 to 4 to reduce stale frame queuing and wasted decode work --- crates/editor/src/editor.rs | 2 +- crates/editor/src/editor_instance.rs | 4 ++-- crates/editor/src/playback.rs | 4 ++-- .../rendering/src/zoom_focus_interpolation.rs | 19 +++++++++++++++++-- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/crates/editor/src/editor.rs b/crates/editor/src/editor.rs index 012869016a6..a80d18b3992 100644 --- a/crates/editor/src/editor.rs +++ b/crates/editor/src/editor.rs @@ -55,7 +55,7 @@ impl Renderer { let total_frames = (30_f64 * max_duration).ceil() as u32; - let (tx, rx) = mpsc::channel(8); + let (tx, rx) = mpsc::channel(4); let this = Self { rx, diff --git a/crates/editor/src/editor_instance.rs b/crates/editor/src/editor_instance.rs index 16b05e33e8f..a48f65c0632 100644 --- a/crates/editor/src/editor_instance.rs +++ b/crates/editor/src/editor_instance.rs @@ -513,8 +513,8 @@ impl EditorInstance { }, ); - let zoom_focus_interpolator = ZoomFocusInterpolator::new( - &segment_medias.cursor, + let zoom_focus_interpolator = ZoomFocusInterpolator::new_arc( + segment_medias.cursor.clone(), cursor_smoothing, project.screen_movement_spring, total_duration, diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index 6ac8571a4a9..dc304ff53da 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -657,8 +657,8 @@ impl Playback { friction: cached_project.cursor.friction, }); - let zoom_focus_interpolator = ZoomFocusInterpolator::new( - &segment_media.cursor, + let zoom_focus_interpolator = ZoomFocusInterpolator::new_arc( + segment_media.cursor.clone(), cursor_smoothing, cached_project.screen_movement_spring, duration, diff --git a/crates/rendering/src/zoom_focus_interpolation.rs b/crates/rendering/src/zoom_focus_interpolation.rs index a622b9c7d09..c6980262540 100644 --- a/crates/rendering/src/zoom_focus_interpolation.rs +++ b/crates/rendering/src/zoom_focus_interpolation.rs @@ -16,7 +16,7 @@ struct SmoothedFocusEvent { pub struct ZoomFocusInterpolator { events: Option>, - cursor_events: CursorEvents, + cursor_events: std::sync::Arc, cursor_smoothing: Option, screen_spring: ScreenMovementSpring, duration_secs: f64, @@ -31,7 +31,22 @@ impl ZoomFocusInterpolator { ) -> Self { Self { events: None, - cursor_events: cursor_events.clone(), + cursor_events: std::sync::Arc::new(cursor_events.clone()), + cursor_smoothing, + screen_spring, + duration_secs, + } + } + + pub fn new_arc( + cursor_events: std::sync::Arc, + cursor_smoothing: Option, + screen_spring: ScreenMovementSpring, + duration_secs: f64, + ) -> Self { + Self { + events: None, + cursor_events, cursor_smoothing, screen_spring, duration_secs, From db091b95cf803b2eeb89946f037f19bb616b14f7 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:38:46 +0000 Subject: [PATCH 13/54] improve: reduce decoder/GPU timeouts and optimize GPU poll loop - Reduce decoder frame timeout from 5000ms to 2000ms (normal frames) - Reduce decoder initial seek timeout from 20000ms to 10000ms - Reduce GPU buffer wait timeout from 30s to 10s - Improve GPU readback poll loop: use progressive backoff instead of tight busy-wait (yield for first 10 polls, 100us sleep for 10-100 polls, 1ms sleep after that) - Reduces CPU usage during GPU readback waiting while maintaining low latency for fast readbacks Co-authored-by: Richie McIlroy --- crates/rendering/src/decoder/mod.rs | 4 ++-- crates/rendering/src/frame_pipeline.rs | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/crates/rendering/src/decoder/mod.rs b/crates/rendering/src/decoder/mod.rs index 77c9b540fc6..1782b5bd865 100644 --- a/crates/rendering/src/decoder/mod.rs +++ b/crates/rendering/src/decoder/mod.rs @@ -456,8 +456,8 @@ pub struct AsyncVideoDecoderHandle { } impl AsyncVideoDecoderHandle { - const NORMAL_TIMEOUT_MS: u64 = 5000; - const INITIAL_SEEK_TIMEOUT_MS: u64 = 20000; + const NORMAL_TIMEOUT_MS: u64 = 2000; + const INITIAL_SEEK_TIMEOUT_MS: u64 = 10000; pub async fn get_frame(&self, time: f32) -> Option { self.get_frame_with_timeout(time, Self::NORMAL_TIMEOUT_MS) diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index e2953af9445..fa811e0c96a 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -5,7 +5,7 @@ use wgpu::COPY_BYTES_PER_ROW_ALIGNMENT; use crate::{ProjectUniforms, RenderingError}; -const GPU_BUFFER_WAIT_TIMEOUT_SECS: u64 = 30; +const GPU_BUFFER_WAIT_TIMEOUT_SECS: u64 = 10; pub struct PendingReadback { rx: oneshot::Receiver>, @@ -43,8 +43,12 @@ impl PendingReadback { Err(oneshot::error::TryRecvError::Empty) => { device.poll(wgpu::PollType::Poll)?; poll_count += 1; - if poll_count.is_multiple_of(3) { + if poll_count < 10 { tokio::task::yield_now().await; + } else if poll_count < 100 { + tokio::time::sleep(std::time::Duration::from_micros(100)).await; + } else { + tokio::time::sleep(std::time::Duration::from_millis(1)).await; } if poll_count.is_multiple_of(10000) { tracing::warn!( From e230beb905ce3966b49de921528a98640c0fdc69 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:38:46 +0000 Subject: [PATCH 14/54] improve: reduce decoder/GPU timeouts and optimize GPU poll loop - Reduce decoder frame timeout from 5000ms to 2000ms (normal frames) - Reduce decoder initial seek timeout from 20000ms to 10000ms - Reduce GPU buffer wait timeout from 30s to 10s - Improve GPU readback poll loop: use progressive backoff instead of tight busy-wait (yield for first 10 polls, 100us sleep for 10-100 polls, 1ms sleep after that) - Reduces CPU usage during GPU readback waiting while maintaining low latency for fast readbacks --- crates/rendering/src/decoder/mod.rs | 4 ++-- crates/rendering/src/frame_pipeline.rs | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/crates/rendering/src/decoder/mod.rs b/crates/rendering/src/decoder/mod.rs index 77c9b540fc6..1782b5bd865 100644 --- a/crates/rendering/src/decoder/mod.rs +++ b/crates/rendering/src/decoder/mod.rs @@ -456,8 +456,8 @@ pub struct AsyncVideoDecoderHandle { } impl AsyncVideoDecoderHandle { - const NORMAL_TIMEOUT_MS: u64 = 5000; - const INITIAL_SEEK_TIMEOUT_MS: u64 = 20000; + const NORMAL_TIMEOUT_MS: u64 = 2000; + const INITIAL_SEEK_TIMEOUT_MS: u64 = 10000; pub async fn get_frame(&self, time: f32) -> Option { self.get_frame_with_timeout(time, Self::NORMAL_TIMEOUT_MS) diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index e2953af9445..fa811e0c96a 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -5,7 +5,7 @@ use wgpu::COPY_BYTES_PER_ROW_ALIGNMENT; use crate::{ProjectUniforms, RenderingError}; -const GPU_BUFFER_WAIT_TIMEOUT_SECS: u64 = 30; +const GPU_BUFFER_WAIT_TIMEOUT_SECS: u64 = 10; pub struct PendingReadback { rx: oneshot::Receiver>, @@ -43,8 +43,12 @@ impl PendingReadback { Err(oneshot::error::TryRecvError::Empty) => { device.poll(wgpu::PollType::Poll)?; poll_count += 1; - if poll_count.is_multiple_of(3) { + if poll_count < 10 { tokio::task::yield_now().await; + } else if poll_count < 100 { + tokio::time::sleep(std::time::Duration::from_micros(100)).await; + } else { + tokio::time::sleep(std::time::Duration::from_millis(1)).await; } if poll_count.is_multiple_of(10000) { tracing::warn!( From a700d6a0ab6481ec245c7ff09a094ceed1806d34 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:39:06 +0000 Subject: [PATCH 15/54] chore: format code with cargo fmt Co-authored-by: Richie McIlroy --- .../examples/playback-pipeline-benchmark.rs | 125 ++++++++---------- 1 file changed, 54 insertions(+), 71 deletions(-) diff --git a/crates/editor/examples/playback-pipeline-benchmark.rs b/crates/editor/examples/playback-pipeline-benchmark.rs index c1524f5faa4..8e04f349d87 100644 --- a/crates/editor/examples/playback-pipeline-benchmark.rs +++ b/crates/editor/examples/playback-pipeline-benchmark.rs @@ -71,7 +71,10 @@ impl PipelineTimings { let total_time: f64 = self.total_ms.iter().sum(); let effective_fps = self.frames_rendered as f64 / (total_time / 1000.0); println!(" Effective FPS: {effective_fps:.1}"); - println!(" Total time: {total_time:.0}ms for {} frames", self.frames_rendered); + println!( + " Total time: {total_time:.0}ms for {} frames", + self.frames_rendered + ); } println!(); @@ -106,11 +109,10 @@ async fn load_recording( let timeline_segments = match meta.as_ref() { StudioRecordingMeta::SingleSegment { segment } => { let display_path = recording_meta.path(&segment.display.path); - let duration = - match cap_rendering::Video::new(&display_path, 0.0) { - Ok(v) => v.duration, - Err(_) => 5.0, - }; + let duration = match cap_rendering::Video::new(&display_path, 0.0) { + Ok(v) => v.duration, + Err(_) => 5.0, + }; vec![TimelineSegment { recording_clip: 0, start: 0.0, @@ -183,13 +185,14 @@ async fn run_decode_only_benchmark( StudioRecordingMeta::MultipleSegments { inner } => inner.segments[0].display.fps, }; - let decoder = match spawn_decoder("benchmark-screen", display_path, display_fps, 0.0, false).await { - Ok(d) => d, - Err(e) => { - eprintln!("Failed to create decoder: {e}"); - return timings; - } - }; + let decoder = + match spawn_decoder("benchmark-screen", display_path, display_fps, 0.0, false).await { + Ok(d) => d, + Err(e) => { + eprintln!("Failed to create decoder: {e}"); + return timings; + } + }; println!(" Decoder type: {}", decoder.decoder_type()); println!( @@ -258,14 +261,13 @@ async fn run_full_pipeline_benchmark( render_constants.is_software_adapter ); - let segments = - match cap_editor::create_segments(recording_meta, meta, false).await { - Ok(s) => s, - Err(e) => { - eprintln!("Failed to create segments: {e}"); - return timings; - } - }; + let segments = match cap_editor::create_segments(recording_meta, meta, false).await { + Ok(s) => s, + Err(e) => { + eprintln!("Failed to create segments: {e}"); + return timings; + } + }; if segments.is_empty() { eprintln!("No segments found"); @@ -297,14 +299,16 @@ async fn run_full_pipeline_benchmark( .unwrap_or(10.0); let max_frames = ((duration * fps as f64).ceil() as usize).min(frame_count); - println!(" Rendering {max_frames} frames at {fps}fps, resolution base: {}x{}...", resolution_base.x, resolution_base.y); + println!( + " Rendering {max_frames} frames at {fps}fps, resolution base: {}x{}...", + resolution_base.x, resolution_base.y + ); - let cursor_smoothing = - (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { - tension: project.cursor.tension, - mass: project.cursor.mass, - friction: project.cursor.friction, - }); + let cursor_smoothing = (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { + tension: project.cursor.tension, + mass: project.cursor.mass, + friction: project.cursor.friction, + }); for i in 0..max_frames { let frame_time = i as f64 / fps as f64; @@ -332,11 +336,7 @@ async fn run_full_pipeline_benchmark( let segment_frames_opt = if i == 0 { segment_media .decoders - .get_frames_initial( - segment_time as f32, - !project.camera.hide, - clip_offsets, - ) + .get_frames_initial(segment_time as f32, !project.camera.hide, clip_offsets) .await } else { segment_media @@ -374,12 +374,7 @@ async fn run_full_pipeline_benchmark( let render_start = Instant::now(); match frame_renderer - .render( - segment_frames, - uniforms, - &segment_media.cursor, - &mut layers, - ) + .render(segment_frames, uniforms, &segment_media.cursor, &mut layers) .await { Ok(_frame) => { @@ -428,14 +423,13 @@ async fn run_scrubbing_benchmark( } }; - let segments = - match cap_editor::create_segments(recording_meta, meta, false).await { - Ok(s) => s, - Err(e) => { - eprintln!("Failed to create segments: {e}"); - return timings; - } - }; + let segments = match cap_editor::create_segments(recording_meta, meta, false).await { + Ok(s) => s, + Err(e) => { + eprintln!("Failed to create segments: {e}"); + return timings; + } + }; if segments.is_empty() { eprintln!("No segments found"); @@ -466,12 +460,11 @@ async fn run_scrubbing_benchmark( .map(|t| t.duration()) .unwrap_or(10.0); - let cursor_smoothing = - (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { - tension: project.cursor.tension, - mass: project.cursor.mass, - friction: project.cursor.friction, - }); + let cursor_smoothing = (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { + tension: project.cursor.tension, + mass: project.cursor.mass, + friction: project.cursor.friction, + }); let scrub_positions: Vec = { let golden_ratio = 1.618_034; @@ -484,7 +477,10 @@ async fn run_scrubbing_benchmark( positions }; - println!(" Scrubbing to {} random positions...", scrub_positions.len()); + println!( + " Scrubbing to {} random positions...", + scrub_positions.len() + ); for (i, &scrub_time) in scrub_positions.iter().enumerate() { let Some((segment_time, segment)) = project.get_segment_time(scrub_time) else { @@ -540,12 +536,7 @@ async fn run_scrubbing_benchmark( let render_start = Instant::now(); match frame_renderer - .render( - segment_frames, - uniforms, - &segment_media.cursor, - &mut layers, - ) + .render(segment_frames, uniforms, &segment_media.cursor, &mut layers) .await { Ok(_frame) => { @@ -631,14 +622,8 @@ async fn main() { ]; println!("\n--- DECODE-ONLY BENCHMARK ---"); - let decode_timings = run_decode_only_benchmark( - &recording_meta, - meta.as_ref(), - &project, - fps, - frame_count, - ) - .await; + let decode_timings = + run_decode_only_benchmark(&recording_meta, meta.as_ref(), &project, fps, frame_count).await; decode_timings.print_report("DECODE-ONLY"); for (resolution_base, label) in &resolutions { @@ -678,9 +663,7 @@ async fn main() { if !decode_timings.decode_ms.is_empty() { let decode_p95 = percentile(&decode_timings.decode_ms, 95.0); let decode_budget_pct = decode_p95 / target_frame_time_ms * 100.0; - println!( - "Decode p95 ({decode_p95:.2}ms) uses {decode_budget_pct:.0}% of frame budget" - ); + println!("Decode p95 ({decode_p95:.2}ms) uses {decode_budget_pct:.0}% of frame budget"); } for (_resolution_base, label) in &resolutions { From 7f984b05ceaa63295a3b7be4f74263e255d408bf Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:39:06 +0000 Subject: [PATCH 16/54] chore: format code with cargo fmt --- .../examples/playback-pipeline-benchmark.rs | 125 ++++++++---------- 1 file changed, 54 insertions(+), 71 deletions(-) diff --git a/crates/editor/examples/playback-pipeline-benchmark.rs b/crates/editor/examples/playback-pipeline-benchmark.rs index c1524f5faa4..8e04f349d87 100644 --- a/crates/editor/examples/playback-pipeline-benchmark.rs +++ b/crates/editor/examples/playback-pipeline-benchmark.rs @@ -71,7 +71,10 @@ impl PipelineTimings { let total_time: f64 = self.total_ms.iter().sum(); let effective_fps = self.frames_rendered as f64 / (total_time / 1000.0); println!(" Effective FPS: {effective_fps:.1}"); - println!(" Total time: {total_time:.0}ms for {} frames", self.frames_rendered); + println!( + " Total time: {total_time:.0}ms for {} frames", + self.frames_rendered + ); } println!(); @@ -106,11 +109,10 @@ async fn load_recording( let timeline_segments = match meta.as_ref() { StudioRecordingMeta::SingleSegment { segment } => { let display_path = recording_meta.path(&segment.display.path); - let duration = - match cap_rendering::Video::new(&display_path, 0.0) { - Ok(v) => v.duration, - Err(_) => 5.0, - }; + let duration = match cap_rendering::Video::new(&display_path, 0.0) { + Ok(v) => v.duration, + Err(_) => 5.0, + }; vec![TimelineSegment { recording_clip: 0, start: 0.0, @@ -183,13 +185,14 @@ async fn run_decode_only_benchmark( StudioRecordingMeta::MultipleSegments { inner } => inner.segments[0].display.fps, }; - let decoder = match spawn_decoder("benchmark-screen", display_path, display_fps, 0.0, false).await { - Ok(d) => d, - Err(e) => { - eprintln!("Failed to create decoder: {e}"); - return timings; - } - }; + let decoder = + match spawn_decoder("benchmark-screen", display_path, display_fps, 0.0, false).await { + Ok(d) => d, + Err(e) => { + eprintln!("Failed to create decoder: {e}"); + return timings; + } + }; println!(" Decoder type: {}", decoder.decoder_type()); println!( @@ -258,14 +261,13 @@ async fn run_full_pipeline_benchmark( render_constants.is_software_adapter ); - let segments = - match cap_editor::create_segments(recording_meta, meta, false).await { - Ok(s) => s, - Err(e) => { - eprintln!("Failed to create segments: {e}"); - return timings; - } - }; + let segments = match cap_editor::create_segments(recording_meta, meta, false).await { + Ok(s) => s, + Err(e) => { + eprintln!("Failed to create segments: {e}"); + return timings; + } + }; if segments.is_empty() { eprintln!("No segments found"); @@ -297,14 +299,16 @@ async fn run_full_pipeline_benchmark( .unwrap_or(10.0); let max_frames = ((duration * fps as f64).ceil() as usize).min(frame_count); - println!(" Rendering {max_frames} frames at {fps}fps, resolution base: {}x{}...", resolution_base.x, resolution_base.y); + println!( + " Rendering {max_frames} frames at {fps}fps, resolution base: {}x{}...", + resolution_base.x, resolution_base.y + ); - let cursor_smoothing = - (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { - tension: project.cursor.tension, - mass: project.cursor.mass, - friction: project.cursor.friction, - }); + let cursor_smoothing = (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { + tension: project.cursor.tension, + mass: project.cursor.mass, + friction: project.cursor.friction, + }); for i in 0..max_frames { let frame_time = i as f64 / fps as f64; @@ -332,11 +336,7 @@ async fn run_full_pipeline_benchmark( let segment_frames_opt = if i == 0 { segment_media .decoders - .get_frames_initial( - segment_time as f32, - !project.camera.hide, - clip_offsets, - ) + .get_frames_initial(segment_time as f32, !project.camera.hide, clip_offsets) .await } else { segment_media @@ -374,12 +374,7 @@ async fn run_full_pipeline_benchmark( let render_start = Instant::now(); match frame_renderer - .render( - segment_frames, - uniforms, - &segment_media.cursor, - &mut layers, - ) + .render(segment_frames, uniforms, &segment_media.cursor, &mut layers) .await { Ok(_frame) => { @@ -428,14 +423,13 @@ async fn run_scrubbing_benchmark( } }; - let segments = - match cap_editor::create_segments(recording_meta, meta, false).await { - Ok(s) => s, - Err(e) => { - eprintln!("Failed to create segments: {e}"); - return timings; - } - }; + let segments = match cap_editor::create_segments(recording_meta, meta, false).await { + Ok(s) => s, + Err(e) => { + eprintln!("Failed to create segments: {e}"); + return timings; + } + }; if segments.is_empty() { eprintln!("No segments found"); @@ -466,12 +460,11 @@ async fn run_scrubbing_benchmark( .map(|t| t.duration()) .unwrap_or(10.0); - let cursor_smoothing = - (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { - tension: project.cursor.tension, - mass: project.cursor.mass, - friction: project.cursor.friction, - }); + let cursor_smoothing = (!project.cursor.raw).then_some(SpringMassDamperSimulationConfig { + tension: project.cursor.tension, + mass: project.cursor.mass, + friction: project.cursor.friction, + }); let scrub_positions: Vec = { let golden_ratio = 1.618_034; @@ -484,7 +477,10 @@ async fn run_scrubbing_benchmark( positions }; - println!(" Scrubbing to {} random positions...", scrub_positions.len()); + println!( + " Scrubbing to {} random positions...", + scrub_positions.len() + ); for (i, &scrub_time) in scrub_positions.iter().enumerate() { let Some((segment_time, segment)) = project.get_segment_time(scrub_time) else { @@ -540,12 +536,7 @@ async fn run_scrubbing_benchmark( let render_start = Instant::now(); match frame_renderer - .render( - segment_frames, - uniforms, - &segment_media.cursor, - &mut layers, - ) + .render(segment_frames, uniforms, &segment_media.cursor, &mut layers) .await { Ok(_frame) => { @@ -631,14 +622,8 @@ async fn main() { ]; println!("\n--- DECODE-ONLY BENCHMARK ---"); - let decode_timings = run_decode_only_benchmark( - &recording_meta, - meta.as_ref(), - &project, - fps, - frame_count, - ) - .await; + let decode_timings = + run_decode_only_benchmark(&recording_meta, meta.as_ref(), &project, fps, frame_count).await; decode_timings.print_report("DECODE-ONLY"); for (resolution_base, label) in &resolutions { @@ -678,9 +663,7 @@ async fn main() { if !decode_timings.decode_ms.is_empty() { let decode_p95 = percentile(&decode_timings.decode_ms, 95.0); let decode_budget_pct = decode_p95 / target_frame_time_ms * 100.0; - println!( - "Decode p95 ({decode_p95:.2}ms) uses {decode_budget_pct:.0}% of frame budget" - ); + println!("Decode p95 ({decode_p95:.2}ms) uses {decode_budget_pct:.0}% of frame budget"); } for (_resolution_base, label) in &resolutions { From 9aad0f3df35ea9a501952ab1c4da5ced70948f5d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:43:42 +0000 Subject: [PATCH 17/54] fix: flush pipelined GPU readback to capture last frame in exports The pipelined readback optimization returns the previous frame's readback result while submitting the current frame's readback. This means the very last frame in a render loop would be lost since no subsequent call would collect it. Changes: - Add flush_pending_readback() function to frame_pipeline - Add flush_pipeline() method to FrameRenderer - Call flush_pipeline() at end of render_video_to_channel() to capture the last frame during exports - For playback, losing the last frame is acceptable (playback loop manages its own frame counting) Co-authored-by: Richie McIlroy --- crates/rendering/src/frame_pipeline.rs | 11 +++++++++++ crates/rendering/src/lib.rs | 18 +++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index fa811e0c96a..13e3c472eb1 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -461,3 +461,14 @@ pub async fn finish_encoder( pending.wait(device).await } + +pub async fn flush_pending_readback( + session: &mut RenderSession, + device: &wgpu::Device, +) -> Option> { + if let Some(pending) = session.pipelined_readback.take_pending() { + Some(pending.wait(device).await) + } else { + None + } +} diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index 7956b0582ad..6c63d295188 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -7,7 +7,7 @@ use composite_frame::CompositeVideoFrameUniforms; use core::f64; use cursor_interpolation::{InterpolatedCursorPosition, interpolate_cursor}; use decoder::{AsyncVideoDecoderHandle, spawn_decoder}; -use frame_pipeline::{RenderSession, finish_encoder}; +use frame_pipeline::{RenderSession, finish_encoder, flush_pending_readback}; use futures::FutureExt; use futures::future::OptionFuture; use layers::{ @@ -532,6 +532,14 @@ pub async fn render_video_to_channel( sender.send((frame, current_frame_number)).await?; } + if let Some(Ok(final_frame)) = frame_renderer.flush_pipeline().await { + if final_frame.width > 0 && final_frame.height > 0 { + sender + .send((final_frame, frame_number.saturating_sub(1))) + .await?; + } + } + let total_time = start_time.elapsed(); println!( "Render complete. Processed {frame_number} frames in {:?} seconds", @@ -1875,6 +1883,14 @@ impl<'a> FrameRenderer<'a> { Err(last_error.unwrap_or(RenderingError::BufferMapWaitingFailed)) } + + pub async fn flush_pipeline(&mut self) -> Option> { + if let Some(session) = &mut self.session { + flush_pending_readback(session, &self.constants.device).await + } else { + None + } + } } pub struct RendererLayers { From 1effdbd92e99c11b7687aed4af2dad0c78bb4f45 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:43:42 +0000 Subject: [PATCH 18/54] fix: flush pipelined GPU readback to capture last frame in exports The pipelined readback optimization returns the previous frame's readback result while submitting the current frame's readback. This means the very last frame in a render loop would be lost since no subsequent call would collect it. Changes: - Add flush_pending_readback() function to frame_pipeline - Add flush_pipeline() method to FrameRenderer - Call flush_pipeline() at end of render_video_to_channel() to capture the last frame during exports - For playback, losing the last frame is acceptable (playback loop manages its own frame counting) --- crates/rendering/src/frame_pipeline.rs | 11 +++++++++++ crates/rendering/src/lib.rs | 18 +++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index fa811e0c96a..13e3c472eb1 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -461,3 +461,14 @@ pub async fn finish_encoder( pending.wait(device).await } + +pub async fn flush_pending_readback( + session: &mut RenderSession, + device: &wgpu::Device, +) -> Option> { + if let Some(pending) = session.pipelined_readback.take_pending() { + Some(pending.wait(device).await) + } else { + None + } +} diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index 7956b0582ad..6c63d295188 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -7,7 +7,7 @@ use composite_frame::CompositeVideoFrameUniforms; use core::f64; use cursor_interpolation::{InterpolatedCursorPosition, interpolate_cursor}; use decoder::{AsyncVideoDecoderHandle, spawn_decoder}; -use frame_pipeline::{RenderSession, finish_encoder}; +use frame_pipeline::{RenderSession, finish_encoder, flush_pending_readback}; use futures::FutureExt; use futures::future::OptionFuture; use layers::{ @@ -532,6 +532,14 @@ pub async fn render_video_to_channel( sender.send((frame, current_frame_number)).await?; } + if let Some(Ok(final_frame)) = frame_renderer.flush_pipeline().await { + if final_frame.width > 0 && final_frame.height > 0 { + sender + .send((final_frame, frame_number.saturating_sub(1))) + .await?; + } + } + let total_time = start_time.elapsed(); println!( "Render complete. Processed {frame_number} frames in {:?} seconds", @@ -1875,6 +1883,14 @@ impl<'a> FrameRenderer<'a> { Err(last_error.unwrap_or(RenderingError::BufferMapWaitingFailed)) } + + pub async fn flush_pipeline(&mut self) -> Option> { + if let Some(session) = &mut self.session { + flush_pending_readback(session, &self.constants.device).await + } else { + None + } + } } pub struct RendererLayers { From c5fc32c4ab83dea3ef40090f1e926de377591ab4 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:48:06 +0000 Subject: [PATCH 19/54] improve: add playback performance instrumentation logging Adds periodic (every 2s) tracing::info! logs during playback that report: - Effective FPS (frames rendered / elapsed time) - Total frames rendered and skipped - Cache hit count (frames served from LRU cache) - Prefetch hit count (frames from prefetch buffer) - Sync decode count (frames decoded synchronously on-demand) - Current prefetch buffer utilization This data helps diagnose playback performance issues in real-world usage without requiring the benchmark tool. Co-authored-by: Richie McIlroy --- crates/editor/src/playback.rs | 38 +++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index dc304ff53da..90c1790f215 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -385,7 +385,12 @@ impl Playback { let aggressive_skip_threshold = 6u32; let mut total_frames_rendered = 0u64; - let mut _total_frames_skipped = 0u64; + let mut total_frames_skipped = 0u64; + let mut cache_hits = 0u64; + let mut prefetch_hits = 0u64; + let mut sync_decodes = 0u64; + let mut last_stats_time = Instant::now(); + let stats_interval = Duration::from_secs(2); let warmup_target_frames = 10usize; let warmup_after_first_timeout = Duration::from_millis(500); @@ -486,6 +491,7 @@ impl Playback { let segment_frames_opt = if let Some(cached) = frame_cache.get(frame_number) { was_cached = true; + cache_hits += 1; Some(cached) } else { let prefetched_idx = prefetch_buffer @@ -494,6 +500,7 @@ impl Playback { if let Some(idx) = prefetched_idx { let prefetched = prefetch_buffer.remove(idx).unwrap(); + prefetch_hits += 1; Some(( Arc::new(prefetched.segment_frames), prefetched.segment_index, @@ -549,7 +556,7 @@ impl Playback { )) } else { frame_number = frame_number.saturating_add(1); - _total_frames_skipped += 1; + total_frames_skipped += 1; continue; } } @@ -571,12 +578,12 @@ impl Playback { } else { prefetch_buffer.push_back(prefetched); frame_number = frame_number.saturating_add(1); - _total_frames_skipped += 1; + total_frames_skipped += 1; continue; } } else { frame_number = frame_number.saturating_add(1); - _total_frames_skipped += 1; + total_frames_skipped += 1; continue; } } else { @@ -617,7 +624,7 @@ impl Playback { guard.remove(&frame_number); } frame_number = frame_number.saturating_add(1); - _total_frames_skipped += 1; + total_frames_skipped += 1; continue; }, data = segment_media @@ -630,6 +637,7 @@ impl Playback { }, }; + sync_decodes += 1; data.map(|frames| (Arc::new(frames), segment.recording_clip)) } } @@ -687,6 +695,24 @@ impl Playback { total_frames_rendered += 1; } + if last_stats_time.elapsed() >= stats_interval { + let effective_fps = total_frames_rendered as f64 + / start.elapsed().as_secs_f64().max(0.001); + let recent_rendered = total_frames_rendered; + let buffer_len = prefetch_buffer.len(); + info!( + effective_fps = format!("{:.1}", effective_fps), + rendered = recent_rendered, + skipped = total_frames_skipped, + cache_hits = cache_hits, + prefetch_hits = prefetch_hits, + sync_decodes = sync_decodes, + prefetch_buffer = buffer_len, + "Playback stats" + ); + last_stats_time = Instant::now(); + } + event_tx.send(PlaybackEvent::Frame(frame_number)).ok(); frame_number = frame_number.saturating_add(1); @@ -712,7 +738,7 @@ impl Playback { let skipped = frames_behind.saturating_sub(1); if skipped > 0 { frame_number += skipped; - _total_frames_skipped += skipped as u64; + total_frames_skipped += skipped as u64; prefetch_buffer.retain(|p| p.frame_number >= frame_number); let _ = frame_request_tx.send(frame_number); From 53e315924667d3665e24fa6ab7fce25883011340 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:48:06 +0000 Subject: [PATCH 20/54] improve: add playback performance instrumentation logging Adds periodic (every 2s) tracing::info! logs during playback that report: - Effective FPS (frames rendered / elapsed time) - Total frames rendered and skipped - Cache hit count (frames served from LRU cache) - Prefetch hit count (frames from prefetch buffer) - Sync decode count (frames decoded synchronously on-demand) - Current prefetch buffer utilization This data helps diagnose playback performance issues in real-world usage without requiring the benchmark tool. --- crates/editor/src/playback.rs | 38 +++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index dc304ff53da..90c1790f215 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -385,7 +385,12 @@ impl Playback { let aggressive_skip_threshold = 6u32; let mut total_frames_rendered = 0u64; - let mut _total_frames_skipped = 0u64; + let mut total_frames_skipped = 0u64; + let mut cache_hits = 0u64; + let mut prefetch_hits = 0u64; + let mut sync_decodes = 0u64; + let mut last_stats_time = Instant::now(); + let stats_interval = Duration::from_secs(2); let warmup_target_frames = 10usize; let warmup_after_first_timeout = Duration::from_millis(500); @@ -486,6 +491,7 @@ impl Playback { let segment_frames_opt = if let Some(cached) = frame_cache.get(frame_number) { was_cached = true; + cache_hits += 1; Some(cached) } else { let prefetched_idx = prefetch_buffer @@ -494,6 +500,7 @@ impl Playback { if let Some(idx) = prefetched_idx { let prefetched = prefetch_buffer.remove(idx).unwrap(); + prefetch_hits += 1; Some(( Arc::new(prefetched.segment_frames), prefetched.segment_index, @@ -549,7 +556,7 @@ impl Playback { )) } else { frame_number = frame_number.saturating_add(1); - _total_frames_skipped += 1; + total_frames_skipped += 1; continue; } } @@ -571,12 +578,12 @@ impl Playback { } else { prefetch_buffer.push_back(prefetched); frame_number = frame_number.saturating_add(1); - _total_frames_skipped += 1; + total_frames_skipped += 1; continue; } } else { frame_number = frame_number.saturating_add(1); - _total_frames_skipped += 1; + total_frames_skipped += 1; continue; } } else { @@ -617,7 +624,7 @@ impl Playback { guard.remove(&frame_number); } frame_number = frame_number.saturating_add(1); - _total_frames_skipped += 1; + total_frames_skipped += 1; continue; }, data = segment_media @@ -630,6 +637,7 @@ impl Playback { }, }; + sync_decodes += 1; data.map(|frames| (Arc::new(frames), segment.recording_clip)) } } @@ -687,6 +695,24 @@ impl Playback { total_frames_rendered += 1; } + if last_stats_time.elapsed() >= stats_interval { + let effective_fps = total_frames_rendered as f64 + / start.elapsed().as_secs_f64().max(0.001); + let recent_rendered = total_frames_rendered; + let buffer_len = prefetch_buffer.len(); + info!( + effective_fps = format!("{:.1}", effective_fps), + rendered = recent_rendered, + skipped = total_frames_skipped, + cache_hits = cache_hits, + prefetch_hits = prefetch_hits, + sync_decodes = sync_decodes, + prefetch_buffer = buffer_len, + "Playback stats" + ); + last_stats_time = Instant::now(); + } + event_tx.send(PlaybackEvent::Frame(frame_number)).ok(); frame_number = frame_number.saturating_add(1); @@ -712,7 +738,7 @@ impl Playback { let skipped = frames_behind.saturating_sub(1); if skipped > 0 { frame_number += skipped; - _total_frames_skipped += skipped as u64; + total_frames_skipped += skipped as u64; prefetch_buffer.retain(|p| p.frame_number >= frame_number); let _ = frame_request_tx.send(frame_number); From ee41523300a2962702e7d67856b536d21b13ee7c Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:53:14 +0000 Subject: [PATCH 21/54] improve: send NV12 frames over WebSocket to reduce bandwidth ~62% MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert RGBA frames to NV12 format before sending over WebSocket, reducing per-frame data from width*height*4 bytes (RGBA) to width*height*1.5 bytes (NV12) — a 62.5% bandwidth reduction. At half-res (1248x702) this reduces per-frame size from ~3.5MB to ~1.3MB, dropping bandwidth needs from ~210MB/s to ~78MB/s at 60fps. Changes: - Add WSFrameFormat enum (Rgba/Nv12) to WSFrame struct - Add WSFrame::from_rendered_frame_nv12() constructor that converts RGBA data to NV12 using the existing convert_to_nv12() function - Add pack_ws_frame() helper that selects correct packing format - Editor window frame callbacks now use NV12 format - Screenshot editor and camera legacy paths remain RGBA - WebSocket stats logging now reports actual format (NV12/RGBA) The frontend already has full NV12 support via WebGPU compute shader (renderNv12FrameWebGPU) and Canvas2D fallback, so no frontend changes are needed. Co-authored-by: Richie McIlroy --- apps/desktop/src-tauri/src/camera_legacy.rs | 1 + apps/desktop/src-tauri/src/editor_window.rs | 34 ++++----- apps/desktop/src-tauri/src/frame_ws.rs | 75 ++++++++++++++----- .../src-tauri/src/screenshot_editor.rs | 1 + 4 files changed, 75 insertions(+), 36 deletions(-) diff --git a/apps/desktop/src-tauri/src/camera_legacy.rs b/apps/desktop/src-tauri/src/camera_legacy.rs index ac980a7d1e0..012582207eb 100644 --- a/apps/desktop/src-tauri/src/camera_legacy.rs +++ b/apps/desktop/src-tauri/src/camera_legacy.rs @@ -66,6 +66,7 @@ pub async fn create_camera_preview_ws() -> (Sender, u16, Cance stride: frame.stride(0) as u32, frame_number: 0, target_time_ns: 0, + format: crate::frame_ws::WSFrameFormat::Rgba, created_at: Instant::now(), }) .ok(); diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index 87c68a916e4..17288ff82eb 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -28,15 +28,14 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { &app, path, Box::new(move |frame| { - let _ = frame_tx.send(Some(WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.padded_bytes_per_row, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - created_at: Instant::now(), - })); + let _ = frame_tx.send(Some(WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ))); }), ) .await?; @@ -219,15 +218,14 @@ impl EditorInstances { window.app_handle(), path, Box::new(move |frame| { - let _ = frame_tx.send(Some(WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.padded_bytes_per_row, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - created_at: Instant::now(), - })); + let _ = frame_tx.send(Some(WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ))); }), ) .await?; diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs index 66c9afe20c0..e275d0ca689 100644 --- a/apps/desktop/src-tauri/src/frame_ws.rs +++ b/apps/desktop/src-tauri/src/frame_ws.rs @@ -121,6 +121,12 @@ fn pack_frame_data( data } +#[derive(Clone, Copy, PartialEq, Eq)] +pub enum WSFrameFormat { + Rgba, + Nv12, +} + #[derive(Clone)] pub struct WSFrame { pub data: Vec, @@ -129,10 +135,54 @@ pub struct WSFrame { pub stride: u32, pub frame_number: u32, pub target_time_ns: u64, + pub format: WSFrameFormat, #[allow(dead_code)] pub created_at: Instant, } +impl WSFrame { + pub fn from_rendered_frame_nv12( + data: Vec, + width: u32, + height: u32, + stride: u32, + frame_number: u32, + target_time_ns: u64, + ) -> Self { + let nv12_data = convert_to_nv12(&data, width, height, stride); + Self { + data: nv12_data, + width: width & !1, + height: height & !1, + stride: width & !1, + frame_number, + target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + } + } +} + +fn pack_ws_frame(frame: WSFrame) -> Vec { + match frame.format { + WSFrameFormat::Nv12 => pack_nv12_frame( + frame.data, + frame.width, + frame.height, + frame.frame_number, + frame.target_time_ns, + ), + WSFrameFormat::Rgba => pack_frame_data( + frame.data, + frame.stride, + frame.height, + frame.width, + frame.frame_number, + frame.target_time_ns, + ), + } +} + pub async fn create_watch_frame_ws( frame_rx: watch::Receiver>, ) -> (u16, CancellationToken) { @@ -162,14 +212,7 @@ pub async fn create_watch_frame_ws( { let frame_opt = camera_rx.borrow().clone(); if let Some(frame) = frame_opt { - let packed = pack_frame_data( - frame.data, - frame.stride, - frame.height, - frame.width, - frame.frame_number, - frame.target_time_ns, - ); + let packed = pack_ws_frame(frame); if let Err(e) = socket.send(Message::Binary(packed)).await { tracing::error!("Failed to send initial frame to socket: {:?}", e); @@ -198,16 +241,12 @@ pub async fn create_watch_frame_ws( if let Some(frame) = frame_opt { let width = frame.width; let height = frame.height; + let format_label = match frame.format { + WSFrameFormat::Nv12 => "NV12", + WSFrameFormat::Rgba => "RGBA", + }; - let packed = pack_frame_data( - frame.data, - frame.stride, - frame.height, - frame.width, - frame.frame_number, - frame.target_time_ns, - ); - + let packed = pack_ws_frame(frame); let packed_len = packed.len(); match socket.send(Message::Binary(packed)).await { @@ -226,7 +265,7 @@ pub async fn create_watch_frame_ws( mb_per_sec = format!("{:.1}", mb_per_sec), avg_kb = format!("{:.1}", (total_bytes as f64 / total_frames.max(1) as f64) / 1024.0), dims = format!("{}x{}", width, height), - format = "RGBA", + format = format_label, "WS frame stats" ); } diff --git a/apps/desktop/src-tauri/src/screenshot_editor.rs b/apps/desktop/src-tauri/src/screenshot_editor.rs index 042dda22f4c..01af75df033 100644 --- a/apps/desktop/src-tauri/src/screenshot_editor.rs +++ b/apps/desktop/src-tauri/src/screenshot_editor.rs @@ -388,6 +388,7 @@ impl ScreenshotEditorInstances { stride: frame.padded_bytes_per_row, frame_number: frame.frame_number, target_time_ns: frame.target_time_ns, + format: crate::frame_ws::WSFrameFormat::Rgba, created_at: Instant::now(), })); } From ee8bfda6b2edc23ea2e569447f2f1669e8fb9709 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:53:14 +0000 Subject: [PATCH 22/54] improve: send NV12 frames over WebSocket to reduce bandwidth ~62% MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert RGBA frames to NV12 format before sending over WebSocket, reducing per-frame data from width*height*4 bytes (RGBA) to width*height*1.5 bytes (NV12) — a 62.5% bandwidth reduction. At half-res (1248x702) this reduces per-frame size from ~3.5MB to ~1.3MB, dropping bandwidth needs from ~210MB/s to ~78MB/s at 60fps. Changes: - Add WSFrameFormat enum (Rgba/Nv12) to WSFrame struct - Add WSFrame::from_rendered_frame_nv12() constructor that converts RGBA data to NV12 using the existing convert_to_nv12() function - Add pack_ws_frame() helper that selects correct packing format - Editor window frame callbacks now use NV12 format - Screenshot editor and camera legacy paths remain RGBA - WebSocket stats logging now reports actual format (NV12/RGBA) The frontend already has full NV12 support via WebGPU compute shader (renderNv12FrameWebGPU) and Canvas2D fallback, so no frontend changes are needed. --- apps/desktop/src-tauri/src/camera_legacy.rs | 1 + apps/desktop/src-tauri/src/editor_window.rs | 34 ++++----- apps/desktop/src-tauri/src/frame_ws.rs | 75 ++++++++++++++----- .../src-tauri/src/screenshot_editor.rs | 1 + 4 files changed, 75 insertions(+), 36 deletions(-) diff --git a/apps/desktop/src-tauri/src/camera_legacy.rs b/apps/desktop/src-tauri/src/camera_legacy.rs index ac980a7d1e0..012582207eb 100644 --- a/apps/desktop/src-tauri/src/camera_legacy.rs +++ b/apps/desktop/src-tauri/src/camera_legacy.rs @@ -66,6 +66,7 @@ pub async fn create_camera_preview_ws() -> (Sender, u16, Cance stride: frame.stride(0) as u32, frame_number: 0, target_time_ns: 0, + format: crate::frame_ws::WSFrameFormat::Rgba, created_at: Instant::now(), }) .ok(); diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index 87c68a916e4..17288ff82eb 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -28,15 +28,14 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { &app, path, Box::new(move |frame| { - let _ = frame_tx.send(Some(WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.padded_bytes_per_row, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - created_at: Instant::now(), - })); + let _ = frame_tx.send(Some(WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ))); }), ) .await?; @@ -219,15 +218,14 @@ impl EditorInstances { window.app_handle(), path, Box::new(move |frame| { - let _ = frame_tx.send(Some(WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.padded_bytes_per_row, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - created_at: Instant::now(), - })); + let _ = frame_tx.send(Some(WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ))); }), ) .await?; diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs index 66c9afe20c0..e275d0ca689 100644 --- a/apps/desktop/src-tauri/src/frame_ws.rs +++ b/apps/desktop/src-tauri/src/frame_ws.rs @@ -121,6 +121,12 @@ fn pack_frame_data( data } +#[derive(Clone, Copy, PartialEq, Eq)] +pub enum WSFrameFormat { + Rgba, + Nv12, +} + #[derive(Clone)] pub struct WSFrame { pub data: Vec, @@ -129,10 +135,54 @@ pub struct WSFrame { pub stride: u32, pub frame_number: u32, pub target_time_ns: u64, + pub format: WSFrameFormat, #[allow(dead_code)] pub created_at: Instant, } +impl WSFrame { + pub fn from_rendered_frame_nv12( + data: Vec, + width: u32, + height: u32, + stride: u32, + frame_number: u32, + target_time_ns: u64, + ) -> Self { + let nv12_data = convert_to_nv12(&data, width, height, stride); + Self { + data: nv12_data, + width: width & !1, + height: height & !1, + stride: width & !1, + frame_number, + target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + } + } +} + +fn pack_ws_frame(frame: WSFrame) -> Vec { + match frame.format { + WSFrameFormat::Nv12 => pack_nv12_frame( + frame.data, + frame.width, + frame.height, + frame.frame_number, + frame.target_time_ns, + ), + WSFrameFormat::Rgba => pack_frame_data( + frame.data, + frame.stride, + frame.height, + frame.width, + frame.frame_number, + frame.target_time_ns, + ), + } +} + pub async fn create_watch_frame_ws( frame_rx: watch::Receiver>, ) -> (u16, CancellationToken) { @@ -162,14 +212,7 @@ pub async fn create_watch_frame_ws( { let frame_opt = camera_rx.borrow().clone(); if let Some(frame) = frame_opt { - let packed = pack_frame_data( - frame.data, - frame.stride, - frame.height, - frame.width, - frame.frame_number, - frame.target_time_ns, - ); + let packed = pack_ws_frame(frame); if let Err(e) = socket.send(Message::Binary(packed)).await { tracing::error!("Failed to send initial frame to socket: {:?}", e); @@ -198,16 +241,12 @@ pub async fn create_watch_frame_ws( if let Some(frame) = frame_opt { let width = frame.width; let height = frame.height; + let format_label = match frame.format { + WSFrameFormat::Nv12 => "NV12", + WSFrameFormat::Rgba => "RGBA", + }; - let packed = pack_frame_data( - frame.data, - frame.stride, - frame.height, - frame.width, - frame.frame_number, - frame.target_time_ns, - ); - + let packed = pack_ws_frame(frame); let packed_len = packed.len(); match socket.send(Message::Binary(packed)).await { @@ -226,7 +265,7 @@ pub async fn create_watch_frame_ws( mb_per_sec = format!("{:.1}", mb_per_sec), avg_kb = format!("{:.1}", (total_bytes as f64 / total_frames.max(1) as f64) / 1024.0), dims = format!("{}x{}", width, height), - format = "RGBA", + format = format_label, "WS frame stats" ); } diff --git a/apps/desktop/src-tauri/src/screenshot_editor.rs b/apps/desktop/src-tauri/src/screenshot_editor.rs index 042dda22f4c..01af75df033 100644 --- a/apps/desktop/src-tauri/src/screenshot_editor.rs +++ b/apps/desktop/src-tauri/src/screenshot_editor.rs @@ -388,6 +388,7 @@ impl ScreenshotEditorInstances { stride: frame.padded_bytes_per_row, frame_number: frame.frame_number, target_time_ns: frame.target_time_ns, + format: crate::frame_ws::WSFrameFormat::Rgba, created_at: Instant::now(), })); } From b659ef5f9e67f1ed453ac15e001fff7eb8807986 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:54:21 +0000 Subject: [PATCH 23/54] improve: add smart frame cache eviction for better memory usage Add evict_far_from() method to FrameCache that removes entries far from the current playhead position. Called when aggressive frame skipping occurs (playback falls behind) to prevent the cache from holding stale frames that will never be used again. This keeps the LRU cache focused on frames near the current playhead, improving cache hit rates during normal sequential playback. Co-authored-by: Richie McIlroy --- crates/editor/src/playback.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index 90c1790f215..b59648e0b15 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -100,6 +100,25 @@ impl FrameCache { self.cache .put(frame_number, (segment_frames, segment_index)); } + + fn evict_far_from(&mut self, current_frame: u32, max_distance: u32) { + let keys_to_remove: Vec = self + .cache + .iter() + .filter_map(|(k, _)| { + let distance = if *k > current_frame { + *k - current_frame + } else { + current_frame - *k + }; + if distance > max_distance { Some(*k) } else { None } + }) + .collect(); + + for key in keys_to_remove { + self.cache.pop(&key); + } + } } impl Playback { @@ -741,6 +760,7 @@ impl Playback { total_frames_skipped += skipped as u64; prefetch_buffer.retain(|p| p.frame_number >= frame_number); + frame_cache.evict_far_from(frame_number, MAX_PREFETCH_AHEAD); let _ = frame_request_tx.send(frame_number); let _ = playback_position_tx.send(frame_number); if has_audio From c262261ad6a73f2fe89b713cb4738bbc97c1b028 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:54:21 +0000 Subject: [PATCH 24/54] improve: add smart frame cache eviction for better memory usage Add evict_far_from() method to FrameCache that removes entries far from the current playhead position. Called when aggressive frame skipping occurs (playback falls behind) to prevent the cache from holding stale frames that will never be used again. This keeps the LRU cache focused on frames near the current playhead, improving cache hit rates during normal sequential playback. --- crates/editor/src/playback.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index 90c1790f215..b59648e0b15 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -100,6 +100,25 @@ impl FrameCache { self.cache .put(frame_number, (segment_frames, segment_index)); } + + fn evict_far_from(&mut self, current_frame: u32, max_distance: u32) { + let keys_to_remove: Vec = self + .cache + .iter() + .filter_map(|(k, _)| { + let distance = if *k > current_frame { + *k - current_frame + } else { + current_frame - *k + }; + if distance > max_distance { Some(*k) } else { None } + }) + .collect(); + + for key in keys_to_remove { + self.cache.pop(&key); + } + } } impl Playback { @@ -741,6 +760,7 @@ impl Playback { total_frames_skipped += skipped as u64; prefetch_buffer.retain(|p| p.frame_number >= frame_number); + frame_cache.evict_far_from(frame_number, MAX_PREFETCH_AHEAD); let _ = frame_request_tx.send(frame_number); let _ = playback_position_tx.send(frame_number); if has_audio From 0d021bd3733f02be82025e41ca4a2eaefc4316f0 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 18:57:43 +0000 Subject: [PATCH 25/54] chore: format code with cargo fmt Co-authored-by: Richie McIlroy --- crates/editor/src/playback.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index b59648e0b15..4f64aeef86b 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -111,7 +111,11 @@ impl FrameCache { } else { current_frame - *k }; - if distance > max_distance { Some(*k) } else { None } + if distance > max_distance { + Some(*k) + } else { + None + } }) .collect(); @@ -715,8 +719,8 @@ impl Playback { } if last_stats_time.elapsed() >= stats_interval { - let effective_fps = total_frames_rendered as f64 - / start.elapsed().as_secs_f64().max(0.001); + let effective_fps = + total_frames_rendered as f64 / start.elapsed().as_secs_f64().max(0.001); let recent_rendered = total_frames_rendered; let buffer_len = prefetch_buffer.len(); info!( From de63d9a74d9f02f18b5b53daccab03082cf804ca Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 18:57:43 +0000 Subject: [PATCH 26/54] chore: format code with cargo fmt --- crates/editor/src/playback.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index b59648e0b15..4f64aeef86b 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -111,7 +111,11 @@ impl FrameCache { } else { current_frame - *k }; - if distance > max_distance { Some(*k) } else { None } + if distance > max_distance { + Some(*k) + } else { + None + } }) .collect(); @@ -715,8 +719,8 @@ impl Playback { } if last_stats_time.elapsed() >= stats_interval { - let effective_fps = total_frames_rendered as f64 - / start.elapsed().as_secs_f64().max(0.001); + let effective_fps = + total_frames_rendered as f64 / start.elapsed().as_secs_f64().max(0.001); let recent_rendered = total_frames_rendered; let buffer_len = prefetch_buffer.len(); info!( From 324d49b9ff1ad843d8ea29063451ce1b425ddaec Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 19:01:14 +0000 Subject: [PATCH 27/54] improve: batch camera layer YUV conversion into shared GPU encoder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the camera layer created its own command encoder and called queue.submit() for both YUV→RGBA conversion and texture copy — 2 extra GPU submissions per frame when camera is visible. Changes: - Add copy_from_yuv_output_to_encoder() that copies using external encoder - Add prepare_with_encoder() to CameraLayer that uses batched convert_nv12_to_encoder/convert_yuv420p_to_encoder methods - Update RendererLayers::prepare_with_encoder() to use camera's batched path for both camera and camera_only layers Result: When camera is visible, saves 2-4 queue.submit() calls per frame, reducing GPU overhead. Co-authored-by: Richie McIlroy --- crates/rendering/src/layers/camera.rs | 182 ++++++++++++++++++++++++++ crates/rendering/src/lib.rs | 6 +- 2 files changed, 186 insertions(+), 2 deletions(-) diff --git a/crates/rendering/src/layers/camera.rs b/crates/rendering/src/layers/camera.rs index 700bf52c412..7a6b04d212b 100644 --- a/crates/rendering/src/layers/camera.rs +++ b/crates/rendering/src/layers/camera.rs @@ -289,6 +289,188 @@ impl CameraLayer { } } + fn copy_from_yuv_output_to_encoder( + &self, + encoder: &mut wgpu::CommandEncoder, + next_texture: usize, + frame_size: XY, + ) { + if let Some(output_texture) = self.yuv_converter.output_texture() { + encoder.copy_texture_to_texture( + wgpu::TexelCopyTextureInfo { + texture: output_texture, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + wgpu::TexelCopyTextureInfo { + texture: &self.frame_textures[next_texture], + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + wgpu::Extent3d { + width: frame_size.x, + height: frame_size.y, + depth_or_array_layers: 1, + }, + ); + } + } + + pub fn prepare_with_encoder( + &mut self, + device: &wgpu::Device, + queue: &wgpu::Queue, + uniforms: Option, + frame_data: Option<(XY, &DecodedFrame, f32)>, + encoder: &mut wgpu::CommandEncoder, + ) { + let Some(uniforms) = uniforms else { + self.hidden = true; + return; + }; + + let has_previous_frame = self.last_recording_time.is_some(); + self.hidden = frame_data.is_none() && !has_previous_frame; + + queue.write_buffer(&self.uniforms_buffer, 0, bytemuck::cast_slice(&[uniforms])); + + let Some((frame_size, camera_frame, recording_time)) = frame_data else { + return; + }; + + let format = camera_frame.format(); + + let is_same_frame = self + .last_recording_time + .is_some_and(|last| (last - recording_time).abs() < 0.001); + + if !is_same_frame { + let next_texture = 1 - self.current_texture; + + if self.frame_textures[next_texture].width() != frame_size.x + || self.frame_textures[next_texture].height() != frame_size.y + { + self.frame_textures[next_texture] = + CompositeVideoFramePipeline::create_frame_texture( + device, + frame_size.x, + frame_size.y, + ); + self.frame_texture_views[next_texture] = + self.frame_textures[next_texture].create_view(&Default::default()); + + self.bind_groups[next_texture] = Some(self.pipeline.bind_group( + device, + &self.uniforms_buffer, + &self.frame_texture_views[next_texture], + )); + } + + match format { + PixelFormat::Rgba => { + let frame_data_bytes = camera_frame.data(); + let src_bytes_per_row = frame_size.x * 4; + + queue.write_texture( + wgpu::TexelCopyTextureInfo { + texture: &self.frame_textures[next_texture], + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + frame_data_bytes, + wgpu::TexelCopyBufferLayout { + offset: 0, + bytes_per_row: Some(src_bytes_per_row), + rows_per_image: Some(frame_size.y), + }, + wgpu::Extent3d { + width: frame_size.x, + height: frame_size.y, + depth_or_array_layers: 1, + }, + ); + } + PixelFormat::Nv12 => { + if let Err(e) = self.yuv_converter.prepare_for_dimensions( + device, + frame_size.x, + frame_size.y, + ) { + tracing::warn!(error = %e, "YUV converter prepare failed"); + return; + } + + if let (Some(y_data), Some(uv_data)) = + (camera_frame.y_plane(), camera_frame.uv_plane()) + && self + .yuv_converter + .convert_nv12_to_encoder( + device, + queue, + encoder, + y_data, + uv_data, + frame_size.x, + frame_size.y, + camera_frame.y_stride(), + camera_frame.uv_stride(), + ) + .is_ok() + { + self.copy_from_yuv_output_to_encoder( + encoder, + next_texture, + frame_size, + ); + } + } + PixelFormat::Yuv420p => { + if let Err(e) = self.yuv_converter.prepare_for_dimensions( + device, + frame_size.x, + frame_size.y, + ) { + tracing::warn!(error = %e, "YUV converter prepare failed"); + return; + } + + if let (Some(y_data), Some(u_data), Some(v_data)) = ( + camera_frame.y_plane(), + camera_frame.u_plane(), + camera_frame.v_plane(), + ) && self + .yuv_converter + .convert_yuv420p_to_encoder( + device, + queue, + encoder, + y_data, + u_data, + v_data, + frame_size.x, + frame_size.y, + camera_frame.y_stride(), + camera_frame.uv_stride(), + ) + .is_ok() + { + self.copy_from_yuv_output_to_encoder( + encoder, + next_texture, + frame_size, + ); + } + } + } + + self.last_recording_time = Some(recording_time); + self.current_texture = next_texture; + } + } + pub fn copy_to_texture(&mut self, _encoder: &mut wgpu::CommandEncoder) {} pub fn render(&self, pass: &mut wgpu::RenderPass<'_>) { diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index 6c63d295188..dbcf768c04e 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -2067,7 +2067,7 @@ impl RendererLayers { constants, ); - self.camera.prepare( + self.camera.prepare_with_encoder( &constants.device, &constants.queue, uniforms.camera, @@ -2077,9 +2077,10 @@ impl RendererLayers { .as_ref() .map(|frame| (size, frame, segment_frames.recording_time)) }), + encoder, ); - self.camera_only.prepare( + self.camera_only.prepare_with_encoder( &constants.device, &constants.queue, uniforms.camera_only, @@ -2089,6 +2090,7 @@ impl RendererLayers { .as_ref() .map(|frame| (size, frame, segment_frames.recording_time)) }), + encoder, ); self.text.prepare( From e2955540e30d4522bdd7c3ea6331ca04a7f3db61 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 19:01:14 +0000 Subject: [PATCH 28/54] improve: batch camera layer YUV conversion into shared GPU encoder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the camera layer created its own command encoder and called queue.submit() for both YUV→RGBA conversion and texture copy — 2 extra GPU submissions per frame when camera is visible. Changes: - Add copy_from_yuv_output_to_encoder() that copies using external encoder - Add prepare_with_encoder() to CameraLayer that uses batched convert_nv12_to_encoder/convert_yuv420p_to_encoder methods - Update RendererLayers::prepare_with_encoder() to use camera's batched path for both camera and camera_only layers Result: When camera is visible, saves 2-4 queue.submit() calls per frame, reducing GPU overhead. --- crates/rendering/src/layers/camera.rs | 182 ++++++++++++++++++++++++++ crates/rendering/src/lib.rs | 6 +- 2 files changed, 186 insertions(+), 2 deletions(-) diff --git a/crates/rendering/src/layers/camera.rs b/crates/rendering/src/layers/camera.rs index 700bf52c412..7a6b04d212b 100644 --- a/crates/rendering/src/layers/camera.rs +++ b/crates/rendering/src/layers/camera.rs @@ -289,6 +289,188 @@ impl CameraLayer { } } + fn copy_from_yuv_output_to_encoder( + &self, + encoder: &mut wgpu::CommandEncoder, + next_texture: usize, + frame_size: XY, + ) { + if let Some(output_texture) = self.yuv_converter.output_texture() { + encoder.copy_texture_to_texture( + wgpu::TexelCopyTextureInfo { + texture: output_texture, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + wgpu::TexelCopyTextureInfo { + texture: &self.frame_textures[next_texture], + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + wgpu::Extent3d { + width: frame_size.x, + height: frame_size.y, + depth_or_array_layers: 1, + }, + ); + } + } + + pub fn prepare_with_encoder( + &mut self, + device: &wgpu::Device, + queue: &wgpu::Queue, + uniforms: Option, + frame_data: Option<(XY, &DecodedFrame, f32)>, + encoder: &mut wgpu::CommandEncoder, + ) { + let Some(uniforms) = uniforms else { + self.hidden = true; + return; + }; + + let has_previous_frame = self.last_recording_time.is_some(); + self.hidden = frame_data.is_none() && !has_previous_frame; + + queue.write_buffer(&self.uniforms_buffer, 0, bytemuck::cast_slice(&[uniforms])); + + let Some((frame_size, camera_frame, recording_time)) = frame_data else { + return; + }; + + let format = camera_frame.format(); + + let is_same_frame = self + .last_recording_time + .is_some_and(|last| (last - recording_time).abs() < 0.001); + + if !is_same_frame { + let next_texture = 1 - self.current_texture; + + if self.frame_textures[next_texture].width() != frame_size.x + || self.frame_textures[next_texture].height() != frame_size.y + { + self.frame_textures[next_texture] = + CompositeVideoFramePipeline::create_frame_texture( + device, + frame_size.x, + frame_size.y, + ); + self.frame_texture_views[next_texture] = + self.frame_textures[next_texture].create_view(&Default::default()); + + self.bind_groups[next_texture] = Some(self.pipeline.bind_group( + device, + &self.uniforms_buffer, + &self.frame_texture_views[next_texture], + )); + } + + match format { + PixelFormat::Rgba => { + let frame_data_bytes = camera_frame.data(); + let src_bytes_per_row = frame_size.x * 4; + + queue.write_texture( + wgpu::TexelCopyTextureInfo { + texture: &self.frame_textures[next_texture], + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + frame_data_bytes, + wgpu::TexelCopyBufferLayout { + offset: 0, + bytes_per_row: Some(src_bytes_per_row), + rows_per_image: Some(frame_size.y), + }, + wgpu::Extent3d { + width: frame_size.x, + height: frame_size.y, + depth_or_array_layers: 1, + }, + ); + } + PixelFormat::Nv12 => { + if let Err(e) = self.yuv_converter.prepare_for_dimensions( + device, + frame_size.x, + frame_size.y, + ) { + tracing::warn!(error = %e, "YUV converter prepare failed"); + return; + } + + if let (Some(y_data), Some(uv_data)) = + (camera_frame.y_plane(), camera_frame.uv_plane()) + && self + .yuv_converter + .convert_nv12_to_encoder( + device, + queue, + encoder, + y_data, + uv_data, + frame_size.x, + frame_size.y, + camera_frame.y_stride(), + camera_frame.uv_stride(), + ) + .is_ok() + { + self.copy_from_yuv_output_to_encoder( + encoder, + next_texture, + frame_size, + ); + } + } + PixelFormat::Yuv420p => { + if let Err(e) = self.yuv_converter.prepare_for_dimensions( + device, + frame_size.x, + frame_size.y, + ) { + tracing::warn!(error = %e, "YUV converter prepare failed"); + return; + } + + if let (Some(y_data), Some(u_data), Some(v_data)) = ( + camera_frame.y_plane(), + camera_frame.u_plane(), + camera_frame.v_plane(), + ) && self + .yuv_converter + .convert_yuv420p_to_encoder( + device, + queue, + encoder, + y_data, + u_data, + v_data, + frame_size.x, + frame_size.y, + camera_frame.y_stride(), + camera_frame.uv_stride(), + ) + .is_ok() + { + self.copy_from_yuv_output_to_encoder( + encoder, + next_texture, + frame_size, + ); + } + } + } + + self.last_recording_time = Some(recording_time); + self.current_texture = next_texture; + } + } + pub fn copy_to_texture(&mut self, _encoder: &mut wgpu::CommandEncoder) {} pub fn render(&self, pass: &mut wgpu::RenderPass<'_>) { diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index 6c63d295188..dbcf768c04e 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -2067,7 +2067,7 @@ impl RendererLayers { constants, ); - self.camera.prepare( + self.camera.prepare_with_encoder( &constants.device, &constants.queue, uniforms.camera, @@ -2077,9 +2077,10 @@ impl RendererLayers { .as_ref() .map(|frame| (size, frame, segment_frames.recording_time)) }), + encoder, ); - self.camera_only.prepare( + self.camera_only.prepare_with_encoder( &constants.device, &constants.queue, uniforms.camera_only, @@ -2089,6 +2090,7 @@ impl RendererLayers { .as_ref() .map(|frame| (size, frame, segment_frames.recording_time)) }), + encoder, ); self.text.prepare( From c1d922923643d3c9e91fa84ece69d981283bd3ac Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 19:04:52 +0000 Subject: [PATCH 29/54] =?UTF-8?q?improve:=20optimize=20RGBA=E2=86=92NV12?= =?UTF-8?q?=20conversion=20with=20better=20cache=20access=20pattern?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite convert_to_nv12() for better performance: - Separate Y and UV computation with row-level bounds checking (one check per row instead of per-pixel) - Use row slices for sequential memory access - Process UV pairs in a tight while loop instead of branching per pixel - Use bit shift (>>1) instead of division for averaging - More cache-friendly: process each row's Y values contiguously, then UV values for even rows This conversion runs on every frame in the frame callback, so reducing its cost directly improves sustained FPS. Co-authored-by: Richie McIlroy --- apps/desktop/src-tauri/src/frame_ws.rs | 101 ++++++++++++------------- crates/rendering/src/layers/camera.rs | 12 +-- 2 files changed, 49 insertions(+), 64 deletions(-) diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs index e275d0ca689..cff525e823d 100644 --- a/apps/desktop/src-tauri/src/frame_ws.rs +++ b/apps/desktop/src-tauri/src/frame_ws.rs @@ -10,72 +10,65 @@ static LAST_LOG_TIME: AtomicU64 = AtomicU64::new(0); const NV12_FORMAT_MAGIC: u32 = 0x4e563132; fn convert_to_nv12(data: &[u8], width: u32, height: u32, stride: u32) -> Vec { - let width = width & !1; - let height = height & !1; + let width = (width & !1) as usize; + let height = (height & !1) as usize; if width == 0 || height == 0 { return Vec::new(); } - let y_stride = width; - let uv_stride = width; - let y_size = (y_stride * height) as usize; - let uv_size = (uv_stride * (height / 2)) as usize; - let total_size = y_size + uv_size; - - let stride_bytes = stride as usize; + let y_size = width * height; + let uv_size = width * (height / 2); + let stride = stride as usize; - let mut output = vec![0u8; total_size]; + let mut output = vec![0u8; y_size + uv_size]; let (y_plane, uv_plane) = output.split_at_mut(y_size); - for y in 0..height as usize { - let src_row = y * stride_bytes; + for row in 0..height { + let src_offset = row * stride; + let y_offset = row * width; - if src_row >= data.len() { + if src_offset + width * 4 > data.len() { continue; } - let y_row_start = y * y_stride as usize; - let is_uv_row = y % 2 == 0; - let uv_row_start = if is_uv_row { - (y / 2) * uv_stride as usize - } else { - 0 - }; - - for x in 0..width as usize { - let px = src_row + x * 4; - - if px + 2 < data.len() { - let r = data[px] as i32; - let g = data[px + 1] as i32; - let b = data[px + 2] as i32; - - let y_val = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; - y_plane[y_row_start + x] = y_val.clamp(0, 255) as u8; - - if is_uv_row && x % 2 == 0 && x + 1 < width as usize { - let px1 = src_row + (x + 1) * 4; - - let (r1, g1, b1) = if px1 + 2 < data.len() { - (data[px1] as i32, data[px1 + 1] as i32, data[px1 + 2] as i32) - } else { - (r, g, b) - }; - - let avg_r = (r + r1) / 2; - let avg_g = (g + g1) / 2; - let avg_b = (b + b1) / 2; - - let u = ((-38 * avg_r - 74 * avg_g + 112 * avg_b + 128) >> 8) + 128; - let v = ((112 * avg_r - 94 * avg_g - 18 * avg_b + 128) >> 8) + 128; - - let uv_idx = uv_row_start + x; - if uv_idx + 1 < uv_plane.len() { - uv_plane[uv_idx] = u.clamp(0, 255) as u8; - uv_plane[uv_idx + 1] = v.clamp(0, 255) as u8; - } - } + let src_row = &data[src_offset..]; + let y_row = &mut y_plane[y_offset..y_offset + width]; + + for x in 0..width { + let px = x * 4; + let r = src_row[px] as i32; + let g = src_row[px + 1] as i32; + let b = src_row[px + 2] as i32; + y_row[x] = (((66 * r + 129 * g + 25 * b + 128) >> 8) + 16).min(255) as u8; + } + + if row % 2 == 0 { + let uv_offset = (row / 2) * width; + let uv_row = &mut uv_plane[uv_offset..uv_offset + width]; + + let mut x = 0; + while x < width { + let px0 = x * 4; + let px1 = (x + 1) * 4; + + let r0 = src_row[px0] as i32; + let g0 = src_row[px0 + 1] as i32; + let b0 = src_row[px0 + 2] as i32; + let r1 = src_row[px1] as i32; + let g1 = src_row[px1 + 1] as i32; + let b1 = src_row[px1 + 2] as i32; + + let avg_r = (r0 + r1) >> 1; + let avg_g = (g0 + g1) >> 1; + let avg_b = (b0 + b1) >> 1; + + uv_row[x] = (((-38 * avg_r - 74 * avg_g + 112 * avg_b + 128) >> 8) + 128) + .clamp(0, 255) as u8; + uv_row[x + 1] = (((112 * avg_r - 94 * avg_g - 18 * avg_b + 128) >> 8) + 128) + .clamp(0, 255) as u8; + + x += 2; } } } diff --git a/crates/rendering/src/layers/camera.rs b/crates/rendering/src/layers/camera.rs index 7a6b04d212b..01c130d5f0e 100644 --- a/crates/rendering/src/layers/camera.rs +++ b/crates/rendering/src/layers/camera.rs @@ -420,11 +420,7 @@ impl CameraLayer { ) .is_ok() { - self.copy_from_yuv_output_to_encoder( - encoder, - next_texture, - frame_size, - ); + self.copy_from_yuv_output_to_encoder(encoder, next_texture, frame_size); } } PixelFormat::Yuv420p => { @@ -457,11 +453,7 @@ impl CameraLayer { ) .is_ok() { - self.copy_from_yuv_output_to_encoder( - encoder, - next_texture, - frame_size, - ); + self.copy_from_yuv_output_to_encoder(encoder, next_texture, frame_size); } } } From 8daed82808ab90c218b4c049e38aa89236a29f76 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 19:04:52 +0000 Subject: [PATCH 30/54] =?UTF-8?q?improve:=20optimize=20RGBA=E2=86=92NV12?= =?UTF-8?q?=20conversion=20with=20better=20cache=20access=20pattern?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite convert_to_nv12() for better performance: - Separate Y and UV computation with row-level bounds checking (one check per row instead of per-pixel) - Use row slices for sequential memory access - Process UV pairs in a tight while loop instead of branching per pixel - Use bit shift (>>1) instead of division for averaging - More cache-friendly: process each row's Y values contiguously, then UV values for even rows This conversion runs on every frame in the frame callback, so reducing its cost directly improves sustained FPS. --- apps/desktop/src-tauri/src/frame_ws.rs | 101 ++++++++++++------------- crates/rendering/src/layers/camera.rs | 12 +-- 2 files changed, 49 insertions(+), 64 deletions(-) diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs index e275d0ca689..cff525e823d 100644 --- a/apps/desktop/src-tauri/src/frame_ws.rs +++ b/apps/desktop/src-tauri/src/frame_ws.rs @@ -10,72 +10,65 @@ static LAST_LOG_TIME: AtomicU64 = AtomicU64::new(0); const NV12_FORMAT_MAGIC: u32 = 0x4e563132; fn convert_to_nv12(data: &[u8], width: u32, height: u32, stride: u32) -> Vec { - let width = width & !1; - let height = height & !1; + let width = (width & !1) as usize; + let height = (height & !1) as usize; if width == 0 || height == 0 { return Vec::new(); } - let y_stride = width; - let uv_stride = width; - let y_size = (y_stride * height) as usize; - let uv_size = (uv_stride * (height / 2)) as usize; - let total_size = y_size + uv_size; - - let stride_bytes = stride as usize; + let y_size = width * height; + let uv_size = width * (height / 2); + let stride = stride as usize; - let mut output = vec![0u8; total_size]; + let mut output = vec![0u8; y_size + uv_size]; let (y_plane, uv_plane) = output.split_at_mut(y_size); - for y in 0..height as usize { - let src_row = y * stride_bytes; + for row in 0..height { + let src_offset = row * stride; + let y_offset = row * width; - if src_row >= data.len() { + if src_offset + width * 4 > data.len() { continue; } - let y_row_start = y * y_stride as usize; - let is_uv_row = y % 2 == 0; - let uv_row_start = if is_uv_row { - (y / 2) * uv_stride as usize - } else { - 0 - }; - - for x in 0..width as usize { - let px = src_row + x * 4; - - if px + 2 < data.len() { - let r = data[px] as i32; - let g = data[px + 1] as i32; - let b = data[px + 2] as i32; - - let y_val = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; - y_plane[y_row_start + x] = y_val.clamp(0, 255) as u8; - - if is_uv_row && x % 2 == 0 && x + 1 < width as usize { - let px1 = src_row + (x + 1) * 4; - - let (r1, g1, b1) = if px1 + 2 < data.len() { - (data[px1] as i32, data[px1 + 1] as i32, data[px1 + 2] as i32) - } else { - (r, g, b) - }; - - let avg_r = (r + r1) / 2; - let avg_g = (g + g1) / 2; - let avg_b = (b + b1) / 2; - - let u = ((-38 * avg_r - 74 * avg_g + 112 * avg_b + 128) >> 8) + 128; - let v = ((112 * avg_r - 94 * avg_g - 18 * avg_b + 128) >> 8) + 128; - - let uv_idx = uv_row_start + x; - if uv_idx + 1 < uv_plane.len() { - uv_plane[uv_idx] = u.clamp(0, 255) as u8; - uv_plane[uv_idx + 1] = v.clamp(0, 255) as u8; - } - } + let src_row = &data[src_offset..]; + let y_row = &mut y_plane[y_offset..y_offset + width]; + + for x in 0..width { + let px = x * 4; + let r = src_row[px] as i32; + let g = src_row[px + 1] as i32; + let b = src_row[px + 2] as i32; + y_row[x] = (((66 * r + 129 * g + 25 * b + 128) >> 8) + 16).min(255) as u8; + } + + if row % 2 == 0 { + let uv_offset = (row / 2) * width; + let uv_row = &mut uv_plane[uv_offset..uv_offset + width]; + + let mut x = 0; + while x < width { + let px0 = x * 4; + let px1 = (x + 1) * 4; + + let r0 = src_row[px0] as i32; + let g0 = src_row[px0 + 1] as i32; + let b0 = src_row[px0 + 2] as i32; + let r1 = src_row[px1] as i32; + let g1 = src_row[px1 + 1] as i32; + let b1 = src_row[px1 + 2] as i32; + + let avg_r = (r0 + r1) >> 1; + let avg_g = (g0 + g1) >> 1; + let avg_b = (b0 + b1) >> 1; + + uv_row[x] = (((-38 * avg_r - 74 * avg_g + 112 * avg_b + 128) >> 8) + 128) + .clamp(0, 255) as u8; + uv_row[x + 1] = (((112 * avg_r - 94 * avg_g - 18 * avg_b + 128) >> 8) + 128) + .clamp(0, 255) as u8; + + x += 2; } } } diff --git a/crates/rendering/src/layers/camera.rs b/crates/rendering/src/layers/camera.rs index 7a6b04d212b..01c130d5f0e 100644 --- a/crates/rendering/src/layers/camera.rs +++ b/crates/rendering/src/layers/camera.rs @@ -420,11 +420,7 @@ impl CameraLayer { ) .is_ok() { - self.copy_from_yuv_output_to_encoder( - encoder, - next_texture, - frame_size, - ); + self.copy_from_yuv_output_to_encoder(encoder, next_texture, frame_size); } } PixelFormat::Yuv420p => { @@ -457,11 +453,7 @@ impl CameraLayer { ) .is_ok() { - self.copy_from_yuv_output_to_encoder( - encoder, - next_texture, - frame_size, - ); + self.copy_from_yuv_output_to_encoder(encoder, next_texture, frame_size); } } } From ab623ee5932a9a64fc48ab8faaf7c52096eb562e Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 19:08:01 +0000 Subject: [PATCH 31/54] fix: address clippy warnings - collapsible if and abs_diff - Use let-chain syntax for collapsible if in flush_pipeline export path (required by workspace clippy deny rule) - Use abs_diff() instead of manual absolute difference pattern in frame cache eviction Co-authored-by: Richie McIlroy --- crates/editor/src/playback.rs | 7 +------ crates/rendering/src/lib.rs | 13 +++++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index 4f64aeef86b..dbedf9f8fd0 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -106,12 +106,7 @@ impl FrameCache { .cache .iter() .filter_map(|(k, _)| { - let distance = if *k > current_frame { - *k - current_frame - } else { - current_frame - *k - }; - if distance > max_distance { + if (*k).abs_diff(current_frame) > max_distance { Some(*k) } else { None diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index dbcf768c04e..1bd37aa807b 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -532,12 +532,13 @@ pub async fn render_video_to_channel( sender.send((frame, current_frame_number)).await?; } - if let Some(Ok(final_frame)) = frame_renderer.flush_pipeline().await { - if final_frame.width > 0 && final_frame.height > 0 { - sender - .send((final_frame, frame_number.saturating_sub(1))) - .await?; - } + if let Some(Ok(final_frame)) = frame_renderer.flush_pipeline().await + && final_frame.width > 0 + && final_frame.height > 0 + { + sender + .send((final_frame, frame_number.saturating_sub(1))) + .await?; } let total_time = start_time.elapsed(); From bf15251289f0d382b5b883c7a2f5e37205053920 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 19:08:01 +0000 Subject: [PATCH 32/54] fix: address clippy warnings - collapsible if and abs_diff - Use let-chain syntax for collapsible if in flush_pipeline export path (required by workspace clippy deny rule) - Use abs_diff() instead of manual absolute difference pattern in frame cache eviction --- crates/editor/src/playback.rs | 7 +------ crates/rendering/src/lib.rs | 13 +++++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index 4f64aeef86b..dbedf9f8fd0 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -106,12 +106,7 @@ impl FrameCache { .cache .iter() .filter_map(|(k, _)| { - let distance = if *k > current_frame { - *k - current_frame - } else { - current_frame - *k - }; - if distance > max_distance { + if (*k).abs_diff(current_frame) > max_distance { Some(*k) } else { None diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index dbcf768c04e..1bd37aa807b 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -532,12 +532,13 @@ pub async fn render_video_to_channel( sender.send((frame, current_frame_number)).await?; } - if let Some(Ok(final_frame)) = frame_renderer.flush_pipeline().await { - if final_frame.width > 0 && final_frame.height > 0 { - sender - .send((final_frame, frame_number.saturating_sub(1))) - .await?; - } + if let Some(Ok(final_frame)) = frame_renderer.flush_pipeline().await + && final_frame.width > 0 + && final_frame.height > 0 + { + sender + .send((final_frame, frame_number.saturating_sub(1))) + .await?; } let total_time = start_time.elapsed(); From 6fbab5c0294ec8a0da357ac253238d432b7b3db0 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 19:13:57 +0000 Subject: [PATCH 33/54] improve: use Arc in watch channel to avoid per-frame deep clone Previously, the WebSocket handler cloned the entire WSFrame (~1.3MB of NV12 data) on every frame from the watch channel. Now the watch channel stores Arc, so the clone is just an atomic reference count increment. Changes: - Change create_watch_frame_ws to accept watch::Receiver>> - Add pack_ws_frame_ref/pack_nv12_frame_ref/pack_frame_data_ref that take &WSFrame references instead of consuming ownership - Update editor_window.rs to wrap WSFrame in Arc before sending - Update screenshot_editor.rs to wrap WSFrame in Arc before sending - Broadcast-based create_frame_ws (camera_legacy) unchanged as broadcast handles cloning differently This eliminates ~1.3MB of allocation per frame in the WS handler, reducing GC pressure and improving frame delivery throughput. Co-authored-by: Richie McIlroy --- apps/desktop/src-tauri/src/editor_window.rs | 32 ++++---- apps/desktop/src-tauri/src/frame_ws.rs | 81 ++++++++++++++++--- .../src-tauri/src/screenshot_editor.rs | 4 +- 3 files changed, 92 insertions(+), 25 deletions(-) diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index 17288ff82eb..7af0230041b 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -28,13 +28,15 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { &app, path, Box::new(move |frame| { - let _ = frame_tx.send(Some(WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, + let _ = frame_tx.send(Some(std::sync::Arc::new( + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ), ))); }), ) @@ -218,13 +220,15 @@ impl EditorInstances { window.app_handle(), path, Box::new(move |frame| { - let _ = frame_tx.send(Some(WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, + let _ = frame_tx.send(Some(std::sync::Arc::new( + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ), ))); }), ) diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs index cff525e823d..f31f8e8e018 100644 --- a/apps/desktop/src-tauri/src/frame_ws.rs +++ b/apps/desktop/src-tauri/src/frame_ws.rs @@ -114,6 +114,45 @@ fn pack_frame_data( data } +fn pack_nv12_frame_ref( + data: &[u8], + width: u32, + height: u32, + frame_number: u32, + target_time_ns: u64, +) -> Vec { + let y_stride = width; + let metadata_size = 28; + let mut output = Vec::with_capacity(data.len() + metadata_size); + output.extend_from_slice(data); + output.extend_from_slice(&y_stride.to_le_bytes()); + output.extend_from_slice(&height.to_le_bytes()); + output.extend_from_slice(&width.to_le_bytes()); + output.extend_from_slice(&frame_number.to_le_bytes()); + output.extend_from_slice(&target_time_ns.to_le_bytes()); + output.extend_from_slice(&NV12_FORMAT_MAGIC.to_le_bytes()); + output +} + +fn pack_frame_data_ref( + data: &[u8], + stride: u32, + height: u32, + width: u32, + frame_number: u32, + target_time_ns: u64, +) -> Vec { + let metadata_size = 24; + let mut output = Vec::with_capacity(data.len() + metadata_size); + output.extend_from_slice(data); + output.extend_from_slice(&stride.to_le_bytes()); + output.extend_from_slice(&height.to_le_bytes()); + output.extend_from_slice(&width.to_le_bytes()); + output.extend_from_slice(&frame_number.to_le_bytes()); + output.extend_from_slice(&target_time_ns.to_le_bytes()); + output +} + #[derive(Clone, Copy, PartialEq, Eq)] pub enum WSFrameFormat { Rgba, @@ -176,8 +215,28 @@ fn pack_ws_frame(frame: WSFrame) -> Vec { } } +fn pack_ws_frame_ref(frame: &WSFrame) -> Vec { + match frame.format { + WSFrameFormat::Nv12 => pack_nv12_frame_ref( + &frame.data, + frame.width, + frame.height, + frame.frame_number, + frame.target_time_ns, + ), + WSFrameFormat::Rgba => pack_frame_data_ref( + &frame.data, + frame.stride, + frame.height, + frame.width, + frame.frame_number, + frame.target_time_ns, + ), + } +} + pub async fn create_watch_frame_ws( - frame_rx: watch::Receiver>, + frame_rx: watch::Receiver>>, ) -> (u16, CancellationToken) { use axum::{ extract::{ @@ -188,7 +247,7 @@ pub async fn create_watch_frame_ws( routing::get, }; - type RouterState = watch::Receiver>; + type RouterState = watch::Receiver>>; #[axum::debug_handler] async fn ws_handler( @@ -198,15 +257,19 @@ pub async fn create_watch_frame_ws( ws.on_upgrade(move |socket| handle_socket(socket, state)) } - async fn handle_socket(mut socket: WebSocket, mut camera_rx: watch::Receiver>) { + async fn handle_socket( + mut socket: WebSocket, + mut camera_rx: watch::Receiver>>, + ) { tracing::info!("Socket connection established"); let now = std::time::Instant::now(); { - let frame_opt = camera_rx.borrow().clone(); - if let Some(frame) = frame_opt { - let packed = pack_ws_frame(frame); + let borrowed = camera_rx.borrow(); + if let Some(frame) = borrowed.as_deref() { + let packed = pack_ws_frame_ref(frame); + drop(borrowed); if let Err(e) = socket.send(Message::Binary(packed)).await { tracing::error!("Failed to send initial frame to socket: {:?}", e); return; @@ -230,8 +293,8 @@ pub async fn create_watch_frame_ws( } }, _ = camera_rx.changed() => { - let frame_opt = camera_rx.borrow_and_update().clone(); - if let Some(frame) = frame_opt { + let frame_arc = camera_rx.borrow_and_update().clone(); + if let Some(ref frame) = frame_arc { let width = frame.width; let height = frame.height; let format_label = match frame.format { @@ -239,7 +302,7 @@ pub async fn create_watch_frame_ws( WSFrameFormat::Rgba => "RGBA", }; - let packed = pack_ws_frame(frame); + let packed = pack_ws_frame_ref(frame); let packed_len = packed.len(); match socket.send(Message::Binary(packed)).await { diff --git a/apps/desktop/src-tauri/src/screenshot_editor.rs b/apps/desktop/src-tauri/src/screenshot_editor.rs index 01af75df033..c95624c59b6 100644 --- a/apps/desktop/src-tauri/src/screenshot_editor.rs +++ b/apps/desktop/src-tauri/src/screenshot_editor.rs @@ -381,7 +381,7 @@ impl ScreenshotEditorInstances { match rendered_frame { Ok(frame) => { - let _ = frame_tx.send(Some(WSFrame { + let _ = frame_tx.send(Some(std::sync::Arc::new(WSFrame { data: frame.data, width: frame.width, height: frame.height, @@ -390,7 +390,7 @@ impl ScreenshotEditorInstances { target_time_ns: frame.target_time_ns, format: crate::frame_ws::WSFrameFormat::Rgba, created_at: Instant::now(), - })); + }))); } Err(e) => { tracing::error!("Failed to render screenshot frame: {e}"); From 22a64a2d498433fe0e4b6ff503f7b6d99a6d93ed Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 19:13:57 +0000 Subject: [PATCH 34/54] improve: use Arc in watch channel to avoid per-frame deep clone Previously, the WebSocket handler cloned the entire WSFrame (~1.3MB of NV12 data) on every frame from the watch channel. Now the watch channel stores Arc, so the clone is just an atomic reference count increment. Changes: - Change create_watch_frame_ws to accept watch::Receiver>> - Add pack_ws_frame_ref/pack_nv12_frame_ref/pack_frame_data_ref that take &WSFrame references instead of consuming ownership - Update editor_window.rs to wrap WSFrame in Arc before sending - Update screenshot_editor.rs to wrap WSFrame in Arc before sending - Broadcast-based create_frame_ws (camera_legacy) unchanged as broadcast handles cloning differently This eliminates ~1.3MB of allocation per frame in the WS handler, reducing GC pressure and improving frame delivery throughput. --- apps/desktop/src-tauri/src/editor_window.rs | 32 ++++---- apps/desktop/src-tauri/src/frame_ws.rs | 81 ++++++++++++++++--- .../src-tauri/src/screenshot_editor.rs | 4 +- 3 files changed, 92 insertions(+), 25 deletions(-) diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index 17288ff82eb..7af0230041b 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -28,13 +28,15 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { &app, path, Box::new(move |frame| { - let _ = frame_tx.send(Some(WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, + let _ = frame_tx.send(Some(std::sync::Arc::new( + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ), ))); }), ) @@ -218,13 +220,15 @@ impl EditorInstances { window.app_handle(), path, Box::new(move |frame| { - let _ = frame_tx.send(Some(WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, + let _ = frame_tx.send(Some(std::sync::Arc::new( + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ), ))); }), ) diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs index cff525e823d..f31f8e8e018 100644 --- a/apps/desktop/src-tauri/src/frame_ws.rs +++ b/apps/desktop/src-tauri/src/frame_ws.rs @@ -114,6 +114,45 @@ fn pack_frame_data( data } +fn pack_nv12_frame_ref( + data: &[u8], + width: u32, + height: u32, + frame_number: u32, + target_time_ns: u64, +) -> Vec { + let y_stride = width; + let metadata_size = 28; + let mut output = Vec::with_capacity(data.len() + metadata_size); + output.extend_from_slice(data); + output.extend_from_slice(&y_stride.to_le_bytes()); + output.extend_from_slice(&height.to_le_bytes()); + output.extend_from_slice(&width.to_le_bytes()); + output.extend_from_slice(&frame_number.to_le_bytes()); + output.extend_from_slice(&target_time_ns.to_le_bytes()); + output.extend_from_slice(&NV12_FORMAT_MAGIC.to_le_bytes()); + output +} + +fn pack_frame_data_ref( + data: &[u8], + stride: u32, + height: u32, + width: u32, + frame_number: u32, + target_time_ns: u64, +) -> Vec { + let metadata_size = 24; + let mut output = Vec::with_capacity(data.len() + metadata_size); + output.extend_from_slice(data); + output.extend_from_slice(&stride.to_le_bytes()); + output.extend_from_slice(&height.to_le_bytes()); + output.extend_from_slice(&width.to_le_bytes()); + output.extend_from_slice(&frame_number.to_le_bytes()); + output.extend_from_slice(&target_time_ns.to_le_bytes()); + output +} + #[derive(Clone, Copy, PartialEq, Eq)] pub enum WSFrameFormat { Rgba, @@ -176,8 +215,28 @@ fn pack_ws_frame(frame: WSFrame) -> Vec { } } +fn pack_ws_frame_ref(frame: &WSFrame) -> Vec { + match frame.format { + WSFrameFormat::Nv12 => pack_nv12_frame_ref( + &frame.data, + frame.width, + frame.height, + frame.frame_number, + frame.target_time_ns, + ), + WSFrameFormat::Rgba => pack_frame_data_ref( + &frame.data, + frame.stride, + frame.height, + frame.width, + frame.frame_number, + frame.target_time_ns, + ), + } +} + pub async fn create_watch_frame_ws( - frame_rx: watch::Receiver>, + frame_rx: watch::Receiver>>, ) -> (u16, CancellationToken) { use axum::{ extract::{ @@ -188,7 +247,7 @@ pub async fn create_watch_frame_ws( routing::get, }; - type RouterState = watch::Receiver>; + type RouterState = watch::Receiver>>; #[axum::debug_handler] async fn ws_handler( @@ -198,15 +257,19 @@ pub async fn create_watch_frame_ws( ws.on_upgrade(move |socket| handle_socket(socket, state)) } - async fn handle_socket(mut socket: WebSocket, mut camera_rx: watch::Receiver>) { + async fn handle_socket( + mut socket: WebSocket, + mut camera_rx: watch::Receiver>>, + ) { tracing::info!("Socket connection established"); let now = std::time::Instant::now(); { - let frame_opt = camera_rx.borrow().clone(); - if let Some(frame) = frame_opt { - let packed = pack_ws_frame(frame); + let borrowed = camera_rx.borrow(); + if let Some(frame) = borrowed.as_deref() { + let packed = pack_ws_frame_ref(frame); + drop(borrowed); if let Err(e) = socket.send(Message::Binary(packed)).await { tracing::error!("Failed to send initial frame to socket: {:?}", e); return; @@ -230,8 +293,8 @@ pub async fn create_watch_frame_ws( } }, _ = camera_rx.changed() => { - let frame_opt = camera_rx.borrow_and_update().clone(); - if let Some(frame) = frame_opt { + let frame_arc = camera_rx.borrow_and_update().clone(); + if let Some(ref frame) = frame_arc { let width = frame.width; let height = frame.height; let format_label = match frame.format { @@ -239,7 +302,7 @@ pub async fn create_watch_frame_ws( WSFrameFormat::Rgba => "RGBA", }; - let packed = pack_ws_frame(frame); + let packed = pack_ws_frame_ref(frame); let packed_len = packed.len(); match socket.send(Message::Binary(packed)).await { diff --git a/apps/desktop/src-tauri/src/screenshot_editor.rs b/apps/desktop/src-tauri/src/screenshot_editor.rs index 01af75df033..c95624c59b6 100644 --- a/apps/desktop/src-tauri/src/screenshot_editor.rs +++ b/apps/desktop/src-tauri/src/screenshot_editor.rs @@ -381,7 +381,7 @@ impl ScreenshotEditorInstances { match rendered_frame { Ok(frame) => { - let _ = frame_tx.send(Some(WSFrame { + let _ = frame_tx.send(Some(std::sync::Arc::new(WSFrame { data: frame.data, width: frame.width, height: frame.height, @@ -390,7 +390,7 @@ impl ScreenshotEditorInstances { target_time_ns: frame.target_time_ns, format: crate::frame_ws::WSFrameFormat::Rgba, created_at: Instant::now(), - })); + }))); } Err(e) => { tracing::error!("Failed to render screenshot frame: {e}"); From 7dc69e46f358ff49080c01133ef4ed054ed405d2 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 19:22:00 +0000 Subject: [PATCH 35/54] =?UTF-8?q?feat:=20add=20GPU=20RGBA=E2=86=92NV12=20c?= =?UTF-8?q?onverter=20shader=20and=20pipeline=20infrastructure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add compute shader and Rust pipeline for converting rendered RGBA frames to NV12 format on the GPU before readback. This will reduce readback data size by 62% (from width*height*4 to width*height*1.5 bytes). New components: - shaders/rgba_to_nv12.wgsl: Compute shader that processes 4x2 pixel blocks, writing complete u32 words to avoid data races. Each thread produces 4 Y values (2 rows) and 2 UV pairs. - RgbaToNv12Converter: Creates compute pipeline, manages storage and readback buffers, dispatches conversion compute pass - PendingNv12Readback: Async readback with same progressive backoff poll loop as RGBA readback - Nv12RenderedFrame: Output frame with NV12 data + metadata Not yet wired into the render pipeline - infrastructure only. Co-authored-by: Richie McIlroy --- crates/rendering/src/frame_pipeline.rs | 286 ++++++++++++++++++ .../rendering/src/shaders/rgba_to_nv12.wgsl | 92 ++++++ 2 files changed, 378 insertions(+) create mode 100644 crates/rendering/src/shaders/rgba_to_nv12.wgsl diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index 13e3c472eb1..ac4492927b8 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -7,6 +7,292 @@ use crate::{ProjectUniforms, RenderingError}; const GPU_BUFFER_WAIT_TIMEOUT_SECS: u64 = 10; +pub struct RgbaToNv12Converter { + pipeline: wgpu::ComputePipeline, + bind_group_layout: wgpu::BindGroupLayout, + params_buffer: wgpu::Buffer, + nv12_buffer: Option, + readback_buffer: Option>, + cached_width: u32, + cached_height: u32, +} + +#[repr(C)] +#[derive(Copy, Clone, bytemuck::Pod, bytemuck::Zeroable)] +struct Nv12Params { + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, +} + +impl RgbaToNv12Converter { + pub fn new(device: &wgpu::Device) -> Self { + let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: Some("RGBA to NV12 Converter"), + source: wgpu::ShaderSource::Wgsl(std::borrow::Cow::Borrowed(include_str!( + "shaders/rgba_to_nv12.wgsl" + ))), + }); + + let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: Some("RGBA to NV12 Bind Group Layout"), + entries: &[ + wgpu::BindGroupLayoutEntry { + binding: 0, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Texture { + sample_type: wgpu::TextureSampleType::Float { filterable: false }, + view_dimension: wgpu::TextureViewDimension::D2, + multisampled: false, + }, + count: None, + }, + wgpu::BindGroupLayoutEntry { + binding: 1, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + wgpu::BindGroupLayoutEntry { + binding: 2, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Uniform, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + ], + }); + + let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: Some("RGBA to NV12 Pipeline Layout"), + bind_group_layouts: &[&bind_group_layout], + push_constant_ranges: &[], + }); + + let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { + label: Some("RGBA to NV12 Pipeline"), + layout: Some(&pipeline_layout), + module: &shader, + entry_point: Some("main"), + compilation_options: Default::default(), + cache: None, + }); + + let params_buffer = device.create_buffer(&wgpu::BufferDescriptor { + label: Some("NV12 Params Buffer"), + size: std::mem::size_of::() as u64, + usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + + Self { + pipeline, + bind_group_layout, + params_buffer, + nv12_buffer: None, + readback_buffer: None, + cached_width: 0, + cached_height: 0, + } + } + + fn nv12_size(width: u32, height: u32) -> u64 { + let y_size = (width as u64) * (height as u64); + let uv_size = (width as u64) * (height as u64 / 2); + y_size + uv_size + } + + fn ensure_buffers(&mut self, device: &wgpu::Device, width: u32, height: u32) { + if self.cached_width == width && self.cached_height == height { + return; + } + + let nv12_size = Self::nv12_size(width, height); + let aligned_size = ((nv12_size + 3) / 4) * 4; + + self.nv12_buffer = Some(device.create_buffer(&wgpu::BufferDescriptor { + label: Some("NV12 Storage Buffer"), + size: aligned_size, + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: false, + })); + + self.readback_buffer = Some(Arc::new(device.create_buffer(&wgpu::BufferDescriptor { + label: Some("NV12 Readback Buffer"), + size: nv12_size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }))); + + self.cached_width = width; + self.cached_height = height; + } + + pub fn convert_and_readback( + &mut self, + device: &wgpu::Device, + queue: &wgpu::Queue, + encoder: &mut wgpu::CommandEncoder, + source_texture: &wgpu::Texture, + width: u32, + height: u32, + frame_number: u32, + frame_rate: u32, + ) -> Option { + if width == 0 || height == 0 || width % 4 != 0 || height % 2 != 0 { + return None; + } + + self.ensure_buffers(device, width, height); + + let nv12_buffer = self.nv12_buffer.as_ref()?; + let readback_buffer = self.readback_buffer.as_ref()?.clone(); + + let y_stride = width; + let uv_stride = width; + + let params = Nv12Params { + width, + height, + y_stride, + uv_stride, + }; + queue.write_buffer(&self.params_buffer, 0, bytemuck::cast_slice(&[params])); + + let source_view = source_texture.create_view(&Default::default()); + + let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { + label: Some("RGBA to NV12 Bind Group"), + layout: &self.bind_group_layout, + entries: &[ + wgpu::BindGroupEntry { + binding: 0, + resource: wgpu::BindingResource::TextureView(&source_view), + }, + wgpu::BindGroupEntry { + binding: 1, + resource: nv12_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 2, + resource: self.params_buffer.as_entire_binding(), + }, + ], + }); + + { + let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("RGBA to NV12 Conversion"), + ..Default::default() + }); + pass.set_pipeline(&self.pipeline); + pass.set_bind_group(0, &bind_group, &[]); + pass.dispatch_workgroups(width.div_ceil(4 * 8), height.div_ceil(2 * 8), 1); + } + + let nv12_size = Self::nv12_size(width, height); + encoder.copy_buffer_to_buffer(nv12_buffer, 0, &readback_buffer, 0, nv12_size); + + let (tx, rx) = oneshot::channel(); + readback_buffer + .slice(..) + .map_async(wgpu::MapMode::Read, move |result| { + let _ = tx.send(result); + }); + + Some(PendingNv12Readback { + rx, + buffer: readback_buffer, + width, + height, + y_stride, + frame_number, + frame_rate, + }) + } +} + +pub struct PendingNv12Readback { + rx: oneshot::Receiver>, + buffer: Arc, + pub width: u32, + pub height: u32, + pub y_stride: u32, + pub frame_number: u32, + pub frame_rate: u32, +} + +impl PendingNv12Readback { + pub async fn wait(mut self, device: &wgpu::Device) -> Result { + let mut poll_count = 0u32; + let start_time = Instant::now(); + let timeout_duration = std::time::Duration::from_secs(GPU_BUFFER_WAIT_TIMEOUT_SECS); + + loop { + if start_time.elapsed() > timeout_duration { + return Err(RenderingError::BufferMapWaitingFailed); + } + + match self.rx.try_recv() { + Ok(result) => { + result?; + break; + } + Err(oneshot::error::TryRecvError::Empty) => { + device.poll(wgpu::PollType::Poll)?; + poll_count += 1; + if poll_count < 10 { + tokio::task::yield_now().await; + } else if poll_count < 100 { + tokio::time::sleep(std::time::Duration::from_micros(100)).await; + } else { + tokio::time::sleep(std::time::Duration::from_millis(1)).await; + } + } + Err(oneshot::error::TryRecvError::Closed) => { + return Err(RenderingError::BufferMapWaitingFailed); + } + } + } + + let buffer_slice = self.buffer.slice(..); + let data = buffer_slice.get_mapped_range(); + let nv12_data = data.to_vec(); + + drop(data); + self.buffer.unmap(); + + let target_time_ns = + (self.frame_number as u64 * 1_000_000_000) / self.frame_rate.max(1) as u64; + + Ok(Nv12RenderedFrame { + data: nv12_data, + width: self.width, + height: self.height, + y_stride: self.y_stride, + frame_number: self.frame_number, + target_time_ns, + }) + } +} + +pub struct Nv12RenderedFrame { + pub data: Vec, + pub width: u32, + pub height: u32, + pub y_stride: u32, + pub frame_number: u32, + pub target_time_ns: u64, +} + pub struct PendingReadback { rx: oneshot::Receiver>, buffer: Arc, diff --git a/crates/rendering/src/shaders/rgba_to_nv12.wgsl b/crates/rendering/src/shaders/rgba_to_nv12.wgsl new file mode 100644 index 00000000000..444805da2f6 --- /dev/null +++ b/crates/rendering/src/shaders/rgba_to_nv12.wgsl @@ -0,0 +1,92 @@ +struct Params { + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, +} + +@group(0) @binding(0) var input: texture_2d; +@group(0) @binding(1) var nv12_output: array; +@group(0) @binding(2) var params: Params; + +fn rgb_to_y(r: f32, g: f32, b: f32) -> u32 { + return u32(clamp(16.0 + 65.481 * r + 128.553 * g + 24.966 * b, 0.0, 255.0)); +} + +fn rgb_to_u(r: f32, g: f32, b: f32) -> u32 { + return u32(clamp(128.0 - 37.797 * r - 74.203 * g + 112.0 * b, 0.0, 255.0)); +} + +fn rgb_to_v(r: f32, g: f32, b: f32) -> u32 { + return u32(clamp(128.0 + 112.0 * r - 93.786 * g - 18.214 * b, 0.0, 255.0)); +} + +fn safe_load(coord: vec2, dims: vec2) -> vec4 { + let c = min(coord, dims - vec2(1u, 1u)); + return textureLoad(input, c, 0); +} + +@compute @workgroup_size(8, 8) +fn main(@builtin(global_invocation_id) global_id: vec3) { + let width = params.width; + let height = params.height; + let y_stride = params.y_stride; + let dims = vec2(width, height); + + let px = global_id.x * 4u; + let py = global_id.y * 2u; + + if (px >= width || py >= height) { + return; + } + + let p0 = safe_load(vec2(px, py), dims); + let p1 = safe_load(vec2(px + 1u, py), dims); + let p2 = safe_load(vec2(px + 2u, py), dims); + let p3 = safe_load(vec2(px + 3u, py), dims); + + let p4 = safe_load(vec2(px, py + 1u), dims); + let p5 = safe_load(vec2(px + 1u, py + 1u), dims); + let p6 = safe_load(vec2(px + 2u, py + 1u), dims); + let p7 = safe_load(vec2(px + 3u, py + 1u), dims); + + let y0 = rgb_to_y(p0.r, p0.g, p0.b); + let y1 = rgb_to_y(p1.r, p1.g, p1.b); + let y2 = rgb_to_y(p2.r, p2.g, p2.b); + let y3 = rgb_to_y(p3.r, p3.g, p3.b); + let y4 = rgb_to_y(p4.r, p4.g, p4.b); + let y5 = rgb_to_y(p5.r, p5.g, p5.b); + let y6 = rgb_to_y(p6.r, p6.g, p6.b); + let y7 = rgb_to_y(p7.r, p7.g, p7.b); + + let y_row0_word = y0 | (y1 << 8u) | (y2 << 16u) | (y3 << 24u); + let y_row0_idx = (py * y_stride + px) / 4u; + nv12_output[y_row0_idx] = y_row0_word; + + if (py + 1u < height) { + let y_row1_word = y4 | (y5 << 8u) | (y6 << 16u) | (y7 << 24u); + let y_row1_idx = ((py + 1u) * y_stride + px) / 4u; + nv12_output[y_row1_idx] = y_row1_word; + } + + let y_plane_size = y_stride * height; + + let avg_r_left = (p0.r + p1.r + p4.r + p5.r) * 0.25; + let avg_g_left = (p0.g + p1.g + p4.g + p5.g) * 0.25; + let avg_b_left = (p0.b + p1.b + p4.b + p5.b) * 0.25; + + let avg_r_right = (p2.r + p3.r + p6.r + p7.r) * 0.25; + let avg_g_right = (p2.g + p3.g + p6.g + p7.g) * 0.25; + let avg_b_right = (p2.b + p3.b + p6.b + p7.b) * 0.25; + + let u_left = rgb_to_u(avg_r_left, avg_g_left, avg_b_left); + let v_left = rgb_to_v(avg_r_left, avg_g_left, avg_b_left); + let u_right = rgb_to_u(avg_r_right, avg_g_right, avg_b_right); + let v_right = rgb_to_v(avg_r_right, avg_g_right, avg_b_right); + + let uv_word = u_left | (v_left << 8u) | (u_right << 16u) | (v_right << 24u); + let uv_row = global_id.y; + let uv_offset = y_plane_size + uv_row * params.uv_stride + px; + let uv_idx = uv_offset / 4u; + nv12_output[uv_idx] = uv_word; +} From 47c5f0e738d9f627b88e9aa9a190593d3a85dd59 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 19:22:00 +0000 Subject: [PATCH 36/54] =?UTF-8?q?feat:=20add=20GPU=20RGBA=E2=86=92NV12=20c?= =?UTF-8?q?onverter=20shader=20and=20pipeline=20infrastructure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add compute shader and Rust pipeline for converting rendered RGBA frames to NV12 format on the GPU before readback. This will reduce readback data size by 62% (from width*height*4 to width*height*1.5 bytes). New components: - shaders/rgba_to_nv12.wgsl: Compute shader that processes 4x2 pixel blocks, writing complete u32 words to avoid data races. Each thread produces 4 Y values (2 rows) and 2 UV pairs. - RgbaToNv12Converter: Creates compute pipeline, manages storage and readback buffers, dispatches conversion compute pass - PendingNv12Readback: Async readback with same progressive backoff poll loop as RGBA readback - Nv12RenderedFrame: Output frame with NV12 data + metadata Not yet wired into the render pipeline - infrastructure only. --- crates/rendering/src/frame_pipeline.rs | 286 ++++++++++++++++++ .../rendering/src/shaders/rgba_to_nv12.wgsl | 92 ++++++ 2 files changed, 378 insertions(+) create mode 100644 crates/rendering/src/shaders/rgba_to_nv12.wgsl diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index 13e3c472eb1..ac4492927b8 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -7,6 +7,292 @@ use crate::{ProjectUniforms, RenderingError}; const GPU_BUFFER_WAIT_TIMEOUT_SECS: u64 = 10; +pub struct RgbaToNv12Converter { + pipeline: wgpu::ComputePipeline, + bind_group_layout: wgpu::BindGroupLayout, + params_buffer: wgpu::Buffer, + nv12_buffer: Option, + readback_buffer: Option>, + cached_width: u32, + cached_height: u32, +} + +#[repr(C)] +#[derive(Copy, Clone, bytemuck::Pod, bytemuck::Zeroable)] +struct Nv12Params { + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, +} + +impl RgbaToNv12Converter { + pub fn new(device: &wgpu::Device) -> Self { + let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: Some("RGBA to NV12 Converter"), + source: wgpu::ShaderSource::Wgsl(std::borrow::Cow::Borrowed(include_str!( + "shaders/rgba_to_nv12.wgsl" + ))), + }); + + let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: Some("RGBA to NV12 Bind Group Layout"), + entries: &[ + wgpu::BindGroupLayoutEntry { + binding: 0, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Texture { + sample_type: wgpu::TextureSampleType::Float { filterable: false }, + view_dimension: wgpu::TextureViewDimension::D2, + multisampled: false, + }, + count: None, + }, + wgpu::BindGroupLayoutEntry { + binding: 1, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + wgpu::BindGroupLayoutEntry { + binding: 2, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Uniform, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + ], + }); + + let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: Some("RGBA to NV12 Pipeline Layout"), + bind_group_layouts: &[&bind_group_layout], + push_constant_ranges: &[], + }); + + let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { + label: Some("RGBA to NV12 Pipeline"), + layout: Some(&pipeline_layout), + module: &shader, + entry_point: Some("main"), + compilation_options: Default::default(), + cache: None, + }); + + let params_buffer = device.create_buffer(&wgpu::BufferDescriptor { + label: Some("NV12 Params Buffer"), + size: std::mem::size_of::() as u64, + usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + + Self { + pipeline, + bind_group_layout, + params_buffer, + nv12_buffer: None, + readback_buffer: None, + cached_width: 0, + cached_height: 0, + } + } + + fn nv12_size(width: u32, height: u32) -> u64 { + let y_size = (width as u64) * (height as u64); + let uv_size = (width as u64) * (height as u64 / 2); + y_size + uv_size + } + + fn ensure_buffers(&mut self, device: &wgpu::Device, width: u32, height: u32) { + if self.cached_width == width && self.cached_height == height { + return; + } + + let nv12_size = Self::nv12_size(width, height); + let aligned_size = ((nv12_size + 3) / 4) * 4; + + self.nv12_buffer = Some(device.create_buffer(&wgpu::BufferDescriptor { + label: Some("NV12 Storage Buffer"), + size: aligned_size, + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: false, + })); + + self.readback_buffer = Some(Arc::new(device.create_buffer(&wgpu::BufferDescriptor { + label: Some("NV12 Readback Buffer"), + size: nv12_size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }))); + + self.cached_width = width; + self.cached_height = height; + } + + pub fn convert_and_readback( + &mut self, + device: &wgpu::Device, + queue: &wgpu::Queue, + encoder: &mut wgpu::CommandEncoder, + source_texture: &wgpu::Texture, + width: u32, + height: u32, + frame_number: u32, + frame_rate: u32, + ) -> Option { + if width == 0 || height == 0 || width % 4 != 0 || height % 2 != 0 { + return None; + } + + self.ensure_buffers(device, width, height); + + let nv12_buffer = self.nv12_buffer.as_ref()?; + let readback_buffer = self.readback_buffer.as_ref()?.clone(); + + let y_stride = width; + let uv_stride = width; + + let params = Nv12Params { + width, + height, + y_stride, + uv_stride, + }; + queue.write_buffer(&self.params_buffer, 0, bytemuck::cast_slice(&[params])); + + let source_view = source_texture.create_view(&Default::default()); + + let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { + label: Some("RGBA to NV12 Bind Group"), + layout: &self.bind_group_layout, + entries: &[ + wgpu::BindGroupEntry { + binding: 0, + resource: wgpu::BindingResource::TextureView(&source_view), + }, + wgpu::BindGroupEntry { + binding: 1, + resource: nv12_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 2, + resource: self.params_buffer.as_entire_binding(), + }, + ], + }); + + { + let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("RGBA to NV12 Conversion"), + ..Default::default() + }); + pass.set_pipeline(&self.pipeline); + pass.set_bind_group(0, &bind_group, &[]); + pass.dispatch_workgroups(width.div_ceil(4 * 8), height.div_ceil(2 * 8), 1); + } + + let nv12_size = Self::nv12_size(width, height); + encoder.copy_buffer_to_buffer(nv12_buffer, 0, &readback_buffer, 0, nv12_size); + + let (tx, rx) = oneshot::channel(); + readback_buffer + .slice(..) + .map_async(wgpu::MapMode::Read, move |result| { + let _ = tx.send(result); + }); + + Some(PendingNv12Readback { + rx, + buffer: readback_buffer, + width, + height, + y_stride, + frame_number, + frame_rate, + }) + } +} + +pub struct PendingNv12Readback { + rx: oneshot::Receiver>, + buffer: Arc, + pub width: u32, + pub height: u32, + pub y_stride: u32, + pub frame_number: u32, + pub frame_rate: u32, +} + +impl PendingNv12Readback { + pub async fn wait(mut self, device: &wgpu::Device) -> Result { + let mut poll_count = 0u32; + let start_time = Instant::now(); + let timeout_duration = std::time::Duration::from_secs(GPU_BUFFER_WAIT_TIMEOUT_SECS); + + loop { + if start_time.elapsed() > timeout_duration { + return Err(RenderingError::BufferMapWaitingFailed); + } + + match self.rx.try_recv() { + Ok(result) => { + result?; + break; + } + Err(oneshot::error::TryRecvError::Empty) => { + device.poll(wgpu::PollType::Poll)?; + poll_count += 1; + if poll_count < 10 { + tokio::task::yield_now().await; + } else if poll_count < 100 { + tokio::time::sleep(std::time::Duration::from_micros(100)).await; + } else { + tokio::time::sleep(std::time::Duration::from_millis(1)).await; + } + } + Err(oneshot::error::TryRecvError::Closed) => { + return Err(RenderingError::BufferMapWaitingFailed); + } + } + } + + let buffer_slice = self.buffer.slice(..); + let data = buffer_slice.get_mapped_range(); + let nv12_data = data.to_vec(); + + drop(data); + self.buffer.unmap(); + + let target_time_ns = + (self.frame_number as u64 * 1_000_000_000) / self.frame_rate.max(1) as u64; + + Ok(Nv12RenderedFrame { + data: nv12_data, + width: self.width, + height: self.height, + y_stride: self.y_stride, + frame_number: self.frame_number, + target_time_ns, + }) + } +} + +pub struct Nv12RenderedFrame { + pub data: Vec, + pub width: u32, + pub height: u32, + pub y_stride: u32, + pub frame_number: u32, + pub target_time_ns: u64, +} + pub struct PendingReadback { rx: oneshot::Receiver>, buffer: Arc, diff --git a/crates/rendering/src/shaders/rgba_to_nv12.wgsl b/crates/rendering/src/shaders/rgba_to_nv12.wgsl new file mode 100644 index 00000000000..444805da2f6 --- /dev/null +++ b/crates/rendering/src/shaders/rgba_to_nv12.wgsl @@ -0,0 +1,92 @@ +struct Params { + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, +} + +@group(0) @binding(0) var input: texture_2d; +@group(0) @binding(1) var nv12_output: array; +@group(0) @binding(2) var params: Params; + +fn rgb_to_y(r: f32, g: f32, b: f32) -> u32 { + return u32(clamp(16.0 + 65.481 * r + 128.553 * g + 24.966 * b, 0.0, 255.0)); +} + +fn rgb_to_u(r: f32, g: f32, b: f32) -> u32 { + return u32(clamp(128.0 - 37.797 * r - 74.203 * g + 112.0 * b, 0.0, 255.0)); +} + +fn rgb_to_v(r: f32, g: f32, b: f32) -> u32 { + return u32(clamp(128.0 + 112.0 * r - 93.786 * g - 18.214 * b, 0.0, 255.0)); +} + +fn safe_load(coord: vec2, dims: vec2) -> vec4 { + let c = min(coord, dims - vec2(1u, 1u)); + return textureLoad(input, c, 0); +} + +@compute @workgroup_size(8, 8) +fn main(@builtin(global_invocation_id) global_id: vec3) { + let width = params.width; + let height = params.height; + let y_stride = params.y_stride; + let dims = vec2(width, height); + + let px = global_id.x * 4u; + let py = global_id.y * 2u; + + if (px >= width || py >= height) { + return; + } + + let p0 = safe_load(vec2(px, py), dims); + let p1 = safe_load(vec2(px + 1u, py), dims); + let p2 = safe_load(vec2(px + 2u, py), dims); + let p3 = safe_load(vec2(px + 3u, py), dims); + + let p4 = safe_load(vec2(px, py + 1u), dims); + let p5 = safe_load(vec2(px + 1u, py + 1u), dims); + let p6 = safe_load(vec2(px + 2u, py + 1u), dims); + let p7 = safe_load(vec2(px + 3u, py + 1u), dims); + + let y0 = rgb_to_y(p0.r, p0.g, p0.b); + let y1 = rgb_to_y(p1.r, p1.g, p1.b); + let y2 = rgb_to_y(p2.r, p2.g, p2.b); + let y3 = rgb_to_y(p3.r, p3.g, p3.b); + let y4 = rgb_to_y(p4.r, p4.g, p4.b); + let y5 = rgb_to_y(p5.r, p5.g, p5.b); + let y6 = rgb_to_y(p6.r, p6.g, p6.b); + let y7 = rgb_to_y(p7.r, p7.g, p7.b); + + let y_row0_word = y0 | (y1 << 8u) | (y2 << 16u) | (y3 << 24u); + let y_row0_idx = (py * y_stride + px) / 4u; + nv12_output[y_row0_idx] = y_row0_word; + + if (py + 1u < height) { + let y_row1_word = y4 | (y5 << 8u) | (y6 << 16u) | (y7 << 24u); + let y_row1_idx = ((py + 1u) * y_stride + px) / 4u; + nv12_output[y_row1_idx] = y_row1_word; + } + + let y_plane_size = y_stride * height; + + let avg_r_left = (p0.r + p1.r + p4.r + p5.r) * 0.25; + let avg_g_left = (p0.g + p1.g + p4.g + p5.g) * 0.25; + let avg_b_left = (p0.b + p1.b + p4.b + p5.b) * 0.25; + + let avg_r_right = (p2.r + p3.r + p6.r + p7.r) * 0.25; + let avg_g_right = (p2.g + p3.g + p6.g + p7.g) * 0.25; + let avg_b_right = (p2.b + p3.b + p6.b + p7.b) * 0.25; + + let u_left = rgb_to_u(avg_r_left, avg_g_left, avg_b_left); + let v_left = rgb_to_v(avg_r_left, avg_g_left, avg_b_left); + let u_right = rgb_to_u(avg_r_right, avg_g_right, avg_b_right); + let v_right = rgb_to_v(avg_r_right, avg_g_right, avg_b_right); + + let uv_word = u_left | (v_left << 8u) | (u_right << 16u) | (v_right << 24u); + let uv_row = global_id.y; + let uv_offset = y_plane_size + uv_row * params.uv_stride + px; + let uv_idx = uv_offset / 4u; + nv12_output[uv_idx] = uv_word; +} From d5b0d17a8416ce7212f2262cea516488f6f8024a Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 19:29:41 +0000 Subject: [PATCH 37/54] =?UTF-8?q?improve:=20GPU-side=20RGBA=E2=86=92NV12?= =?UTF-8?q?=20conversion=20eliminates=20CPU=20conversion=20and=20reduces?= =?UTF-8?q?=20readback=20by=2062%?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major optimization: instead of reading back full RGBA from GPU (width*height*4) and converting to NV12 on CPU, the render pipeline now converts RGBA→NV12 on GPU using a compute shader and reads back only NV12 data (width*height*1.5). This eliminates two bottlenecks simultaneously: 1. GPU readback size reduced by 62% (e.g., 3.5MB → 1.3MB at half-res) 2. CPU RGBA→NV12 conversion (~1-2ms per frame) eliminated from render thread Pipeline flow change: Before: GPU render RGBA → readback RGBA → CPU convert NV12 → WS send NV12 After: GPU render RGBA → GPU convert NV12 → readback NV12 → WS send NV12 Implementation: - Add rgba_to_nv12.wgsl compute shader (processes 4x2 pixel blocks, writes complete u32 words to avoid data races) - Add RgbaToNv12Converter with compute pipeline, storage/readback buffers - Add finish_encoder_nv12() for NV12 readback path - Add FrameRenderer::render_nv12() method - Add EditorFrameOutput enum (Rgba/Nv12) for frame callback - Editor renderer now produces NV12 frames directly - Frame callback receives NV12 data without CPU conversion - Export path unchanged (still uses RGBA readback) Co-authored-by: Richie McIlroy --- apps/desktop/src-tauri/src/editor_window.rs | 72 ++++++++++++++------- apps/desktop/src-tauri/src/lib.rs | 2 +- crates/editor/src/editor.rs | 15 +++-- crates/editor/src/editor_instance.rs | 4 +- crates/editor/src/lib.rs | 2 +- crates/rendering/src/frame_pipeline.rs | 47 +++++++++++++- crates/rendering/src/lib.rs | 66 ++++++++++++++++++- 7 files changed, 173 insertions(+), 35 deletions(-) diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index 7af0230041b..35da7c3c4b3 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -5,7 +5,7 @@ use tokio_util::sync::CancellationToken; use crate::{ create_editor_instance_impl, - frame_ws::{WSFrame, create_watch_frame_ws}, + frame_ws::{WSFrame, WSFrameFormat, create_watch_frame_ws}, }; pub struct EditorInstance { @@ -27,17 +27,30 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { let inner = create_editor_instance_impl( &app, path, - Box::new(move |frame| { - let _ = frame_tx.send(Some(std::sync::Arc::new( - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, - ), - ))); + Box::new(move |output| { + let ws_frame = match output { + cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + }, + cap_editor::editor::EditorFrameOutput::Rgba(frame) => { + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ) + } + }; + let _ = frame_tx.send(Some(std::sync::Arc::new(ws_frame))); }), ) .await?; @@ -219,17 +232,30 @@ impl EditorInstances { let inner = create_editor_instance_impl( window.app_handle(), path, - Box::new(move |frame| { - let _ = frame_tx.send(Some(std::sync::Arc::new( - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, - ), - ))); + Box::new(move |output| { + let ws_frame = match output { + cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + }, + cap_editor::editor::EditorFrameOutput::Rgba(frame) => { + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ) + } + }; + let _ = frame_tx.send(Some(std::sync::Arc::new(ws_frame))); }), ) .await?; diff --git a/apps/desktop/src-tauri/src/lib.rs b/apps/desktop/src-tauri/src/lib.rs index 00dc6903b16..6a24f81a82a 100644 --- a/apps/desktop/src-tauri/src/lib.rs +++ b/apps/desktop/src-tauri/src/lib.rs @@ -3853,7 +3853,7 @@ async fn resume_uploads(app: AppHandle) -> Result<(), String> { async fn create_editor_instance_impl( app: &AppHandle, path: PathBuf, - frame_cb: Box, + frame_cb: Box, ) -> Result, String> { let app = app.clone(); diff --git a/crates/editor/src/editor.rs b/crates/editor/src/editor.rs index a80d18b3992..71cee1d0fb7 100644 --- a/crates/editor/src/editor.rs +++ b/crates/editor/src/editor.rs @@ -3,7 +3,7 @@ use std::time::Instant; use cap_project::{CursorEvents, RecordingMeta, StudioRecordingMeta}; use cap_rendering::{ - DecodedSegmentFrames, FrameRenderer, ProjectRecordingsMeta, ProjectUniforms, + DecodedSegmentFrames, FrameRenderer, Nv12RenderedFrame, ProjectRecordingsMeta, ProjectUniforms, RenderVideoConstants, RenderedFrame, RendererLayers, }; use tokio::sync::{mpsc, oneshot}; @@ -21,9 +21,14 @@ pub enum RendererMessage { }, } +pub enum EditorFrameOutput { + Rgba(RenderedFrame), + Nv12(Nv12RenderedFrame), +} + pub struct Renderer { rx: mpsc::Receiver, - frame_cb: Box, + frame_cb: Box, render_constants: Arc, #[allow(unused)] total_frames: u32, @@ -36,7 +41,7 @@ pub struct RendererHandle { impl Renderer { pub fn spawn( render_constants: Arc, - frame_cb: Box, + frame_cb: Box, recording_meta: &RecordingMeta, meta: &StudioRecordingMeta, ) -> Result { @@ -143,7 +148,7 @@ impl Renderer { } } match frame_renderer - .render( + .render_nv12( current.segment_frames, current.uniforms, ¤t.cursor, @@ -152,7 +157,7 @@ impl Renderer { .await { Ok(frame) => { - (self.frame_cb)(frame); + (self.frame_cb)(EditorFrameOutput::Nv12(frame)); } Err(e) => { tracing::error!(error = %e, "Failed to render frame in editor"); diff --git a/crates/editor/src/editor_instance.rs b/crates/editor/src/editor_instance.rs index a48f65c0632..4879fadc26b 100644 --- a/crates/editor/src/editor_instance.rs +++ b/crates/editor/src/editor_instance.rs @@ -8,7 +8,7 @@ use cap_project::{ }; use cap_rendering::{ ProjectRecordingsMeta, ProjectUniforms, RecordingSegmentDecoders, RenderVideoConstants, - RenderedFrame, SegmentVideoPaths, Video, ZoomFocusInterpolator, get_duration, + SegmentVideoPaths, Video, ZoomFocusInterpolator, get_duration, spring_mass_damper::SpringMassDamperSimulationConfig, }; use std::{ @@ -94,7 +94,7 @@ impl EditorInstance { pub async fn new( project_path: PathBuf, on_state_change: impl Fn(&EditorState) + Send + Sync + 'static, - frame_cb: Box, + frame_cb: Box, ) -> Result, String> { if !project_path.exists() { return Err(format!("Video path {} not found!", project_path.display())); diff --git a/crates/editor/src/lib.rs b/crates/editor/src/lib.rs index 89af32ea2ab..a08a7afe29c 100644 --- a/crates/editor/src/lib.rs +++ b/crates/editor/src/lib.rs @@ -1,5 +1,5 @@ mod audio; -mod editor; +pub mod editor; mod editor_instance; mod playback; mod segments; diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index ac4492927b8..f4ca4ca5ced 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -231,7 +231,10 @@ pub struct PendingNv12Readback { } impl PendingNv12Readback { - pub async fn wait(mut self, device: &wgpu::Device) -> Result { + pub async fn wait( + mut self, + device: &wgpu::Device, + ) -> Result { let mut poll_count = 0u32; let start_time = Instant::now(); let timeout_duration = std::time::Duration::from_secs(GPU_BUFFER_WAIT_TIMEOUT_SECS); @@ -748,6 +751,48 @@ pub async fn finish_encoder( pending.wait(device).await } +pub async fn finish_encoder_nv12( + session: &mut RenderSession, + nv12_converter: &mut RgbaToNv12Converter, + device: &wgpu::Device, + queue: &wgpu::Queue, + uniforms: &ProjectUniforms, + mut encoder: wgpu::CommandEncoder, +) -> Result { + let width = uniforms.output_size.0; + let height = uniforms.output_size.1; + + let texture = if session.current_is_left { + &session.textures.0 + } else { + &session.textures.1 + }; + + if let Some(pending) = nv12_converter.convert_and_readback( + device, + queue, + &mut encoder, + texture, + width, + height, + uniforms.frame_number, + uniforms.frame_rate, + ) { + queue.submit(std::iter::once(encoder.finish())); + pending.wait(device).await + } else { + let rgba_frame = finish_encoder(session, device, queue, uniforms, encoder).await?; + Ok(Nv12RenderedFrame { + data: rgba_frame.data, + width: rgba_frame.width, + height: rgba_frame.height, + y_stride: rgba_frame.padded_bytes_per_row, + frame_number: rgba_frame.frame_number, + target_time_ns: rgba_frame.target_time_ns, + }) + } +} + pub async fn flush_pending_readback( session: &mut RenderSession, device: &wgpu::Device, diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index 1bd37aa807b..ac75b938494 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -7,7 +7,7 @@ use composite_frame::CompositeVideoFrameUniforms; use core::f64; use cursor_interpolation::{InterpolatedCursorPosition, interpolate_cursor}; use decoder::{AsyncVideoDecoderHandle, spawn_decoder}; -use frame_pipeline::{RenderSession, finish_encoder, flush_pending_readback}; +use frame_pipeline::{RenderSession, finish_encoder, finish_encoder_nv12, flush_pending_readback}; use futures::FutureExt; use futures::future::OptionFuture; use layers::{ @@ -42,7 +42,7 @@ pub mod zoom_focus_interpolation; pub use coord::*; pub use decoder::{DecodedFrame, DecoderStatus, DecoderType, PixelFormat}; -pub use frame_pipeline::RenderedFrame; +pub use frame_pipeline::{Nv12RenderedFrame, RenderedFrame}; pub use project_recordings::{ProjectRecordingsMeta, SegmentRecordings, Video}; use mask::interpolate_masks; @@ -1799,6 +1799,7 @@ pub struct DecodedSegmentFrames { pub struct FrameRenderer<'a> { constants: &'a RenderVideoConstants, session: Option, + nv12_converter: Option, } impl<'a> FrameRenderer<'a> { @@ -1808,6 +1809,7 @@ impl<'a> FrameRenderer<'a> { Self { constants, session: None, + nv12_converter: None, } } @@ -1892,6 +1894,66 @@ impl<'a> FrameRenderer<'a> { None } } + + pub async fn render_nv12( + &mut self, + segment_frames: DecodedSegmentFrames, + uniforms: ProjectUniforms, + cursor: &CursorEvents, + layers: &mut RendererLayers, + ) -> Result { + let session = self.session.get_or_insert_with(|| { + RenderSession::new( + &self.constants.device, + uniforms.output_size.0, + uniforms.output_size.1, + ) + }); + + session.update_texture_size( + &self.constants.device, + uniforms.output_size.0, + uniforms.output_size.1, + ); + + let nv12_converter = self.nv12_converter.get_or_insert_with(|| { + frame_pipeline::RgbaToNv12Converter::new(&self.constants.device) + }); + + let mut encoder = self.constants.device.create_command_encoder( + &(wgpu::CommandEncoderDescriptor { + label: Some("Render Encoder (NV12)"), + }), + ); + + layers + .prepare_with_encoder( + self.constants, + &uniforms, + &segment_frames, + cursor, + &mut encoder, + ) + .await?; + + layers.render( + &self.constants.device, + &self.constants.queue, + &mut encoder, + session, + &uniforms, + ); + + finish_encoder_nv12( + session, + nv12_converter, + &self.constants.device, + &self.constants.queue, + &uniforms, + encoder, + ) + .await + } } pub struct RendererLayers { From 570da29333e7d017eb50f8b735fff57f5be051d4 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 19:29:41 +0000 Subject: [PATCH 38/54] =?UTF-8?q?improve:=20GPU-side=20RGBA=E2=86=92NV12?= =?UTF-8?q?=20conversion=20eliminates=20CPU=20conversion=20and=20reduces?= =?UTF-8?q?=20readback=20by=2062%?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major optimization: instead of reading back full RGBA from GPU (width*height*4) and converting to NV12 on CPU, the render pipeline now converts RGBA→NV12 on GPU using a compute shader and reads back only NV12 data (width*height*1.5). This eliminates two bottlenecks simultaneously: 1. GPU readback size reduced by 62% (e.g., 3.5MB → 1.3MB at half-res) 2. CPU RGBA→NV12 conversion (~1-2ms per frame) eliminated from render thread Pipeline flow change: Before: GPU render RGBA → readback RGBA → CPU convert NV12 → WS send NV12 After: GPU render RGBA → GPU convert NV12 → readback NV12 → WS send NV12 Implementation: - Add rgba_to_nv12.wgsl compute shader (processes 4x2 pixel blocks, writes complete u32 words to avoid data races) - Add RgbaToNv12Converter with compute pipeline, storage/readback buffers - Add finish_encoder_nv12() for NV12 readback path - Add FrameRenderer::render_nv12() method - Add EditorFrameOutput enum (Rgba/Nv12) for frame callback - Editor renderer now produces NV12 frames directly - Frame callback receives NV12 data without CPU conversion - Export path unchanged (still uses RGBA readback) --- apps/desktop/src-tauri/src/editor_window.rs | 72 ++++++++++++++------- apps/desktop/src-tauri/src/lib.rs | 2 +- crates/editor/src/editor.rs | 15 +++-- crates/editor/src/editor_instance.rs | 4 +- crates/editor/src/lib.rs | 2 +- crates/rendering/src/frame_pipeline.rs | 47 +++++++++++++- crates/rendering/src/lib.rs | 66 ++++++++++++++++++- 7 files changed, 173 insertions(+), 35 deletions(-) diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index 7af0230041b..35da7c3c4b3 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -5,7 +5,7 @@ use tokio_util::sync::CancellationToken; use crate::{ create_editor_instance_impl, - frame_ws::{WSFrame, create_watch_frame_ws}, + frame_ws::{WSFrame, WSFrameFormat, create_watch_frame_ws}, }; pub struct EditorInstance { @@ -27,17 +27,30 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { let inner = create_editor_instance_impl( &app, path, - Box::new(move |frame| { - let _ = frame_tx.send(Some(std::sync::Arc::new( - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, - ), - ))); + Box::new(move |output| { + let ws_frame = match output { + cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + }, + cap_editor::editor::EditorFrameOutput::Rgba(frame) => { + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ) + } + }; + let _ = frame_tx.send(Some(std::sync::Arc::new(ws_frame))); }), ) .await?; @@ -219,17 +232,30 @@ impl EditorInstances { let inner = create_editor_instance_impl( window.app_handle(), path, - Box::new(move |frame| { - let _ = frame_tx.send(Some(std::sync::Arc::new( - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, - ), - ))); + Box::new(move |output| { + let ws_frame = match output { + cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + }, + cap_editor::editor::EditorFrameOutput::Rgba(frame) => { + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.padded_bytes_per_row, + frame.frame_number, + frame.target_time_ns, + ) + } + }; + let _ = frame_tx.send(Some(std::sync::Arc::new(ws_frame))); }), ) .await?; diff --git a/apps/desktop/src-tauri/src/lib.rs b/apps/desktop/src-tauri/src/lib.rs index 00dc6903b16..6a24f81a82a 100644 --- a/apps/desktop/src-tauri/src/lib.rs +++ b/apps/desktop/src-tauri/src/lib.rs @@ -3853,7 +3853,7 @@ async fn resume_uploads(app: AppHandle) -> Result<(), String> { async fn create_editor_instance_impl( app: &AppHandle, path: PathBuf, - frame_cb: Box, + frame_cb: Box, ) -> Result, String> { let app = app.clone(); diff --git a/crates/editor/src/editor.rs b/crates/editor/src/editor.rs index a80d18b3992..71cee1d0fb7 100644 --- a/crates/editor/src/editor.rs +++ b/crates/editor/src/editor.rs @@ -3,7 +3,7 @@ use std::time::Instant; use cap_project::{CursorEvents, RecordingMeta, StudioRecordingMeta}; use cap_rendering::{ - DecodedSegmentFrames, FrameRenderer, ProjectRecordingsMeta, ProjectUniforms, + DecodedSegmentFrames, FrameRenderer, Nv12RenderedFrame, ProjectRecordingsMeta, ProjectUniforms, RenderVideoConstants, RenderedFrame, RendererLayers, }; use tokio::sync::{mpsc, oneshot}; @@ -21,9 +21,14 @@ pub enum RendererMessage { }, } +pub enum EditorFrameOutput { + Rgba(RenderedFrame), + Nv12(Nv12RenderedFrame), +} + pub struct Renderer { rx: mpsc::Receiver, - frame_cb: Box, + frame_cb: Box, render_constants: Arc, #[allow(unused)] total_frames: u32, @@ -36,7 +41,7 @@ pub struct RendererHandle { impl Renderer { pub fn spawn( render_constants: Arc, - frame_cb: Box, + frame_cb: Box, recording_meta: &RecordingMeta, meta: &StudioRecordingMeta, ) -> Result { @@ -143,7 +148,7 @@ impl Renderer { } } match frame_renderer - .render( + .render_nv12( current.segment_frames, current.uniforms, ¤t.cursor, @@ -152,7 +157,7 @@ impl Renderer { .await { Ok(frame) => { - (self.frame_cb)(frame); + (self.frame_cb)(EditorFrameOutput::Nv12(frame)); } Err(e) => { tracing::error!(error = %e, "Failed to render frame in editor"); diff --git a/crates/editor/src/editor_instance.rs b/crates/editor/src/editor_instance.rs index a48f65c0632..4879fadc26b 100644 --- a/crates/editor/src/editor_instance.rs +++ b/crates/editor/src/editor_instance.rs @@ -8,7 +8,7 @@ use cap_project::{ }; use cap_rendering::{ ProjectRecordingsMeta, ProjectUniforms, RecordingSegmentDecoders, RenderVideoConstants, - RenderedFrame, SegmentVideoPaths, Video, ZoomFocusInterpolator, get_duration, + SegmentVideoPaths, Video, ZoomFocusInterpolator, get_duration, spring_mass_damper::SpringMassDamperSimulationConfig, }; use std::{ @@ -94,7 +94,7 @@ impl EditorInstance { pub async fn new( project_path: PathBuf, on_state_change: impl Fn(&EditorState) + Send + Sync + 'static, - frame_cb: Box, + frame_cb: Box, ) -> Result, String> { if !project_path.exists() { return Err(format!("Video path {} not found!", project_path.display())); diff --git a/crates/editor/src/lib.rs b/crates/editor/src/lib.rs index 89af32ea2ab..a08a7afe29c 100644 --- a/crates/editor/src/lib.rs +++ b/crates/editor/src/lib.rs @@ -1,5 +1,5 @@ mod audio; -mod editor; +pub mod editor; mod editor_instance; mod playback; mod segments; diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index ac4492927b8..f4ca4ca5ced 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -231,7 +231,10 @@ pub struct PendingNv12Readback { } impl PendingNv12Readback { - pub async fn wait(mut self, device: &wgpu::Device) -> Result { + pub async fn wait( + mut self, + device: &wgpu::Device, + ) -> Result { let mut poll_count = 0u32; let start_time = Instant::now(); let timeout_duration = std::time::Duration::from_secs(GPU_BUFFER_WAIT_TIMEOUT_SECS); @@ -748,6 +751,48 @@ pub async fn finish_encoder( pending.wait(device).await } +pub async fn finish_encoder_nv12( + session: &mut RenderSession, + nv12_converter: &mut RgbaToNv12Converter, + device: &wgpu::Device, + queue: &wgpu::Queue, + uniforms: &ProjectUniforms, + mut encoder: wgpu::CommandEncoder, +) -> Result { + let width = uniforms.output_size.0; + let height = uniforms.output_size.1; + + let texture = if session.current_is_left { + &session.textures.0 + } else { + &session.textures.1 + }; + + if let Some(pending) = nv12_converter.convert_and_readback( + device, + queue, + &mut encoder, + texture, + width, + height, + uniforms.frame_number, + uniforms.frame_rate, + ) { + queue.submit(std::iter::once(encoder.finish())); + pending.wait(device).await + } else { + let rgba_frame = finish_encoder(session, device, queue, uniforms, encoder).await?; + Ok(Nv12RenderedFrame { + data: rgba_frame.data, + width: rgba_frame.width, + height: rgba_frame.height, + y_stride: rgba_frame.padded_bytes_per_row, + frame_number: rgba_frame.frame_number, + target_time_ns: rgba_frame.target_time_ns, + }) + } +} + pub async fn flush_pending_readback( session: &mut RenderSession, device: &wgpu::Device, diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index 1bd37aa807b..ac75b938494 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -7,7 +7,7 @@ use composite_frame::CompositeVideoFrameUniforms; use core::f64; use cursor_interpolation::{InterpolatedCursorPosition, interpolate_cursor}; use decoder::{AsyncVideoDecoderHandle, spawn_decoder}; -use frame_pipeline::{RenderSession, finish_encoder, flush_pending_readback}; +use frame_pipeline::{RenderSession, finish_encoder, finish_encoder_nv12, flush_pending_readback}; use futures::FutureExt; use futures::future::OptionFuture; use layers::{ @@ -42,7 +42,7 @@ pub mod zoom_focus_interpolation; pub use coord::*; pub use decoder::{DecodedFrame, DecoderStatus, DecoderType, PixelFormat}; -pub use frame_pipeline::RenderedFrame; +pub use frame_pipeline::{Nv12RenderedFrame, RenderedFrame}; pub use project_recordings::{ProjectRecordingsMeta, SegmentRecordings, Video}; use mask::interpolate_masks; @@ -1799,6 +1799,7 @@ pub struct DecodedSegmentFrames { pub struct FrameRenderer<'a> { constants: &'a RenderVideoConstants, session: Option, + nv12_converter: Option, } impl<'a> FrameRenderer<'a> { @@ -1808,6 +1809,7 @@ impl<'a> FrameRenderer<'a> { Self { constants, session: None, + nv12_converter: None, } } @@ -1892,6 +1894,66 @@ impl<'a> FrameRenderer<'a> { None } } + + pub async fn render_nv12( + &mut self, + segment_frames: DecodedSegmentFrames, + uniforms: ProjectUniforms, + cursor: &CursorEvents, + layers: &mut RendererLayers, + ) -> Result { + let session = self.session.get_or_insert_with(|| { + RenderSession::new( + &self.constants.device, + uniforms.output_size.0, + uniforms.output_size.1, + ) + }); + + session.update_texture_size( + &self.constants.device, + uniforms.output_size.0, + uniforms.output_size.1, + ); + + let nv12_converter = self.nv12_converter.get_or_insert_with(|| { + frame_pipeline::RgbaToNv12Converter::new(&self.constants.device) + }); + + let mut encoder = self.constants.device.create_command_encoder( + &(wgpu::CommandEncoderDescriptor { + label: Some("Render Encoder (NV12)"), + }), + ); + + layers + .prepare_with_encoder( + self.constants, + &uniforms, + &segment_frames, + cursor, + &mut encoder, + ) + .await?; + + layers.render( + &self.constants.device, + &self.constants.queue, + &mut encoder, + session, + &uniforms, + ); + + finish_encoder_nv12( + session, + nv12_converter, + &self.constants.device, + &self.constants.queue, + &uniforms, + encoder, + ) + .await + } } pub struct RendererLayers { From 97e2df8b4fb557b3f746baee481e0d2b8b3c374f Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 19:31:16 +0000 Subject: [PATCH 39/54] fix: address clippy warnings in NV12 converter (div_ceil, is_multiple_of) Co-authored-by: Richie McIlroy --- crates/rendering/src/frame_pipeline.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index f4ca4ca5ced..5a3c50aeee6 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -116,7 +116,7 @@ impl RgbaToNv12Converter { } let nv12_size = Self::nv12_size(width, height); - let aligned_size = ((nv12_size + 3) / 4) * 4; + let aligned_size = nv12_size.div_ceil(4) * 4; self.nv12_buffer = Some(device.create_buffer(&wgpu::BufferDescriptor { label: Some("NV12 Storage Buffer"), @@ -136,6 +136,7 @@ impl RgbaToNv12Converter { self.cached_height = height; } + #[allow(clippy::too_many_arguments)] pub fn convert_and_readback( &mut self, device: &wgpu::Device, @@ -147,7 +148,7 @@ impl RgbaToNv12Converter { frame_number: u32, frame_rate: u32, ) -> Option { - if width == 0 || height == 0 || width % 4 != 0 || height % 2 != 0 { + if width == 0 || height == 0 || !width.is_multiple_of(4) || !height.is_multiple_of(2) { return None; } From 29fa1354c0fb67ac40d44ae63da0f17e69f9e2f7 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 19:31:16 +0000 Subject: [PATCH 40/54] fix: address clippy warnings in NV12 converter (div_ceil, is_multiple_of) --- crates/rendering/src/frame_pipeline.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index f4ca4ca5ced..5a3c50aeee6 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -116,7 +116,7 @@ impl RgbaToNv12Converter { } let nv12_size = Self::nv12_size(width, height); - let aligned_size = ((nv12_size + 3) / 4) * 4; + let aligned_size = nv12_size.div_ceil(4) * 4; self.nv12_buffer = Some(device.create_buffer(&wgpu::BufferDescriptor { label: Some("NV12 Storage Buffer"), @@ -136,6 +136,7 @@ impl RgbaToNv12Converter { self.cached_height = height; } + #[allow(clippy::too_many_arguments)] pub fn convert_and_readback( &mut self, device: &wgpu::Device, @@ -147,7 +148,7 @@ impl RgbaToNv12Converter { frame_number: u32, frame_rate: u32, ) -> Option { - if width == 0 || height == 0 || width % 4 != 0 || height % 2 != 0 { + if width == 0 || height == 0 || !width.is_multiple_of(4) || !height.is_multiple_of(2) { return None; } From af38499769a9ffaa76ffc0fc8fedea8259c1c039 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 19:36:45 +0000 Subject: [PATCH 41/54] improve: add pipelined readback to GPU NV12 conversion path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NV12 readback path was submitting and immediately waiting for each frame, losing the pipelining benefit of the RGBA path. Now it uses the same pattern: return previous frame's result while submitting current frame's readback. Changes: - Add dual readback buffers to RgbaToNv12Converter (alternating) - Refactor convert_and_readback into submit_conversion + start_readback - Store pending readback in converter (take_pending to retrieve) - finish_encoder_nv12 now returns previous frame while current renders - First frame still waits synchronously, subsequent frames pipelined This ensures the NV12 GPU path gets the same pipelining benefit as the RGBA path — GPU readback of frame N overlaps with rendering of frame N+1. Co-authored-by: Richie McIlroy --- crates/rendering/src/frame_pipeline.rs | 105 ++++++++++++++++++------- 1 file changed, 78 insertions(+), 27 deletions(-) diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index 5a3c50aeee6..c49d9ba2459 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -12,7 +12,9 @@ pub struct RgbaToNv12Converter { bind_group_layout: wgpu::BindGroupLayout, params_buffer: wgpu::Buffer, nv12_buffer: Option, - readback_buffer: Option>, + readback_buffers: [Option>; 2], + current_readback: usize, + pending: Option, cached_width: u32, cached_height: u32, } @@ -98,7 +100,9 @@ impl RgbaToNv12Converter { bind_group_layout, params_buffer, nv12_buffer: None, - readback_buffer: None, + readback_buffers: [None, None], + current_readback: 0, + pending: None, cached_width: 0, cached_height: 0, } @@ -125,19 +129,23 @@ impl RgbaToNv12Converter { mapped_at_creation: false, })); - self.readback_buffer = Some(Arc::new(device.create_buffer(&wgpu::BufferDescriptor { - label: Some("NV12 Readback Buffer"), - size: nv12_size, - usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, - mapped_at_creation: false, - }))); + let make_readback = || { + Arc::new(device.create_buffer(&wgpu::BufferDescriptor { + label: Some("NV12 Readback Buffer"), + size: nv12_size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + })) + }; + self.readback_buffers = [Some(make_readback()), Some(make_readback())]; + self.current_readback = 0; self.cached_width = width; self.cached_height = height; } #[allow(clippy::too_many_arguments)] - pub fn convert_and_readback( + pub fn submit_conversion( &mut self, device: &wgpu::Device, queue: &wgpu::Queue, @@ -147,15 +155,22 @@ impl RgbaToNv12Converter { height: u32, frame_number: u32, frame_rate: u32, - ) -> Option { + ) -> bool { if width == 0 || height == 0 || !width.is_multiple_of(4) || !height.is_multiple_of(2) { - return None; + return false; } self.ensure_buffers(device, width, height); - let nv12_buffer = self.nv12_buffer.as_ref()?; - let readback_buffer = self.readback_buffer.as_ref()?.clone(); + let Some(nv12_buffer) = self.nv12_buffer.as_ref() else { + return false; + }; + + let readback_buffer = match self.readback_buffers[self.current_readback].as_ref() { + Some(b) => b.clone(), + None => return false, + }; + self.current_readback = 1 - self.current_readback; let y_stride = width; let uv_stride = width; @@ -202,27 +217,39 @@ impl RgbaToNv12Converter { let nv12_size = Self::nv12_size(width, height); encoder.copy_buffer_to_buffer(nv12_buffer, 0, &readback_buffer, 0, nv12_size); - let (tx, rx) = oneshot::channel(); - readback_buffer - .slice(..) - .map_async(wgpu::MapMode::Read, move |result| { - let _ = tx.send(result); - }); - - Some(PendingNv12Readback { - rx, + self.pending = Some(PendingNv12Readback { + rx: None, buffer: readback_buffer, width, height, y_stride, frame_number, frame_rate, - }) + }); + + true + } + + pub fn start_readback(&mut self) { + if let Some(ref mut pending) = self.pending { + let (tx, rx) = oneshot::channel(); + pending + .buffer + .slice(..) + .map_async(wgpu::MapMode::Read, move |result| { + let _ = tx.send(result); + }); + pending.rx = Some(rx); + } + } + + pub fn take_pending(&mut self) -> Option { + self.pending.take() } } pub struct PendingNv12Readback { - rx: oneshot::Receiver>, + rx: Option>>, buffer: Arc, pub width: u32, pub height: u32, @@ -236,6 +263,10 @@ impl PendingNv12Readback { mut self, device: &wgpu::Device, ) -> Result { + let Some(mut rx) = self.rx.take() else { + return Err(RenderingError::BufferMapWaitingFailed); + }; + let mut poll_count = 0u32; let start_time = Instant::now(); let timeout_duration = std::time::Duration::from_secs(GPU_BUFFER_WAIT_TIMEOUT_SECS); @@ -245,7 +276,7 @@ impl PendingNv12Readback { return Err(RenderingError::BufferMapWaitingFailed); } - match self.rx.try_recv() { + match rx.try_recv() { Ok(result) => { result?; break; @@ -763,13 +794,19 @@ pub async fn finish_encoder_nv12( let width = uniforms.output_size.0; let height = uniforms.output_size.1; + let previous_frame = if let Some(prev) = nv12_converter.take_pending() { + Some(prev.wait(device).await?) + } else { + None + }; + let texture = if session.current_is_left { &session.textures.0 } else { &session.textures.1 }; - if let Some(pending) = nv12_converter.convert_and_readback( + let submitted = nv12_converter.submit_conversion( device, queue, &mut encoder, @@ -778,9 +815,23 @@ pub async fn finish_encoder_nv12( height, uniforms.frame_number, uniforms.frame_rate, - ) { + ); + + if submitted { queue.submit(std::iter::once(encoder.finish())); + nv12_converter.start_readback(); + + if let Some(prev_frame) = previous_frame { + return Ok(prev_frame); + } + + let pending = nv12_converter + .take_pending() + .expect("just submitted a conversion"); pending.wait(device).await + } else if let Some(prev_frame) = previous_frame { + queue.submit(std::iter::once(encoder.finish())); + Ok(prev_frame) } else { let rgba_frame = finish_encoder(session, device, queue, uniforms, encoder).await?; Ok(Nv12RenderedFrame { From 9411b5922b3732917aabfd14f680f1a721d08621 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 19:36:45 +0000 Subject: [PATCH 42/54] improve: add pipelined readback to GPU NV12 conversion path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NV12 readback path was submitting and immediately waiting for each frame, losing the pipelining benefit of the RGBA path. Now it uses the same pattern: return previous frame's result while submitting current frame's readback. Changes: - Add dual readback buffers to RgbaToNv12Converter (alternating) - Refactor convert_and_readback into submit_conversion + start_readback - Store pending readback in converter (take_pending to retrieve) - finish_encoder_nv12 now returns previous frame while current renders - First frame still waits synchronously, subsequent frames pipelined This ensures the NV12 GPU path gets the same pipelining benefit as the RGBA path — GPU readback of frame N overlaps with rendering of frame N+1. --- crates/rendering/src/frame_pipeline.rs | 105 ++++++++++++++++++------- 1 file changed, 78 insertions(+), 27 deletions(-) diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index 5a3c50aeee6..c49d9ba2459 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -12,7 +12,9 @@ pub struct RgbaToNv12Converter { bind_group_layout: wgpu::BindGroupLayout, params_buffer: wgpu::Buffer, nv12_buffer: Option, - readback_buffer: Option>, + readback_buffers: [Option>; 2], + current_readback: usize, + pending: Option, cached_width: u32, cached_height: u32, } @@ -98,7 +100,9 @@ impl RgbaToNv12Converter { bind_group_layout, params_buffer, nv12_buffer: None, - readback_buffer: None, + readback_buffers: [None, None], + current_readback: 0, + pending: None, cached_width: 0, cached_height: 0, } @@ -125,19 +129,23 @@ impl RgbaToNv12Converter { mapped_at_creation: false, })); - self.readback_buffer = Some(Arc::new(device.create_buffer(&wgpu::BufferDescriptor { - label: Some("NV12 Readback Buffer"), - size: nv12_size, - usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, - mapped_at_creation: false, - }))); + let make_readback = || { + Arc::new(device.create_buffer(&wgpu::BufferDescriptor { + label: Some("NV12 Readback Buffer"), + size: nv12_size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + })) + }; + self.readback_buffers = [Some(make_readback()), Some(make_readback())]; + self.current_readback = 0; self.cached_width = width; self.cached_height = height; } #[allow(clippy::too_many_arguments)] - pub fn convert_and_readback( + pub fn submit_conversion( &mut self, device: &wgpu::Device, queue: &wgpu::Queue, @@ -147,15 +155,22 @@ impl RgbaToNv12Converter { height: u32, frame_number: u32, frame_rate: u32, - ) -> Option { + ) -> bool { if width == 0 || height == 0 || !width.is_multiple_of(4) || !height.is_multiple_of(2) { - return None; + return false; } self.ensure_buffers(device, width, height); - let nv12_buffer = self.nv12_buffer.as_ref()?; - let readback_buffer = self.readback_buffer.as_ref()?.clone(); + let Some(nv12_buffer) = self.nv12_buffer.as_ref() else { + return false; + }; + + let readback_buffer = match self.readback_buffers[self.current_readback].as_ref() { + Some(b) => b.clone(), + None => return false, + }; + self.current_readback = 1 - self.current_readback; let y_stride = width; let uv_stride = width; @@ -202,27 +217,39 @@ impl RgbaToNv12Converter { let nv12_size = Self::nv12_size(width, height); encoder.copy_buffer_to_buffer(nv12_buffer, 0, &readback_buffer, 0, nv12_size); - let (tx, rx) = oneshot::channel(); - readback_buffer - .slice(..) - .map_async(wgpu::MapMode::Read, move |result| { - let _ = tx.send(result); - }); - - Some(PendingNv12Readback { - rx, + self.pending = Some(PendingNv12Readback { + rx: None, buffer: readback_buffer, width, height, y_stride, frame_number, frame_rate, - }) + }); + + true + } + + pub fn start_readback(&mut self) { + if let Some(ref mut pending) = self.pending { + let (tx, rx) = oneshot::channel(); + pending + .buffer + .slice(..) + .map_async(wgpu::MapMode::Read, move |result| { + let _ = tx.send(result); + }); + pending.rx = Some(rx); + } + } + + pub fn take_pending(&mut self) -> Option { + self.pending.take() } } pub struct PendingNv12Readback { - rx: oneshot::Receiver>, + rx: Option>>, buffer: Arc, pub width: u32, pub height: u32, @@ -236,6 +263,10 @@ impl PendingNv12Readback { mut self, device: &wgpu::Device, ) -> Result { + let Some(mut rx) = self.rx.take() else { + return Err(RenderingError::BufferMapWaitingFailed); + }; + let mut poll_count = 0u32; let start_time = Instant::now(); let timeout_duration = std::time::Duration::from_secs(GPU_BUFFER_WAIT_TIMEOUT_SECS); @@ -245,7 +276,7 @@ impl PendingNv12Readback { return Err(RenderingError::BufferMapWaitingFailed); } - match self.rx.try_recv() { + match rx.try_recv() { Ok(result) => { result?; break; @@ -763,13 +794,19 @@ pub async fn finish_encoder_nv12( let width = uniforms.output_size.0; let height = uniforms.output_size.1; + let previous_frame = if let Some(prev) = nv12_converter.take_pending() { + Some(prev.wait(device).await?) + } else { + None + }; + let texture = if session.current_is_left { &session.textures.0 } else { &session.textures.1 }; - if let Some(pending) = nv12_converter.convert_and_readback( + let submitted = nv12_converter.submit_conversion( device, queue, &mut encoder, @@ -778,9 +815,23 @@ pub async fn finish_encoder_nv12( height, uniforms.frame_number, uniforms.frame_rate, - ) { + ); + + if submitted { queue.submit(std::iter::once(encoder.finish())); + nv12_converter.start_readback(); + + if let Some(prev_frame) = previous_frame { + return Ok(prev_frame); + } + + let pending = nv12_converter + .take_pending() + .expect("just submitted a conversion"); pending.wait(device).await + } else if let Some(prev_frame) = previous_frame { + queue.submit(std::iter::once(encoder.finish())); + Ok(prev_frame) } else { let rgba_frame = finish_encoder(session, device, queue, uniforms, encoder).await?; Ok(Nv12RenderedFrame { From b5cbf8da444c64c73e72a94394538ff5c3cd2da6 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 19:40:06 +0000 Subject: [PATCH 43/54] fix: add GPU error retry logic to render_nv12 matching RGBA render path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The render_nv12 method was missing the retry-on-GPU-error logic that the RGBA render method has. If a GPU buffer mapping fails (which can happen transiently on some hardware), it now retries up to 3 times with progressive backoff — same as the RGBA path. On retry, both the RenderSession and the NV12 converter are reset to clear any stale GPU state. Co-authored-by: Richie McIlroy --- crates/rendering/src/lib.rs | 114 +++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 41 deletions(-) diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index ac75b938494..d74f53b7d8f 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -1902,57 +1902,89 @@ impl<'a> FrameRenderer<'a> { cursor: &CursorEvents, layers: &mut RendererLayers, ) -> Result { - let session = self.session.get_or_insert_with(|| { - RenderSession::new( + let mut last_error = None; + + for attempt in 0..Self::MAX_RENDER_RETRIES { + if attempt > 0 { + tracing::warn!( + frame_number = uniforms.frame_number, + attempt = attempt + 1, + "Retrying NV12 frame render after GPU error" + ); + self.reset_session(); + self.nv12_converter = None; + tokio::time::sleep(std::time::Duration::from_millis(100 * (attempt as u64 + 1))) + .await; + } + + let session = self.session.get_or_insert_with(|| { + RenderSession::new( + &self.constants.device, + uniforms.output_size.0, + uniforms.output_size.1, + ) + }); + + session.update_texture_size( &self.constants.device, uniforms.output_size.0, uniforms.output_size.1, - ) - }); + ); - session.update_texture_size( - &self.constants.device, - uniforms.output_size.0, - uniforms.output_size.1, - ); + let nv12_converter = self.nv12_converter.get_or_insert_with(|| { + frame_pipeline::RgbaToNv12Converter::new(&self.constants.device) + }); - let nv12_converter = self.nv12_converter.get_or_insert_with(|| { - frame_pipeline::RgbaToNv12Converter::new(&self.constants.device) - }); + let mut encoder = self.constants.device.create_command_encoder( + &(wgpu::CommandEncoderDescriptor { + label: Some("Render Encoder (NV12)"), + }), + ); - let mut encoder = self.constants.device.create_command_encoder( - &(wgpu::CommandEncoderDescriptor { - label: Some("Render Encoder (NV12)"), - }), - ); + if let Err(e) = layers + .prepare_with_encoder( + self.constants, + &uniforms, + &segment_frames, + cursor, + &mut encoder, + ) + .await + { + last_error = Some(e); + continue; + } - layers - .prepare_with_encoder( - self.constants, - &uniforms, - &segment_frames, - cursor, + layers.render( + &self.constants.device, + &self.constants.queue, &mut encoder, - ) - .await?; + session, + &uniforms, + ); - layers.render( - &self.constants.device, - &self.constants.queue, - &mut encoder, - session, - &uniforms, - ); + match finish_encoder_nv12( + session, + nv12_converter, + &self.constants.device, + &self.constants.queue, + &uniforms, + encoder, + ) + .await + { + Ok(frame) => return Ok(frame), + Err(RenderingError::BufferMapWaitingFailed) => { + last_error = Some(RenderingError::BufferMapWaitingFailed); + } + Err(RenderingError::BufferMapFailed(e)) => { + last_error = Some(RenderingError::BufferMapFailed(e)); + } + Err(e) => return Err(e), + } + } - finish_encoder_nv12( - session, - nv12_converter, - &self.constants.device, - &self.constants.queue, - &uniforms, - encoder, - ) - .await + Err(last_error.unwrap_or(RenderingError::BufferMapWaitingFailed)) } } From 5ad740df44ec77945f3f67d4282532c60592bd41 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 19:40:06 +0000 Subject: [PATCH 44/54] fix: add GPU error retry logic to render_nv12 matching RGBA render path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The render_nv12 method was missing the retry-on-GPU-error logic that the RGBA render method has. If a GPU buffer mapping fails (which can happen transiently on some hardware), it now retries up to 3 times with progressive backoff — same as the RGBA path. On retry, both the RenderSession and the NV12 converter are reset to clear any stale GPU state. --- crates/rendering/src/lib.rs | 114 +++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 41 deletions(-) diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index ac75b938494..d74f53b7d8f 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -1902,57 +1902,89 @@ impl<'a> FrameRenderer<'a> { cursor: &CursorEvents, layers: &mut RendererLayers, ) -> Result { - let session = self.session.get_or_insert_with(|| { - RenderSession::new( + let mut last_error = None; + + for attempt in 0..Self::MAX_RENDER_RETRIES { + if attempt > 0 { + tracing::warn!( + frame_number = uniforms.frame_number, + attempt = attempt + 1, + "Retrying NV12 frame render after GPU error" + ); + self.reset_session(); + self.nv12_converter = None; + tokio::time::sleep(std::time::Duration::from_millis(100 * (attempt as u64 + 1))) + .await; + } + + let session = self.session.get_or_insert_with(|| { + RenderSession::new( + &self.constants.device, + uniforms.output_size.0, + uniforms.output_size.1, + ) + }); + + session.update_texture_size( &self.constants.device, uniforms.output_size.0, uniforms.output_size.1, - ) - }); + ); - session.update_texture_size( - &self.constants.device, - uniforms.output_size.0, - uniforms.output_size.1, - ); + let nv12_converter = self.nv12_converter.get_or_insert_with(|| { + frame_pipeline::RgbaToNv12Converter::new(&self.constants.device) + }); - let nv12_converter = self.nv12_converter.get_or_insert_with(|| { - frame_pipeline::RgbaToNv12Converter::new(&self.constants.device) - }); + let mut encoder = self.constants.device.create_command_encoder( + &(wgpu::CommandEncoderDescriptor { + label: Some("Render Encoder (NV12)"), + }), + ); - let mut encoder = self.constants.device.create_command_encoder( - &(wgpu::CommandEncoderDescriptor { - label: Some("Render Encoder (NV12)"), - }), - ); + if let Err(e) = layers + .prepare_with_encoder( + self.constants, + &uniforms, + &segment_frames, + cursor, + &mut encoder, + ) + .await + { + last_error = Some(e); + continue; + } - layers - .prepare_with_encoder( - self.constants, - &uniforms, - &segment_frames, - cursor, + layers.render( + &self.constants.device, + &self.constants.queue, &mut encoder, - ) - .await?; + session, + &uniforms, + ); - layers.render( - &self.constants.device, - &self.constants.queue, - &mut encoder, - session, - &uniforms, - ); + match finish_encoder_nv12( + session, + nv12_converter, + &self.constants.device, + &self.constants.queue, + &uniforms, + encoder, + ) + .await + { + Ok(frame) => return Ok(frame), + Err(RenderingError::BufferMapWaitingFailed) => { + last_error = Some(RenderingError::BufferMapWaitingFailed); + } + Err(RenderingError::BufferMapFailed(e)) => { + last_error = Some(RenderingError::BufferMapFailed(e)); + } + Err(e) => return Err(e), + } + } - finish_encoder_nv12( - session, - nv12_converter, - &self.constants.device, - &self.constants.queue, - &uniforms, - encoder, - ) - .await + Err(last_error.unwrap_or(RenderingError::BufferMapWaitingFailed)) } } From 5adfdf95cfcbefe7d3e7602dbb91c7c475f0693b Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 19:44:39 +0000 Subject: [PATCH 45/54] fix: handle RGBA fallback correctly in GPU NV12 conversion path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the GPU NV12 conversion fails (e.g., dimensions not a multiple of 4), the fallback path was wrapping RGBA data in Nv12RenderedFrame and sending it to the frontend tagged as NV12 format. The frontend WebGPU shader would try to decode NV12, producing incorrect colors. Fix: - Add GpuOutputFormat enum (Nv12/Rgba) to Nv12RenderedFrame - GPU NV12 path sets format=Nv12, fallback sets format=Rgba - Editor frame callback checks the format field: if Nv12, sends directly; if Rgba, does CPU RGBA→NV12 conversion before sending - Ensures the frontend always receives correctly formatted NV12 data Co-authored-by: Richie McIlroy --- apps/desktop/src-tauri/src/editor_window.rs | 66 ++++++++++++++------- crates/rendering/src/frame_pipeline.rs | 9 +++ crates/rendering/src/lib.rs | 2 +- 3 files changed, 56 insertions(+), 21 deletions(-) diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index 35da7c3c4b3..d8042a925f6 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -29,16 +29,29 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.y_stride, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - format: WSFrameFormat::Nv12, - created_at: Instant::now(), - }, + cap_editor::editor::EditorFrameOutput::Nv12(frame) => { + if frame.format == cap_rendering::GpuOutputFormat::Nv12 { + WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + } + } else { + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.y_stride, + frame.frame_number, + frame.target_time_ns, + ) + } + } cap_editor::editor::EditorFrameOutput::Rgba(frame) => { WSFrame::from_rendered_frame_nv12( frame.data, @@ -234,16 +247,29 @@ impl EditorInstances { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.y_stride, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - format: WSFrameFormat::Nv12, - created_at: Instant::now(), - }, + cap_editor::editor::EditorFrameOutput::Nv12(frame) => { + if frame.format == cap_rendering::GpuOutputFormat::Nv12 { + WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + } + } else { + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.y_stride, + frame.frame_number, + frame.target_time_ns, + ) + } + } cap_editor::editor::EditorFrameOutput::Rgba(frame) => { WSFrame::from_rendered_frame_nv12( frame.data, diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index c49d9ba2459..5b9e207ee24 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -315,10 +315,17 @@ impl PendingNv12Readback { y_stride: self.y_stride, frame_number: self.frame_number, target_time_ns, + format: GpuOutputFormat::Nv12, }) } } +#[derive(Clone, Copy, PartialEq, Eq)] +pub enum GpuOutputFormat { + Nv12, + Rgba, +} + pub struct Nv12RenderedFrame { pub data: Vec, pub width: u32, @@ -326,6 +333,7 @@ pub struct Nv12RenderedFrame { pub y_stride: u32, pub frame_number: u32, pub target_time_ns: u64, + pub format: GpuOutputFormat, } pub struct PendingReadback { @@ -841,6 +849,7 @@ pub async fn finish_encoder_nv12( y_stride: rgba_frame.padded_bytes_per_row, frame_number: rgba_frame.frame_number, target_time_ns: rgba_frame.target_time_ns, + format: GpuOutputFormat::Rgba, }) } } diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index d74f53b7d8f..092465e037d 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -42,7 +42,7 @@ pub mod zoom_focus_interpolation; pub use coord::*; pub use decoder::{DecodedFrame, DecoderStatus, DecoderType, PixelFormat}; -pub use frame_pipeline::{Nv12RenderedFrame, RenderedFrame}; +pub use frame_pipeline::{GpuOutputFormat, Nv12RenderedFrame, RenderedFrame}; pub use project_recordings::{ProjectRecordingsMeta, SegmentRecordings, Video}; use mask::interpolate_masks; From 8ec07ac0c132d2e9e4ad70fe6a95e01df5a0512c Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 19:44:39 +0000 Subject: [PATCH 46/54] fix: handle RGBA fallback correctly in GPU NV12 conversion path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the GPU NV12 conversion fails (e.g., dimensions not a multiple of 4), the fallback path was wrapping RGBA data in Nv12RenderedFrame and sending it to the frontend tagged as NV12 format. The frontend WebGPU shader would try to decode NV12, producing incorrect colors. Fix: - Add GpuOutputFormat enum (Nv12/Rgba) to Nv12RenderedFrame - GPU NV12 path sets format=Nv12, fallback sets format=Rgba - Editor frame callback checks the format field: if Nv12, sends directly; if Rgba, does CPU RGBA→NV12 conversion before sending - Ensures the frontend always receives correctly formatted NV12 data --- apps/desktop/src-tauri/src/editor_window.rs | 66 ++++++++++++++------- crates/rendering/src/frame_pipeline.rs | 9 +++ crates/rendering/src/lib.rs | 2 +- 3 files changed, 56 insertions(+), 21 deletions(-) diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index 35da7c3c4b3..d8042a925f6 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -29,16 +29,29 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.y_stride, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - format: WSFrameFormat::Nv12, - created_at: Instant::now(), - }, + cap_editor::editor::EditorFrameOutput::Nv12(frame) => { + if frame.format == cap_rendering::GpuOutputFormat::Nv12 { + WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + } + } else { + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.y_stride, + frame.frame_number, + frame.target_time_ns, + ) + } + } cap_editor::editor::EditorFrameOutput::Rgba(frame) => { WSFrame::from_rendered_frame_nv12( frame.data, @@ -234,16 +247,29 @@ impl EditorInstances { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.y_stride, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - format: WSFrameFormat::Nv12, - created_at: Instant::now(), - }, + cap_editor::editor::EditorFrameOutput::Nv12(frame) => { + if frame.format == cap_rendering::GpuOutputFormat::Nv12 { + WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + } + } else { + WSFrame::from_rendered_frame_nv12( + frame.data, + frame.width, + frame.height, + frame.y_stride, + frame.frame_number, + frame.target_time_ns, + ) + } + } cap_editor::editor::EditorFrameOutput::Rgba(frame) => { WSFrame::from_rendered_frame_nv12( frame.data, diff --git a/crates/rendering/src/frame_pipeline.rs b/crates/rendering/src/frame_pipeline.rs index c49d9ba2459..5b9e207ee24 100644 --- a/crates/rendering/src/frame_pipeline.rs +++ b/crates/rendering/src/frame_pipeline.rs @@ -315,10 +315,17 @@ impl PendingNv12Readback { y_stride: self.y_stride, frame_number: self.frame_number, target_time_ns, + format: GpuOutputFormat::Nv12, }) } } +#[derive(Clone, Copy, PartialEq, Eq)] +pub enum GpuOutputFormat { + Nv12, + Rgba, +} + pub struct Nv12RenderedFrame { pub data: Vec, pub width: u32, @@ -326,6 +333,7 @@ pub struct Nv12RenderedFrame { pub y_stride: u32, pub frame_number: u32, pub target_time_ns: u64, + pub format: GpuOutputFormat, } pub struct PendingReadback { @@ -841,6 +849,7 @@ pub async fn finish_encoder_nv12( y_stride: rgba_frame.padded_bytes_per_row, frame_number: rgba_frame.frame_number, target_time_ns: rgba_frame.target_time_ns, + format: GpuOutputFormat::Rgba, }) } } diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index d74f53b7d8f..092465e037d 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -42,7 +42,7 @@ pub mod zoom_focus_interpolation; pub use coord::*; pub use decoder::{DecodedFrame, DecoderStatus, DecoderType, PixelFormat}; -pub use frame_pipeline::{Nv12RenderedFrame, RenderedFrame}; +pub use frame_pipeline::{GpuOutputFormat, Nv12RenderedFrame, RenderedFrame}; pub use project_recordings::{ProjectRecordingsMeta, SegmentRecordings, Video}; use mask::interpolate_masks; From 4a292eef40a164028f207b3987f2e8fc47549b5a Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 14 Feb 2026 23:54:36 +0000 Subject: [PATCH 47/54] fix: revert to RGBA GPU rendering to fix full-resolution playback regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GPU RGBA→NV12 compute shader was adding too much GPU-side work at full resolution (1920x1080), causing playback to drop from ~60fps to ~24-30fps on M4 Max. The GPU compute overhead exceeded the readback bandwidth savings. Changes: - Revert editor renderer from render_nv12 back to render (RGBA output) - Restore renderer channel capacity from 4 back to 8 - CPU NV12 conversion still happens in the frame callback for WS bandwidth savings (~1-2ms, well within frame budget) The GPU NV12 infrastructure (shader, pipeline, pipelined readback) is preserved for future use when it can be made resolution-adaptive, but the editor now uses the standard RGBA render path that works well at all resolutions. All other optimizations remain active: - Batched GPU command submissions (YUV→RGBA + render in single encoder) - Pipelined RGBA readback (previous frame returned while current renders) - NV12 over WebSocket (CPU conversion, 62% bandwidth reduction) - Arc (no deep clone) - Prefetch/decode parallelism (6-8 tasks, 90-frame cache) - Audio sync improvements - Performance instrumentation Co-authored-by: Richie McIlroy --- crates/editor/src/editor.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/editor/src/editor.rs b/crates/editor/src/editor.rs index 71cee1d0fb7..d3f5d911cd7 100644 --- a/crates/editor/src/editor.rs +++ b/crates/editor/src/editor.rs @@ -60,7 +60,7 @@ impl Renderer { let total_frames = (30_f64 * max_duration).ceil() as u32; - let (tx, rx) = mpsc::channel(4); + let (tx, rx) = mpsc::channel(8); let this = Self { rx, @@ -148,7 +148,7 @@ impl Renderer { } } match frame_renderer - .render_nv12( + .render( current.segment_frames, current.uniforms, ¤t.cursor, @@ -157,7 +157,7 @@ impl Renderer { .await { Ok(frame) => { - (self.frame_cb)(EditorFrameOutput::Nv12(frame)); + (self.frame_cb)(EditorFrameOutput::Rgba(frame)); } Err(e) => { tracing::error!(error = %e, "Failed to render frame in editor"); From acd4c32ed6702a824b67f27a2ebea73960d879f4 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sat, 14 Feb 2026 23:54:36 +0000 Subject: [PATCH 48/54] fix: revert to RGBA GPU rendering to fix full-resolution playback regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GPU RGBA→NV12 compute shader was adding too much GPU-side work at full resolution (1920x1080), causing playback to drop from ~60fps to ~24-30fps on M4 Max. The GPU compute overhead exceeded the readback bandwidth savings. Changes: - Revert editor renderer from render_nv12 back to render (RGBA output) - Restore renderer channel capacity from 4 back to 8 - CPU NV12 conversion still happens in the frame callback for WS bandwidth savings (~1-2ms, well within frame budget) The GPU NV12 infrastructure (shader, pipeline, pipelined readback) is preserved for future use when it can be made resolution-adaptive, but the editor now uses the standard RGBA render path that works well at all resolutions. All other optimizations remain active: - Batched GPU command submissions (YUV→RGBA + render in single encoder) - Pipelined RGBA readback (previous frame returned while current renders) - NV12 over WebSocket (CPU conversion, 62% bandwidth reduction) - Arc (no deep clone) - Prefetch/decode parallelism (6-8 tasks, 90-frame cache) - Audio sync improvements - Performance instrumentation --- crates/editor/src/editor.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/editor/src/editor.rs b/crates/editor/src/editor.rs index 71cee1d0fb7..d3f5d911cd7 100644 --- a/crates/editor/src/editor.rs +++ b/crates/editor/src/editor.rs @@ -60,7 +60,7 @@ impl Renderer { let total_frames = (30_f64 * max_duration).ceil() as u32; - let (tx, rx) = mpsc::channel(4); + let (tx, rx) = mpsc::channel(8); let this = Self { rx, @@ -148,7 +148,7 @@ impl Renderer { } } match frame_renderer - .render_nv12( + .render( current.segment_frames, current.uniforms, ¤t.cursor, @@ -157,7 +157,7 @@ impl Renderer { .await { Ok(frame) => { - (self.frame_cb)(EditorFrameOutput::Nv12(frame)); + (self.frame_cb)(EditorFrameOutput::Rgba(frame)); } Err(e) => { tracing::error!(error = %e, "Failed to render frame in editor"); From 0811353c88814f6c022b521a23ba45fbcfbd05f9 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sun, 15 Feb 2026 00:10:46 +0000 Subject: [PATCH 49/54] Simplify WSFrame construction and borrow scope --- apps/desktop/src-tauri/src/editor_window.rs | 106 ++++++++------------ apps/desktop/src-tauri/src/frame_ws.rs | 10 +- apps/desktop/src-tauri/src/lib.rs | 2 +- 3 files changed, 46 insertions(+), 72 deletions(-) diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index d8042a925f6..1c79a324182 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -29,39 +29,26 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => { - if frame.format == cap_rendering::GpuOutputFormat::Nv12 { - WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.y_stride, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - format: WSFrameFormat::Nv12, - created_at: Instant::now(), - } - } else { - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.y_stride, - frame.frame_number, - frame.target_time_ns, - ) - } - } - cap_editor::editor::EditorFrameOutput::Rgba(frame) => { - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, - ) - } + cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + }, + cap_editor::editor::EditorFrameOutput::Rgba(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.padded_bytes_per_row, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Rgba, + created_at: Instant::now(), + }, }; let _ = frame_tx.send(Some(std::sync::Arc::new(ws_frame))); }), @@ -247,39 +234,26 @@ impl EditorInstances { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => { - if frame.format == cap_rendering::GpuOutputFormat::Nv12 { - WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.y_stride, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - format: WSFrameFormat::Nv12, - created_at: Instant::now(), - } - } else { - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.y_stride, - frame.frame_number, - frame.target_time_ns, - ) - } - } - cap_editor::editor::EditorFrameOutput::Rgba(frame) => { - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, - ) - } + cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + }, + cap_editor::editor::EditorFrameOutput::Rgba(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.padded_bytes_per_row, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Rgba, + created_at: Instant::now(), + }, }; let _ = frame_tx.send(Some(std::sync::Arc::new(ws_frame))); }), diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs index f31f8e8e018..f3fe9d7ec56 100644 --- a/apps/desktop/src-tauri/src/frame_ws.rs +++ b/apps/desktop/src-tauri/src/frame_ws.rs @@ -265,11 +265,11 @@ pub async fn create_watch_frame_ws( let now = std::time::Instant::now(); { - let borrowed = camera_rx.borrow(); - if let Some(frame) = borrowed.as_deref() { - let packed = pack_ws_frame_ref(frame); - - drop(borrowed); + let packed = { + let borrowed = camera_rx.borrow(); + borrowed.as_deref().map(pack_ws_frame_ref) + }; + if let Some(packed) = packed { if let Err(e) = socket.send(Message::Binary(packed)).await { tracing::error!("Failed to send initial frame to socket: {:?}", e); return; diff --git a/apps/desktop/src-tauri/src/lib.rs b/apps/desktop/src-tauri/src/lib.rs index 6a24f81a82a..2cdc32d5d4a 100644 --- a/apps/desktop/src-tauri/src/lib.rs +++ b/apps/desktop/src-tauri/src/lib.rs @@ -54,7 +54,7 @@ use cap_recording::{ }, sources::screen_capture::ScreenCaptureTarget, }; -use cap_rendering::{ProjectRecordingsMeta, RenderedFrame}; +use cap_rendering::ProjectRecordingsMeta; use clipboard_rs::common::RustImage; use clipboard_rs::{Clipboard, ClipboardContext}; use cpal::StreamError; From f478b4d7a0b286f82b959dcd8318604572f06d91 Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sun, 15 Feb 2026 00:10:46 +0000 Subject: [PATCH 50/54] Simplify WSFrame construction and borrow scope --- apps/desktop/src-tauri/src/editor_window.rs | 106 ++++++++------------ apps/desktop/src-tauri/src/frame_ws.rs | 10 +- apps/desktop/src-tauri/src/lib.rs | 2 +- 3 files changed, 46 insertions(+), 72 deletions(-) diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index d8042a925f6..1c79a324182 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -29,39 +29,26 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => { - if frame.format == cap_rendering::GpuOutputFormat::Nv12 { - WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.y_stride, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - format: WSFrameFormat::Nv12, - created_at: Instant::now(), - } - } else { - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.y_stride, - frame.frame_number, - frame.target_time_ns, - ) - } - } - cap_editor::editor::EditorFrameOutput::Rgba(frame) => { - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, - ) - } + cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + }, + cap_editor::editor::EditorFrameOutput::Rgba(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.padded_bytes_per_row, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Rgba, + created_at: Instant::now(), + }, }; let _ = frame_tx.send(Some(std::sync::Arc::new(ws_frame))); }), @@ -247,39 +234,26 @@ impl EditorInstances { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => { - if frame.format == cap_rendering::GpuOutputFormat::Nv12 { - WSFrame { - data: frame.data, - width: frame.width, - height: frame.height, - stride: frame.y_stride, - frame_number: frame.frame_number, - target_time_ns: frame.target_time_ns, - format: WSFrameFormat::Nv12, - created_at: Instant::now(), - } - } else { - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.y_stride, - frame.frame_number, - frame.target_time_ns, - ) - } - } - cap_editor::editor::EditorFrameOutput::Rgba(frame) => { - WSFrame::from_rendered_frame_nv12( - frame.data, - frame.width, - frame.height, - frame.padded_bytes_per_row, - frame.frame_number, - frame.target_time_ns, - ) - } + cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.y_stride, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Nv12, + created_at: Instant::now(), + }, + cap_editor::editor::EditorFrameOutput::Rgba(frame) => WSFrame { + data: frame.data, + width: frame.width, + height: frame.height, + stride: frame.padded_bytes_per_row, + frame_number: frame.frame_number, + target_time_ns: frame.target_time_ns, + format: WSFrameFormat::Rgba, + created_at: Instant::now(), + }, }; let _ = frame_tx.send(Some(std::sync::Arc::new(ws_frame))); }), diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs index f31f8e8e018..f3fe9d7ec56 100644 --- a/apps/desktop/src-tauri/src/frame_ws.rs +++ b/apps/desktop/src-tauri/src/frame_ws.rs @@ -265,11 +265,11 @@ pub async fn create_watch_frame_ws( let now = std::time::Instant::now(); { - let borrowed = camera_rx.borrow(); - if let Some(frame) = borrowed.as_deref() { - let packed = pack_ws_frame_ref(frame); - - drop(borrowed); + let packed = { + let borrowed = camera_rx.borrow(); + borrowed.as_deref().map(pack_ws_frame_ref) + }; + if let Some(packed) = packed { if let Err(e) = socket.send(Message::Binary(packed)).await { tracing::error!("Failed to send initial frame to socket: {:?}", e); return; diff --git a/apps/desktop/src-tauri/src/lib.rs b/apps/desktop/src-tauri/src/lib.rs index 6a24f81a82a..2cdc32d5d4a 100644 --- a/apps/desktop/src-tauri/src/lib.rs +++ b/apps/desktop/src-tauri/src/lib.rs @@ -54,7 +54,7 @@ use cap_recording::{ }, sources::screen_capture::ScreenCaptureTarget, }; -use cap_rendering::{ProjectRecordingsMeta, RenderedFrame}; +use cap_rendering::ProjectRecordingsMeta; use clipboard_rs::common::RustImage; use clipboard_rs::{Clipboard, ClipboardContext}; use cpal::StreamError; From f842737542413460b9d6a41bd953de11230afbdb Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sun, 15 Feb 2026 00:17:00 +0000 Subject: [PATCH 51/54] cleanup --- apps/desktop/src-tauri/src/editor_window.rs | 8 +- apps/desktop/src-tauri/src/frame_ws.rs | 137 +------------------- apps/desktop/src-tauri/src/lib.rs | 2 +- crates/editor/src/lib.rs | 3 +- crates/editor/src/playback.rs | 5 +- crates/rendering/src/lib.rs | 7 +- 6 files changed, 15 insertions(+), 147 deletions(-) diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index 1c79a324182..33a2d56f8c2 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -29,7 +29,7 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + cap_editor::EditorFrameOutput::Nv12(frame) => WSFrame { data: frame.data, width: frame.width, height: frame.height, @@ -39,7 +39,7 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { format: WSFrameFormat::Nv12, created_at: Instant::now(), }, - cap_editor::editor::EditorFrameOutput::Rgba(frame) => WSFrame { + cap_editor::EditorFrameOutput::Rgba(frame) => WSFrame { data: frame.data, width: frame.width, height: frame.height, @@ -234,7 +234,7 @@ impl EditorInstances { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + cap_editor::EditorFrameOutput::Nv12(frame) => WSFrame { data: frame.data, width: frame.width, height: frame.height, @@ -244,7 +244,7 @@ impl EditorInstances { format: WSFrameFormat::Nv12, created_at: Instant::now(), }, - cap_editor::editor::EditorFrameOutput::Rgba(frame) => WSFrame { + cap_editor::EditorFrameOutput::Rgba(frame) => WSFrame { data: frame.data, width: frame.width, height: frame.height, diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs index f3fe9d7ec56..3d959e4a926 100644 --- a/apps/desktop/src-tauri/src/frame_ws.rs +++ b/apps/desktop/src-tauri/src/frame_ws.rs @@ -9,94 +9,6 @@ static LAST_LOG_TIME: AtomicU64 = AtomicU64::new(0); const NV12_FORMAT_MAGIC: u32 = 0x4e563132; -fn convert_to_nv12(data: &[u8], width: u32, height: u32, stride: u32) -> Vec { - let width = (width & !1) as usize; - let height = (height & !1) as usize; - - if width == 0 || height == 0 { - return Vec::new(); - } - - let y_size = width * height; - let uv_size = width * (height / 2); - let stride = stride as usize; - - let mut output = vec![0u8; y_size + uv_size]; - let (y_plane, uv_plane) = output.split_at_mut(y_size); - - for row in 0..height { - let src_offset = row * stride; - let y_offset = row * width; - - if src_offset + width * 4 > data.len() { - continue; - } - - let src_row = &data[src_offset..]; - let y_row = &mut y_plane[y_offset..y_offset + width]; - - for x in 0..width { - let px = x * 4; - let r = src_row[px] as i32; - let g = src_row[px + 1] as i32; - let b = src_row[px + 2] as i32; - y_row[x] = (((66 * r + 129 * g + 25 * b + 128) >> 8) + 16).min(255) as u8; - } - - if row % 2 == 0 { - let uv_offset = (row / 2) * width; - let uv_row = &mut uv_plane[uv_offset..uv_offset + width]; - - let mut x = 0; - while x < width { - let px0 = x * 4; - let px1 = (x + 1) * 4; - - let r0 = src_row[px0] as i32; - let g0 = src_row[px0 + 1] as i32; - let b0 = src_row[px0 + 2] as i32; - let r1 = src_row[px1] as i32; - let g1 = src_row[px1 + 1] as i32; - let b1 = src_row[px1 + 2] as i32; - - let avg_r = (r0 + r1) >> 1; - let avg_g = (g0 + g1) >> 1; - let avg_b = (b0 + b1) >> 1; - - uv_row[x] = (((-38 * avg_r - 74 * avg_g + 112 * avg_b + 128) >> 8) + 128) - .clamp(0, 255) as u8; - uv_row[x + 1] = (((112 * avg_r - 94 * avg_g - 18 * avg_b + 128) >> 8) + 128) - .clamp(0, 255) as u8; - - x += 2; - } - } - } - - output -} - -fn pack_nv12_frame( - data: Vec, - width: u32, - height: u32, - frame_number: u32, - target_time_ns: u64, -) -> Vec { - let y_stride = width; - let metadata_size = 28; - let mut output = Vec::with_capacity(data.len() + metadata_size); - output.extend_from_slice(&data); - output.extend_from_slice(&y_stride.to_le_bytes()); - output.extend_from_slice(&height.to_le_bytes()); - output.extend_from_slice(&width.to_le_bytes()); - output.extend_from_slice(&frame_number.to_le_bytes()); - output.extend_from_slice(&target_time_ns.to_le_bytes()); - output.extend_from_slice(&NV12_FORMAT_MAGIC.to_le_bytes()); - - output -} - fn pack_frame_data( mut data: Vec, stride: u32, @@ -172,49 +84,6 @@ pub struct WSFrame { pub created_at: Instant, } -impl WSFrame { - pub fn from_rendered_frame_nv12( - data: Vec, - width: u32, - height: u32, - stride: u32, - frame_number: u32, - target_time_ns: u64, - ) -> Self { - let nv12_data = convert_to_nv12(&data, width, height, stride); - Self { - data: nv12_data, - width: width & !1, - height: height & !1, - stride: width & !1, - frame_number, - target_time_ns, - format: WSFrameFormat::Nv12, - created_at: Instant::now(), - } - } -} - -fn pack_ws_frame(frame: WSFrame) -> Vec { - match frame.format { - WSFrameFormat::Nv12 => pack_nv12_frame( - frame.data, - frame.width, - frame.height, - frame.frame_number, - frame.target_time_ns, - ), - WSFrameFormat::Rgba => pack_frame_data( - frame.data, - frame.stride, - frame.height, - frame.width, - frame.frame_number, - frame.target_time_ns, - ), - } -} - fn pack_ws_frame_ref(frame: &WSFrame) -> Vec { match frame.format { WSFrameFormat::Nv12 => pack_nv12_frame_ref( @@ -355,7 +224,7 @@ pub async fn create_watch_frame_ws( tokio::select! { _ = server => {}, _ = cancel_token.cancelled() => { - println!("WebSocket server shutting down"); + tracing::info!("WebSocket server shutting down"); } } }); @@ -385,7 +254,6 @@ pub async fn create_frame_ws(frame_tx: broadcast::Sender) -> (u16, Canc } async fn handle_socket(mut socket: WebSocket, mut camera_rx: broadcast::Receiver) { - println!("socket connection established"); tracing::info!("Socket connection established"); let now = std::time::Instant::now(); @@ -438,7 +306,6 @@ pub async fn create_frame_ws(frame_tx: broadcast::Sender) -> (u16, Canc } let elapsed = now.elapsed(); - println!("Websocket closing after {elapsed:.2?}"); tracing::info!("Websocket closing after {elapsed:.2?}"); } @@ -457,7 +324,7 @@ pub async fn create_frame_ws(frame_tx: broadcast::Sender) -> (u16, Canc tokio::select! { _ = server => {}, _ = cancel_token.cancelled() => { - println!("WebSocket server shutting down"); + tracing::info!("WebSocket server shutting down"); } } }); diff --git a/apps/desktop/src-tauri/src/lib.rs b/apps/desktop/src-tauri/src/lib.rs index 2cdc32d5d4a..23cdbed7f7f 100644 --- a/apps/desktop/src-tauri/src/lib.rs +++ b/apps/desktop/src-tauri/src/lib.rs @@ -3853,7 +3853,7 @@ async fn resume_uploads(app: AppHandle) -> Result<(), String> { async fn create_editor_instance_impl( app: &AppHandle, path: PathBuf, - frame_cb: Box, + frame_cb: Box, ) -> Result, String> { let app = app.clone(); diff --git a/crates/editor/src/lib.rs b/crates/editor/src/lib.rs index a08a7afe29c..0d37d6e87df 100644 --- a/crates/editor/src/lib.rs +++ b/crates/editor/src/lib.rs @@ -1,9 +1,10 @@ mod audio; -pub mod editor; +mod editor; mod editor_instance; mod playback; mod segments; pub use audio::AudioRenderer; +pub use editor::EditorFrameOutput; pub use editor_instance::{EditorInstance, EditorState, SegmentMedia, create_segments}; pub use segments::get_audio_segments; diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index dbedf9f8fd0..87a1776a3ce 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -716,12 +716,11 @@ impl Playback { if last_stats_time.elapsed() >= stats_interval { let effective_fps = total_frames_rendered as f64 / start.elapsed().as_secs_f64().max(0.001); - let recent_rendered = total_frames_rendered; let buffer_len = prefetch_buffer.len(); info!( effective_fps = format!("{:.1}", effective_fps), - rendered = recent_rendered, - skipped = total_frames_skipped, + total_rendered = total_frames_rendered, + total_skipped = total_frames_skipped, cache_hits = cache_hits, prefetch_hits = prefetch_hits, sync_decodes = sync_decodes, diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index 092465e037d..0d693f35fd6 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -542,9 +542,10 @@ pub async fn render_video_to_channel( } let total_time = start_time.elapsed(); - println!( - "Render complete. Processed {frame_number} frames in {:?} seconds", - total_time.as_secs_f32() + tracing::info!( + frames = frame_number, + elapsed_secs = format!("{:.2}", total_time.as_secs_f32()), + "Render complete" ); Ok(()) From 5c6fe47f8c8e0c3c71e0ade2cd605810dc0da92f Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sun, 15 Feb 2026 00:17:00 +0000 Subject: [PATCH 52/54] cleanup --- apps/desktop/src-tauri/src/editor_window.rs | 8 +- apps/desktop/src-tauri/src/frame_ws.rs | 137 +------------------- apps/desktop/src-tauri/src/lib.rs | 2 +- crates/editor/src/lib.rs | 3 +- crates/editor/src/playback.rs | 5 +- crates/rendering/src/lib.rs | 7 +- 6 files changed, 15 insertions(+), 147 deletions(-) diff --git a/apps/desktop/src-tauri/src/editor_window.rs b/apps/desktop/src-tauri/src/editor_window.rs index 1c79a324182..33a2d56f8c2 100644 --- a/apps/desktop/src-tauri/src/editor_window.rs +++ b/apps/desktop/src-tauri/src/editor_window.rs @@ -29,7 +29,7 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + cap_editor::EditorFrameOutput::Nv12(frame) => WSFrame { data: frame.data, width: frame.width, height: frame.height, @@ -39,7 +39,7 @@ async fn do_prewarm(app: AppHandle, path: PathBuf) -> PendingResult { format: WSFrameFormat::Nv12, created_at: Instant::now(), }, - cap_editor::editor::EditorFrameOutput::Rgba(frame) => WSFrame { + cap_editor::EditorFrameOutput::Rgba(frame) => WSFrame { data: frame.data, width: frame.width, height: frame.height, @@ -234,7 +234,7 @@ impl EditorInstances { path, Box::new(move |output| { let ws_frame = match output { - cap_editor::editor::EditorFrameOutput::Nv12(frame) => WSFrame { + cap_editor::EditorFrameOutput::Nv12(frame) => WSFrame { data: frame.data, width: frame.width, height: frame.height, @@ -244,7 +244,7 @@ impl EditorInstances { format: WSFrameFormat::Nv12, created_at: Instant::now(), }, - cap_editor::editor::EditorFrameOutput::Rgba(frame) => WSFrame { + cap_editor::EditorFrameOutput::Rgba(frame) => WSFrame { data: frame.data, width: frame.width, height: frame.height, diff --git a/apps/desktop/src-tauri/src/frame_ws.rs b/apps/desktop/src-tauri/src/frame_ws.rs index f3fe9d7ec56..3d959e4a926 100644 --- a/apps/desktop/src-tauri/src/frame_ws.rs +++ b/apps/desktop/src-tauri/src/frame_ws.rs @@ -9,94 +9,6 @@ static LAST_LOG_TIME: AtomicU64 = AtomicU64::new(0); const NV12_FORMAT_MAGIC: u32 = 0x4e563132; -fn convert_to_nv12(data: &[u8], width: u32, height: u32, stride: u32) -> Vec { - let width = (width & !1) as usize; - let height = (height & !1) as usize; - - if width == 0 || height == 0 { - return Vec::new(); - } - - let y_size = width * height; - let uv_size = width * (height / 2); - let stride = stride as usize; - - let mut output = vec![0u8; y_size + uv_size]; - let (y_plane, uv_plane) = output.split_at_mut(y_size); - - for row in 0..height { - let src_offset = row * stride; - let y_offset = row * width; - - if src_offset + width * 4 > data.len() { - continue; - } - - let src_row = &data[src_offset..]; - let y_row = &mut y_plane[y_offset..y_offset + width]; - - for x in 0..width { - let px = x * 4; - let r = src_row[px] as i32; - let g = src_row[px + 1] as i32; - let b = src_row[px + 2] as i32; - y_row[x] = (((66 * r + 129 * g + 25 * b + 128) >> 8) + 16).min(255) as u8; - } - - if row % 2 == 0 { - let uv_offset = (row / 2) * width; - let uv_row = &mut uv_plane[uv_offset..uv_offset + width]; - - let mut x = 0; - while x < width { - let px0 = x * 4; - let px1 = (x + 1) * 4; - - let r0 = src_row[px0] as i32; - let g0 = src_row[px0 + 1] as i32; - let b0 = src_row[px0 + 2] as i32; - let r1 = src_row[px1] as i32; - let g1 = src_row[px1 + 1] as i32; - let b1 = src_row[px1 + 2] as i32; - - let avg_r = (r0 + r1) >> 1; - let avg_g = (g0 + g1) >> 1; - let avg_b = (b0 + b1) >> 1; - - uv_row[x] = (((-38 * avg_r - 74 * avg_g + 112 * avg_b + 128) >> 8) + 128) - .clamp(0, 255) as u8; - uv_row[x + 1] = (((112 * avg_r - 94 * avg_g - 18 * avg_b + 128) >> 8) + 128) - .clamp(0, 255) as u8; - - x += 2; - } - } - } - - output -} - -fn pack_nv12_frame( - data: Vec, - width: u32, - height: u32, - frame_number: u32, - target_time_ns: u64, -) -> Vec { - let y_stride = width; - let metadata_size = 28; - let mut output = Vec::with_capacity(data.len() + metadata_size); - output.extend_from_slice(&data); - output.extend_from_slice(&y_stride.to_le_bytes()); - output.extend_from_slice(&height.to_le_bytes()); - output.extend_from_slice(&width.to_le_bytes()); - output.extend_from_slice(&frame_number.to_le_bytes()); - output.extend_from_slice(&target_time_ns.to_le_bytes()); - output.extend_from_slice(&NV12_FORMAT_MAGIC.to_le_bytes()); - - output -} - fn pack_frame_data( mut data: Vec, stride: u32, @@ -172,49 +84,6 @@ pub struct WSFrame { pub created_at: Instant, } -impl WSFrame { - pub fn from_rendered_frame_nv12( - data: Vec, - width: u32, - height: u32, - stride: u32, - frame_number: u32, - target_time_ns: u64, - ) -> Self { - let nv12_data = convert_to_nv12(&data, width, height, stride); - Self { - data: nv12_data, - width: width & !1, - height: height & !1, - stride: width & !1, - frame_number, - target_time_ns, - format: WSFrameFormat::Nv12, - created_at: Instant::now(), - } - } -} - -fn pack_ws_frame(frame: WSFrame) -> Vec { - match frame.format { - WSFrameFormat::Nv12 => pack_nv12_frame( - frame.data, - frame.width, - frame.height, - frame.frame_number, - frame.target_time_ns, - ), - WSFrameFormat::Rgba => pack_frame_data( - frame.data, - frame.stride, - frame.height, - frame.width, - frame.frame_number, - frame.target_time_ns, - ), - } -} - fn pack_ws_frame_ref(frame: &WSFrame) -> Vec { match frame.format { WSFrameFormat::Nv12 => pack_nv12_frame_ref( @@ -355,7 +224,7 @@ pub async fn create_watch_frame_ws( tokio::select! { _ = server => {}, _ = cancel_token.cancelled() => { - println!("WebSocket server shutting down"); + tracing::info!("WebSocket server shutting down"); } } }); @@ -385,7 +254,6 @@ pub async fn create_frame_ws(frame_tx: broadcast::Sender) -> (u16, Canc } async fn handle_socket(mut socket: WebSocket, mut camera_rx: broadcast::Receiver) { - println!("socket connection established"); tracing::info!("Socket connection established"); let now = std::time::Instant::now(); @@ -438,7 +306,6 @@ pub async fn create_frame_ws(frame_tx: broadcast::Sender) -> (u16, Canc } let elapsed = now.elapsed(); - println!("Websocket closing after {elapsed:.2?}"); tracing::info!("Websocket closing after {elapsed:.2?}"); } @@ -457,7 +324,7 @@ pub async fn create_frame_ws(frame_tx: broadcast::Sender) -> (u16, Canc tokio::select! { _ = server => {}, _ = cancel_token.cancelled() => { - println!("WebSocket server shutting down"); + tracing::info!("WebSocket server shutting down"); } } }); diff --git a/apps/desktop/src-tauri/src/lib.rs b/apps/desktop/src-tauri/src/lib.rs index 2cdc32d5d4a..23cdbed7f7f 100644 --- a/apps/desktop/src-tauri/src/lib.rs +++ b/apps/desktop/src-tauri/src/lib.rs @@ -3853,7 +3853,7 @@ async fn resume_uploads(app: AppHandle) -> Result<(), String> { async fn create_editor_instance_impl( app: &AppHandle, path: PathBuf, - frame_cb: Box, + frame_cb: Box, ) -> Result, String> { let app = app.clone(); diff --git a/crates/editor/src/lib.rs b/crates/editor/src/lib.rs index a08a7afe29c..0d37d6e87df 100644 --- a/crates/editor/src/lib.rs +++ b/crates/editor/src/lib.rs @@ -1,9 +1,10 @@ mod audio; -pub mod editor; +mod editor; mod editor_instance; mod playback; mod segments; pub use audio::AudioRenderer; +pub use editor::EditorFrameOutput; pub use editor_instance::{EditorInstance, EditorState, SegmentMedia, create_segments}; pub use segments::get_audio_segments; diff --git a/crates/editor/src/playback.rs b/crates/editor/src/playback.rs index dbedf9f8fd0..87a1776a3ce 100644 --- a/crates/editor/src/playback.rs +++ b/crates/editor/src/playback.rs @@ -716,12 +716,11 @@ impl Playback { if last_stats_time.elapsed() >= stats_interval { let effective_fps = total_frames_rendered as f64 / start.elapsed().as_secs_f64().max(0.001); - let recent_rendered = total_frames_rendered; let buffer_len = prefetch_buffer.len(); info!( effective_fps = format!("{:.1}", effective_fps), - rendered = recent_rendered, - skipped = total_frames_skipped, + total_rendered = total_frames_rendered, + total_skipped = total_frames_skipped, cache_hits = cache_hits, prefetch_hits = prefetch_hits, sync_decodes = sync_decodes, diff --git a/crates/rendering/src/lib.rs b/crates/rendering/src/lib.rs index 092465e037d..0d693f35fd6 100644 --- a/crates/rendering/src/lib.rs +++ b/crates/rendering/src/lib.rs @@ -542,9 +542,10 @@ pub async fn render_video_to_channel( } let total_time = start_time.elapsed(); - println!( - "Render complete. Processed {frame_number} frames in {:?} seconds", - total_time.as_secs_f32() + tracing::info!( + frames = frame_number, + elapsed_secs = format!("{:.2}", total_time.as_secs_f32()), + "Render complete" ); Ok(()) From 6d1d3c1e1b40bb74c6e9239f4676dacc73e821aa Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sun, 15 Feb 2026 00:27:00 +0000 Subject: [PATCH 53/54] clippy --- crates/recording/src/output_pipeline/core.rs | 2 +- crates/recording/src/output_pipeline/macos.rs | 2 +- crates/recording/src/sources/camera.rs | 2 +- crates/recording/src/sources/microphone.rs | 10 ++++------ 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/crates/recording/src/output_pipeline/core.rs b/crates/recording/src/output_pipeline/core.rs index 737cb368a3c..b184cc0beae 100644 --- a/crates/recording/src/output_pipeline/core.rs +++ b/crates/recording/src/output_pipeline/core.rs @@ -419,7 +419,7 @@ impl TimestampAnomalyTracker { jump_secs: f64, now: Instant, ) -> Result { - let wall_clock_confirmed = self.last_valid_wall_clock.map_or(false, |last_wc| { + let wall_clock_confirmed = self.last_valid_wall_clock.is_some_and(|last_wc| { let wall_clock_gap_secs = now.duration_since(last_wc).as_secs_f64(); wall_clock_gap_secs >= jump_secs * 0.5 }); diff --git a/crates/recording/src/output_pipeline/macos.rs b/crates/recording/src/output_pipeline/macos.rs index 5a190257acf..dc03b7a51bd 100644 --- a/crates/recording/src/output_pipeline/macos.rs +++ b/crates/recording/src/output_pipeline/macos.rs @@ -34,7 +34,7 @@ fn get_available_disk_space_mb(path: &std::path::Path) -> Option { if result != 0 { return None; } - Some((stat.f_bavail as u64).saturating_mul(stat.f_frsize as u64) / (1024 * 1024)) + Some((stat.f_bavail as u64).saturating_mul(stat.f_frsize) / (1024 * 1024)) } fn get_mp4_muxer_buffer_size(instant_mode: bool) -> usize { diff --git a/crates/recording/src/sources/camera.rs b/crates/recording/src/sources/camera.rs index db0fa42b9b5..aede5e52d5a 100644 --- a/crates/recording/src/sources/camera.rs +++ b/crates/recording/src/sources/camera.rs @@ -133,7 +133,7 @@ impl VideoSource for Camera { if frame_width != original_width || frame_height != original_height { let needs_new_scaler = scaler .as_ref() - .map_or(true, |s| !s.matches_source(frame_width, frame_height)); + .is_none_or(|s| !s.matches_source(frame_width, frame_height)); if needs_new_scaler { let frame_format = frame.inner.format(); diff --git a/crates/recording/src/sources/microphone.rs b/crates/recording/src/sources/microphone.rs index 3e8b6247f3a..4bdd47522aa 100644 --- a/crates/recording/src/sources/microphone.rs +++ b/crates/recording/src/sources/microphone.rs @@ -299,12 +299,10 @@ impl AudioSource for Microphone { silence_counter.fetch_add(1, Ordering::Relaxed); - match tokio::time::timeout(send_timeout, audio_tx.send(audio_frame)) - .await - { - Ok(Ok(())) => {} - _ => {} - } + if let Ok(Ok(())) = + tokio::time::timeout(send_timeout, audio_tx.send(audio_frame)) + .await + {} } } } From 672eac256538c383062a07c832651e639ddd0ffe Mon Sep 17 00:00:00 2001 From: Richie McIlroy <33632126+richiemcilroy@users.noreply.github.com> Date: Sun, 15 Feb 2026 00:27:00 +0000 Subject: [PATCH 54/54] clippy --- crates/recording/src/output_pipeline/core.rs | 2 +- crates/recording/src/output_pipeline/macos.rs | 2 +- crates/recording/src/sources/camera.rs | 2 +- crates/recording/src/sources/microphone.rs | 10 ++++------ 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/crates/recording/src/output_pipeline/core.rs b/crates/recording/src/output_pipeline/core.rs index 737cb368a3c..b184cc0beae 100644 --- a/crates/recording/src/output_pipeline/core.rs +++ b/crates/recording/src/output_pipeline/core.rs @@ -419,7 +419,7 @@ impl TimestampAnomalyTracker { jump_secs: f64, now: Instant, ) -> Result { - let wall_clock_confirmed = self.last_valid_wall_clock.map_or(false, |last_wc| { + let wall_clock_confirmed = self.last_valid_wall_clock.is_some_and(|last_wc| { let wall_clock_gap_secs = now.duration_since(last_wc).as_secs_f64(); wall_clock_gap_secs >= jump_secs * 0.5 }); diff --git a/crates/recording/src/output_pipeline/macos.rs b/crates/recording/src/output_pipeline/macos.rs index 5a190257acf..dc03b7a51bd 100644 --- a/crates/recording/src/output_pipeline/macos.rs +++ b/crates/recording/src/output_pipeline/macos.rs @@ -34,7 +34,7 @@ fn get_available_disk_space_mb(path: &std::path::Path) -> Option { if result != 0 { return None; } - Some((stat.f_bavail as u64).saturating_mul(stat.f_frsize as u64) / (1024 * 1024)) + Some((stat.f_bavail as u64).saturating_mul(stat.f_frsize) / (1024 * 1024)) } fn get_mp4_muxer_buffer_size(instant_mode: bool) -> usize { diff --git a/crates/recording/src/sources/camera.rs b/crates/recording/src/sources/camera.rs index db0fa42b9b5..aede5e52d5a 100644 --- a/crates/recording/src/sources/camera.rs +++ b/crates/recording/src/sources/camera.rs @@ -133,7 +133,7 @@ impl VideoSource for Camera { if frame_width != original_width || frame_height != original_height { let needs_new_scaler = scaler .as_ref() - .map_or(true, |s| !s.matches_source(frame_width, frame_height)); + .is_none_or(|s| !s.matches_source(frame_width, frame_height)); if needs_new_scaler { let frame_format = frame.inner.format(); diff --git a/crates/recording/src/sources/microphone.rs b/crates/recording/src/sources/microphone.rs index 3e8b6247f3a..4bdd47522aa 100644 --- a/crates/recording/src/sources/microphone.rs +++ b/crates/recording/src/sources/microphone.rs @@ -299,12 +299,10 @@ impl AudioSource for Microphone { silence_counter.fetch_add(1, Ordering::Relaxed); - match tokio::time::timeout(send_timeout, audio_tx.send(audio_frame)) - .await - { - Ok(Ok(())) => {} - _ => {} - } + if let Ok(Ok(())) = + tokio::time::timeout(send_timeout, audio_tx.send(audio_frame)) + .await + {} } } }