From 2846aca6303f3ec6d1de243cb4f840b391054ba1 Mon Sep 17 00:00:00 2001 From: Alexander Rulkens Date: Wed, 20 May 2026 01:30:05 +0200 Subject: [PATCH] perf(rendering): VSOut shrink, invisibility cull, volume step cut MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At large tier (M-series, fully zoomed out): 24ms point-sprites + 8ms volume → 17.5ms point-sprites + 3.5ms volume. ~11ms total saved. Points pipeline - Drop unused dMpc and camDistMpc VSOut fields. - Mark tint, intensity, axisRatio as @interpolate(flat). They're per-instance constants — smooth interpolation was wasted rasterizer work. - Fold depthFade × schechterRatio × angularDensityWeight into intensity at the vertex stage. Removes three varyings and three per-pixel multiplies; mathematically identical because all four factors are per-instance constants. - Cull invisible billboards (folded intensity < 0.005) via degenerate clip-space output, same pattern as the existing Malmquist gate. The selected galaxy bypasses the cull so the selection halo never vanishes on a faint pick. Pick fragment shares the vertex stage, so culled galaxies become non-pickable — acceptable since they're visually undetectable. Volume raymarch - STEP_COUNT 192 → 128. Texture-cache locality at higher counts gave sub-linear scaling, so the per-step cost curve flattens above this knee; 128 is the sweet spot before banding becomes perceptible in sparse fields. GPU adapter - Request powerPreference: 'high-performance' so multi-GPU systems pick the discrete card. No-op on single-GPU machines. DebugPanel - Add scalar-volume to DISPLAY_SLOT_ORDER (was instrumented but missing from the panel's display list). - Use the 60-frame rolling average for the header total instead of the single-frame sum, so the readout is internally consistent with the per-row values and stops jumping frame-to-frame. Co-Authored-By: Claude Opus 4.7 --- .../DebugPanel/GpuTimingsSection.tsx | 22 ++--- src/services/gpu/device.ts | 6 +- .../gpu/shaders/points/colorFragment.wesl | 32 +------ src/services/gpu/shaders/points/io.wesl | 43 +++------ src/services/gpu/shaders/points/vertex.wesl | 91 ++++++++----------- .../gpu/shaders/scalarVolume/fragment.wesl | 2 +- 6 files changed, 70 insertions(+), 126 deletions(-) diff --git a/src/components/DebugPanel/GpuTimingsSection.tsx b/src/components/DebugPanel/GpuTimingsSection.tsx index d06011e3..2a6854c7 100644 --- a/src/components/DebugPanel/GpuTimingsSection.tsx +++ b/src/components/DebugPanel/GpuTimingsSection.tsx @@ -50,14 +50,12 @@ import type { TimingSlotName } from '../../@types/gpu/timing/TimingSlotName'; import { HDR_PASSES } from '../../services/engine/frame/passes'; import { Sparkline } from './Sparkline'; -// Row display order: HDR_PASSES (one timing slot per pass), then the -// three out-of-HDR passes in render order — `tone-map` (HDR→swap- -// chain blit), `ui-overlay` (marker-lines + labels combined; see -// `services/engine/frame/uiOverlay.ts` for why they share one slot), -// and `pick` (its own encoder, submitted by the pick renderer). -// Reordering passes in `passes/index.ts` automatically reorders the -// timing UI for the HDR portion. +// Row order matches encoder draw order. `scalar-volume` runs in +// `encodeVolumes` before the HDR loop, so it's listed explicitly; the +// HDR_PASSES spread covers the loop interior. Reorders in +// `passes/index.ts` propagate here automatically. const DISPLAY_SLOT_ORDER: readonly TimingSlotName[] = [ + 'scalar-volume', ...HDR_PASSES.map((p) => p.name as TimingSlotName), 'tone-map', 'ui-overlay', @@ -135,20 +133,20 @@ export function GpuTimingsSection({ service }: GpuTimingsSectionProps): ReactEle // ── Branch 3: live data ─────────────────────────────────────────── const stats = statsRef.current; - // Sum of CURRENT-FRAME timings for the header. Only count slots that - // actually ran this frame (staleFrames === 0); idle slots' last - // value is no longer the live frame's cost. + // Header sums per-slot AVG_WINDOW averages, matching the visible + // row values. Stale slots excluded so the total reflects current + // GPU work, not a gated-off subsystem's last cost. let frameTotalMs = 0; for (const [, row] of stats) { if (row.staleFrames === 0 && row.recent.length > 0) { - frameTotalMs += row.recent[row.recent.length - 1]!; + frameTotalMs += row.recent.reduce((a, b) => a + b, 0) / row.recent.length; } } return (
- GPU Timings (last frame: {frameTotalMs.toFixed(1)} ms) + GPU Timings (avg {AVG_WINDOW}f: {frameTotalMs.toFixed(1)} ms)
{/* diff --git a/src/services/gpu/device.ts b/src/services/gpu/device.ts index 599a8e05..05df3ccd 100644 --- a/src/services/gpu/device.ts +++ b/src/services/gpu/device.ts @@ -55,11 +55,15 @@ export async function initGpu(canvas: HTMLCanvasElement): Promise { if (!navigator.gpu) throw new Error('WebGPU not supported in this browser.'); // Step 1 — Request an adapter. + // `powerPreference: 'high-performance'` asks the browser to pick the + // discrete GPU on multi-GPU systems (dual-GPU MacBook Pros, desktops + // with both integrated and dedicated GPUs). It's a no-op on single-GPU + // machines (Apple Silicon, most laptops without a discrete card). // `requestAdapter()` returns null when the browser has no usable GPU // (e.g. headless test environments, or a machine whose GPU is blocked by // a corporate driver policy). We treat that as a hard stop. // See: https://www.w3.org/TR/webgpu/#dom-gpu-requestadapter - const adapter = await navigator.gpu.requestAdapter(); + const adapter = await navigator.gpu.requestAdapter({ powerPreference: 'high-performance' }); if (!adapter) throw new Error('No WebGPU adapter available.'); // Step 2 — Request a device, opting into `timestamp-query` when the diff --git a/src/services/gpu/shaders/points/colorFragment.wesl b/src/services/gpu/shaders/points/colorFragment.wesl index 59268844..70887dc8 100644 --- a/src/services/gpu/shaders/points/colorFragment.wesl +++ b/src/services/gpu/shaders/points/colorFragment.wesl @@ -184,37 +184,11 @@ fn fs(in: VSOut) -> @location(0) vec4 { if (r2 > 1.0) { discard; } // Gaussian-like falloff: bright at centre (r²=0 → e⁰=1), fading to - // e⁻⁴ ≈ 0.018 at the edge (r²=1). + // e⁻⁴ ≈ 0.018 at the edge (r²=1). The per-instance modulators + // (Schechter, angular reweight, depth fade) are folded into + // 'in.intensity' by the vertex stage — see vertex.wesl. var alpha = exp(-r2 * 4.0); - // ── Schechter density correction (mode 3) ──────────────────────────── - // - // Modulate alpha by the per-galaxy ratio 'clamp(N_ref / n(d), 0, 10)' - // baked at upload time into 'in.schechterRatio'. Originally a 200-step - // trapezoidal integral evaluated PER FRAGMENT; now a single multiply. - let schechterAlpha_ = select(1.0, in.schechterRatio, u.biasMode == 3u); - alpha = alpha * schechterAlpha_; - - // ── HEALPix angular re-weight (mode 4) ──────────────────────────── - // - // Modulate alpha by the per-galaxy ratio - // 'clamp(medianCellCount / localCellCount, 0.1, 10)' baked at toggle - // time into 'in.angularDensityWeight'. Down-weights galaxies in - // over-dense angular cells and up-weights galaxies in sparse cells, - // flattening radial pencil-beam-jet artefacts. - let angWeight = select(1.0, in.angularDensityWeight, u.biasMode == 4u); - alpha = alpha * angWeight; - - // ── Camera-distance depth fade ─────────────────────────────────────────── - // - // Every line through the catalog origin under additive billboards - // accumulates hundreds of overlapping galaxies in a single screen - // pixel. The depth-fade multiplier is pre-computed in the vertex - // stage and flat-interpolated as 'in.depthFade' (the vertex stage - // already handles the 'u.depthFadeEnabled' gate, so this is - // unconditionally a multiply). - alpha = alpha * in.depthFade; - // ── Procedural-disk crossfade-OUT ──────────────────────────────────────── // // The thumbnail subsystem's procedural-disk pass fades IN across diff --git a/src/services/gpu/shaders/points/io.wesl b/src/services/gpu/shaders/points/io.wesl index 856b8dd9..f6268a0c 100644 --- a/src/services/gpu/shaders/points/io.wesl +++ b/src/services/gpu/shaders/points/io.wesl @@ -276,14 +276,16 @@ struct VSOut { @location(0) uv: vec2, // Pre-computed colour for this point (from the colourIndex ramp). - // Interpolated across the quad by the rasteriser — but since all 6 - // vertices of one instance share the same tint, there is no visible - // interpolation. - @location(1) tint: vec3, + // Flat-interpolated — all 6 vertices of an instance share the value, + // so smooth interpolation would do work for an identical result. + @location(1) @interpolate(flat) tint: vec3, - // Combined brightness: magnitude-based intensity × global brightness - // knob × per-instance bias-mode alpha (1/V_max). - @location(2) intensity: f32, + // Per-instance brightness with every per-instance modulator folded in: + // magnitude-based intensity × brightness slider × vMax (mode 2) × + // Schechter (mode 3) × angular reweight (mode 4) × depth-fade + // (camera-distance falloff). Fragment multiplies in only per-pixel + // terms (Gaussian falloff, procedural-disk fade, source fade). + @location(2) @interpolate(flat) intensity: f32, // Packed (source, localIdx) identity used by 'fsPick' to write the // pick texture. Flat-interpolated because integers can't be linearly @@ -296,8 +298,8 @@ struct VSOut { // Forwarded 'abs(axisRatio)' so the fragment stage's elliptical mask // uses the unsigned magnitude. Sign bit was the fallback flag (now - // extracted into 'isFallback'). - @location(5) axisRatio: f32, + // extracted into 'isFallback'). Per-instance constant. + @location(5) @interpolate(flat) axisRatio: f32, // Pre-computed cos/sin of the position-angle rotation. Computed once // per primitive in 'vs' instead of per fragment, saving millions of @@ -310,29 +312,6 @@ struct VSOut { // measurements. @location(7) @interpolate(flat) isFallback: u32, - // Origin-relative distance in Mpc, forwarded for future distance- - // dependent fragment effects. Currently unused in the fragment stage - // but kept as plumbed. - @location(8) @interpolate(flat) dMpc: f32, - - // Per-galaxy Schechter density-correction ratio. Read in 'fs' only - // when 'u.biasMode == 3u'. - @location(9) @interpolate(flat) schechterRatio: f32, - - // Per-galaxy HEALPix angular re-weight. Read in 'fs' only when - // 'u.biasMode == 4u'. - @location(10) @interpolate(flat) angularDensityWeight: f32, - - // Distance from the camera to this galaxy in Mpc. Forwarded for - // potential per-fragment depth-driven effects. - @location(11) @interpolate(flat) camDistMpc: f32, - - // Pre-computed depth-fade multiplier '1 / (1 + (camDist/FALLOFF_HALF)²)', - // gated by 'u.depthFadeEnabled' (passes through 1.0 when off). - // Per-instance constant — flat-interpolated for one mul per primitive - // instead of per fragment. - @location(12) @interpolate(flat) depthFade: f32, - // Per-instance billboard radius in screen-space pixels. Used by the // fragment stage to fade points-pass alpha across the procedural- // disk crossfade band. All 6 vertices share the same value. diff --git a/src/services/gpu/shaders/points/vertex.wesl b/src/services/gpu/shaders/points/vertex.wesl index 3ee7a53c..8d7bd3f7 100644 --- a/src/services/gpu/shaders/points/vertex.wesl +++ b/src/services/gpu/shaders/points/vertex.wesl @@ -121,11 +121,6 @@ fn vs( earlyOut.paCs = 1.0; earlyOut.paSn = 0.0; earlyOut.isFallback = 0u; - earlyOut.dMpc = dMpc; - earlyOut.schechterRatio = 0.0; - earlyOut.angularDensityWeight = 1.0; - earlyOut.camDistMpc = 0.0; - earlyOut.depthFade = 1.0; earlyOut.sizePx = 0.0; return earlyOut; } @@ -207,29 +202,49 @@ fn vs( // Look up the colour for this point's *rest-frame* colour index. out.tint = ramp(restColorIndex); - // ── MAGNITUDE → INTENSITY ──────────────────────────────────────────────── + // ── MAGNITUDE → INTENSITY, with every per-instance modulator folded in ── // - // intensity = (22 - magnitude) / 8 + // intensity = clamp((22 - magnitude) / 8, 0.05, 1.0) // mag 14 → 1.0, + // mag 22 → 0.05 + // × u.brightness // global slider + // × vMaxWeight (mode 2: 1/V_max) + // × schechterRatio (mode 3: Schechter LF) + // × angularReweight (mode 4: HEALPix) + // × depthFade (camera-distance falloff) // - // magnitude 14 → 1.0 (brightest) - // magnitude 22 → 0.0 (faint limit) - // - // We clamp to [0.05, 1.0] rather than [0, 1] so that even the faintest - // objects remain *barely* visible — a hard zero would create gaps in - // the distribution. - // - // ── 1/V_max alpha modulation ───────────────────────────────────────────── - // - // When 'u.biasMode == 2u' (BiasMode.VMax), multiply the intensity by - // 'p.vMaxWeight'. This dims intrinsically-bright galaxies whose - // detectability volume V_max exceeds the reference volume V_ref — they - // visible across a much larger slice of space than their faint - // companions, so without the down-weighting they'd over-represent - // themselves visually. The 'select(1.0, p.vMaxWeight, …)' keeps the - // OTHER three modes (None, VolumeLimited, Schechter, AngularReweight) - // unchanged. + // Folding the four mode/distance multipliers in here means the fragment + // multiplies only per-pixel terms (Gaussian + crossfade + source fade). + // Mathematically identical to applying them per-pixel because all five + // factors are per-instance constants. let vMaxAlpha = select(1.0, p.vMaxWeight, u.biasMode == 2u); - out.intensity = clamp((22.0 - p.magnitude) / 8.0, 0.05, 1.0) * u.brightness * vMaxAlpha; + let schechterMult = select(1.0, p.schechterRatio, u.biasMode == 3u); + let angularMult = select(1.0, p.angularDensityWeight, u.biasMode == 4u); + + // Depth-fade: 1 / (1 + (camDist/FALLOFF_HALF)²), gated by depthFadeEnabled. + let FALLOFF_HALF_MPC = 1000.0; + let camDistRel = distanceMpc / FALLOFF_HALF_MPC; + let depthFadeRaw = 1.0 / (1.0 + camDistRel * camDistRel); + let depthFadeMult = select(1.0, depthFadeRaw, u.depthFadeEnabled == 1u); + + out.intensity = clamp((22.0 - p.magnitude) / 8.0, 0.05, 1.0) + * u.brightness + * vMaxAlpha + * schechterMult + * angularMult + * depthFadeMult; + + // Invisibility cull: galaxies whose folded intensity falls below this + // threshold contribute imperceptibly to the additive HDR target, so + // we emit a degenerate clip position (outside the [-1, 1] NDC cube) + // and let the rasteriser drop the primitive before any fragment work. + // Selected galaxies bypass the cull so the selection halo never + // vanishes on a faint pick. Pick fragment shares this vertex stage, + // so culled galaxies also become non-pickable — acceptable since + // they were never visible. + let INVISIBILITY_THRESHOLD = 0.005; + if (out.intensity < INVISIBILITY_THRESHOLD && !isSelected) { + out.clip = vec4(2.0, 2.0, 2.0, 1.0); + } // Forward the per-instance packed identity to 'fsPick'. // The visual 'fs' ignores this field. @@ -256,32 +271,6 @@ fn vs( out.paCs = cos(paRad); out.paSn = sin(paRad); - // Forward origin-relative distance for future distance-dependent - // fragment effects (currently unused in 'fs'). - out.dMpc = dMpc; - - // Forward the per-galaxy Schechter density ratio. The intensity above - // already folded it into 'out.intensity' for mode 3 — forwarding it - // through VSOut keeps the attribute available to the fragment in case - // future tweaks (e.g. tint modulation) want to read it. With - // @interpolate(flat) the GPU writes the value once per primitive. - out.schechterRatio = p.schechterRatio; - - // Forward the per-galaxy HEALPix angular re-weight (default 1.0). - out.angularDensityWeight = p.angularDensityWeight; - - // Forward camera-relative distance for fragment-stage depth effects. - out.camDistMpc = distanceMpc; - - // Pre-compute the depth-fade multiplier here so the fragment doesn't - // re-derive it per pixel. Curve: '1 / (1 + (camDist / FALLOFF_HALF)²)'. - // The 1000 Mpc half-distance + the 'u.depthFadeEnabled' gate are - // resolved here so the fragment stage gets a single multiplier. - let FALLOFF_HALF_MPC = 1000.0; - let camDistRel = distanceMpc / FALLOFF_HALF_MPC; - let depthFadeRaw = 1.0 / (1.0 + camDistRel * camDistRel); - out.depthFade = select(1.0, depthFadeRaw, u.depthFadeEnabled == 1u); - // Forward the per-instance billboard radius in screen-pixels so the // fragment stage can fade points-pass alpha across the procedural- // disk crossfade band. diff --git a/src/services/gpu/shaders/scalarVolume/fragment.wesl b/src/services/gpu/shaders/scalarVolume/fragment.wesl index 56b108b8..08fb61f5 100644 --- a/src/services/gpu/shaders/scalarVolume/fragment.wesl +++ b/src/services/gpu/shaders/scalarVolume/fragment.wesl @@ -105,7 +105,7 @@ struct VolumeUniforms { @group(1) @binding(0) var fade: FadeUniforms; -const STEP_COUNT: i32 = 192; +const STEP_COUNT: i32 = 128; const SATURATION_THRESHOLD: f32 = 0.99; struct FsIn {