diff --git a/.github/workflows/build-pull-request.yaml b/.github/workflows/build-pull-request.yaml
index 2a8b63fe7..b04e065b2 100644
--- a/.github/workflows/build-pull-request.yaml
+++ b/.github/workflows/build-pull-request.yaml
@@ -79,3 +79,4 @@ jobs:
           docker run --rm --gpus 'device=1' ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} tests/test_GPURaytrace.sh
           docker run --rm --gpus 'device=1' ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} tests/test_GPUPhotonFileSource.sh
           docker run --rm --gpus 'device=1' ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} tests/test_GPUPhotonSource_8x8SiPM.sh
+          docker run --rm --gpus 'device=1' ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} tests/test_wavelength_shifting.sh
diff --git a/README.md b/README.md
index b1933482c..9f02b0cb7 100644
--- a/README.md
+++ b/README.md
@@ -155,6 +155,7 @@ EIC-Opticks provides several examples demonstrating GPU-accelerated optical phot
 | `GPUPhotonSource` | Optical photons (torch) | Any GDML | G4 + GPU side-by-side validation |
 | `GPUPhotonSourceMinimal` | Optical photons (torch) | Any GDML | GPU-only test |
 | `GPUPhotonFileSource` | Optical photons (text file) | Any GDML | GPU-only, user-defined photons from file |
+| WLS test | Wavelength shifting | WLS sphere + detector shell | Validate GPU WLS physics |
 
 ### Example 1: GPUCerenkov (Cerenkov Only)
 
@@ -297,6 +298,55 @@ GPUPhotonFileSource -g tests/geom/opticks_raindrop.gdml -p my_photons.txt -m run
 
 **Source files:** `src/GPUPhotonFileSource.cpp`, `src/GPUPhotonFileSource.h`
 
+### Example 6: Wavelength Shifting (WLS) Test
+
+This test validates the GPU wavelength shifting implementation using a dedicated
+geometry with a WLS sphere surrounded by a detector shell:
+
+```
+Geometry: wls_test.gdml
+├── Air world (r=200 mm)
+│   ├── WLS sphere (r=20 mm) ← Absorbs UV, re-emits visible
+│   └── Glass detector shell (r=28-30 mm) ← 100% detection efficiency
+```
+
+The WLS material absorbs UV photons (350 nm) and re-emits them isotropically at
+longer wavelengths (peak ~481 nm) with a 0.5 ns exponential time delay. The test
+fires 1000 monochromatic 350 nm photons from the origin into the WLS sphere.
+
+```bash
+GPUPhotonSourceMinimal -g tests/geom/wls_test.gdml -c wls_test -m tests/run.mac -s 42
+```
+
+**Expected results:**
+- ~990/1000 photons detected (10 absorbed after failing energy conservation)
+- All hits wavelength-shifted from 350 nm to mean ~487 nm
+- Energy conservation: no hits with wavelength < 350 nm
+- Isotropic re-emission: mean momentum direction near zero
+- Time delay: mean ~0.6 ns (propagation + 0.5 ns exponential WLS decay)
+
+**GDML WLS properties required** (same syntax for G4 10.x and 11.x):
+```xml
+<define>
+    <matrix coldim="2" name="WLSABSLENGTH" values="1.77e-06 10000.0 ... 4.13e-06 0.01"/>
+    <matrix coldim="2" name="WLSCOMPONENT" values="1.77e-06 0.00 ... 3.10e-06 0.00"/>
+    <matrix coldim="1" name="WLSTIMECONSTANT" values="0.5"/>
+</define>
+<materials>
+    <material name="WLSMaterial">
+        <property name="WLSABSLENGTH" ref="WLSABSLENGTH"/>
+        <property name="WLSCOMPONENT" ref="WLSCOMPONENT"/>
+        <property name="WLSTIMECONSTANT" ref="WLSTIMECONSTANT"/>
+    </material>
+</materials>
+```
+
+Unlike scintillation properties, WLS property names are the same in both Geant4
+10.x and 11.x — no dual-naming is needed.
+
+**Test files:** `tests/geom/wls_test.gdml`, `config/wls_test.json`
+**Implementation docs:** `docs/WLS_IMPLEMENTATION.md`
+
 ### Torch configuration
 
 `GPUPhotonSource` and `GPUPhotonSourceMinimal` read photon source parameters from a
@@ -317,6 +367,7 @@ JSON config file (default `config/dev.json`). Key fields:
 |---------|-------------|-------------|-----------------|----------------------|---------------------|
 | Cerenkov genstep collection | ✓ | ✓ | ✗ | ✗ | ✗ |
 | Scintillation genstep collection | ✗ | ✓ | ✗ | ✗ | ✗ |
+| Wavelength shifting (WLS) | ✓ | ✓ | ✓ | ✓ | ✓ |
 | Torch photon generation | ✗ | ✗ | ✓ | ✓ | ✗ |
 | Photon input from text file | ✗ | ✗ | ✗ | ✗ | ✓ |
 | G4 optical photon tracking | ✓ | ✓ | ✓ | ✗ | ✗ |
diff --git a/config/wls_100k.json b/config/wls_100k.json
new file mode 100644
index 000000000..26166e47b
--- /dev/null
+++ b/config/wls_100k.json
@@ -0,0 +1,30 @@
+{
+  "torch": {
+    "gentype": "TORCH",
+    "trackid": 0,
+    "matline": 0,
+    "numphoton": 100000,
+
+    "pos": [0.0, 0.0, 0.0],
+    "time": 0.0,
+
+    "mom": [0.0, 0.0, 1.0],
+    "weight": 0.0,
+
+    "pol": [1.0, 0.0, 0.0],
+    "wavelength": 350.0,
+
+    "zenith":  [0.0, 1.0],
+    "azimuth": [0.0, 1.0],
+
+    "radius": 0.0,
+    "distance": 0.0,
+    "mode": 255,
+    "type": "disc"
+  },
+
+  "event": {
+    "mode": "DebugLite",
+    "maxslot": 10000000
+  }
+}
diff --git a/config/wls_scatter_viz.json b/config/wls_scatter_viz.json
new file mode 100644
index 000000000..76bd9ece2
--- /dev/null
+++ b/config/wls_scatter_viz.json
@@ -0,0 +1,30 @@
+{
+  "torch": {
+    "gentype": "TORCH",
+    "trackid": 0,
+    "matline": 0,
+    "numphoton": 10000,
+
+    "pos": [0.0, 0.0, -25.0],
+    "time": 0.0,
+
+    "mom": [0.0, 0.0, 1.0],
+    "weight": 0.0,
+
+    "pol": [1.0, 0.0, 0.0],
+    "wavelength": 350.0,
+
+    "zenith":  [0.0, 0.3],
+    "azimuth": [0.0, 1.0],
+
+    "radius": 0.0,
+    "distance": 0.0,
+    "mode": 255,
+    "type": "disc"
+  },
+
+  "event": {
+    "mode": "HitPhoton",
+    "maxslot": 100000
+  }
+}
diff --git a/config/wls_slab.json b/config/wls_slab.json
new file mode 100644
index 000000000..bfd412305
--- /dev/null
+++ b/config/wls_slab.json
@@ -0,0 +1,30 @@
+{
+  "torch": {
+    "gentype": "TORCH",
+    "trackid": 0,
+    "matline": 0,
+    "numphoton": 100000,
+
+    "pos": [0.0, 0.0, -50.0],
+    "time": 0.0,
+
+    "mom": [0.0, 0.0, 1.0],
+    "weight": 0.0,
+
+    "pol": [1.0, 0.0, 0.0],
+    "wavelength": 400.0,
+
+    "zenith":  [0.0, 0.0],
+    "azimuth": [0.0, 1.0],
+
+    "radius": 0.0,
+    "distance": 0.0,
+    "mode": 255,
+    "type": "disc"
+  },
+
+  "event": {
+    "mode": "DebugLite",
+    "maxslot": 10000000
+  }
+}
diff --git a/config/wls_test.json b/config/wls_test.json
new file mode 100644
index 000000000..b8572b5b9
--- /dev/null
+++ b/config/wls_test.json
@@ -0,0 +1,30 @@
+{
+  "torch": {
+    "gentype": "TORCH",
+    "trackid": 0,
+    "matline": 0,
+    "numphoton": 1000,
+
+    "pos": [0.0, 0.0, 0.0],
+    "time": 0.0,
+
+    "mom": [0.0, 0.0, 1.0],
+    "weight": 0.0,
+
+    "pol": [1.0, 0.0, 0.0],
+    "wavelength": 350.0,
+
+    "zenith":  [0.0, 1.0],
+    "azimuth": [0.0, 1.0],
+
+    "radius": 0.0,
+    "distance": 0.0,
+    "mode": 255,
+    "type": "disc"
+  },
+
+  "event": {
+    "mode": "DebugLite",
+    "maxslot": 1000000
+  }
+}
diff --git a/examples/benchmark_apex.sh b/examples/benchmark_apex.sh
new file mode 100755
index 000000000..2a4de6f83
--- /dev/null
+++ b/examples/benchmark_apex.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# benchmark_apex.sh — Measure GPU vs G4 speedup on apex.gdml
+#
+# Usage:
+#   ./examples/benchmark_apex.sh
+
+GDML="apex.gdml"
+MACRO="tests/run.mac"
+EPS="0.00001"
+EPS0="0.0006"
+OUTDIR="plots"
+CONFIG="det_debug"
+
+if [ ! -f "$GDML" ]; then
+    echo "ERROR: $GDML not found. Run from the eic-opticks root directory."
+    exit 1
+fi
+
+echo "=== apex.gdml Benchmark ==="
+echo "eps=$EPS, eps0=$EPS0"
+echo "Running..."
+
+LOGFILE=$(mktemp /tmp/bench_XXXXXX.txt)
+OPTICKS_MAX_BOUNCE=1000 \
+OPTICKS_PROPAGATE_EPSILON=$EPS \
+OPTICKS_PROPAGATE_EPSILON0=$EPS0 \
+GPURaytrace -g "$GDML" -m "$MACRO" -c "$CONFIG" &> "$LOGFILE" || true
+
+GPU_TIME=$(grep "Simulation time:" "$LOGFILE" | awk '{print $3}')
+G4_LINE=$(grep "^  User=" "$LOGFILE" | tail -1)
+G4_CPU=$(echo "$G4_LINE" | grep -oP 'User=\K[0-9.]+')
+G4_WALL=$(echo "$G4_LINE" | grep -oP 'Real=\K[0-9.]+')
+NPHOTONS=$(grep "NumCollected:" "$LOGFILE" | tail -1 | awk '{print $NF}')
+GPU_HITS=$(grep "Opticks: NumHits:" "$LOGFILE" | awk '{print $NF}')
+G4_HITS=$(grep "Geant4: NumHits:" "$LOGFILE" | awk '{print $NF}')
+
+if [ -z "$GPU_TIME" ] || [ -z "$G4_CPU" ]; then
+    echo "ERROR: Could not parse timing from output"
+    tail -30 "$LOGFILE"
+    rm -f "$LOGFILE"
+    exit 1
+fi
+
+python3 -c "
+gpu = float('$GPU_TIME')
+g4_cpu = float('$G4_CPU')
+g4_wall = float('$G4_WALL')
+nphotons = int('$NPHOTONS')
+gpu_hits = int('$GPU_HITS')
+g4_hits = int('$G4_HITS')
+hit_diff = (gpu_hits - g4_hits) / g4_hits * 100 if g4_hits > 0 else 0
+
+print()
+print(f'Photons:        {nphotons:>10,}')
+print(f'GPU sim time:   {gpu:>10.4f} s')
+print(f'G4 CPU time:    {g4_cpu:>10.2f} s')
+print(f'G4 wall time:   {g4_wall:>10.2f} s')
+print()
+print(f'Speedup (CPU):  {g4_cpu/gpu:>10.0f}x')
+print(f'Speedup (wall): {g4_wall/gpu:>10.0f}x')
+print()
+print(f'GPU rate:       {nphotons/gpu/1e6:>10.1f} M photons/s')
+print(f'G4 rate:        {nphotons/g4_cpu/1e3:>10.1f} k photons/s')
+print()
+print(f'GPU hits:       {gpu_hits:>10}')
+print(f'G4 hits:        {g4_hits:>10}')
+print(f'Hit diff:       {hit_diff:>+9.1f}%')
+"
+
+rm -f "$LOGFILE"
+
+# Generate comparison plots if hit files exist
+if [ -f "gpu_hits.npy" ] && [ -f "g4_hits.npy" ]; then
+    echo ""
+    echo "=== Generating comparison plots ==="
+    python3 optiphy/ana/run_and_compare.py --gpu-hits gpu_hits.npy --g4-hits g4_hits.npy --outdir "$OUTDIR" 2>&1 | tail -15
+    echo "Plots saved to $OUTDIR/"
+fi
diff --git a/optiphy/ana/compare_aligned.py b/optiphy/ana/compare_aligned.py
new file mode 100644
index 000000000..b044ce6f2
--- /dev/null
+++ b/optiphy/ana/compare_aligned.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+"""
+compare_aligned.py - Photon-by-photon comparison of GPU vs G4 aligned simulations.
+
+Usage:
+    python compare_aligned.py <gpu_photon.npy> <g4_photon.npy>
+
+Performs:
+  1. Per-photon flag comparison (exact match rate)
+  2. Position comparison at multiple thresholds
+  3. Chi-squared test on flag distributions (gold-standard validation metric)
+  4. Glancing-angle photon identification (normal sign ambiguity)
+  5. Divergent photon listing
+"""
+import sys
+import numpy as np
+
+FLAG_NAMES = {
+    0x0004: "TORCH", 0x0008: "BULK_ABSORB", 0x0010: "BULK_REEMIT",
+    0x0020: "BULK_SCATTER", 0x0040: "SURFACE_DETECT", 0x0080: "SURFACE_ABSORB",
+    0x0100: "SURFACE_DREFLECT", 0x0200: "SURFACE_SREFLECT",
+    0x0400: "BOUNDARY_REFLECT", 0x0800: "BOUNDARY_TRANSMIT", 0x8000: "MISS",
+}
+
+def flag_name(f):
+    return FLAG_NAMES.get(f, f"0x{f:04x}")
+
+def extract_flag(photon):
+    """Extract flag from q3.x (orient_boundary_flag) - lower 16 bits."""
+    q3 = photon.view(np.uint32).reshape(-1, 4, 4)
+    return q3[:, 3, 0] & 0xFFFF
+
+def chi2_flag_distribution(gpu_flags, g4_flags):
+    """
+    Chi-squared comparison of flag distributions.
+
+    Compares the frequency of each flag value between GPU and G4.
+    This is the opticks gold-standard validation metric.
+
+    Returns (chi2, ndof, flags_used, gpu_counts, g4_counts).
+    """
+    all_flags = sorted(set(gpu_flags) | set(g4_flags))
+    gpu_counts = np.array([(gpu_flags == f).sum() for f in all_flags], dtype=float)
+    g4_counts  = np.array([(g4_flags == f).sum() for f in all_flags], dtype=float)
+
+    total = gpu_counts + g4_counts
+    mask = total > 0
+    gpu_c = gpu_counts[mask]
+    g4_c  = g4_counts[mask]
+    tot   = total[mask]
+    flags_used = [f for f, m in zip(all_flags, mask) if m]
+
+    n_gpu = gpu_c.sum()
+    n_g4  = g4_c.sum()
+    expected_gpu = tot * n_gpu / (n_gpu + n_g4)
+    expected_g4  = tot * n_g4  / (n_gpu + n_g4)
+
+    chi2 = 0.0
+    for i in range(len(flags_used)):
+        if expected_gpu[i] > 0:
+            chi2 += (gpu_c[i] - expected_gpu[i])**2 / expected_gpu[i]
+        if expected_g4[i] > 0:
+            chi2 += (g4_c[i] - expected_g4[i])**2 / expected_g4[i]
+
+    ndof = max(len(flags_used) - 1, 1)
+    return chi2, ndof, flags_used, gpu_c, g4_c
+
+def identify_glancing(gpu, g4):
+    """
+    Identify glancing-angle photons where the normal sign ambiguity
+    causes momentum negation between GPU and G4.
+
+    At glancing incidence cos(theta) ~ 0, float32 vs float64 can produce
+    opposite normal signs, reflecting the photon in the opposite direction.
+    These photons have matching flags but very different positions.
+
+    Returns boolean mask of glancing photons.
+    """
+    gpu_mom = gpu[:, 1, :3]
+    g4_mom  = g4[:, 1, :3]
+
+    # Normalize momenta (should already be unit vectors, but be safe)
+    gpu_norm = np.linalg.norm(gpu_mom, axis=1, keepdims=True)
+    g4_norm  = np.linalg.norm(g4_mom, axis=1, keepdims=True)
+    gpu_norm[gpu_norm == 0] = 1
+    g4_norm[g4_norm == 0] = 1
+
+    gpu_hat = gpu_mom / gpu_norm
+    g4_hat  = g4_mom / g4_norm
+
+    # Dot product of momentum directions: -1 = fully negated (normal flip)
+    mom_dot = np.sum(gpu_hat * g4_hat, axis=1)
+
+    # Glancing: momentum vectors are nearly anti-parallel (dot ~ -1)
+    glancing = mom_dot < -0.5
+    return glancing, mom_dot
+
+def main():
+    if len(sys.argv) < 3:
+        print(f"Usage: {sys.argv[0]} <gpu_photon.npy> <g4_photon.npy>")
+        sys.exit(1)
+
+    gpu = np.load(sys.argv[1])
+    g4  = np.load(sys.argv[2])
+
+    print(f"GPU shape: {gpu.shape}")
+    print(f"G4  shape: {g4.shape}")
+
+    n = min(len(gpu), len(g4))
+    gpu = gpu[:n]
+    g4  = g4[:n]
+
+    gpu_flags = extract_flag(gpu)
+    g4_flags  = extract_flag(g4)
+
+    # ---- 1. Per-photon flag comparison ----
+    match = gpu_flags == g4_flags
+    n_match = match.sum()
+    n_diff = n - n_match
+    print(f"\n{'='*60}")
+    print(f"FLAG COMPARISON ({n} photons)")
+    print(f"{'='*60}")
+    print(f"  Matching: {n_match} ({100*n_match/n:.1f}%)")
+    print(f"  Differ:   {n_diff} ({100*n_diff/n:.1f}%)")
+
+    # ---- 2. Position comparison ----
+    gpu_pos = gpu[:, 0, :3]
+    g4_pos  = g4[:, 0, :3]
+    pos_diff = np.linalg.norm(gpu_pos - g4_pos, axis=1)
+    zero_g4 = np.all(g4_pos == 0, axis=1)
+
+    valid = ~zero_g4
+    n_valid = valid.sum()
+    print(f"\n{'='*60}")
+    print(f"POSITION COMPARISON ({n_valid} valid, {zero_g4.sum()} unrecorded)")
+    print(f"{'='*60}")
+    if n_valid > 0:
+        vdiff = pos_diff[valid]
+        print(f"  Mean dist:   {vdiff.mean():.4f} mm")
+        print(f"  Max dist:    {vdiff.max():.4f} mm")
+        print(f"  < 0.01 mm:   {(vdiff < 0.01).sum()} ({100*(vdiff < 0.01).sum()/n_valid:.1f}%)")
+        print(f"  < 0.1 mm:    {(vdiff < 0.1).sum()} ({100*(vdiff < 0.1).sum()/n_valid:.1f}%)")
+        print(f"  < 1.0 mm:    {(vdiff < 1.0).sum()} ({100*(vdiff < 1.0).sum()/n_valid:.1f}%)")
+
+    # ---- 3. Chi-squared test on flag distributions ----
+    print(f"\n{'='*60}")
+    print(f"CHI-SQUARED TEST (flag distribution)")
+    print(f"{'='*60}")
+
+    chi2_val, ndof, flags_used, gpu_c, g4_c = chi2_flag_distribution(gpu_flags, g4_flags)
+
+    print(f"  {'Flag':<20s} {'GPU':>8s} {'G4':>8s} {'Diff':>8s}")
+    print(f"  {'-'*20} {'-'*8} {'-'*8} {'-'*8}")
+    for i, f in enumerate(flags_used):
+        diff = int(gpu_c[i] - g4_c[i])
+        sign = "+" if diff > 0 else ""
+        print(f"  {flag_name(f):<20s} {int(gpu_c[i]):>8d} {int(g4_c[i]):>8d} {sign}{diff:>7d}")
+
+    deviant_frac = 100 * n_diff / n if n > 0 else 0
+    print(f"\n  chi2/ndof = {chi2_val:.2f}/{ndof} = {chi2_val/ndof:.2f}")
+    print(f"  deviant fraction: {deviant_frac:.2f}% ({n_diff}/{n})")
+
+    # ---- 4. Glancing-angle analysis ----
+    print(f"\n{'='*60}")
+    print(f"GLANCING-ANGLE ANALYSIS (normal sign ambiguity)")
+    print(f"{'='*60}")
+
+    glancing, mom_dot = identify_glancing(gpu, g4)
+    n_glancing = glancing.sum()
+
+    # Among matching-flag photons, how many are glancing with large pos diff?
+    match_glancing = match & glancing
+    match_large_pos = match & (pos_diff > 1.0)
+    match_glancing_large = match & glancing & (pos_diff > 1.0)
+
+    print(f"  Glancing photons (mom dot < -0.5):  {n_glancing}")
+    print(f"  Matching flag + pos diff > 1mm:      {match_large_pos.sum()}")
+    print(f"  Of those, glancing:                  {match_glancing_large.sum()}")
+    if match_large_pos.sum() > 0:
+        frac = 100 * match_glancing_large.sum() / match_large_pos.sum()
+        print(f"  Fraction explained by glancing:      {frac:.0f}%")
+
+    # Position stats excluding glancing photons
+    non_glancing_match = match & ~glancing & valid
+    if non_glancing_match.sum() > 0:
+        ng_diff = pos_diff[non_glancing_match]
+        print(f"\n  Position (matching, non-glancing, {non_glancing_match.sum()} photons):")
+        print(f"    Max dist:  {ng_diff.max():.6f} mm")
+        print(f"    Mean dist: {ng_diff.mean():.6f} mm")
+        print(f"    < 0.01 mm: {(ng_diff < 0.01).sum()} ({100*(ng_diff < 0.01).sum()/non_glancing_match.sum():.1f}%)")
+
+    # ---- 5. Divergent photon listing ----
+    if n_diff > 0:
+        div_idx = np.where(~match)[0]
+        print(f"\n{'='*60}")
+        print(f"DIVERGENT PHOTONS (first 10 of {n_diff})")
+        print(f"{'='*60}")
+        for i in div_idx[:10]:
+            gf = flag_name(gpu_flags[i])
+            cf = flag_name(g4_flags[i])
+            gp = gpu_pos[i]
+            cp = g4_pos[i]
+            print(f"  [{i:5d}] GPU: {gf:20s} pos=({gp[0]:8.2f},{gp[1]:8.2f},{gp[2]:8.2f})")
+            print(f"          G4:  {cf:20s} pos=({cp[0]:8.2f},{cp[1]:8.2f},{cp[2]:8.2f})")
+
+if __name__ == "__main__":
+    main()
diff --git a/optiphy/ana/compare_gpu_g4.py b/optiphy/ana/compare_gpu_g4.py
new file mode 100755
index 000000000..87fe210e6
--- /dev/null
+++ b/optiphy/ana/compare_gpu_g4.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python
+"""
+compare_gpu_g4.py : Compare GPU (opticks) vs G4 (standalone) simulation hits
+=============================================================================
+
+Reads GPU hit/photon arrays from an opticks event folder and G4 hits from
+g4_hits.npy, then prints a side-by-side comparison table.
+
+Usage::
+
+    python ana/compare_gpu_g4.py <gpu_event_folder> <g4_hits.npy>
+
+    # Auto-resolves A000 subfolder:
+    python ana/compare_gpu_g4.py /tmp/$USER/opticks/GEOM/GEOM/GPUPhotonSourceMinimal/ALL0_no_opticks_event_name g4_hits.npy
+"""
+import sys
+import os
+import argparse
+import numpy as np
+
+FLAG_ENUM = {
+    0x0004: "TORCH", 0x0008: "BULK_ABSORB", 0x0010: "BULK_REEMIT",
+    0x0020: "BULK_SCATTER", 0x0040: "SURFACE_DETECT", 0x0080: "SURFACE_ABSORB",
+    0x0100: "SURFACE_DREFLECT", 0x0200: "SURFACE_SREFLECT",
+    0x0400: "BOUNDARY_REFLECT", 0x0800: "BOUNDARY_TRANSMIT",
+    0x1000: "NAN_ABORT", 0x2000: "EFFICIENCY_COLLECT", 0x8000: "MISS",
+}
+
+
+def resolve_event_path(path):
+    if os.path.exists(os.path.join(path, "photon.npy")):
+        return path
+    a000 = os.path.join(path, "A000")
+    if os.path.exists(os.path.join(a000, "photon.npy")):
+        return a000
+    if os.path.isdir(path):
+        for d in sorted(os.listdir(path)):
+            dp = os.path.join(path, d)
+            if os.path.isdir(dp) and os.path.exists(os.path.join(dp, "photon.npy")):
+                return dp
+    return path
+
+
+def hit_stats(hits, label):
+    """Compute statistics dict from a (N, 4, 4) hit array."""
+    n = len(hits)
+    if n == 0:
+        return dict(label=label, n=0)
+    wl = hits[:, 2, 3]
+    t = hits[:, 0, 3]
+    pos = hits[:, 0, :3]
+    r = np.sqrt(np.sum(pos ** 2, axis=1))
+    return dict(
+        label=label, n=n,
+        wl_min=wl.min(), wl_max=wl.max(), wl_mean=wl.mean(), wl_std=wl.std(),
+        t_min=t.min(), t_max=t.max(), t_mean=t.mean(), t_std=t.std(),
+        r_min=r.min(), r_max=r.max(), r_mean=r.mean(),
+        x_mean=pos[:, 0].mean(), y_mean=pos[:, 1].mean(), z_mean=pos[:, 2].mean(),
+    )
+
+
+def print_comparison_table(gpu, g4, n_photons):
+    """Print side-by-side comparison."""
+    w = 14  # column width
+
+    print("=" * 70)
+    print("GPU vs G4 COMPARISON")
+    print("=" * 70)
+
+    print(f"\n  {'':30s} {'GPU':>{w}s} {'G4':>{w}s} {'Diff':>{w}s}")
+    print(f"  {'-'*30} {'-'*w} {'-'*w} {'-'*w}")
+
+    def row(name, gv, cv, fmt=".1f", diff_fmt=None):
+        if diff_fmt is None:
+            diff_fmt = fmt
+        gs = f"{gv:{fmt}}" if gv is not None else "—"
+        cs = f"{cv:{fmt}}" if cv is not None else "—"
+        if gv is not None and cv is not None:
+            d = cv - gv
+            ds = f"{d:{diff_fmt}}"
+        else:
+            ds = "—"
+        print(f"  {name:30s} {gs:>{w}s} {cs:>{w}s} {ds:>{w}s}")
+
+    row("Hits", gpu["n"], g4["n"], "d")
+    if n_photons and n_photons > 0:
+        row("Hit rate (%)", 100.0 * gpu["n"] / n_photons, 100.0 * g4["n"] / n_photons, ".2f")
+
+    if gpu["n"] > 0 and g4["n"] > 0:
+        ratio = g4["n"] / gpu["n"]
+        print(f"  {'Ratio G4/GPU':30s} {'':>{w}s} {'':>{w}s} {ratio:>{w}.3f}")
+
+    if gpu["n"] == 0 or g4["n"] == 0:
+        print("\n  Cannot compare distributions — one side has zero hits.")
+        return
+
+    print()
+    row("Wavelength min (nm)", gpu["wl_min"], g4["wl_min"])
+    row("Wavelength max (nm)", gpu["wl_max"], g4["wl_max"])
+    row("Wavelength mean (nm)", gpu["wl_mean"], g4["wl_mean"])
+    row("Wavelength std (nm)", gpu["wl_std"], g4["wl_std"])
+
+    print()
+    row("Time min (ns)", gpu["t_min"], g4["t_min"], ".3f")
+    row("Time max (ns)", gpu["t_max"], g4["t_max"], ".3f")
+    row("Time mean (ns)", gpu["t_mean"], g4["t_mean"], ".3f")
+    row("Time std (ns)", gpu["t_std"], g4["t_std"], ".3f")
+
+    print()
+    row("Radius min (mm)", gpu["r_min"], g4["r_min"], ".2f")
+    row("Radius max (mm)", gpu["r_max"], g4["r_max"], ".2f")
+    row("Radius mean (mm)", gpu["r_mean"], g4["r_mean"], ".2f")
+
+    print()
+    row("Mean X (mm)", gpu["x_mean"], g4["x_mean"], ".2f")
+    row("Mean Y (mm)", gpu["y_mean"], g4["y_mean"], ".2f")
+    row("Mean Z (mm)", gpu["z_mean"], g4["z_mean"], ".2f")
+
+    # Statistical significance
+    print()
+    if n_photons and n_photons > 0:
+        p_pool = (gpu["n"] + g4["n"]) / (2 * n_photons)
+        std = np.sqrt(p_pool * (1 - p_pool) / n_photons)
+        if std > 0:
+            z = abs(gpu["n"] / n_photons - g4["n"] / n_photons) / (std * np.sqrt(2))
+            expected_fluct = std * np.sqrt(2) * n_photons
+            print(f"  {'Z-score (hit count)':30s} {z:>{w}.1f}")
+            print(f"  {'Expected 1σ fluctuation':30s} {expected_fluct:>{w}.0f} hits")
+            if z > 3:
+                print(f"  ** Statistically significant difference (>{3}σ) **")
+            else:
+                print(f"  Within statistical expectations (<3σ)")
+    print()
+
+
+def print_gpu_outcomes(photon):
+    """Print GPU photon outcome summary."""
+    q3 = photon[:, 3, :].view(np.uint32)
+    flag = q3[:, 0] & 0xFFFF
+
+    print("=" * 70)
+    print("GPU PHOTON OUTCOMES")
+    print("=" * 70)
+
+    n = len(flag)
+    vals, counts = np.unique(flag, return_counts=True)
+    order = np.argsort(-counts)
+
+    print(f"\n  {'Flag':<22s} {'Count':>8s} {'%':>7s}")
+    print(f"  {'-'*22} {'-'*8} {'-'*7}")
+    for idx in order:
+        f = vals[idx]
+        c = counts[idx]
+        name = FLAG_ENUM.get(f, f"0x{f:04x}")
+        print(f"  {name:<22s} {c:8d} {100*c/n:6.1f}%")
+    print()
+
+
+def print_wavelength_histograms(gpu_hits, g4_hits):
+    """Print overlaid wavelength histograms."""
+    if len(gpu_hits) == 0 or len(g4_hits) == 0:
+        return
+
+    gpu_wl = gpu_hits[:, 2, 3]
+    g4_wl = g4_hits[:, 2, 3]
+
+    wl_min = min(gpu_wl.min(), g4_wl.min())
+    wl_max = max(gpu_wl.max(), g4_wl.max())
+    bins = np.arange(max(100, np.floor(wl_min / 25) * 25),
+                     min(800, np.ceil(wl_max / 25) * 25 + 25), 25)
+
+    gpu_counts, _ = np.histogram(gpu_wl, bins=bins)
+    g4_counts, _ = np.histogram(g4_wl, bins=bins)
+
+    # Normalize to same total for shape comparison
+    gpu_norm = gpu_counts / len(gpu_hits) * 1000
+    g4_norm = g4_counts / len(g4_hits) * 1000
+
+    print("=" * 70)
+    print("WAVELENGTH DISTRIBUTION (per 1000 hits)")
+    print("=" * 70)
+    print(f"\n  {'Bin (nm)':<14s} {'GPU':>8s} {'G4':>8s} {'GPU':^20s} {'G4':^20s}")
+    print(f"  {'-'*14} {'-'*8} {'-'*8} {'-'*20} {'-'*20}")
+
+    max_bar = 20
+    scale = max(gpu_norm.max(), g4_norm.max())
+    if scale == 0:
+        scale = 1
+
+    for i in range(len(bins) - 1):
+        if gpu_counts[i] == 0 and g4_counts[i] == 0:
+            continue
+        gpu_bar = "#" * int(gpu_norm[i] / scale * max_bar)
+        g4_bar = "#" * int(g4_norm[i] / scale * max_bar)
+        print(f"  {bins[i]:5.0f}-{bins[i+1]:5.0f}   {gpu_norm[i]:8.1f} {g4_norm[i]:8.1f}"
+              f" {gpu_bar:<20s} {g4_bar:<20s}")
+    print()
+
+
+def print_time_histograms(gpu_hits, g4_hits):
+    """Print overlaid time histograms."""
+    if len(gpu_hits) == 0 or len(g4_hits) == 0:
+        return
+
+    gpu_t = gpu_hits[:, 0, 3]
+    g4_t = g4_hits[:, 0, 3]
+
+    t_max = max(gpu_t.max(), g4_t.max())
+    bin_size = max(1.0, np.ceil(t_max / 15))
+    bins = np.arange(0, t_max + bin_size, bin_size)
+
+    gpu_counts, _ = np.histogram(gpu_t, bins=bins)
+    g4_counts, _ = np.histogram(g4_t, bins=bins)
+
+    gpu_norm = gpu_counts / len(gpu_hits) * 1000
+    g4_norm = g4_counts / len(g4_hits) * 1000
+
+    print("=" * 70)
+    print("TIME DISTRIBUTION (per 1000 hits)")
+    print("=" * 70)
+    print(f"\n  {'Bin (ns)':<14s} {'GPU':>8s} {'G4':>8s} {'GPU':^20s} {'G4':^20s}")
+    print(f"  {'-'*14} {'-'*8} {'-'*8} {'-'*20} {'-'*20}")
+
+    max_bar = 20
+    scale = max(gpu_norm.max(), g4_norm.max())
+    if scale == 0:
+        scale = 1
+
+    for i in range(len(bins) - 1):
+        if gpu_counts[i] == 0 and g4_counts[i] == 0:
+            continue
+        gpu_bar = "#" * int(gpu_norm[i] / scale * max_bar)
+        g4_bar = "#" * int(g4_norm[i] / scale * max_bar)
+        print(f"  {bins[i]:5.1f}-{bins[i+1]:5.1f}   {gpu_norm[i]:8.1f} {g4_norm[i]:8.1f}"
+              f" {gpu_bar:<20s} {g4_bar:<20s}")
+    print()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compare GPU (opticks) vs G4 (standalone) simulation hits",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument("gpu_path", help="Path to GPU opticks event folder")
+    parser.add_argument("g4_hits", help="Path to G4 hits file (g4_hits.npy)")
+    parser.add_argument("--histograms", action="store_true",
+                        help="Show wavelength and time distribution histograms")
+
+    args = parser.parse_args()
+
+    gpu_path = resolve_event_path(args.gpu_path)
+    if not os.path.exists(os.path.join(gpu_path, "photon.npy")):
+        print(f"Error: photon.npy not found in {gpu_path}")
+        sys.exit(1)
+    if not os.path.exists(args.g4_hits):
+        print(f"Error: {args.g4_hits} not found")
+        sys.exit(1)
+
+    # Load GPU arrays
+    gpu_hits = np.load(os.path.join(gpu_path, "hit.npy")) if os.path.exists(os.path.join(gpu_path, "hit.npy")) else np.zeros((0, 4, 4), dtype=np.float32)
+    gpu_photon = np.load(os.path.join(gpu_path, "photon.npy"))
+    n_photons = len(gpu_photon)
+
+    # Load G4 hits
+    g4_hits = np.load(args.g4_hits)
+
+    print(f"\nGPU event: {gpu_path}")
+    print(f"G4 hits:   {args.g4_hits}")
+    print(f"Total photons: {n_photons}\n")
+
+    # Compute stats
+    gpu_stats = hit_stats(gpu_hits, "GPU")
+    g4_stats = hit_stats(g4_hits, "G4")
+
+    # Print tables
+    print_comparison_table(gpu_stats, g4_stats, n_photons)
+    print_gpu_outcomes(gpu_photon)
+
+    if args.histograms:
+        print_wavelength_histograms(gpu_hits, g4_hits)
+        print_time_histograms(gpu_hits, g4_hits)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/optiphy/ana/plot_photon_paths.py b/optiphy/ana/plot_photon_paths.py
new file mode 100644
index 000000000..bf6a09245
--- /dev/null
+++ b/optiphy/ana/plot_photon_paths.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+"""Plot GPU photon paths colored by wavelength from record.npy.
+
+Usage:
+    python optiphy/ana/plot_photon_paths.py <event_dir> [photon_indices] [--output path.png]
+
+Examples:
+    # Plot first 10 hit photons
+    python optiphy/ana/plot_photon_paths.py /tmp/$USER/opticks/GEOM/GEOM/GPUPhotonSourceMinimal/ALL0_no_opticks_event_name/A000
+
+    # Plot specific photons by index
+    python optiphy/ana/plot_photon_paths.py /tmp/$USER/opticks/.../A000 2,19,6
+
+    # Custom output path
+    python optiphy/ana/plot_photon_paths.py /tmp/$USER/opticks/.../A000 2,19,6 --output my_plot.png
+"""
+import argparse
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+import matplotlib.colors as mcolors
+from matplotlib.cm import ScalarMappable
+
+
+def wl_to_rgb(wl):
+    """Convert wavelength (nm) to RGB tuple. Covers 300-780nm."""
+    r = g = b = 0.0
+    if 300 <= wl < 380:
+        t = (wl - 300) / (380 - 300)
+        r = 0.4 * (1 - t) + 0.5 * t
+        g = 0
+        b = 0.4 * (1 - t) + 1.0 * t
+    elif 380 <= wl < 440:
+        r = -(wl - 440) / (440 - 380); g = 0; b = 1
+    elif 440 <= wl < 490:
+        r = 0; g = (wl - 440) / (490 - 440); b = 1
+    elif 490 <= wl < 510:
+        r = 0; g = 1; b = -(wl - 510) / (510 - 490)
+    elif 510 <= wl < 580:
+        r = (wl - 510) / (580 - 510); g = 1; b = 0
+    elif 580 <= wl < 645:
+        r = 1; g = -(wl - 645) / (645 - 580); b = 0
+    elif 645 <= wl <= 780:
+        r = 1; g = 0; b = 0
+    else:
+        r = g = b = 0.3
+    return (max(0, min(1, r)), max(0, min(1, g)), max(0, min(1, b)))
+
+
+def get_steps(record, pidx):
+    """Return number of valid steps for photon pidx."""
+    rec_p = record[pidx]
+    rf = rec_p.reshape(rec_p.shape[0], -1)
+    return int(np.sum(np.any(rf != 0, axis=1)))
+
+
+def plot_photon_paths(event_dir, photon_indices=None, output="photon_paths.png",
+                      sphere_radii=None, title=None, lim=None):
+    record = np.load(f"{event_dir}/record.npy")
+    photon = np.load(f"{event_dir}/photon.npy")
+
+    q3 = photon[:, 3, :].copy().view(np.uint32)
+    flags = q3[:, 0] & 0xFFFF
+    hit_idx = np.where(flags == 0x40)[0]
+
+    if photon_indices is None:
+        photon_indices = hit_idx[:10]
+
+    fig = plt.figure(figsize=(12, 10))
+    ax = fig.add_subplot(111, projection='3d')
+
+    wl_min, wl_max = 800, 300
+    for pidx in photon_indices:
+        ns = get_steps(record, pidx)
+        if ns < 2:
+            continue
+        rec_p = record[pidx]
+        x = rec_p[:ns, 0, 0]
+        y = rec_p[:ns, 0, 1]
+        z = rec_p[:ns, 0, 2]
+        wl = rec_p[:ns, 2, 3]
+
+        wl_min = min(wl_min, wl.min())
+        wl_max = max(wl_max, wl.max())
+
+        for s in range(ns - 1):
+            color = wl_to_rgb(float(wl[s]))
+            ax.plot([x[s], x[s + 1]], [y[s], y[s + 1]], [z[s], z[s + 1]],
+                    color=color, alpha=0.9, linewidth=2.5)
+
+        ax.scatter(x[0], y[0], z[0], c=[wl_to_rgb(float(wl[0]))], s=60,
+                   marker='o', edgecolors='black', linewidths=0.8, zorder=5)
+        ax.scatter(x[-1], y[-1], z[-1], c='red', s=100, marker='*', zorder=5)
+
+    # Draw spheres if requested
+    if sphere_radii:
+        u = np.linspace(0, 2 * np.pi, 60)
+        v = np.linspace(0, np.pi, 30)
+        sphere_colors = ['mediumpurple', 'lightgreen', 'lightyellow', 'lightcoral']
+        sphere_alphas = [0.1, 0.05, 0.05, 0.05]
+        for i, r in enumerate(sphere_radii):
+            xs = r * np.outer(np.cos(u), np.sin(v))
+            ys = r * np.outer(np.sin(u), np.sin(v))
+            zs = r * np.outer(np.ones_like(u), np.cos(v))
+            ci = min(i, len(sphere_colors) - 1)
+            ax.plot_surface(xs, ys, zs, alpha=sphere_alphas[ci], color=sphere_colors[ci])
+
+    # Wavelength colorbar
+    wl_range = np.linspace(wl_min, wl_max, 256)
+    colors = [wl_to_rgb(w) for w in wl_range]
+    cmap = mcolors.ListedColormap(colors)
+    norm = mcolors.Normalize(vmin=wl_min, vmax=wl_max)
+    sm = ScalarMappable(cmap=cmap, norm=norm)
+    sm.set_array([])
+    plt.colorbar(sm, ax=ax, shrink=0.5, pad=0.08, label='Wavelength (nm)')
+
+    ax.set_xlabel('X (mm)')
+    ax.set_ylabel('Y (mm)')
+    ax.set_zlabel('Z (mm)')
+    if title:
+        ax.set_title(title)
+    if lim:
+        ax.set_xlim(-lim, lim)
+        ax.set_ylim(-lim, lim)
+        ax.set_zlim(-lim, lim)
+    ax.view_init(elev=20, azim=135)
+    plt.tight_layout()
+    plt.savefig(output, dpi=180)
+    print(f"Saved {output}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("event_dir", help="Path to opticks event folder containing record.npy")
+    parser.add_argument("indices", nargs='?', default=None,
+                        help="Comma-separated photon indices (default: first 10 hits)")
+    parser.add_argument("--output", "-o", default="photon_paths.png", help="Output image path")
+    parser.add_argument("--spheres", default=None,
+                        help="Comma-separated sphere radii to draw (e.g. 10,30)")
+    parser.add_argument("--title", "-t", default=None, help="Plot title")
+    parser.add_argument("--lim", type=float, default=None,
+                        help="Axis limit in mm (symmetric)")
+    args = parser.parse_args()
+
+    indices = None
+    if args.indices:
+        indices = [int(x) for x in args.indices.split(',')]
+
+    spheres = None
+    if args.spheres:
+        spheres = [float(x) for x in args.spheres.split(',')]
+
+    plot_photon_paths(args.event_dir, indices, args.output,
+                      sphere_radii=spheres, title=args.title, lim=args.lim)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/optiphy/ana/run_and_compare.py b/optiphy/ana/run_and_compare.py
new file mode 100755
index 000000000..d44980234
--- /dev/null
+++ b/optiphy/ana/run_and_compare.py
@@ -0,0 +1,314 @@
+#!/usr/bin/env python3
+"""Run GPU and G4 simulations and compare hit distributions.
+
+Runs GPURaytrace with a given GDML and config, then plots:
+  1. Hit count with sqrt(N) error bars
+  2. WLS-shifted wavelength distribution
+  3. Full wavelength distribution
+  4. Arrival time (bulk, truncated)
+  5. Arrival time (full range, no overflow)
+  6. 3D hit position scatter for GPU and G4
+
+Usage:
+    python optiphy/ana/run_and_compare.py -g apex.gdml -s 42 [--outdir plots]
+
+    # Skip simulation, use existing .npy files:
+    python optiphy/ana/run_and_compare.py --gpu-hits gpu_hits.npy --g4-hits g4_hits.npy
+"""
+import argparse
+import os
+import subprocess
+import sys
+import math
+
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+
+
+def run_simulation(gdml, config, macro, seed):
+    """Run GPURaytrace and return (gpu_hits_path, g4_hits_path, gpu_nhits, g4_nhits)."""
+    print("NOTE: For step count plots, set \"mode\": \"DebugLite\" in the config JSON file.")
+    print("      For hit-only analysis, \"HitPhoton\" is sufficient.")
+    print()
+    cmd = ["/opt/eic-opticks/bin/GPURaytrace",
+           "-g", gdml, "-m", macro, "-s", str(seed)]
+    if config:
+        cmd += ["-c", config]
+
+    print(f"Running: {' '.join(cmd)}")
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
+    output = result.stdout + result.stderr
+
+    gpu_nhits = g4_nhits = 0
+    for line in output.split('\n'):
+        if 'Opticks: NumHits:' in line:
+            gpu_nhits = int(line.strip().split()[-1])
+        if 'Geant4: NumHits:' in line:
+            g4_nhits = int(line.strip().split()[-1])
+
+    print(f"  GPU: {gpu_nhits} hits, G4: {g4_nhits} hits")
+    return "gpu_hits.npy", "g4_hits.npy", gpu_nhits, g4_nhits
+
+
+def load_hits(path, expected_cols=None):
+    """Load hit array and reshape to (N, ?, 4)."""
+    a = np.load(path)
+    if a.ndim == 2:
+        ncols = a.shape[1] // 4
+        a = a.reshape(-1, ncols, 4)
+    return a
+
+
+def plot_with_errors(ax, data1, data2, bins, label1, label2, xlabel):
+    """Plot two histograms as points with sqrt(N) error bars."""
+    h1, edges = np.histogram(data1, bins=bins)
+    h2, _ = np.histogram(data2, bins=bins)
+    centers = (edges[:-1] + edges[1:]) / 2
+    width = (edges[1] - edges[0]) * 0.35
+    ax.errorbar(centers - width / 2, h1, yerr=np.sqrt(np.maximum(h1, 1)),
+                fmt='o', color='dodgerblue', markersize=4, capsize=2,
+                linewidth=1, label=label1)
+    ax.errorbar(centers + width / 2, h2, yerr=np.sqrt(np.maximum(h2, 1)),
+                fmt='s', color='orangered', markersize=4, capsize=2,
+                linewidth=1, label=label2)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel('Counts')
+    ax.legend()
+
+
+def make_plots(gpu, g4, outdir, title_extra="", g4_full=None, g4_raw_shape=None):
+    os.makedirs(outdir, exist_ok=True)
+
+    gpu_wl = gpu[:, 2, 3]
+    g4_wl = g4[:, 2, 3]
+    gpu_t = gpu[:, 0, 3]
+    g4_t = g4[:, 0, 3]
+    gpu_pos = gpu[:, 0, :3]
+    g4_pos = g4[:, 0, :3]
+
+    diff = 100 * (len(gpu) / len(g4) - 1) if len(g4) > 0 else 0
+    z_score = (len(gpu) - len(g4)) / math.sqrt(len(gpu) + len(g4)) if (len(gpu) + len(g4)) > 0 else 0
+    header = f"GPU={len(gpu)} G4={len(g4)} ({diff:+.1f}%, {z_score:+.1f}σ)"
+    if title_extra:
+        header = f"{title_extra}\n{header}"
+
+    # 1. Hit count
+    fig, ax = plt.subplots(figsize=(6, 5))
+    vals = [len(gpu), len(g4)]
+    errs = [math.sqrt(v) for v in vals]
+    ax.errorbar([0], [vals[0]], yerr=[errs[0]], fmt='o', markersize=12,
+                capsize=8, linewidth=2, color='dodgerblue', label='GPU')
+    ax.errorbar([1], [vals[1]], yerr=[errs[1]], fmt='s', markersize=12,
+                capsize=8, linewidth=2, color='orangered', label='G4')
+    ax.set_xticks([0, 1])
+    ax.set_xticklabels(['GPU', 'G4'])
+    ax.set_ylabel('Hits')
+    ax.set_title(f'Hit Count\n{header}')
+    ax.set_xlim(-0.5, 1.5)
+    ax.legend()
+    for i, (v, e) in enumerate(zip(vals, errs)):
+        ax.text(i + 0.15, v, f'{v}±{e:.0f}', va='center', fontsize=10)
+    plt.tight_layout()
+    plt.savefig(f'{outdir}/hits.png', dpi=150)
+    plt.close()
+
+    # 2. WLS-shifted wavelength
+    fig, ax = plt.subplots(figsize=(8, 5))
+    gpu_s = gpu_wl[gpu_wl > 380]
+    g4_s = g4_wl[g4_wl > 380]
+    plot_with_errors(ax, gpu_s, g4_s, np.arange(380, 550, 15),
+                     f'GPU ({len(gpu_s)})', f'G4 ({len(g4_s)})',
+                     'Wavelength (nm)')
+    ax.set_title(f'WLS-shifted Wavelength (>380nm)\n{header}')
+    plt.tight_layout()
+    plt.savefig(f'{outdir}/wavelength_shifted.png', dpi=150)
+    plt.close()
+
+    # 3. Full wavelength
+    fig, ax = plt.subplots(figsize=(8, 5))
+    plot_with_errors(ax, gpu_wl, g4_wl, np.arange(330, 550, 15),
+                     f'GPU ({len(gpu)})', f'G4 ({len(g4)})',
+                     'Wavelength (nm)')
+    ax.set_title(f'Full Wavelength\n{header}')
+    plt.tight_layout()
+    plt.savefig(f'{outdir}/wavelength_full.png', dpi=150)
+    plt.close()
+
+    # 4. Arrival time (bulk, truncated at 99th percentile)
+    fig, ax = plt.subplots(figsize=(8, 5))
+    t_cut = max(np.percentile(gpu_t, 99), np.percentile(g4_t, 99))
+    t_bins = np.linspace(0, t_cut, 30)
+    gpu_over = (gpu_t > t_cut).sum()
+    g4_over = (g4_t > t_cut).sum()
+    plot_with_errors(ax, gpu_t[gpu_t <= t_cut], g4_t[g4_t <= t_cut], t_bins,
+                     f'GPU (overflow={gpu_over})', f'G4 (overflow={g4_over})',
+                     'Time (ns)')
+    ax.set_title(f'Arrival Time (t < {t_cut:.0f}ns)\n{header}')
+    ax.set_yscale('log')
+    ax.set_ylim(bottom=0.5)
+    plt.tight_layout()
+    plt.savefig(f'{outdir}/time_bulk.png', dpi=150)
+    plt.close()
+
+    # 5. Arrival time (full range, no overflow)
+    fig, ax = plt.subplots(figsize=(8, 5))
+    t_max = max(gpu_t.max(), g4_t.max()) * 1.05
+    t_bins_full = np.linspace(0, t_max, 50)
+    plot_with_errors(ax, gpu_t, g4_t, t_bins_full,
+                     f'GPU ({len(gpu)})', f'G4 ({len(g4)})',
+                     'Time (ns)')
+    ax.set_title(f'Arrival Time (full range)\n{header}')
+    ax.set_yscale('log')
+    ax.set_ylim(bottom=0.5)
+    plt.tight_layout()
+    plt.savefig(f'{outdir}/time_full.png', dpi=150)
+    plt.close()
+
+    # 6. 3D hit positions
+    fig = plt.figure(figsize=(14, 6))
+
+    ax1 = fig.add_subplot(121, projection='3d')
+    ax1.scatter(gpu_pos[:, 0], gpu_pos[:, 1], gpu_pos[:, 2],
+                c='dodgerblue', s=3, alpha=0.5)
+    ax1.set_xlabel('X (mm)')
+    ax1.set_ylabel('Y (mm)')
+    ax1.set_zlabel('Z (mm)')
+    ax1.set_title(f'GPU hit positions ({len(gpu)})')
+
+    ax2 = fig.add_subplot(122, projection='3d')
+    ax2.scatter(g4_pos[:, 0], g4_pos[:, 1], g4_pos[:, 2],
+                c='orangered', s=3, alpha=0.5)
+    ax2.set_xlabel('X (mm)')
+    ax2.set_ylabel('Y (mm)')
+    ax2.set_zlabel('Z (mm)')
+    ax2.set_title(f'G4 hit positions ({len(g4)})')
+
+    plt.suptitle(f'3D Hit Positions\n{header}')
+    plt.tight_layout()
+    plt.savefig(f'{outdir}/positions_3d.png', dpi=150)
+    plt.close()
+
+    # 7. Step count distribution (GPU from record.npy, G4 from extended hit array)
+    gpu_steps = None
+    g4_steps = None
+
+    # GPU: try to load record.npy and count steps per hit
+    record_path = os.path.join(os.path.dirname(os.environ.get('OPTICKS_EVTDIR', '')),
+                               'record.npy')
+    # Try common paths
+    for rpath in [record_path,
+                  '/tmp/MISSING_USER/opticks/GEOM/GEOM/GPURaytrace/ALL0_no_opticks_event_name/A000/record.npy',
+                  '/tmp/MISSING_USER/opticks/GEOM/GEOM/GPUPhotonSourceMinimal/ALL0_no_opticks_event_name/A000/record.npy']:
+        if os.path.exists(rpath):
+            try:
+                record = np.load(rpath)
+                photon_all = np.load(rpath.replace('record.npy', 'photon.npy'))
+                q3_all = photon_all[:, 3, :].copy().view(np.uint32)
+                flags_all = q3_all[:, 0] & 0xFFFF
+                hit_mask = (flags_all == 0x40)
+                hit_indices = np.where(hit_mask)[0]
+                rec_flat = record[hit_indices].reshape(len(hit_indices), record.shape[1], -1)
+                gpu_steps = np.sum(np.any(rec_flat != 0, axis=2), axis=1)
+                print(f"  GPU step counts loaded from {rpath}")
+            except Exception:
+                pass
+            break
+
+    # G4: step count stored in row 3, col 3 (last float of the 4x4 array)
+    g4_q3w = g4[:, 3, 3]
+    if np.any(g4_q3w > 0):
+        g4_steps = g4_q3w.astype(int)
+        print(f"  G4 step counts loaded from hit array row 3 col 3")
+
+    if gpu_steps is not None or g4_steps is not None:
+        fig, ax = plt.subplots(figsize=(8, 5))
+        s_max = 0
+        if gpu_steps is not None:
+            s_max = max(s_max, np.percentile(gpu_steps, 99))
+        if g4_steps is not None:
+            s_max = max(s_max, np.percentile(g4_steps, 99))
+        s_bins = np.linspace(0, s_max * 1.1, 30)
+
+        if gpu_steps is not None and g4_steps is not None:
+            plot_with_errors(ax, gpu_steps, g4_steps, s_bins,
+                             f'GPU ({len(gpu_steps)})', f'G4 ({len(g4_steps)})',
+                             'Steps to detection')
+        elif gpu_steps is not None:
+            h, edges = np.histogram(gpu_steps, bins=s_bins)
+            centers = (edges[:-1] + edges[1:]) / 2
+            ax.errorbar(centers, h, yerr=np.sqrt(np.maximum(h, 1)),
+                         fmt='o', color='dodgerblue', markersize=4, capsize=2,
+                         label=f'GPU ({len(gpu_steps)})')
+            ax.set_xlabel('Steps to detection')
+            ax.set_ylabel('Counts')
+            ax.legend()
+        elif g4_steps is not None:
+            h, edges = np.histogram(g4_steps, bins=s_bins)
+            centers = (edges[:-1] + edges[1:]) / 2
+            ax.errorbar(centers, h, yerr=np.sqrt(np.maximum(h, 1)),
+                         fmt='s', color='orangered', markersize=4, capsize=2,
+                         label=f'G4 ({len(g4_steps)})')
+            ax.set_xlabel('Steps to detection')
+            ax.set_ylabel('Counts')
+            ax.legend()
+
+        ax.set_title(f'Steps to Detection\n{header}')
+        plt.tight_layout()
+        plt.savefig(f'{outdir}/step_count.png', dpi=150)
+        plt.close()
+
+    # Print summary
+    print(f"\nSummary: {header}")
+    print(f"  Wavelength: GPU mean={gpu_wl.mean():.1f}nm  G4 mean={g4_wl.mean():.1f}nm")
+    print(f"  Time:       GPU mean={gpu_t.mean():.2f}ns  G4 mean={g4_t.mean():.2f}ns")
+    print(f"  WLS shifted: GPU {100*(gpu_wl>380).mean():.1f}%  G4 {100*(g4_wl>380).mean():.1f}%")
+    if gpu_steps is not None:
+        print(f"  GPU steps:  mean={gpu_steps.mean():.0f}  median={np.median(gpu_steps):.0f}  max={gpu_steps.max()}")
+    if g4_steps is not None:
+        print(f"  G4  steps:  mean={g4_steps.mean():.0f}  median={np.median(g4_steps):.0f}  max={g4_steps.max()}")
+    print(f"\nPlots saved to {outdir}/:")
+    for f in ['hits.png', 'wavelength_shifted.png', 'wavelength_full.png',
+              'time_bulk.png', 'time_full.png', 'positions_3d.png', 'step_count.png']:
+        print(f"  {f}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("-g", "--gdml", default="apex.gdml", help="GDML geometry file")
+    parser.add_argument("-c", "--config", default=None, help="Config name (e.g. det_debug)")
+    parser.add_argument("-m", "--macro", default="tests/run_genstep.mac", help="G4 macro file")
+    parser.add_argument("-s", "--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--outdir", default="plots", help="Output directory for plots")
+    parser.add_argument("--title", default="", help="Extra title text")
+    parser.add_argument("--gpu-hits", default=None, help="Skip sim, use existing GPU hits .npy")
+    parser.add_argument("--g4-hits", default=None, help="Skip sim, use existing G4 hits .npy")
+    args = parser.parse_args()
+
+    if args.gpu_hits and args.g4_hits:
+        print(f"Using existing files: {args.gpu_hits}, {args.g4_hits}")
+    else:
+        run_simulation(args.gdml, args.config, args.macro, args.seed)
+        args.gpu_hits = "gpu_hits.npy"
+        args.g4_hits = "g4_hits.npy"
+
+    gpu = load_hits(args.gpu_hits)
+    g4_raw = load_hits(args.g4_hits)
+    g4_raw_shape = g4_raw.shape
+
+    # Keep full G4 array for step count extraction
+    g4_full = g4_raw.copy() if g4_raw.shape[1] >= 5 else None
+
+    # Normalize to (N, 4, 4) — take first 4 rows if more
+    gpu = gpu[:, :4, :] if gpu.shape[1] > 4 else gpu
+    g4 = g4_raw[:, :4, :] if g4_raw.shape[1] > 4 else g4_raw
+
+    make_plots(gpu, g4, args.outdir, title_extra=args.title,
+               g4_full=g4_full, g4_raw_shape=g4_raw_shape)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/optiphy/ana/run_genstep_comparison.py b/optiphy/ana/run_genstep_comparison.py
new file mode 100644
index 000000000..321ae6085
--- /dev/null
+++ b/optiphy/ana/run_genstep_comparison.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+"""
+run_genstep_comparison.py
+==========================
+
+Runs GPU (simg4ox) and G4 (G4ValidationGenstep) simulations with the same
+electron primary, then compares the optical photon hit distributions.
+
+Usage:
+    python run_genstep_comparison.py [--gdml apex.gdml] [--energy 1.0] [--nevents 10] [--seed 42]
+"""
+import os
+import sys
+import subprocess
+import argparse
+import numpy as np
+from pathlib import Path
+
+def find_gpu_hits():
+    """Find the most recent GPU hit.npy output."""
+    base = Path(f"/tmp/{os.environ.get('USER','MISSING_USER')}/opticks")
+    candidates = sorted(base.rglob("hit.npy"), key=lambda p: p.stat().st_mtime, reverse=True)
+    return str(candidates[0]) if candidates else None
+
+def run_g4(gdml, energy, nevents, seed, pos, direction):
+    """Run pure G4 simulation with electron primary."""
+    cmd = [
+        "G4ValidationGenstep",
+        "-g", gdml,
+        "-e", str(energy),
+        "-n", str(nevents),
+        "-s", str(seed),
+        "--pos", pos,
+        "--dir", direction,
+    ]
+    print(f"=== Running G4: {' '.join(cmd)}")
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
+
+    # Extract hit count from output
+    g4_hits = 0
+    for line in result.stdout.split('\n'):
+        if "Total hits:" in line:
+            g4_hits = int(line.split("Total hits:")[-1].strip())
+
+    print(f"G4: {g4_hits} hits")
+    if result.returncode != 0:
+        print(f"G4 STDERR (last 5 lines):")
+        for line in result.stderr.strip().split('\n')[-5:]:
+            print(f"  {line}")
+    return g4_hits
+
+def run_gpu(gdml, config, macro, seed):
+    """Run GPU simulation via simg4ox."""
+    env = os.environ.copy()
+    env["OPTICKS_INTEGRATION_MODE"] = "1"  # Minimal mode: G4 tracks electron, GPU propagates optical
+
+    cmd = [
+        "simg4ox",
+        "-g", gdml,
+        "-c", config,
+        "-m", macro,
+    ]
+    print(f"\n=== Running GPU: {' '.join(cmd)}")
+    print(f"    OPTICKS_INTEGRATION_MODE=1 (Minimal)")
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
+
+    if result.returncode != 0:
+        print(f"GPU STDERR (last 10 lines):")
+        for line in result.stderr.strip().split('\n')[-10:]:
+            print(f"  {line}")
+        return 0
+
+    # Find hit output
+    hit_path = find_gpu_hits()
+    if hit_path and os.path.exists(hit_path):
+        hits = np.load(hit_path)
+        print(f"GPU: {len(hits)} hits (from {hit_path})")
+        return len(hits)
+    else:
+        print("GPU: no hit.npy found")
+        return 0
+
+def compare_hits(g4_path, gpu_path):
+    """Compare G4 and GPU hit arrays."""
+    if not os.path.exists(g4_path):
+        print(f"G4 hits not found: {g4_path}")
+        return
+    if not gpu_path or not os.path.exists(gpu_path):
+        print(f"GPU hits not found")
+        return
+
+    g4 = np.load(g4_path)
+    gpu = np.load(gpu_path)
+
+    print(f"\n{'='*60}")
+    print(f"HIT COMPARISON")
+    print(f"{'='*60}")
+    print(f"  G4 hits:  {len(g4)}")
+    print(f"  GPU hits: {len(gpu)}")
+
+    if len(g4) > 0 and len(gpu) > 0:
+        diff = len(gpu) - len(g4)
+        pct = 100 * diff / len(g4) if len(g4) > 0 else 0
+        sign = "+" if diff > 0 else ""
+        print(f"  Diff:     {sign}{diff} ({sign}{pct:.1f}%)")
+
+    # Position distributions
+    if len(g4) > 0:
+        g4_pos = g4[:, 0, :3]
+        print(f"\n  G4 hit positions:")
+        print(f"    x: [{g4_pos[:,0].min():.1f}, {g4_pos[:,0].max():.1f}] mm")
+        print(f"    y: [{g4_pos[:,1].min():.1f}, {g4_pos[:,1].max():.1f}] mm")
+        print(f"    z: [{g4_pos[:,2].min():.1f}, {g4_pos[:,2].max():.1f}] mm")
+
+    if len(gpu) > 0:
+        gpu_pos = gpu[:, 0, :3]
+        print(f"\n  GPU hit positions:")
+        print(f"    x: [{gpu_pos[:,0].min():.1f}, {gpu_pos[:,0].max():.1f}] mm")
+        print(f"    y: [{gpu_pos[:,1].min():.1f}, {gpu_pos[:,1].max():.1f}] mm")
+        print(f"    z: [{gpu_pos[:,2].min():.1f}, {gpu_pos[:,2].max():.1f}] mm")
+
+    # Wavelength distributions
+    if len(g4) > 0:
+        g4_wl = g4[:, 2, 3]
+        print(f"\n  G4 wavelength:  mean={g4_wl.mean():.1f} std={g4_wl.std():.1f} nm")
+    if len(gpu) > 0:
+        gpu_wl = gpu[:, 2, 3]
+        print(f"  GPU wavelength: mean={gpu_wl.mean():.1f} std={gpu_wl.std():.1f} nm")
+
+    # Time distributions
+    if len(g4) > 0:
+        g4_t = g4[:, 0, 3]
+        print(f"\n  G4 time:  mean={g4_t.mean():.2f} max={g4_t.max():.2f} ns")
+    if len(gpu) > 0:
+        gpu_t = gpu[:, 0, 3]
+        print(f"  GPU time: mean={gpu_t.mean():.2f} max={gpu_t.max():.2f} ns")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare GPU vs G4 electron genstep simulation")
+    parser.add_argument("--gdml", default="apex.gdml", help="GDML geometry file")
+    parser.add_argument("--energy", type=float, default=1.0, help="Electron energy in MeV")
+    parser.add_argument("--nevents", type=int, default=10, help="Number of events")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--pos", default="0,0,100", help="Electron position x,y,z mm")
+    parser.add_argument("--dir", default="0,0,1", help="Electron direction x,y,z")
+    args = parser.parse_args()
+
+    # Run G4
+    g4_hits = run_g4(args.gdml, args.energy, args.nevents, args.seed, args.pos, args.dir)
+
+    # Compare
+    g4_path = "g4_genstep_hits.npy"
+    gpu_path = find_gpu_hits()
+
+    if os.path.exists(g4_path):
+        g4 = np.load(g4_path)
+        print(f"\n{'='*60}")
+        print(f"G4 RESULTS ({args.nevents} events, {args.energy} MeV electron)")
+        print(f"{'='*60}")
+        print(f"  Total hits: {len(g4)}")
+        print(f"  Hits/event: {len(g4)/args.nevents:.1f}")
+        if len(g4) > 0:
+            g4_wl = g4[:, 2, 3]
+            g4_pos = g4[:, 0, :3]
+            print(f"  Wavelength: mean={g4_wl.mean():.1f} nm")
+            print(f"  Hit y range: [{g4_pos[:,1].min():.1f}, {g4_pos[:,1].max():.1f}] mm")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/optiphy/ana/wls_diagnostic.py b/optiphy/ana/wls_diagnostic.py
new file mode 100644
index 000000000..6975983cb
--- /dev/null
+++ b/optiphy/ana/wls_diagnostic.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+"""
+wls_diagnostic.py : Detailed WLS wavelength distribution comparison GPU vs G4
+==============================================================================
+
+Compares wavelength and time distributions from GPU (opticks) and G4 hits,
+performs KS test, and checks per-pass WLS conversion probability.
+
+Usage::
+
+    python ana/wls_diagnostic.py <gpu_event_folder> <g4_hits.npy> [--input-wavelength 350]
+"""
+import sys
+import os
+import argparse
+import numpy as np
+
+
+def resolve_event_path(path):
+    if os.path.exists(os.path.join(path, "photon.npy")):
+        return path
+    a000 = os.path.join(path, "A000")
+    if os.path.exists(os.path.join(a000, "photon.npy")):
+        return a000
+    if os.path.isdir(path):
+        for d in sorted(os.listdir(path)):
+            dp = os.path.join(path, d)
+            if os.path.isdir(dp) and os.path.exists(os.path.join(dp, "photon.npy")):
+                return dp
+    return path
+
+
+FLAG_ENUM = {
+    0x0004: "TORCH", 0x0008: "BULK_ABSORB", 0x0010: "BULK_REEMIT",
+    0x0020: "BULK_SCATTER", 0x0040: "SURFACE_DETECT", 0x0080: "SURFACE_ABSORB",
+    0x0100: "SURFACE_DREFLECT", 0x0200: "SURFACE_SREFLECT",
+    0x0400: "BOUNDARY_REFLECT", 0x0800: "BOUNDARY_TRANSMIT",
+    0x1000: "NAN_ABORT", 0x2000: "EFFICIENCY_COLLECT", 0x8000: "MISS",
+}
+
+
+def ks_test_2sample(a, b):
+    """Two-sample Kolmogorov-Smirnov test (no scipy dependency)."""
+    na, nb = len(a), len(b)
+    a_sorted = np.sort(a)
+    b_sorted = np.sort(b)
+    all_vals = np.sort(np.concatenate([a_sorted, b_sorted]))
+
+    cdf_a = np.searchsorted(a_sorted, all_vals, side='right') / na
+    cdf_b = np.searchsorted(b_sorted, all_vals, side='right') / nb
+    d_stat = np.max(np.abs(cdf_a - cdf_b))
+
+    # Approximate p-value (asymptotic)
+    n_eff = np.sqrt(na * nb / (na + nb))
+    lam = (n_eff + 0.12 + 0.11 / n_eff) * d_stat
+    # Kolmogorov distribution approximation
+    if lam < 0.001:
+        p_val = 1.0
+    else:
+        p_val = 2.0 * np.exp(-2.0 * lam * lam)
+        p_val = max(0.0, min(1.0, p_val))
+    return d_stat, p_val
+
+
+def print_header(title):
+    print()
+    print("=" * 74)
+    print(f"  {title}")
+    print("=" * 74)
+
+
+def print_hit_summary(gpu_hits, g4_hits, n_photons, input_wl):
+    print_header("HIT COUNT SUMMARY")
+    ng, nc = len(gpu_hits), len(g4_hits)
+    print(f"  Input photons:     {n_photons:>10d}   (wavelength = {input_wl:.0f} nm)")
+    print(f"  GPU hits:          {ng:>10d}   ({100*ng/n_photons:.2f}%)")
+    print(f"  G4  hits:          {nc:>10d}   ({100*nc/n_photons:.2f}%)")
+    if ng > 0 and nc > 0:
+        ratio = nc / ng
+        # Significance
+        p_pool = (ng + nc) / (2 * n_photons)
+        se = np.sqrt(2 * p_pool * (1 - p_pool) / n_photons)
+        z = abs(ng/n_photons - nc/n_photons) / se if se > 0 else 0
+        print(f"  Ratio G4/GPU:      {ratio:>10.4f}")
+        print(f"  Z-score:           {z:>10.1f}   {'** SIGNIFICANT **' if z > 3 else '(within noise)'}")
+    print()
+
+
+def print_wavelength_comparison(gpu_wl, g4_wl):
+    print_header("WAVELENGTH DISTRIBUTION COMPARISON")
+
+    print(f"\n  {'Statistic':<25s} {'GPU':>12s} {'G4':>12s} {'Diff':>12s}")
+    print(f"  {'-'*25} {'-'*12} {'-'*12} {'-'*12}")
+
+    for name, fn in [("Mean (nm)", np.mean), ("Std (nm)", np.std),
+                     ("Median (nm)", np.median), ("Min (nm)", np.min),
+                     ("Max (nm)", np.max)]:
+        gv, cv = fn(gpu_wl), fn(g4_wl)
+        print(f"  {name:<25s} {gv:12.2f} {cv:12.2f} {cv-gv:12.2f}")
+
+    # Percentiles
+    print()
+    for pct in [5, 25, 75, 95]:
+        gv = np.percentile(gpu_wl, pct)
+        cv = np.percentile(g4_wl, pct)
+        print(f"  {'P%d (nm)' % pct:<25s} {gv:12.2f} {cv:12.2f} {cv-gv:12.2f}")
+
+    # KS test
+    d_stat, p_val = ks_test_2sample(gpu_wl, g4_wl)
+    print(f"\n  KS statistic:      {d_stat:.6f}")
+    print(f"  KS p-value:        {p_val:.2e}")
+    if p_val < 0.01:
+        print("  ** Wavelength distributions are SIGNIFICANTLY DIFFERENT **")
+    else:
+        print("  Wavelength distributions are statistically compatible")
+    print()
+
+
+def print_fine_histogram(gpu_wl, g4_wl, bin_width=10):
+    print_header(f"WAVELENGTH HISTOGRAM (bin={bin_width}nm)")
+
+    lo = min(gpu_wl.min(), g4_wl.min())
+    hi = max(gpu_wl.max(), g4_wl.max())
+    bins = np.arange(np.floor(lo / bin_width) * bin_width,
+                     np.ceil(hi / bin_width) * bin_width + bin_width, bin_width)
+
+    gc, _ = np.histogram(gpu_wl, bins=bins)
+    cc, _ = np.histogram(g4_wl, bins=bins)
+
+    # Normalize to density (per nm per photon)
+    gpu_dens = gc / (len(gpu_wl) * bin_width)
+    g4_dens = cc / (len(g4_wl) * bin_width)
+
+    max_dens = max(gpu_dens.max(), g4_dens.max())
+    bar_w = 25
+
+    print(f"\n  {'Bin (nm)':<14s} {'GPU':>8s} {'G4':>8s} {'GPU/G4':>7s}  GPU|G4")
+    print(f"  {'-'*14} {'-'*8} {'-'*8} {'-'*7}  {'-'*51}")
+
+    for i in range(len(bins) - 1):
+        if gc[i] == 0 and cc[i] == 0:
+            continue
+        ratio_str = f"{gc[i]/cc[i]:.2f}" if cc[i] > 0 else "  inf"
+        gb = "#" * int(gpu_dens[i] / max_dens * bar_w) if max_dens > 0 else ""
+        cb = "=" * int(g4_dens[i] / max_dens * bar_w) if max_dens > 0 else ""
+        print(f"  {bins[i]:5.0f}-{bins[i+1]:5.0f}   {gc[i]:8d} {cc[i]:8d} {ratio_str:>7s}  {gb:<25s}|{cb:<25s}")
+    print()
+
+
+def print_time_comparison(gpu_t, g4_t):
+    print_header("TIME DISTRIBUTION COMPARISON")
+
+    print(f"\n  {'Statistic':<25s} {'GPU':>12s} {'G4':>12s} {'Diff':>12s}")
+    print(f"  {'-'*25} {'-'*12} {'-'*12} {'-'*12}")
+
+    for name, fn in [("Mean (ns)", np.mean), ("Std (ns)", np.std),
+                     ("Median (ns)", np.median), ("Min (ns)", np.min),
+                     ("Max (ns)", np.max)]:
+        gv, cv = fn(gpu_t), fn(g4_t)
+        print(f"  {name:<25s} {gv:12.4f} {cv:12.4f} {cv-gv:12.4f}")
+
+    d_stat, p_val = ks_test_2sample(gpu_t, g4_t)
+    print(f"\n  KS statistic:      {d_stat:.6f}")
+    print(f"  KS p-value:        {p_val:.2e}")
+    if p_val < 0.01:
+        print("  ** Time distributions are SIGNIFICANTLY DIFFERENT **")
+    else:
+        print("  Time distributions are statistically compatible")
+    print()
+
+
+def print_gpu_outcomes(photon):
+    print_header("GPU PHOTON OUTCOMES (all photons)")
+    q3 = photon[:, 3, :].view(np.uint32)
+    flag = q3[:, 0] & 0xFFFF
+    n = len(flag)
+    vals, counts = np.unique(flag, return_counts=True)
+    order = np.argsort(-counts)
+
+    print(f"\n  {'Flag':<22s} {'Count':>8s} {'%':>7s}")
+    print(f"  {'-'*22} {'-'*8} {'-'*7}")
+    for idx in order:
+        f = vals[idx]
+        c = counts[idx]
+        name = FLAG_ENUM.get(f, f"0x{f:04x}")
+        print(f"  {name:<22s} {c:8d} {100*c/n:6.2f}%")
+    print()
+
+
+def print_position_comparison(gpu_hits, g4_hits):
+    print_header("SPATIAL DISTRIBUTION")
+
+    gpu_pos = gpu_hits[:, 0, :3]
+    g4_pos = g4_hits[:, 0, :3]
+    gpu_r = np.sqrt(np.sum(gpu_pos**2, axis=1))
+    g4_r = np.sqrt(np.sum(g4_pos**2, axis=1))
+
+    print(f"\n  {'Statistic':<25s} {'GPU':>12s} {'G4':>12s} {'Diff':>12s}")
+    print(f"  {'-'*25} {'-'*12} {'-'*12} {'-'*12}")
+
+    for name, gv, cv in [
+        ("Mean radius (mm)", gpu_r.mean(), g4_r.mean()),
+        ("Mean X (mm)", gpu_pos[:, 0].mean(), g4_pos[:, 0].mean()),
+        ("Mean Y (mm)", gpu_pos[:, 1].mean(), g4_pos[:, 1].mean()),
+        ("Mean Z (mm)", gpu_pos[:, 2].mean(), g4_pos[:, 2].mean()),
+    ]:
+        print(f"  {name:<25s} {gv:12.3f} {cv:12.3f} {cv-gv:12.3f}")
+    print()
+
+
+def print_energy_conservation(gpu_wl, g4_wl, input_wl):
+    print_header("ENERGY CONSERVATION CHECK")
+    gpu_viol = np.sum(gpu_wl < input_wl)
+    g4_viol = np.sum(g4_wl < input_wl)
+    print(f"  Input wavelength:          {input_wl:.0f} nm")
+    print(f"  GPU hits with wl < input:  {gpu_viol} / {len(gpu_wl)}")
+    print(f"  G4  hits with wl < input:  {g4_viol} / {len(g4_wl)}")
+    if gpu_viol == 0 and g4_viol == 0:
+        print("  ALL PASS: energy conservation satisfied")
+    else:
+        if gpu_viol > 0:
+            bad = gpu_wl[gpu_wl < input_wl]
+            print(f"  GPU violations: min={bad.min():.1f}nm, mean={bad.mean():.1f}nm")
+        if g4_viol > 0:
+            bad = g4_wl[g4_wl < input_wl]
+            print(f"  G4  violations: min={bad.min():.1f}nm, mean={bad.mean():.1f}nm")
+    print()
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("gpu_path", help="GPU opticks event folder")
+    parser.add_argument("g4_hits", help="G4 hits file (g4_hits.npy)")
+    parser.add_argument("--input-wavelength", type=float, default=350.0,
+                        help="Input photon wavelength in nm (default: 350)")
+    parser.add_argument("--bin-width", type=float, default=5.0,
+                        help="Histogram bin width in nm (default: 5)")
+    args = parser.parse_args()
+
+    gpu_path = resolve_event_path(args.gpu_path)
+    hit_path = os.path.join(gpu_path, "hit.npy")
+    photon_path = os.path.join(gpu_path, "photon.npy")
+
+    if not os.path.exists(photon_path):
+        print(f"Error: photon.npy not found in {gpu_path}")
+        sys.exit(1)
+    if not os.path.exists(args.g4_hits):
+        print(f"Error: {args.g4_hits} not found")
+        sys.exit(1)
+
+    gpu_photon = np.load(photon_path)
+    gpu_hits = np.load(hit_path) if os.path.exists(hit_path) else np.zeros((0, 4, 4), dtype=np.float32)
+    g4_hits = np.load(args.g4_hits)
+    n_photons = len(gpu_photon)
+
+    print(f"\n  GPU event path: {gpu_path}")
+    print(f"  G4 hits file:   {args.g4_hits}")
+
+    # Hit summary
+    print_hit_summary(gpu_hits, g4_hits, n_photons, args.input_wavelength)
+
+    if len(gpu_hits) == 0 or len(g4_hits) == 0:
+        print("  Cannot compare — one side has zero hits.")
+        return
+
+    gpu_wl = gpu_hits[:, 2, 3]
+    g4_wl = g4_hits[:, 2, 3]
+    gpu_t = gpu_hits[:, 0, 3]
+    g4_t = g4_hits[:, 0, 3]
+
+    # GPU outcomes
+    print_gpu_outcomes(gpu_photon)
+
+    # Energy conservation
+    print_energy_conservation(gpu_wl, g4_wl, args.input_wavelength)
+
+    # Wavelength comparison
+    print_wavelength_comparison(gpu_wl, g4_wl)
+    print_fine_histogram(gpu_wl, g4_wl, bin_width=args.bin_width)
+
+    # Time comparison
+    print_time_comparison(gpu_t, g4_t)
+
+    # Spatial
+    print_position_comparison(gpu_hits, g4_hits)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qudarap/CMakeLists.txt b/qudarap/CMakeLists.txt
index 1529e1999..8a7228499 100644
--- a/qudarap/CMakeLists.txt
+++ b/qudarap/CMakeLists.txt
@@ -51,6 +51,8 @@ set(SOURCES
     QScint.cc
     QScint.cu
 
+    QWls.cc
+
     QCerenkovIntegral.cc
     QCerenkov.cc
     QCerenkov.cu 
@@ -120,6 +122,9 @@ SET(HEADERS
     QScint.hh
     qscint.h
 
+    QWls.hh
+    qwls.h
+
     QCerenkovIntegral.hh
     QCerenkov.hh
     qcerenkov.h
diff --git a/qudarap/QSim.cc b/qudarap/QSim.cc
index bf8f77156..d3ed3879a 100644
--- a/qudarap/QSim.cc
+++ b/qudarap/QSim.cc
@@ -3,73 +3,71 @@
 
 #include "SLOG.hh"
 
-#include "ssys.h"
-#include "sstamp.h"
-#include "spath.h"
 #include "SProf.hh"
+#include "spath.h"
+#include "sstamp.h"
+#include "ssys.h"
 
 #include "SComp.h"
+#include "SEvent.hh"
+#include "SEventConfig.hh"
 #include "SEvt.hh"
 #include "SSim.hh"
+#include "salloc.h"
 #include "scuda.h"
 #include "squad.h"
-#include "salloc.h"
-#include "SEvent.hh"
-#include "SEventConfig.hh"
 
-//#include "SCSGOptiX.h"
+// #include "SCSGOptiX.h"
 #include "SSimulator.h"
 
 #include "SGenstep.h"
 #include "sslice.h"
 
 #include "NP.hh"
-#include "QUDA_CHECK.h"
 #include "QU.hh"
+#include "QUDA_CHECK.h"
 
+#include "qdebug.h"
 #include "qrng.h"
 #include "qsim.h"
-#include "qdebug.h"
 
 #include "QBase.hh"
-#include "QEvt.hh"
-#include "QRng.hh"
-#include "QTex.hh"
-#include "QScint.hh"
-#include "QCerenkov.hh"
 #include "QBnd.hh"
-#include "QProp.hh"
-#include "QMultiFilm.hh"
+#include "QCerenkov.hh"
+#include "QDebug.hh"
 #include "QEvt.hh"
+#include "QMultiFilm.hh"
 #include "QOptical.hh"
-#include "QSimLaunch.hh"
-#include "QDebug.hh"
 #include "QPMT.hh"
+#include "QProp.hh"
+#include "QRng.hh"
+#include "QScint.hh"
+#include "QSimLaunch.hh"
+#include "QTex.hh"
+#include "QWls.hh"
 
 #include "QSim.hh"
 
 const plog::Severity QSim::LEVEL = SLOG::EnvLevel("QSim", "DEBUG");
 
-const bool  QSim::REQUIRE_PMT = ssys::getenvbool(_QSim__REQUIRE_PMT);
-const int   QSim::SAVE_IGS_EVENTID = ssys::getenvint(_QSim__SAVE_IGS_EVENTID,-1) ;
-const char* QSim::SAVE_IGS_PATH = ssys::getenvvar(_QSim__SAVE_IGS_PATH, "$TMP/.opticks/igs.npy");
-const bool  QSim::CONCAT = ssys::getenvbool(_QSim__CONCAT);
-const bool  QSim::ALLOC  = ssys::getenvbool(_QSim__ALLOC);
+const bool QSim::REQUIRE_PMT = ssys::getenvbool(_QSim__REQUIRE_PMT);
+const int QSim::SAVE_IGS_EVENTID = ssys::getenvint(_QSim__SAVE_IGS_EVENTID, -1);
+const char *QSim::SAVE_IGS_PATH = ssys::getenvvar(_QSim__SAVE_IGS_PATH, "$TMP/.opticks/igs.npy");
+const bool QSim::CONCAT = ssys::getenvbool(_QSim__CONCAT);
+const bool QSim::ALLOC = ssys::getenvbool(_QSim__ALLOC);
 
-
-
-QSim* QSim::INSTANCE = nullptr ;
-QSim* QSim::Get(){ return INSTANCE ; }
-
-QSim* QSim::Create()
+QSim *QSim::INSTANCE = nullptr;
+QSim *QSim::Get()
 {
-    LOG_IF(fatal, INSTANCE != nullptr) << " a QSim INSTANCE already exists " ;
-    assert( INSTANCE == nullptr ) ;
-    return new QSim  ;
+    return INSTANCE;
 }
 
-
-
+QSim *QSim::Create()
+{
+    LOG_IF(fatal, INSTANCE != nullptr) << " a QSim INSTANCE already exists ";
+    assert(INSTANCE == nullptr);
+    return new QSim;
+}
 
 /**
 QSim::UploadComponents
@@ -115,137 +113,135 @@ This structure is used to allow separate testing.
 
 **/
 
-void QSim::UploadComponents( const SSim* ssim  )
+void QSim::UploadComponents(const SSim *ssim)
 {
-    LOG(LEVEL) << "[ ssim " << ssim ;
-    if(getenv("QSim__UploadComponents_SIGINT")) std::raise(SIGINT);
-
-    LOG(LEVEL) << "[ new QBase" ;
-    QBase* base = new QBase ;
-    LOG(LEVEL) << "] new QBase : latency here of about 0.3s from first device access, if latency of >1s need to start nvidia-persistenced " ;
+    LOG(LEVEL) << "[ ssim " << ssim;
+    if (getenv("QSim__UploadComponents_SIGINT"))
+        std::raise(SIGINT);
+
+    LOG(LEVEL) << "[ new QBase";
+    QBase *base = new QBase;
+    LOG(LEVEL) << "] new QBase : latency here of about 0.3s from first device access, if latency of >1s need to start "
+                  "nvidia-persistenced ";
     LOG(LEVEL) << base->desc();
 
-
-    unsigned skipahead_event_offset = SEventConfig::EventSkipahead()  ;
-    LOG(LEVEL) << "[ new QRng skipahead_event_offset : " << skipahead_event_offset << " " << SEventConfig::kEventSkipahead ;
-    QRng* rng = new QRng(skipahead_event_offset)  ;  // loads and uploads RNG
-    LOG(LEVEL) << "] new QRng " << rng->desc()  ;
+    unsigned skipahead_event_offset = SEventConfig::EventSkipahead();
+    LOG(LEVEL) << "[ new QRng skipahead_event_offset : " << skipahead_event_offset << " "
+               << SEventConfig::kEventSkipahead;
+    QRng *rng = new QRng(skipahead_event_offset); // loads and uploads RNG
+    LOG(LEVEL) << "] new QRng " << rng->desc();
 
     LOG(LEVEL) << rng->desc();
 
-    const NP* optical = ssim->get(snam::OPTICAL);
-    const NP* bnd = ssim->get(snam::BND);
+    const NP *optical = ssim->get(snam::OPTICAL);
+    const NP *bnd = ssim->get(snam::BND);
 
-    if( optical == nullptr && bnd == nullptr )
+    if (optical == nullptr && bnd == nullptr)
     {
-        LOG(error) << " optical and bnd null  snam::OPTICAL " << snam::OPTICAL << " snam::BND " << snam::BND  ;
+        LOG(error) << " optical and bnd null  snam::OPTICAL " << snam::OPTICAL << " snam::BND " << snam::BND;
     }
     else
     {
-       // note that QOptical and QBnd are tightly coupled, perhaps add constraints to tie them together
-        QOptical* qopt = new QOptical(optical);
+        // note that QOptical and QBnd are tightly coupled, perhaps add constraints to tie them together
+        QOptical *qopt = new QOptical(optical);
         LOG(LEVEL) << qopt->desc();
 
-        QBnd* qbnd = new QBnd(bnd); // boundary texture with standard domain, used for standard fast property lookup
+        QBnd *qbnd = new QBnd(bnd); // boundary texture with standard domain, used for standard fast property lookup
         LOG(LEVEL) << qbnd->desc();
     }
 
-    QDebug* debug_ = new QDebug ;
-    LOG(LEVEL) << debug_->desc() ;
+    QDebug *debug_ = new QDebug;
+    LOG(LEVEL) << debug_->desc();
 
-    const NP* propcom = ssim->get(snam::PROPCOM);
-    if( propcom )
+    const NP *propcom = ssim->get(snam::PROPCOM);
+    if (propcom)
     {
-        LOG(LEVEL) << "[ QProp " ;
-        QProp<float>* prop = new QProp<float>(propcom) ;
+        LOG(LEVEL) << "[ QProp ";
+        QProp<float> *prop = new QProp<float>(propcom);
         // property interpolation with per-property domains, eg used for Cerenkov RINDEX sampling
-        LOG(LEVEL) << "] QProp " ;
+        LOG(LEVEL) << "] QProp ";
         LOG(LEVEL) << prop->desc();
     }
     else
     {
-        LOG(LEVEL) << "  propcom null, snam::PROPCOM " <<  snam::PROPCOM ;
+        LOG(LEVEL) << "  propcom null, snam::PROPCOM " << snam::PROPCOM;
     }
 
-
-    const NP* icdf = ssim->get(snam::ICDF);
-    if( icdf == nullptr )
+    const NP *icdf = ssim->get(snam::ICDF);
+    if (icdf == nullptr)
     {
-        LOG(error) << " icdf null, snam::ICDF " << snam::ICDF ;
+        LOG(error) << " icdf null, snam::ICDF " << snam::ICDF;
     }
     else
     {
-        unsigned hd_factor = 20u ;  // 0,10,20
-        QScint* scint = new QScint( icdf, hd_factor); // custom high-definition inverse CDF for scintillation generation
+        unsigned hd_factor = 20u;                    // 0,10,20
+        QScint *scint = new QScint(icdf, hd_factor); // custom high-definition inverse CDF for scintillation generation
         LOG(LEVEL) << scint->desc();
     }
 
+    const NP *wls_icdf = ssim->get(snam::WLS_ICDF);
+    const NP *wls_mat_map = ssim->get(snam::WLS_MAT_MAP);
+    if (wls_icdf == nullptr || wls_mat_map == nullptr)
+    {
+        LOG(LEVEL) << " wls_icdf or wls_mat_map null — no WLS materials in geometry ";
+    }
+    else
+    {
+        const NP *wls_tc = ssim->get(snam::WLS_TIME_CONSTANTS);
+        if (wls_tc)
+        {
+            unsigned hd_factor = 20u;
+            QWls *qwls_ = new QWls(wls_icdf, wls_mat_map, wls_tc, hd_factor);
+            LOG(LEVEL) << qwls_->desc();
+        }
+        else
+        {
+            LOG(error) << " wls_icdf and wls_mat_map present but wls_time_constants missing ";
+        }
+    }
 
     // TODO: make this more like the others : acting on the available inputs rather than the mode
-    bool is_simtrace = SEventConfig::IsRGModeSimtrace() ;
-    if(is_simtrace == false )
+    bool is_simtrace = SEventConfig::IsRGModeSimtrace();
+    if (is_simtrace == false)
     {
-        QCerenkov* cerenkov = new QCerenkov  ;
+        QCerenkov *cerenkov = new QCerenkov;
         LOG(LEVEL) << cerenkov->desc();
     }
     else
     {
-        LOG(LEVEL) << " skip QCerenkov for simtrace running " ;
+        LOG(LEVEL) << " skip QCerenkov for simtrace running ";
     }
 
+    const NPFold *spmt_f = ssim->get_spmt_f();
+    QPMT<float> *qpmt = spmt_f ? new QPMT<float>(spmt_f) : nullptr;
 
+    bool has_PMT = spmt_f != nullptr && qpmt != nullptr;
+    bool MISSING_PMT = REQUIRE_PMT == true && has_PMT == false;
 
+    LOG_IF(fatal, MISSING_PMT) << " MISSING_PMT " << " has_PMT " << (has_PMT ? "YES" : "NO ") << " REQUIRE_PMT "
+                               << (REQUIRE_PMT ? "YES" : "NO ") << " MISSING_PMT " << (MISSING_PMT ? "YES" : "NO ")
+                               << " spmt_f " << (spmt_f ? "YES" : "NO ") << " qpmt " << (qpmt ? "YES" : "NO ");
 
-    const NPFold* spmt_f = ssim->get_spmt_f() ;
-    QPMT<float>* qpmt = spmt_f ? new QPMT<float>(spmt_f) : nullptr ;
+    assert(MISSING_PMT == false);
+    if (MISSING_PMT)
+        std::raise(SIGINT);
 
-    bool has_PMT = spmt_f != nullptr && qpmt != nullptr ;
-    bool MISSING_PMT = REQUIRE_PMT == true && has_PMT == false ;
+    LOG(LEVEL) << QPMT<float>::Desc() << std::endl
+               << " spmt_f " << (spmt_f ? "YES" : "NO ") << " qpmt " << (qpmt ? "YES" : "NO ");
 
-    LOG_IF(fatal, MISSING_PMT )
-        << " MISSING_PMT "
-        << " has_PMT " << ( has_PMT ? "YES" : "NO " )
-        << " REQUIRE_PMT " << ( REQUIRE_PMT ? "YES" : "NO " )
-        << " MISSING_PMT " << ( MISSING_PMT ? "YES" : "NO " )
-        << " spmt_f " << ( spmt_f ? "YES" : "NO " )
-        << " qpmt " << ( qpmt ? "YES" : "NO " )
-        ;
-
-    assert(MISSING_PMT == false) ;
-    if(MISSING_PMT)  std::raise(SIGINT);
-
-
-
-    LOG(LEVEL)
-        << QPMT<float>::Desc()
-        << std::endl
-        << " spmt_f " << ( spmt_f ? "YES" : "NO " )
-        << " qpmt " << ( qpmt ? "YES" : "NO " )
-        ;
-
-
-
-    const NP* multifilm = ssim->get_extra(snam::MULTIFILM);
-    if(multifilm == nullptr)
+    const NP *multifilm = ssim->get_extra(snam::MULTIFILM);
+    if (multifilm == nullptr)
     {
-        LOG(LEVEL) << " multifilm null, snam::MULTIFILM " << snam::MULTIFILM ;
+        LOG(LEVEL) << " multifilm null, snam::MULTIFILM " << snam::MULTIFILM;
     }
     else
     {
-        QMultiFilm* mul = new QMultiFilm( multifilm );
+        QMultiFilm *mul = new QMultiFilm(multifilm);
         LOG(LEVEL) << mul->desc();
     }
-    LOG(LEVEL) << "] ssim " << ssim ;
-
-
-
+    LOG(LEVEL) << "] ssim " << ssim;
 }
 
-
-
-
-
-
 /**
 QSim:::QSim
 -------------
@@ -261,29 +257,15 @@ singleton components.
 **/
 
 QSim::QSim()
-    :
-    base(QBase::Get()),
-    qev(new QEvt),
-    sev(qev->sev),
-    rng(QRng::Get()),
-    scint(QScint::Get()),
-    cerenkov(QCerenkov::Get()),
-    bnd(QBnd::Get()),
-    debug_(QDebug::Get()),
-    prop(QProp<float>::Get()),
-    pmt(QPMT<float>::Get()),
-    multifilm(QMultiFilm::Get()),
-    sim(nullptr),
-    d_sim(nullptr),
-    dbg(debug_ ? debug_->dbg : nullptr),
-    d_dbg(debug_ ? debug_->d_dbg : nullptr),
-    cx(nullptr)
+    : base(QBase::Get()), qev(new QEvt), sev(qev->sev), rng(QRng::Get()), scint(QScint::Get()), qwls(QWls::Get()),
+      cerenkov(QCerenkov::Get()), bnd(QBnd::Get()), debug_(QDebug::Get()), prop(QProp<float>::Get()),
+      pmt(QPMT<float>::Get()), multifilm(QMultiFilm::Get()), sim(nullptr), d_sim(nullptr),
+      dbg(debug_ ? debug_->dbg : nullptr), d_dbg(debug_ ? debug_->d_dbg : nullptr), cx(nullptr)
 {
-    LOG(LEVEL) << desc() ;
+    LOG(LEVEL) << desc();
     init();
 }
 
-
 /**
 QSim::init
 ------------
@@ -303,51 +285,43 @@ place (qsim.h) to add GPU side functionality.
 
 **/
 
-
 void QSim::init()
 {
-    sim = new qsim ;
-    sim->base = base ? base->d_base : nullptr ;
-    sim->evt = qev ? qev->getDevicePtr() : nullptr ;
-    //sim->rng_state = rng ? rng->qr->uploaded_states : nullptr ;
-    sim->rng = rng ? rng->d_qr : nullptr ;
+    sim = new qsim;
+    sim->base = base ? base->d_base : nullptr;
+    sim->evt = qev ? qev->getDevicePtr() : nullptr;
+    // sim->rng_state = rng ? rng->qr->uploaded_states : nullptr ;
+    sim->rng = rng ? rng->d_qr : nullptr;
+
+    sim->bnd = bnd ? bnd->d_qb : nullptr;
+    sim->multifilm = multifilm ? multifilm->d_multifilm : nullptr;
+    sim->cerenkov = cerenkov ? cerenkov->d_cerenkov : nullptr;
+    sim->scint = scint ? scint->d_scint : nullptr;
+    sim->wls = qwls ? qwls->d_wls : nullptr;
+    sim->pmt = pmt ? pmt->d_pmt : nullptr;
+
+    bool has_PMT = pmt != nullptr && sim->pmt != nullptr;
+    bool REQUIRE_PMT = ssys::getenvbool(_QSim__REQUIRE_PMT);
+    bool MISSING_PMT = REQUIRE_PMT == true && has_PMT == false;
 
-    sim->bnd = bnd ? bnd->d_qb : nullptr ;
-    sim->multifilm = multifilm ? multifilm->d_multifilm : nullptr ;
-    sim->cerenkov = cerenkov ? cerenkov->d_cerenkov : nullptr ;
-    sim->scint = scint ? scint->d_scint : nullptr ;
-    sim->pmt = pmt ? pmt->d_pmt : nullptr ;
+    LOG(LEVEL) << " MISSING_PMT " << (MISSING_PMT ? "YES" : "NO ") << " has_PMT " << (has_PMT ? "YES" : "NO ")
+               << " QSim::pmt " << (pmt ? "YES" : "NO ") << " QSim::pmt->d_pmt " << (sim->pmt ? "YES" : "NO ") << " ["
+               << _QSim__REQUIRE_PMT << "] " << (REQUIRE_PMT ? "YES" : "NO ");
 
+    LOG_IF(fatal, MISSING_PMT) << " MISSING_PMT ABORT " << " MISSING_PMT " << (MISSING_PMT ? "YES" : "NO ")
+                               << " has_PMT " << (has_PMT ? "YES" : "NO ") << " QSim::pmt " << (pmt ? "YES" : "NO ")
+                               << " QSim::pmt->d_pmt " << (sim->pmt ? "YES" : "NO ") << " [" << _QSim__REQUIRE_PMT
+                               << "] " << (REQUIRE_PMT ? "YES" : "NO ");
 
-    bool has_PMT = pmt != nullptr && sim->pmt != nullptr ;
-    bool REQUIRE_PMT = ssys::getenvbool(_QSim__REQUIRE_PMT);
-    bool MISSING_PMT = REQUIRE_PMT == true && has_PMT == false ;
-
-    LOG(LEVEL)
-        << " MISSING_PMT " << ( MISSING_PMT ? "YES" : "NO " )
-        << " has_PMT " << ( has_PMT ? "YES" : "NO " )
-        << " QSim::pmt " << ( pmt ? "YES" : "NO " )
-        << " QSim::pmt->d_pmt " << ( sim->pmt ? "YES" : "NO " )
-        << " [" << _QSim__REQUIRE_PMT << "] " << ( REQUIRE_PMT ? "YES" : "NO " )
-        ;
-
-    LOG_IF(fatal, MISSING_PMT )
-        << " MISSING_PMT ABORT "
-        << " MISSING_PMT " << ( MISSING_PMT ? "YES" : "NO " )
-        << " has_PMT " << ( has_PMT ? "YES" : "NO " )
-        << " QSim::pmt " << ( pmt ? "YES" : "NO " )
-        << " QSim::pmt->d_pmt " << ( sim->pmt ? "YES" : "NO " )
-        << " [" << _QSim__REQUIRE_PMT << "] " << ( REQUIRE_PMT ? "YES" : "NO " )
-        ;
-
-    assert(MISSING_PMT == false) ;
-    if(MISSING_PMT)  std::raise(SIGINT);
-
-    d_sim = QU::UploadArray<qsim>(sim, 1, "QSim::init.sim" );
-
-    INSTANCE = this ;
-    LOG(LEVEL) << desc() ;
-    LOG(LEVEL) << descComponents() ;
+    assert(MISSING_PMT == false);
+    if (MISSING_PMT)
+        std::raise(SIGINT);
+
+    d_sim = QU::UploadArray<qsim>(sim, 1, "QSim::init.sim");
+
+    INSTANCE = this;
+    LOG(LEVEL) << desc();
+    LOG(LEVEL) << descComponents();
 }
 
 /**
@@ -357,12 +331,11 @@ QSim::setLauncher
 Formerly used SCSGOptiX
 
 **/
-void QSim::setLauncher(SSimulator* cx_ )
+void QSim::setLauncher(SSimulator *cx_)
 {
-    cx = cx_ ;
+    cx = cx_;
 }
 
-
 /**
 QSim::post_launch
 --------------------
@@ -378,7 +351,6 @@ void QSim::post_launch()
 }
 **/
 
-
 /**
 QSim::simulate
 ---------------
@@ -427,190 +399,159 @@ bool QSim::KEEP_SUBFOLD = ssys::getenvbool(QSim__simulate_KEEP_SUBFOLD);
 
 double QSim::simulate(int eventID, bool reset_)
 {
-    SProf::SetTag(eventID, "A%0.3d_" ) ;
+    SProf::SetTag(eventID, "A%0.3d_");
 
-    assert( SEventConfig::IsRGModeSimulate() );
+    assert(SEventConfig::IsRGModeSimulate());
 
-    //cudaStream_t stream ;  cudaStreamCreate(&stream);
-    cudaStream_t stream = 0 ;
+    // cudaStream_t stream ;  cudaStreamCreate(&stream);
+    cudaStream_t stream = 0;
 
+    int64_t tot_ph = 0;
 
-    int64_t tot_ph = 0 ;
+    double tot_dt = 0.;
 
-    double tot_dt = 0. ;
-
-    int64_t tot_idt = 0 ;
-    int64_t tot_gdt = 0 ;
+    int64_t tot_idt = 0;
+    int64_t tot_gdt = 0;
 
     int64_t t_HEAD = SProf::Add("QSim__simulate_HEAD");
 
-    LOG_IF(info, SEvt::LIFECYCLE) << "[ eventID " << eventID ;
-    if( qev == nullptr ) return -1. ;
-
+    LOG_IF(info, SEvt::LIFECYCLE) << "[ eventID " << eventID;
+    if (qev == nullptr)
+        return -1.;
 
-    sev->beginOfEvent(eventID);  // set SEvt index and tees up frame gensteps for simtrace and input photon simulate running
+    sev->beginOfEvent(
+        eventID); // set SEvt index and tees up frame gensteps for simtrace and input photon simulate running
 
-    NP* igs = sev->makeGenstepArrayFromVector();
+    NP *igs = sev->makeGenstepArrayFromVector();
 
     MaybeSaveIGS(eventID, igs);
 
-    std::vector<sslice> igs_slice ;
-    int64_t tot_ph_0 = SGenstep::GetGenstepSlices( igs_slice, igs, SEventConfig::MaxSlot() );
+    std::vector<sslice> igs_slice;
+    int64_t tot_ph_0 = SGenstep::GetGenstepSlices(igs_slice, igs, SEventConfig::MaxSlot());
 
-    //bool xxl = tot_ph_0 > SGenstep::MAX_SLOT_PER_SLICE ;
-    bool xxl = tot_ph_0 > 100*M ;
+    // bool xxl = tot_ph_0 > SGenstep::MAX_SLOT_PER_SLICE ;
+    bool xxl = tot_ph_0 > 100 * M;
 
     int num_slice = igs_slice.size();
 
-    LOG(xxl ? info : LEVEL)
-        << " eventID " << std::setw(6) << eventID
-        << " igs " << ( igs ? igs->sstr() : "-" )
-        << " tot_ph_0 " << tot_ph_0
-        << " tot_ph_0/M " << tot_ph_0/M
-        << " xxl " << ( xxl ? "YES" : "NO " )
-        << " MaxSlot " << SEventConfig::MaxSlot()
-        << " MaxSlot/M " << SEventConfig::MaxSlot()/M
-        << " sslice::Desc(igs_slice)\n"
-        << sslice::Desc(igs_slice)
-        << " num_slice " << num_slice
-        ;
-
+    LOG(xxl ? info : LEVEL) << " eventID " << std::setw(6) << eventID << " igs " << (igs ? igs->sstr() : "-")
+                            << " tot_ph_0 " << tot_ph_0 << " tot_ph_0/M " << tot_ph_0 / M << " xxl "
+                            << (xxl ? "YES" : "NO ") << " MaxSlot " << SEventConfig::MaxSlot() << " MaxSlot/M "
+                            << SEventConfig::MaxSlot() / M << " sslice::Desc(igs_slice)\n"
+                            << sslice::Desc(igs_slice) << " num_slice " << num_slice;
 
     int64_t t_LBEG = SProf::Add("QSim__simulate_LBEG");
 
-    for(int i=0 ; i < num_slice ; i++)
+    for (int i = 0; i < num_slice; i++)
     {
         SProf::Add("QSim__simulate_PRUP");
 
-        const sslice& sl = igs_slice[i] ;
+        const sslice &sl = igs_slice[i];
 
-        LOG(LEVEL) << sl.idx_desc(i) ;
+        LOG(LEVEL) << sl.idx_desc(i);
 
-        int rc = qev->setGenstepUpload_NP(igs, &sl ) ;
-        LOG_IF(error, rc != 0) << " QEvt::setGenstep ERROR : have qev but no gensteps collected : will skip cx.simulate " ;
-
-        LOG_IF(info, ALLOC)
-            << " [" << _QSim__ALLOC << "] "
-            << " i " << std::setw(5) << i
-            << " SEventConfig::ALLOC " << ( SEventConfig::ALLOC  ? "YES" : "NO " )
-            << ( SEventConfig::ALLOC ? SEventConfig::ALLOC->desc() : "-" )
-            ;
+        int rc = qev->setGenstepUpload_NP(igs, &sl);
+        LOG_IF(error, rc != 0)
+            << " QEvt::setGenstep ERROR : have qev but no gensteps collected : will skip cx.simulate ";
 
+        LOG_IF(info, ALLOC) << " [" << _QSim__ALLOC << "] " << " i " << std::setw(5) << i << " SEventConfig::ALLOC "
+                            << (SEventConfig::ALLOC ? "YES" : "NO ")
+                            << (SEventConfig::ALLOC ? SEventConfig::ALLOC->desc() : "-");
 
         SProf::Add("QSim__simulate_PREL");
 
-        sev->t_PreLaunch = sstamp::Now() ;
+        sev->t_PreLaunch = sstamp::Now();
 
-        double dt = rc == 0 && cx != nullptr ? cx->simulate_launch() : -1. ;  //SSimulator protocol
+        double dt = rc == 0 && cx != nullptr ? cx->simulate_launch() : -1.; // SSimulator protocol
 
-        sev->t_PostLaunch = sstamp::Now() ;
-        sev->t_Launch = dt ;
+        sev->t_PostLaunch = sstamp::Now();
+        sev->t_Launch = dt;
 
-        tot_idt += ( sev->t_PostLaunch - sev->t_PreLaunch ) ;
-        tot_dt += dt ;
-        tot_ph += sl.ph_count ;
+        tot_idt += (sev->t_PostLaunch - sev->t_PreLaunch);
+        tot_dt += dt;
+        tot_ph += sl.ph_count;
 
-        LOG( xxl ? info : LEVEL )
-            << " eventID " << eventID
-            << " xxl " << ( xxl ? "YES" : "NO " )
-            << " i " << std::setw(4) << i
-            << " dt " << std::setw(11) << std::fixed << std::setprecision(6) << dt
-            << " slice " << sl.idx_desc(i)
-            ;
+        LOG(xxl ? info : LEVEL) << " eventID " << eventID << " xxl " << (xxl ? "YES" : "NO ") << " i " << std::setw(4)
+                                << i << " dt " << std::setw(11) << std::fixed << std::setprecision(6) << dt << " slice "
+                                << sl.idx_desc(i);
 
         int64_t t_POST = SProf::Add("QSim__simulate_POST");
 
-        sev->gather();  // gather into *fold* just added to *topfold*
+        sev->gather(); // gather into *fold* just added to *topfold*
 
         int64_t t_DOWN = SProf::Add("QSim__simulate_DOWN");
 
-        tot_gdt += ( t_DOWN - t_POST ) ;
+        tot_gdt += (t_DOWN - t_POST);
     }
 
-
-    size_t max_slot_M = SEventConfig::MaxSlot()/M;
-    std::string anno = SProf::Annotation("slice",num_slice, "max_slot_M", max_slot_M);
+    size_t max_slot_M = SEventConfig::MaxSlot() / M;
+    std::string anno = SProf::Annotation("slice", num_slice, "max_slot_M", max_slot_M);
     int64_t t_LEND = SProf::Add("QSim__simulate_LEND", anno.c_str());
 
-    std::stringstream ss ;
-    std::ostream* out = CONCAT ? &ss : nullptr ;
+    std::stringstream ss;
+    std::ostream *out = CONCAT ? &ss : nullptr;
     int concat_rc = sev->topfold->concat(out);
 
-    LOG_IF(info, CONCAT) << ss.str() ;
-    LOG_IF(fatal, concat_rc != 0) << " sev->topfold->concat FAILED " ;
+    LOG_IF(info, CONCAT) << ss.str();
+    LOG_IF(fatal, concat_rc != 0) << " sev->topfold->concat FAILED ";
     assert(concat_rc == 0);
 
     bool has_hlm = sev->topfold->has_key(SComp::HITLITEMERGED_);
-    bool has_hm  = sev->topfold->has_key(SComp::HITMERGED_);
-    bool do_final_merge = num_slice > 1 && ( has_hlm || has_hm ) ;
-    LOG(LEVEL)
-         << " num_slice " << num_slice
-         << " has_hm " << ( has_hm ? "YES" : "NO " )
-         << " has_hlm " << ( has_hlm ? "YES" : "NO " )
-         << " do_final_merge " << ( do_final_merge ? "YES" : "NO " )
-         ;
-    if(do_final_merge) simulate_final_merge(tot_ph, stream);
+    bool has_hm = sev->topfold->has_key(SComp::HITMERGED_);
+    bool do_final_merge = num_slice > 1 && (has_hlm || has_hm);
+    LOG(LEVEL) << " num_slice " << num_slice << " has_hm " << (has_hm ? "YES" : "NO ") << " has_hlm "
+               << (has_hlm ? "YES" : "NO ") << " do_final_merge " << (do_final_merge ? "YES" : "NO ");
+    if (do_final_merge)
+        simulate_final_merge(tot_ph, stream);
 
-
-    if(!KEEP_SUBFOLD) sev->topfold->clear_subfold();
+    if (!KEEP_SUBFOLD)
+        sev->topfold->clear_subfold();
 
     int64_t t_PCAT = SProf::Add("QSim__simulate_PCAT");
 
-    int tot_ht = sev->getNumHit() ;  // NB from fold, so requires hits array gathering to be configured to get non-zero
-    std::string counts = sev->getCounts();  // collect counts before reset
-
-    LOG_IF(info, SEvt::MINIMAL)
-        << " eventID " << eventID
-        << " tot_dt " << std::setw(11) << std::fixed << std::setprecision(6) << tot_dt
-        << " tot_ph " << std::setw(10) << tot_ph
-        << " tot_ph/M " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_ph)/float(M)
-        << " tot_ht " << std::setw(10) << tot_ht
-        << " tot_ht/M " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_ht)/float(M)
-        << " tot_ht/tot_ph " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_ht)/float(tot_ph)
-        << " reset_ " << ( reset_ ? "YES" : "NO " )
-        ;
+    int tot_ht = sev->getNumHit(); // NB from fold, so requires hits array gathering to be configured to get non-zero
+    std::string counts = sev->getCounts(); // collect counts before reset
 
+    LOG_IF(info, SEvt::MINIMAL) << " eventID " << eventID << " tot_dt " << std::setw(11) << std::fixed
+                                << std::setprecision(6) << tot_dt << " tot_ph " << std::setw(10) << tot_ph
+                                << " tot_ph/M " << std::setw(10) << std::fixed << std::setprecision(6)
+                                << float(tot_ph) / float(M) << " tot_ht " << std::setw(10) << tot_ht << " tot_ht/M "
+                                << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_ht) / float(M)
+                                << " tot_ht/tot_ph " << std::setw(10) << std::fixed << std::setprecision(6)
+                                << float(tot_ht) / float(tot_ph) << " reset_ " << (reset_ ? "YES" : "NO ");
 
-    assert( tot_ph == tot_ph_0 );
+    assert(tot_ph == tot_ph_0);
 
-    int64_t t_BRES  = SProf::Add("QSim__simulate_BRES", counts.c_str() );
-    if(reset_) reset(eventID) ;
+    int64_t t_BRES = SProf::Add("QSim__simulate_BRES", counts.c_str());
+    if (reset_)
+        reset(eventID);
 
-    int64_t t_TAIL  = SProf::Add("QSim__simulate_TAIL");
+    int64_t t_TAIL = SProf::Add("QSim__simulate_TAIL");
 
     SProf::Write(); // per-event write, so have something in case of crash
 
-    LOG_IF(info, SEvt::MINTIME) << "\n"
-        << SEvt::SEvt__MINTIME
-        << "\n"
-        << " (TAIL - HEAD)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_TAIL - t_HEAD )/M
-        << " (head to tail of QSim::simulate method) "
-        << "\n"
-        << " (LEND - LBEG)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_LEND - t_LBEG )/M
-        << " (multilaunch loop begin to end) "
-        << "\n"
-        << " (PCAT - LEND)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_PCAT - t_LEND )/M
-        << " (topfold concat and clear subfold) "
+    LOG_IF(info, SEvt::MINTIME)
         << "\n"
-        << " (TAIL - BRES)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_TAIL - t_BRES )/M
-        << " (QSim::reset which saves hits) "
-        << "\n"
-        << " tot_idt/M       " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_idt)/M
-        << " (sum of kernel execution int64_t stamp differences in microseconds)"
-        << "\n"
-        << " tot_dt          " << std::setw(10) << std::fixed << std::setprecision(6) << tot_dt
-        << " int(tot_dt*M)   " << std::setw(10) << int64_t(tot_dt*M)
-        << " (sum of kernel execution double chrono stamp differences in seconds, and scaled to ms) "
-        << "\n"
-        << " tot_gdt/M       " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_gdt)/M
-        << " (sum of SEvt::gather int64_t stamp differences in microseconds)"
-        << "\n"
-        ;
-
-    return tot_dt ;
+        << SEvt::SEvt__MINTIME << "\n"
+        << " (TAIL - HEAD)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float(t_TAIL - t_HEAD) / M
+        << " (head to tail of QSim::simulate method) " << "\n"
+        << " (LEND - LBEG)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float(t_LEND - t_LBEG) / M
+        << " (multilaunch loop begin to end) " << "\n"
+        << " (PCAT - LEND)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float(t_PCAT - t_LEND) / M
+        << " (topfold concat and clear subfold) " << "\n"
+        << " (TAIL - BRES)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float(t_TAIL - t_BRES) / M
+        << " (QSim::reset which saves hits) " << "\n"
+        << " tot_idt/M       " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_idt) / M
+        << " (sum of kernel execution int64_t stamp differences in microseconds)" << "\n"
+        << " tot_dt          " << std::setw(10) << std::fixed << std::setprecision(6) << tot_dt << " int(tot_dt*M)   "
+        << std::setw(10) << int64_t(tot_dt * M)
+        << " (sum of kernel execution double chrono stamp differences in seconds, and scaled to ms) " << "\n"
+        << " tot_gdt/M       " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_gdt) / M
+        << " (sum of SEvt::gather int64_t stamp differences in microseconds)" << "\n";
+
+    return tot_dt;
 }
 
-
 /**
 QSim::simulate_final_merge
 ---------------------------
@@ -633,58 +574,49 @@ TODO: use QEvt::FinalMerge_async once that makes sense
 void QSim::simulate_final_merge(int64_t tot_ph, cudaStream_t stream)
 {
     bool has_hlm = sev->topfold->has_key(SComp::HITLITEMERGED_);
-    bool has_hm  = sev->topfold->has_key(SComp::HITMERGED_);
+    bool has_hm = sev->topfold->has_key(SComp::HITMERGED_);
 
-    if( has_hlm )
+    if (has_hlm)
     {
-        const NP* hlm = sev->topfold->get(SComp::HITLITEMERGED_);
-        NP*       fin = QEvt::FinalMerge<sphotonlite>(hlm, stream);
+        const NP *hlm = sev->topfold->get(SComp::HITLITEMERGED_);
+        NP *fin = QEvt::FinalMerge<sphotonlite>(hlm, stream);
 
-        float     hlm_frac = float(hlm->num_items())/float(tot_ph) ;
-        float     fin_frac = float(fin->num_items())/float(hlm->num_items()) ;
+        float hlm_frac = float(hlm->num_items()) / float(tot_ph);
+        float fin_frac = float(fin->num_items()) / float(hlm->num_items());
 
-        std::stringstream ss ;
-        ss
-            << " tot_ph " << tot_ph
-            << " hlm " << ( hlm ? hlm->sstr() : "-" )
-            << " fin " << ( fin ? fin->sstr() : "-" )
-            << " hlm/tot " << std::setw(7) << std::fixed << std::setprecision(4) << hlm_frac
-            << " fin/hlm " << std::setw(7) << std::fixed << std::setprecision(4) << fin_frac
-            ;
+        std::stringstream ss;
+        ss << " tot_ph " << tot_ph << " hlm " << (hlm ? hlm->sstr() : "-") << " fin " << (fin ? fin->sstr() : "-")
+           << " hlm/tot " << std::setw(7) << std::fixed << std::setprecision(4) << hlm_frac << " fin/hlm "
+           << std::setw(7) << std::fixed << std::setprecision(4) << fin_frac;
 
         std::string note = ss.str();
-        fin->set_meta<std::string>("QSim__simulate_final_merge", note );
+        fin->set_meta<std::string>("QSim__simulate_final_merge", note);
 
-        sev->topfold->set(SComp::HITLITEMERGED_, fin );
+        sev->topfold->set(SComp::HITLITEMERGED_, fin);
 
-        LOG(info) << note ;
+        LOG(info) << note;
     }
-    if( has_hm )
+    if (has_hm)
     {
-        const NP* hm = sev->topfold->get(SComp::HITMERGED_);
-        NP*       fi = QEvt::FinalMerge<sphoton>(hm, stream);
+        const NP *hm = sev->topfold->get(SComp::HITMERGED_);
+        NP *fi = QEvt::FinalMerge<sphoton>(hm, stream);
 
-        float     hm_frac = float(hm->num_items())/float(tot_ph) ;
-        float     fi_frac = float(fi->num_items())/float(hm->num_items()) ;
+        float hm_frac = float(hm->num_items()) / float(tot_ph);
+        float fi_frac = float(fi->num_items()) / float(hm->num_items());
 
-        std::stringstream ss ;
-        ss
-            << " tot_ph " << tot_ph
-            << " hm " << ( hm ? hm->sstr() : "-" )
-            << " fi " << ( fi ? fi->sstr() : "-" )
-            << " hm/tot " << std::setw(7) << std::fixed << std::setprecision(4) << hm_frac
-            << " fi/hm "  << std::setw(7) << std::fixed << std::setprecision(4) << fi_frac
-            ;
+        std::stringstream ss;
+        ss << " tot_ph " << tot_ph << " hm " << (hm ? hm->sstr() : "-") << " fi " << (fi ? fi->sstr() : "-")
+           << " hm/tot " << std::setw(7) << std::fixed << std::setprecision(4) << hm_frac << " fi/hm " << std::setw(7)
+           << std::fixed << std::setprecision(4) << fi_frac;
 
         std::string note = ss.str();
-        fi->set_meta<std::string>("QSim__simulate_final_merge", note );
+        fi->set_meta<std::string>("QSim__simulate_final_merge", note);
 
-        sev->topfold->set(SComp::HITMERGED_, fi );
-        LOG(info) << note ;
+        sev->topfold->set(SComp::HITMERGED_, fi);
+        LOG(info) << note;
     }
 }
 
-
 /**
 QSim::simulate
 ----------------
@@ -707,36 +639,30 @@ Thus is used from language crossing stack::
 
 **/
 
-
-NP* QSim::simulate(const NP* gs, int eventID )
+NP *QSim::simulate(const NP *gs, int eventID)
 {
     bool eventID_expected = eventID > -1;
-    if(!eventID_expected) std::cerr << "QSim::simulate gs lacks needed eventID metadata [" << eventID << "]\n" ;
+    if (!eventID_expected)
+        std::cerr << "QSim::simulate gs lacks needed eventID metadata [" << eventID << "]\n";
     assert(eventID_expected);
 
-    assert( sev == SEvt::Get_EGPU() );
+    assert(sev == SEvt::Get_EGPU());
     sev->addGenstep(gs);
 
-    bool reset_ = false ;
+    bool reset_ = false;
     double tot_dt = simulate(eventID, reset_);
 
-    const NP* _ht = sev->getHit();
-    NP* ht = _ht ? _ht->copy() : nullptr ;  // copy global hits from SEvt before reset
+    const NP *_ht = sev->getHit();
+    NP *ht = _ht ? _ht->copy() : nullptr; // copy global hits from SEvt before reset
     ht->set_meta<double>("QSim__simulate_tot_dt", tot_dt);
 
-    LOG(info)
-        << " eventID " << std::setw(6) << eventID
-        << " gs " << ( gs ? gs->sstr() : "-" )
-        << " ht " << ( ht ? ht->sstr() : "-" )
-        << " tot_dt " << std::fixed << std::setw(10) << std::setprecision(6) << tot_dt
-        ;
+    LOG(info) << " eventID " << std::setw(6) << eventID << " gs " << (gs ? gs->sstr() : "-") << " ht "
+              << (ht ? ht->sstr() : "-") << " tot_dt " << std::fixed << std::setw(10) << std::setprecision(6) << tot_dt;
     reset(eventID);
 
-    return ht ;
+    return ht;
 }
 
-
-
 /**
 QSim::MaybeSaveIGS
 --------------------
@@ -765,27 +691,21 @@ Try manually reducing slots to see if memory limits are the cause::
 
 **/
 
-void QSim::MaybeSaveIGS(int eventID, NP* igs) // static
+void QSim::MaybeSaveIGS(int eventID, NP *igs) // static
 {
-    bool igs_null = igs == nullptr ;
-    const char* igs_path = SAVE_IGS_PATH ? spath::Resolve(SAVE_IGS_PATH) : nullptr ;
-    bool save_igs = igs && SAVE_IGS_EVENTID == eventID && igs_path ;
-    LOG(LEVEL)
-        << " eventID " << eventID
-        << " igs " << ( igs ? igs->sstr() : "-" )
-        << " igs_null " << ( igs_null ? "YES" : "NO " )
-        << " [" << _QSim__SAVE_IGS_EVENTID << "] " <<  SAVE_IGS_EVENTID
-        << " [" << _QSim__SAVE_IGS_PATH    << "] " << ( SAVE_IGS_PATH ? SAVE_IGS_PATH : "-" )
-        << " igs_path [" << ( igs_path ? igs_path : "-" ) << "]"
-        << " save_igs " << ( save_igs ? "YES" : "NO " )
-        ;
-
-    if(!save_igs) return ;
+    bool igs_null = igs == nullptr;
+    const char *igs_path = SAVE_IGS_PATH ? spath::Resolve(SAVE_IGS_PATH) : nullptr;
+    bool save_igs = igs && SAVE_IGS_EVENTID == eventID && igs_path;
+    LOG(LEVEL) << " eventID " << eventID << " igs " << (igs ? igs->sstr() : "-") << " igs_null "
+               << (igs_null ? "YES" : "NO ") << " [" << _QSim__SAVE_IGS_EVENTID << "] " << SAVE_IGS_EVENTID << " ["
+               << _QSim__SAVE_IGS_PATH << "] " << (SAVE_IGS_PATH ? SAVE_IGS_PATH : "-") << " igs_path ["
+               << (igs_path ? igs_path : "-") << "]" << " save_igs " << (save_igs ? "YES" : "NO ");
+
+    if (!save_igs)
+        return;
     igs->save(igs_path);
 }
 
-
-
 /**
 QSim::getPhotonSlotOffset
 ---------------------------
@@ -810,14 +730,11 @@ or equal the number of states uploaded.
 
 **/
 
-
-
 unsigned long long QSim::get_photon_slot_offset() const
 {
-    return qev->get_photon_slot_offset() ;
+    return qev->get_photon_slot_offset();
 }
 
-
 /**
 QSim::reset
 ------------
@@ -838,12 +755,10 @@ void QSim::reset(int eventID)
     SProf::Add("QSim__reset_HEAD");
     qev->clear();
     sev->endOfEvent(eventID);
-    LOG_IF(info, SEvt::LIFECYCLE) << "] eventID " << eventID ;
+    LOG_IF(info, SEvt::LIFECYCLE) << "] eventID " << eventID;
     SProf::Add("QSim__reset_TAIL");
 }
 
-
-
 /**
 QSim::simtrace
 ---------------
@@ -853,30 +768,26 @@ Collected genstep are uploaded and the CSGOptiX kernel is launched to generate a
 
 **/
 
-
 double QSim::simtrace(int eventID)
 {
-    assert( SEventConfig::IsRGModeSimtrace() );
-
+    assert(SEventConfig::IsRGModeSimtrace());
 
     sev->beginOfEvent(eventID);
 
-    NP* igs = sev->makeGenstepArrayFromVector();
+    NP *igs = sev->makeGenstepArrayFromVector();
 
-    LOG_IF(fatal, igs==nullptr)
-         << " igs NULL "
-         << " sev.descGenstepArrayFromVector " << sev->descGenstepArrayFromVector()
-         ;
+    LOG_IF(fatal, igs == nullptr) << " igs NULL " << " sev.descGenstepArrayFromVector "
+                                  << sev->descGenstepArrayFromVector();
 
     assert(igs);
-    int rc = qev->setGenstepUpload_NP(igs) ;
+    int rc = qev->setGenstepUpload_NP(igs);
 
-    LOG_IF(error, rc != 0) << " QEvt::setGenstep ERROR : no gensteps collected : will skip cx.simtrace " ;
+    LOG_IF(error, rc != 0) << " QEvt::setGenstep ERROR : no gensteps collected : will skip cx.simtrace ";
 
-    sev->t_PreLaunch = sstamp::Now() ;
-    double dt = rc == 0 && cx != nullptr ? cx->simtrace_launch() : -1. ;
-    sev->t_PostLaunch = sstamp::Now() ;
-    sev->t_Launch = dt ;
+    sev->t_PreLaunch = sstamp::Now();
+    double dt = rc == 0 && cx != nullptr ? cx->simtrace_launch() : -1.;
+    sev->t_PostLaunch = sstamp::Now();
+    sev->t_Launch = dt;
 
     // see ~/o/notes/issues/cxt_min_simtrace_revival.rst
     sev->gather();
@@ -886,94 +797,80 @@ double QSim::simtrace(int eventID)
 
     sev->endOfEvent(eventID);
 
-    return dt ;
+    return dt;
 }
 
-
-qsim* QSim::getDevicePtr() const
+qsim *QSim::getDevicePtr() const
 {
-    return d_sim ;
+    return d_sim;
 }
 
-
 char QSim::getScintTexFilterMode() const
 {
-    return scint->tex->getFilterMode() ;
+    return scint->tex->getFilterMode();
 }
 
 std::string QSim::desc() const
 {
-    std::stringstream ss ;
-    ss << "QSim::desc"
-       << std::endl
-       << " this 0x"            << std::hex << std::uint64_t(this)     << std::dec
-       << " INSTANCE 0x"        << std::hex << std::uint64_t(INSTANCE) << std::dec
-       << " QEvt.hh:qev 0x" << std::hex << std::uint64_t(qev)    << std::dec
-       << " qsim.h:sim 0x"      << std::hex << std::uint64_t(sim)      << std::dec
-       ;
+    std::stringstream ss;
+    ss << "QSim::desc" << std::endl
+       << " this 0x" << std::hex << std::uint64_t(this) << std::dec << " INSTANCE 0x" << std::hex
+       << std::uint64_t(INSTANCE) << std::dec << " QEvt.hh:qev 0x" << std::hex << std::uint64_t(qev) << std::dec
+       << " qsim.h:sim 0x" << std::hex << std::uint64_t(sim) << std::dec;
     std::string s = ss.str();
-    return s ;
+    return s;
 }
 
 std::string QSim::descFull() const
 {
-    std::stringstream ss ;
-    ss
-       << std::endl
-       << "QSim::descFull"
-       << std::endl
-       << " this 0x"            << std::hex << std::uint64_t(this)     << std::dec
-       << " INSTANCE 0x"        << std::hex << std::uint64_t(INSTANCE) << std::dec
-       << " QEvt.hh:qev 0x" << std::hex << std::uint64_t(qev)    << std::dec
-       << " qsim.h:sim 0x"      << std::hex << std::uint64_t(sim)      << std::dec
-       << " qsim.h:d_sim 0x"    << std::hex << std::uint64_t(d_sim)    << std::dec
-       //<< " sim->rng_state 0x"   << std::hex << std::uint64_t(sim->rng_state) << std::dec  // tending to SEGV on some systems
-       << " sim->base 0x"       << std::hex << std::uint64_t(sim->base)  << std::dec
-       << " sim->bnd 0x"        << std::hex << std::uint64_t(sim->bnd)   << std::dec
-       << " sim->scint 0x"      << std::hex << std::uint64_t(sim->scint) << std::dec
-       << " sim->cerenkov 0x"   << std::hex << std::uint64_t(sim->cerenkov) << std::dec
-       ;
+    std::stringstream ss;
+    ss << std::endl
+       << "QSim::descFull" << std::endl
+       << " this 0x" << std::hex << std::uint64_t(this) << std::dec << " INSTANCE 0x" << std::hex
+       << std::uint64_t(INSTANCE) << std::dec << " QEvt.hh:qev 0x" << std::hex << std::uint64_t(qev) << std::dec
+       << " qsim.h:sim 0x" << std::hex << std::uint64_t(sim) << std::dec << " qsim.h:d_sim 0x" << std::hex
+       << std::uint64_t(d_sim)
+       << std::dec
+       //<< " sim->rng_state 0x"   << std::hex << std::uint64_t(sim->rng_state) << std::dec  // tending to SEGV on some
+       // systems
+       << " sim->base 0x" << std::hex << std::uint64_t(sim->base) << std::dec << " sim->bnd 0x" << std::hex
+       << std::uint64_t(sim->bnd) << std::dec << " sim->scint 0x" << std::hex << std::uint64_t(sim->scint) << std::dec
+       << " sim->cerenkov 0x" << std::hex << std::uint64_t(sim->cerenkov) << std::dec;
     std::string s = ss.str();
-    return s ;
+    return s;
 }
 
 std::string QSim::descComponents() const
 {
-    std::stringstream ss ;
+    std::stringstream ss;
     ss << std::endl
-       << "QSim::descComponents"
-       << std::endl
-       << " (QBase)base             " << ( base      ? "YES" : "NO " )  << std::endl
-       << " (QEvt)qev           " << ( qev     ? "YES" : "NO " )  << std::endl
-       << " (SEvt)sev               " << ( sev       ? "YES" : "NO " )  << std::endl
-       << " (QRng)rng               " << ( rng       ? "YES" : "NO " )  << std::endl
-       << " (QScint)scint           " << ( scint     ? "YES" : "NO " )  << std::endl
-       << " (QCerenkov)cerenkov     " << ( cerenkov  ? "YES" : "NO " )  << std::endl
-       << " (QBnd)bnd               " << ( bnd       ? "YES" : "NO " )  << std::endl
-       << " (QOptical)optical       " << ( optical   ? "YES" : "NO " )  << std::endl
-       << " (QDebug)debug_          " << ( debug_    ? "YES" : "NO " )  << std::endl
-       << " (QProp)prop             " << ( prop      ? "YES" : "NO " )  << std::endl
-       << " (QPMT)pmt               " << ( pmt       ? "YES" : "NO " )  << std::endl
-       << " (QMultiFilm)multifilm   " << ( multifilm ? "YES" : "NO " )  << std::endl
-       << " (qsim)sim               " << ( sim       ? "YES" : "NO " )  << std::endl
-       << " (qsim)d_sim             " << ( d_sim     ? "YES" : "NO " )  << std::endl
-       << " (qdebug)dbg             " << ( dbg       ? "YES" : "NO " )  << std::endl
-       << " (qdebug)d_dbg           " << ( d_dbg     ? "YES" : "NO " )  << std::endl
-       ;
+       << "QSim::descComponents" << std::endl
+       << " (QBase)base             " << (base ? "YES" : "NO ") << std::endl
+       << " (QEvt)qev           " << (qev ? "YES" : "NO ") << std::endl
+       << " (SEvt)sev               " << (sev ? "YES" : "NO ") << std::endl
+       << " (QRng)rng               " << (rng ? "YES" : "NO ") << std::endl
+       << " (QScint)scint           " << (scint ? "YES" : "NO ") << std::endl
+       << " (QCerenkov)cerenkov     " << (cerenkov ? "YES" : "NO ") << std::endl
+       << " (QBnd)bnd               " << (bnd ? "YES" : "NO ") << std::endl
+       << " (QOptical)optical       " << (optical ? "YES" : "NO ") << std::endl
+       << " (QDebug)debug_          " << (debug_ ? "YES" : "NO ") << std::endl
+       << " (QProp)prop             " << (prop ? "YES" : "NO ") << std::endl
+       << " (QPMT)pmt               " << (pmt ? "YES" : "NO ") << std::endl
+       << " (QMultiFilm)multifilm   " << (multifilm ? "YES" : "NO ") << std::endl
+       << " (qsim)sim               " << (sim ? "YES" : "NO ") << std::endl
+       << " (qsim)d_sim             " << (d_sim ? "YES" : "NO ") << std::endl
+       << " (qdebug)dbg             " << (dbg ? "YES" : "NO ") << std::endl
+       << " (qdebug)d_dbg           " << (d_dbg ? "YES" : "NO ") << std::endl;
     std::string s = ss.str();
-    return s ;
+    return s;
 }
 
-
-
-
-
-void QSim::configureLaunch(unsigned width, unsigned height )
+void QSim::configureLaunch(unsigned width, unsigned height)
 {
     QU::ConfigureLaunch(numBlocks, threadsPerBlock, width, height);
 }
 
-void QSim::configureLaunch2D(unsigned width, unsigned height )
+void QSim::configureLaunch2D(unsigned width, unsigned height)
 {
     QU::ConfigureLaunch2D(numBlocks, threadsPerBlock, width, height);
 }
@@ -988,20 +885,11 @@ void QSim::configureLaunch1D(unsigned num, unsigned threads_per_block)
     QU::ConfigureLaunch1D(numBlocks, threadsPerBlock, num, threads_per_block);
 }
 
-
 std::string QSim::descLaunch() const
 {
     return QU::DescLaunch(numBlocks, threadsPerBlock);
 }
 
-
-
-
-
-
-
-
-
 /**
 QSim::rng_sequence mass production with multiple launches...
 --------------------------------------------------------------
@@ -1009,9 +897,8 @@ QSim::rng_sequence mass production with multiple launches...
 The output files are split too::
 
     epsilon:opticks blyth$ np.py *.npy
-    a :                                            TRngBufTest_0.npy :      (10000, 16, 16) : 8f9b27c9416a0121574730baa742b5c9 : 20210715-1227
-    epsilon:opticks blyth$ du -h TRngBufTest_0.npy
-     20M	TRngBufTest_0.npy
+    a :                                            TRngBufTest_0.npy :      (10000, 16, 16) :
+8f9b27c9416a0121574730baa742b5c9 : 20210715-1227 epsilon:opticks blyth$ du -h TRngBufTest_0.npy 20M	TRngBufTest_0.npy
 
     In [6]: (16*16*4*2*10000)/1e6
     Out[6]: 20.48
@@ -1024,10 +911,9 @@ Upping to 1M would be 100x 20M = 2000M  2GB
 
 **/
 
-
 template <typename T>
-extern void QSim_rng_sequence(  dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, T* seq, unsigned ni, unsigned nj, unsigned id_offset );
-
+extern void QSim_rng_sequence(dim3 numBlocks, dim3 threadsPerBlock, qsim *d_sim, T *seq, unsigned ni, unsigned nj,
+                              unsigned id_offset);
 
 /**
 QSim::rng_sequence generate randoms in single CUDA launch
@@ -1047,25 +933,22 @@ skipahead : used curand skipahead offsets depending on sim->evt->index and OPTIC
 
 **/
 
-template <typename T>
-void QSim::rng_sequence( T* seq, unsigned ni_tranche, unsigned nv, unsigned id_offset )
+template <typename T> void QSim::rng_sequence(T *seq, unsigned ni_tranche, unsigned nv, unsigned id_offset)
 {
-    configureLaunch(ni_tranche, 1 );
+    configureLaunch(ni_tranche, 1);
 
-    unsigned num_rng = ni_tranche*nv ;
+    unsigned num_rng = ni_tranche * nv;
 
-    const char* label = "QSim::rng_sequence:num_rng" ;
+    const char *label = "QSim::rng_sequence:num_rng";
 
-    T* d_seq = QU::device_alloc<T>(num_rng, label );
+    T *d_seq = QU::device_alloc<T>(num_rng, label);
 
-    QSim_rng_sequence<T>( numBlocks, threadsPerBlock, d_sim, d_seq, ni_tranche, nv, id_offset );
+    QSim_rng_sequence<T>(numBlocks, threadsPerBlock, d_sim, d_seq, ni_tranche, nv, id_offset);
 
-    QU::copy_device_to_host_and_free<T>( seq, d_seq, num_rng, label );
+    QU::copy_device_to_host_and_free<T>(seq, d_seq, num_rng, label);
 }
 
-
-
-const char* QSim::PREFIX = "rng_sequence" ;
+const char *QSim::PREFIX = "rng_sequence";
 
 /**
 QSim::rng_sequence
@@ -1085,77 +968,47 @@ Default *dir* is $TMP/QSimTest/rng_sequence leading to npy paths like::
 **/
 
 template <typename T>
-void QSim::rng_sequence( const char* dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size )
+void QSim::rng_sequence(const char *dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size)
 {
-    assert( ni >= ni_tranche_size && ni % ni_tranche_size == 0 );   // total size *ni* must be integral multiple of *ni_tranche_size*
-    unsigned num_tranche = ni/ni_tranche_size ;
-    unsigned nv = nj*nk ;
-
-    unsigned size = ni_tranche_size*nv ;   // number of randoms to be generated in each launch
-    std::string reldir = QU::rng_sequence_reldir<T>(PREFIX, ni, nj, nk, ni_tranche_size  ) ;
-
-    LOG(info)
-        << " ni " << ni
-        << " ni_tranche_size " << ni_tranche_size
-        << " num_tranche " << num_tranche
-        << " reldir " << reldir.c_str()
-        << " nj " << nj
-        << " nk " << nk
-        << " nv(nj*nk) " << nv
-        << " size(ni_tranche_size*nv) " << size
-        << " typecode " << QU::typecode<T>()
-        ;
+    assert(ni >= ni_tranche_size &&
+           ni % ni_tranche_size == 0); // total size *ni* must be integral multiple of *ni_tranche_size*
+    unsigned num_tranche = ni / ni_tranche_size;
+    unsigned nv = nj * nk;
+
+    unsigned size = ni_tranche_size * nv; // number of randoms to be generated in each launch
+    std::string reldir = QU::rng_sequence_reldir<T>(PREFIX, ni, nj, nk, ni_tranche_size);
 
+    LOG(info) << " ni " << ni << " ni_tranche_size " << ni_tranche_size << " num_tranche " << num_tranche << " reldir "
+              << reldir.c_str() << " nj " << nj << " nk " << nk << " nv(nj*nk) " << nv << " size(ni_tranche_size*nv) "
+              << size << " typecode " << QU::typecode<T>();
 
     // NB seq array memory gets reused for each launch and saved to different paths
-    NP* seq = NP::Make<T>(ni_tranche_size, nj, nk) ;
-    T* seq_values = seq->values<T>();
+    NP *seq = NP::Make<T>(ni_tranche_size, nj, nk);
+    T *seq_values = seq->values<T>();
     NP::INT seq_nv = seq->num_values();
 
+    LOG(info) << " seq " << (seq ? seq->sstr() : "-") << " seq_values " << seq_values << " seq_nv " << seq_nv
+              << " seq_values[0] " << seq_values[0] << " seq_values[seq_nv-1] " << seq_values[seq_nv - 1];
 
-    LOG(info)
-        << " seq " << ( seq ? seq->sstr() : "-" )
-        << " seq_values " << seq_values
-        << " seq_nv " << seq_nv
-        << " seq_values[0] " << seq_values[0]
-        << " seq_values[seq_nv-1] " << seq_values[seq_nv-1]
-        ;
-
-
-
-    for(unsigned t=0 ; t < num_tranche ; t++)
+    for (unsigned t = 0; t < num_tranche; t++)
     {
         // *id_offset* controls which rng_state/RNG to use
-        unsigned id_offset = ni_tranche_size*t ;
-        std::string name = QU::rng_sequence_name<T>(PREFIX, ni_tranche_size, nj, nk, id_offset ) ;
+        unsigned id_offset = ni_tranche_size * t;
+        std::string name = QU::rng_sequence_name<T>(PREFIX, ni_tranche_size, nj, nk, id_offset);
 
-        std::cout
-            << std::setw(3) << t
-            << std::setw(10) << id_offset
-            << std::setw(100) << name.c_str()
-            << std::endl
-            ;
+        std::cout << std::setw(3) << t << std::setw(10) << id_offset << std::setw(100) << name.c_str() << std::endl;
 
-        rng_sequence( seq_values, ni_tranche_size, nv, id_offset );
+        rng_sequence(seq_values, ni_tranche_size, nv, id_offset);
 
-        const char* path = spath::Resolve(dir, reldir.c_str(), name.c_str() );
+        const char *path = spath::Resolve(dir, reldir.c_str(), name.c_str());
         seq->save(path);
     }
 }
 
-
-
-template void QSim::rng_sequence<float>(  const char* dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size );
-template void QSim::rng_sequence<double>( const char* dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size );
-
-
-
-
-
-
-
-
-
+template void QSim::rng_sequence<float>(const char *dir, unsigned ni, unsigned nj, unsigned nk,
+                                        unsigned ni_tranche_size);
+template void QSim::rng_sequence<double>(const char *dir, unsigned ni, unsigned nj, unsigned nk,
+                                         unsigned ni_tranche_size);
 
 /**
 QSim::scint_wavelength
@@ -1167,95 +1020,87 @@ the typical values of 10 or 20 which depend on the buffer creation.
 
 **/
 
-extern void QSim_scint_wavelength(   dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, float* wavelength, unsigned num_wavelength );
+extern void QSim_scint_wavelength(dim3 numBlocks, dim3 threadsPerBlock, qsim *d_sim, float *wavelength,
+                                  unsigned num_wavelength);
 
-NP* QSim::scint_wavelength(unsigned num_wavelength, unsigned& hd_factor )
+NP *QSim::scint_wavelength(unsigned num_wavelength, unsigned &hd_factor)
 {
 
     bool qsim_disable_hd = ssys::getenvbool("QSIM_DISABLE_HD");
-    hd_factor = qsim_disable_hd ? 0u : scint->tex->getHDFactor() ;
+    hd_factor = qsim_disable_hd ? 0u : scint->tex->getHDFactor();
     // HMM: perhaps get this from sim rather than occupying an argument slot
-    LOG(LEVEL) << "[" << " qsim_disable_hd " << qsim_disable_hd << " hd_factor " << hd_factor ;
+    LOG(LEVEL) << "[" << " qsim_disable_hd " << qsim_disable_hd << " hd_factor " << hd_factor;
 
-    configureLaunch(num_wavelength, 1 );
+    configureLaunch(num_wavelength, 1);
 
-    float* d_wavelength = QU::device_alloc<float>(num_wavelength, "QSim::scint_wavelength/num_wavelength");
+    float *d_wavelength = QU::device_alloc<float>(num_wavelength, "QSim::scint_wavelength/num_wavelength");
 
-    QSim_scint_wavelength(numBlocks, threadsPerBlock, d_sim, d_wavelength, num_wavelength );
+    QSim_scint_wavelength(numBlocks, threadsPerBlock, d_sim, d_wavelength, num_wavelength);
 
-    NP* w = NP::Make<float>(num_wavelength) ;
+    NP *w = NP::Make<float>(num_wavelength);
 
-    QU::copy_device_to_host_and_free<float>( (float*)w->bytes(), d_wavelength, num_wavelength, "QSim::scint_wavelength" );
+    QU::copy_device_to_host_and_free<float>((float *)w->bytes(), d_wavelength, num_wavelength,
+                                            "QSim::scint_wavelength");
 
-    LOG(LEVEL) << "]" ;
+    LOG(LEVEL) << "]";
 
-    return w ;
+    return w;
 }
 
+extern void QSim_RandGaussQ_shoot(dim3 numBlocks, dim3 threadsPerBlock, qsim *d_sim, float *v, unsigned num_v);
 
-extern void QSim_RandGaussQ_shoot(  dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, float* v, unsigned num_v );
-
-NP* QSim::RandGaussQ_shoot(unsigned num_v )
+NP *QSim::RandGaussQ_shoot(unsigned num_v)
 {
-    const char* label = "QSim::RandGaussQ_shoot/num" ;
-    configureLaunch(num_v, 1 );
+    const char *label = "QSim::RandGaussQ_shoot/num";
+    configureLaunch(num_v, 1);
     std::cout << label << " " << num_v << std::endl;
 
-    float* d_v = QU::device_alloc<float>(num_v, label );
+    float *d_v = QU::device_alloc<float>(num_v, label);
 
-    QSim_RandGaussQ_shoot(numBlocks, threadsPerBlock, d_sim, d_v, num_v );
+    QSim_RandGaussQ_shoot(numBlocks, threadsPerBlock, d_sim, d_v, num_v);
 
     cudaDeviceSynchronize();
 
-    NP* v = NP::Make<float>(num_v) ;
-    QU::copy_device_to_host_and_free<float>( (float*)v->bytes(), d_v, num_v, label );
+    NP *v = NP::Make<float>(num_v);
+    QU::copy_device_to_host_and_free<float>((float *)v->bytes(), d_v, num_v, label);
 
-    return v ;
+    return v;
 }
 
-
-
-
-void QSim::dump_wavelength( float* wavelength, unsigned num_wavelength, unsigned edgeitems )
+void QSim::dump_wavelength(float *wavelength, unsigned num_wavelength, unsigned edgeitems)
 {
     LOG(LEVEL);
-    for(unsigned i=0 ; i < num_wavelength ; i++)
+    for (unsigned i = 0; i < num_wavelength; i++)
     {
-        if( i < edgeitems || i > num_wavelength - edgeitems)
+        if (i < edgeitems || i > num_wavelength - edgeitems)
         {
-            std::cout
-                << std::setw(10) << i
-                << std::setw(10) << std::fixed << std::setprecision(3) << wavelength[i]
-                << std::endl
-                ;
+            std::cout << std::setw(10) << i << std::setw(10) << std::fixed << std::setprecision(3) << wavelength[i]
+                      << std::endl;
         }
     }
 }
 
+extern void QSim_dbg_gs_generate(dim3 numBlocks, dim3 threadsPerBlock, qsim *sim, qdebug *dbg, sphoton *photon,
+                                 unsigned num_photon, unsigned type);
 
-extern void QSim_dbg_gs_generate(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, qdebug* dbg, sphoton* photon, unsigned num_photon, unsigned type ) ;
-
-
-NP* QSim::dbg_gs_generate(unsigned num_photon, unsigned type )
+NP *QSim::dbg_gs_generate(unsigned num_photon, unsigned type)
 {
-    assert( type == SCINT_GENERATE || type == CERENKOV_GENERATE );
+    assert(type == SCINT_GENERATE || type == CERENKOV_GENERATE);
 
-    configureLaunch( num_photon, 1 );
-    sphoton* d_photon = QU::device_alloc<sphoton>(num_photon, "QSim::dbg_gs_generate:num_photon") ;
+    configureLaunch(num_photon, 1);
+    sphoton *d_photon = QU::device_alloc<sphoton>(num_photon, "QSim::dbg_gs_generate:num_photon");
     QU::device_memset<sphoton>(d_photon, 0, num_photon);
 
-    QSim_dbg_gs_generate(numBlocks, threadsPerBlock, d_sim, d_dbg, d_photon, num_photon, type );
+    QSim_dbg_gs_generate(numBlocks, threadsPerBlock, d_sim, d_dbg, d_photon, num_photon, type);
 
-    NP* p = NP::Make<float>(num_photon, 4, 4);
-    const char* label = "QSim::dbg_gs_generate" ;
+    NP *p = NP::Make<float>(num_photon, 4, 4);
+    const char *label = "QSim::dbg_gs_generate";
 
-    QU::copy_device_to_host_and_free<sphoton>( (sphoton*)p->bytes(), d_photon, num_photon, label );
-    return p ;
+    QU::copy_device_to_host_and_free<sphoton>((sphoton *)p->bytes(), d_photon, num_photon, label);
+    return p;
 }
 
-
-
-extern void QSim_generate_photon(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim )  ;
+extern void QSim_generate_photon(dim3 numBlocks, dim3 threadsPerBlock, qsim *sim);
 
 /**
 QSim::generate_photon
@@ -1263,92 +1108,71 @@ QSim::generate_photon
 
 **/
 
-
 void QSim::generate_photon()
 {
-    LOG(LEVEL) << "[" ;
+    LOG(LEVEL) << "[";
 
-    unsigned num_photon = qev->getNumPhoton() ;
-    LOG(info) << " num_photon " << num_photon ;
+    unsigned num_photon = qev->getNumPhoton();
+    LOG(info) << " num_photon " << num_photon;
 
-    LOG_IF(fatal, num_photon == 0 )
-        << " num_photon zero : MUST QEvt::setGenstep before QSim::generate_photon "
-        ;
+    LOG_IF(fatal, num_photon == 0) << " num_photon zero : MUST QEvt::setGenstep before QSim::generate_photon ";
 
-    assert( num_photon > 0 );
-    assert( d_sim );
+    assert(num_photon > 0);
+    assert(d_sim);
 
-    configureLaunch( num_photon, 1 );
+    configureLaunch(num_photon, 1);
 
-    LOG(info) << "QSim_generate_photon... " ;
+    LOG(info) << "QSim_generate_photon... ";
 
-    QSim_generate_photon(numBlocks, threadsPerBlock, d_sim );
+    QSim_generate_photon(numBlocks, threadsPerBlock, d_sim);
 
-    LOG(LEVEL) << "]" ;
+    LOG(LEVEL) << "]";
 }
 
+extern void QSim_fill_state_0(dim3 numBlocks, dim3 threadsPerBlock, qsim *sim, quad6 *state, unsigned num_state,
+                              qdebug *dbg);
 
-
-
-
-
-extern void QSim_fill_state_0(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad6* state, unsigned num_state, qdebug* dbg );
-
-void QSim::fill_state_0(quad6* state, unsigned num_state)
+void QSim::fill_state_0(quad6 *state, unsigned num_state)
 {
-    assert( d_sim );
-    assert( d_dbg );
-
-    quad6* d_state = QU::device_alloc<quad6>(num_state, "QSim::fill_state_0:num_state") ;
+    assert(d_sim);
+    assert(d_dbg);
 
+    quad6 *d_state = QU::device_alloc<quad6>(num_state, "QSim::fill_state_0:num_state");
 
-    unsigned threads_per_block = 32 ;
-    configureLaunch1D( num_state, threads_per_block );
+    unsigned threads_per_block = 32;
+    configureLaunch1D(num_state, threads_per_block);
 
-    LOG(info)
-         << " num_state " << num_state
-         << " threads_per_block  " << threads_per_block
-         << " descLaunch " << descLaunch()
-         ;
+    LOG(info) << " num_state " << num_state << " threads_per_block  " << threads_per_block << " descLaunch "
+              << descLaunch();
 
-    QSim_fill_state_0(numBlocks, threadsPerBlock, d_sim, d_state, num_state, d_dbg  );
+    QSim_fill_state_0(numBlocks, threadsPerBlock, d_sim, d_state, num_state, d_dbg);
 
-    const char* label = "QSim::fill_state_0" ;
-    QU::copy_device_to_host_and_free<quad6>( state, d_state, num_state, label );
+    const char *label = "QSim::fill_state_0";
+    QU::copy_device_to_host_and_free<quad6>(state, d_state, num_state, label);
 }
 
+extern void QSim_fill_state_1(dim3 numBlocks, dim3 threadsPerBlock, qsim *sim, sstate *state, unsigned num_state,
+                              qdebug *dbg);
 
-extern void QSim_fill_state_1(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, sstate* state, unsigned num_state, qdebug* dbg );
-
-void QSim::fill_state_1(sstate* state, unsigned num_state)
+void QSim::fill_state_1(sstate *state, unsigned num_state)
 {
-    assert( d_sim );
-    assert( d_dbg );
+    assert(d_sim);
+    assert(d_dbg);
 
-    sstate* d_state = QU::device_alloc<sstate>(num_state, "QSim::fill_state_1:num_state") ;
+    sstate *d_state = QU::device_alloc<sstate>(num_state, "QSim::fill_state_1:num_state");
 
-    unsigned threads_per_block = 64 ;
-    configureLaunch1D( num_state, threads_per_block );
+    unsigned threads_per_block = 64;
+    configureLaunch1D(num_state, threads_per_block);
 
-    LOG(info)
-         << " num_state " << num_state
-         << " threads_per_block  " << threads_per_block
-         << " descLaunch " << descLaunch()
-         ;
+    LOG(info) << " num_state " << num_state << " threads_per_block  " << threads_per_block << " descLaunch "
+              << descLaunch();
 
-    QSim_fill_state_1(numBlocks, threadsPerBlock, d_sim, d_state, num_state, d_dbg );
+    QSim_fill_state_1(numBlocks, threadsPerBlock, d_sim, d_state, num_state, d_dbg);
 
-    const char* label = "QSim::fill_state_1" ;
-    QU::copy_device_to_host_and_free<sstate>( state, d_state, num_state, label );
+    const char *label = "QSim::fill_state_1";
+    QU::copy_device_to_host_and_free<sstate>(state, d_state, num_state, label);
 }
 
-
-
-
-
-
-
-
 /**
 extern QSim_quad_launch
 --------------------------
@@ -1357,43 +1181,39 @@ This function is implemented in QSim.cu and it used by *quad_launch_generate*
 
 **/
 
-extern void QSim_quad_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad* q, unsigned num_quad, qdebug* dbg, unsigned type  );
-
-
+extern void QSim_quad_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim *sim, quad *q, unsigned num_quad, qdebug *dbg,
+                             unsigned type);
 
-NP* QSim::quad_launch_generate(unsigned num_quad, unsigned type )
+NP *QSim::quad_launch_generate(unsigned num_quad, unsigned type)
 {
-    assert( d_sim );
-    assert( d_dbg );
+    assert(d_sim);
+    assert(d_dbg);
 
-    const char* label = "QSim::quad_launch_generate:num_quad" ;
+    const char *label = "QSim::quad_launch_generate:num_quad";
 
-    quad* d_q = QU::device_alloc<quad>(num_quad, label ) ;
+    quad *d_q = QU::device_alloc<quad>(num_quad, label);
 
-    unsigned threads_per_block = 512 ;
-    configureLaunch1D( num_quad, threads_per_block );
+    unsigned threads_per_block = 512;
+    configureLaunch1D(num_quad, threads_per_block);
 
-    QSim_quad_launch(numBlocks, threadsPerBlock, d_sim, d_q, num_quad, d_dbg, type );
+    QSim_quad_launch(numBlocks, threadsPerBlock, d_sim, d_q, num_quad, d_dbg, type);
 
-    NP* q = NP::Make<float>( num_quad, 4 );
-    quad* qq = (quad*)q->bytes();
+    NP *q = NP::Make<float>(num_quad, 4);
+    quad *qq = (quad *)q->bytes();
 
-    QU::copy_device_to_host_and_free<quad>( qq, d_q, num_quad, label );
+    QU::copy_device_to_host_and_free<quad>(qq, d_q, num_quad, label);
 
-    if( type == QGEN_SMEAR_NORMAL_SIGMA_ALPHA || type == QGEN_SMEAR_NORMAL_POLISH )
+    if (type == QGEN_SMEAR_NORMAL_SIGMA_ALPHA || type == QGEN_SMEAR_NORMAL_POLISH)
     {
-        q->set_meta<std::string>("normal", scuda::serialize(dbg->normal) );
-        q->set_meta<std::string>("direction", scuda::serialize(dbg->direction) );
-        q->set_meta<float>("value", dbg->value );
-        q->set_meta<std::string>("valuename", type == QGEN_SMEAR_NORMAL_SIGMA_ALPHA ? "sigma_alpha" : "polish" );
+        q->set_meta<std::string>("normal", scuda::serialize(dbg->normal));
+        q->set_meta<std::string>("direction", scuda::serialize(dbg->direction));
+        q->set_meta<float>("value", dbg->value);
+        q->set_meta<std::string>("valuename", type == QGEN_SMEAR_NORMAL_SIGMA_ALPHA ? "sigma_alpha" : "polish");
     }
 
-    return q ;
+    return q;
 }
 
-
-
-
 /**
 extern QSim_photon_launch
 --------------------------
@@ -1402,8 +1222,8 @@ This function is implemented in QSim.cu and it used by BOTH *photon_launch_gener
 
 **/
 
-extern void QSim_photon_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, sphoton* photon, unsigned num_photon, qdebug* dbg, unsigned type  );
-
+extern void QSim_photon_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim *sim, sphoton *photon, unsigned num_photon,
+                               qdebug *dbg, unsigned type);
 
 /**
 QSim::photon_launch_generate
@@ -1414,32 +1234,29 @@ then downloads the generated photons into the host array. Contrast with *photon_
 
 **/
 
-NP* QSim::photon_launch_generate(unsigned num_photon, unsigned type )
+NP *QSim::photon_launch_generate(unsigned num_photon, unsigned type)
 {
-    assert( d_sim );
-    assert( d_dbg );
+    assert(d_sim);
+    assert(d_dbg);
 
-    const char* label = "QSim::photon_launch_generate:num_photon" ;
+    const char *label = "QSim::photon_launch_generate:num_photon";
 
-    sphoton* d_photon = QU::device_alloc<sphoton>(num_photon, label ) ;
+    sphoton *d_photon = QU::device_alloc<sphoton>(num_photon, label);
     QU::device_memset<sphoton>(d_photon, 0, num_photon);
 
-    unsigned threads_per_block = 512 ;
-    configureLaunch1D( num_photon, threads_per_block );
+    unsigned threads_per_block = 512;
+    configureLaunch1D(num_photon, threads_per_block);
 
-    QSim_photon_launch(numBlocks, threadsPerBlock, d_sim, d_photon, num_photon, d_dbg, type );
+    QSim_photon_launch(numBlocks, threadsPerBlock, d_sim, d_photon, num_photon, d_dbg, type);
 
-    NP* p = NP::Make<float>(num_photon, 4, 4);
-    sphoton* photon = (sphoton*)p->bytes() ;
+    NP *p = NP::Make<float>(num_photon, 4, 4);
+    sphoton *photon = (sphoton *)p->bytes();
 
-    QU::copy_device_to_host_and_free<sphoton>( photon, d_photon, num_photon, label );
+    QU::copy_device_to_host_and_free<sphoton>(photon, d_photon, num_photon, label);
 
-    return p ;
+    return p;
 }
 
-
-
-
 /**
 QSim::photon_launch_mutate
 ---------------------------
@@ -1448,45 +1265,35 @@ This uploads the photon array provided, mutates it and then downloads the change
 
 **/
 
-void QSim::photon_launch_mutate(sphoton* photon, unsigned num_photon, unsigned type )
+void QSim::photon_launch_mutate(sphoton *photon, unsigned num_photon, unsigned type)
 {
-    assert( d_sim );
-    assert( d_dbg );
+    assert(d_sim);
+    assert(d_dbg);
 
-    const char* label_0 = "QSim::photon_launch_mutate/d_photon" ;
-    sphoton* d_photon = QU::UploadArray<sphoton>(photon, num_photon, label_0 );
+    const char *label_0 = "QSim::photon_launch_mutate/d_photon";
+    sphoton *d_photon = QU::UploadArray<sphoton>(photon, num_photon, label_0);
 
-    unsigned DEBUG_NUM_PHOTON = ssys::getenvunsigned(_QSim__photon_launch_mutate_DEBUG_NUM_PHOTON, 0 );
-    bool DEBUG_NUM_PHOTON_valid = DEBUG_NUM_PHOTON > 0 && DEBUG_NUM_PHOTON <= num_photon ;
-    unsigned u_num_photon = DEBUG_NUM_PHOTON_valid ? DEBUG_NUM_PHOTON  : num_photon ;
-    bool SKIP_LAUNCH = ssys::getenvbool(_QSim__photon_launch_mutate_SKIP_LAUNCH) ;
+    unsigned DEBUG_NUM_PHOTON = ssys::getenvunsigned(_QSim__photon_launch_mutate_DEBUG_NUM_PHOTON, 0);
+    bool DEBUG_NUM_PHOTON_valid = DEBUG_NUM_PHOTON > 0 && DEBUG_NUM_PHOTON <= num_photon;
+    unsigned u_num_photon = DEBUG_NUM_PHOTON_valid ? DEBUG_NUM_PHOTON : num_photon;
+    bool SKIP_LAUNCH = ssys::getenvbool(_QSim__photon_launch_mutate_SKIP_LAUNCH);
 
-    LOG_IF( error, DEBUG_NUM_PHOTON_valid || true )
-        << _QSim__photon_launch_mutate_DEBUG_NUM_PHOTON
-        << " DEBUG_NUM_PHOTON " << DEBUG_NUM_PHOTON
-        << " num_photon " << num_photon
-        << " u_num_photon " << u_num_photon
-        << _QSim__photon_launch_mutate_SKIP_LAUNCH
-        << " " << ( SKIP_LAUNCH ? "YES" : "NO " )
-        ;
+    LOG_IF(error, DEBUG_NUM_PHOTON_valid || true)
+        << _QSim__photon_launch_mutate_DEBUG_NUM_PHOTON << " DEBUG_NUM_PHOTON " << DEBUG_NUM_PHOTON << " num_photon "
+        << num_photon << " u_num_photon " << u_num_photon << _QSim__photon_launch_mutate_SKIP_LAUNCH << " "
+        << (SKIP_LAUNCH ? "YES" : "NO ");
 
-
-    if( SKIP_LAUNCH == false )
+    if (SKIP_LAUNCH == false)
     {
-        unsigned threads_per_block = 512 ;
-        configureLaunch1D( u_num_photon, threads_per_block );
-        QSim_photon_launch(numBlocks, threadsPerBlock, d_sim, d_photon, u_num_photon, d_dbg, type );
+        unsigned threads_per_block = 512;
+        configureLaunch1D(u_num_photon, threads_per_block);
+        QSim_photon_launch(numBlocks, threadsPerBlock, d_sim, d_photon, u_num_photon, d_dbg, type);
     }
 
-
-    const char* label_1 = "QSim::photon_launch_mutate" ;
-    QU::copy_device_to_host_and_free<sphoton>( photon, d_photon, u_num_photon, label_1 );
+    const char *label_1 = "QSim::photon_launch_mutate";
+    QU::copy_device_to_host_and_free<sphoton>(photon, d_photon, u_num_photon, label_1);
 }
 
-
-
-
-
 /**
 QSim::UploadFakePRD (formerly "UploadMockPRD" )
 ----------------------------------------------------
@@ -1494,33 +1301,28 @@ QSim::UploadFakePRD (formerly "UploadMockPRD" )
 Caution this returns a device pointer.
 **/
 
-quad2* QSim::UploadFakePRD(const NP* ip, const NP* prd) // static
+quad2 *QSim::UploadFakePRD(const NP *ip, const NP *prd) // static
 {
     assert(ip);
-    int num_ip = ip->shape[0] ;
-    assert( num_ip > 0 );
+    int num_ip = ip->shape[0];
+    assert(num_ip > 0);
 
-    assert( prd->has_shape( num_ip, -1, 2, 4 ) );    // TODO: evt->max_record checking
-    assert( prd->shape.size() == 4 && prd->shape[2] == 2 && prd->shape[3] == 4 );
-    int num_prd = prd->shape[0]*prd->shape[1] ;
+    assert(prd->has_shape(num_ip, -1, 2, 4)); // TODO: evt->max_record checking
+    assert(prd->shape.size() == 4 && prd->shape[2] == 2 && prd->shape[3] == 4);
+    int num_prd = prd->shape[0] * prd->shape[1];
 
-    LOG(LEVEL)
-         << "["
-         << " num_ip " << num_ip
-         << " num_prd " << num_prd
-         << " prd " << prd->sstr()
-         ;
+    LOG(LEVEL) << "[" << " num_ip " << num_ip << " num_prd " << num_prd << " prd " << prd->sstr();
 
-    const char* label = "QSim::UploadFakePRD/d_prd" ;
-    quad2* d_prd = QU::UploadArray<quad2>( (quad2*)prd->bytes(), num_prd, label );
+    const char *label = "QSim::UploadFakePRD/d_prd";
+    quad2 *d_prd = QU::UploadArray<quad2>((quad2 *)prd->bytes(), num_prd, label);
 
     // prd is non-standard so it is appropriate to adhoc upload here
 
-    return d_prd ;
+    return d_prd;
 }
 
 #if !defined(PRODUCTION)
-extern void QSim_fake_propagate_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad2* prd );
+extern void QSim_fake_propagate_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim *sim, quad2 *prd);
 #endif
 
 /**
@@ -1543,7 +1345,7 @@ using common QEvt functionality
 
 **/
 
-void QSim::fake_propagate( const NP* prd, unsigned type )
+void QSim::fake_propagate(const NP *prd, unsigned type)
 {
 #if defined(PRODUCTION)
     (void)prd;
@@ -1551,126 +1353,105 @@ void QSim::fake_propagate( const NP* prd, unsigned type )
     LOG(fatal) << "QSim::fake_propagate is disabled in PRODUCTION builds";
     std::raise(SIGINT);
 #else
-    const NP* ip = sev->getInputPhoton();
-    int num_ip = ip ? ip->shape[0] : 0 ;
-    assert( num_ip > 0 );
+    const NP *ip = sev->getInputPhoton();
+    int num_ip = ip ? ip->shape[0] : 0;
+    assert(num_ip > 0);
 
-    quad2* d_prd = UploadFakePRD(ip, prd) ;
+    quad2 *d_prd = UploadFakePRD(ip, prd);
 
-    NP* igs = sev->makeGenstepArrayFromVector();
+    NP *igs = sev->makeGenstepArrayFromVector();
 
     int rc = qev->setGenstepUpload_NP(igs);
-    assert( rc == 0 );
-    if(rc!=0) std::raise(SIGINT);
+    assert(rc == 0);
+    if (rc != 0)
+        std::raise(SIGINT);
 
-    sev->add_array("prd0", prd );
+    sev->add_array("prd0", prd);
     // NB SEvt::beginOfEvent calls SEvt/clear so this addition
     // must be after that to succeed in being added to SEvt saved arrays
 
     int num_photon = qev->getNumPhoton();
-    bool consistent_num_photon = num_photon == num_ip ;
+    bool consistent_num_photon = num_photon == num_ip;
 
     LOG_IF(fatal, !consistent_num_photon)
-         << "["
-         << " num_ip " << num_ip
-         << " QEvt::getNumPhoton " << num_photon
-         << " consistent_num_photon " << ( consistent_num_photon ? "YES" : "NO " )
-         << " prd " << prd->sstr()
-         ;
+        << "[" << " num_ip " << num_ip << " QEvt::getNumPhoton " << num_photon << " consistent_num_photon "
+        << (consistent_num_photon ? "YES" : "NO ") << " prd " << prd->sstr();
     assert(consistent_num_photon);
 
-    assert( qev->upload_count > 0 );
+    assert(qev->upload_count > 0);
 
-    unsigned threads_per_block = 512 ;
-    configureLaunch1D( num_photon, threads_per_block );
+    unsigned threads_per_block = 512;
+    configureLaunch1D(num_photon, threads_per_block);
 
-    QSim_fake_propagate_launch(numBlocks, threadsPerBlock, d_sim, d_prd );
+    QSim_fake_propagate_launch(numBlocks, threadsPerBlock, d_sim, d_prd);
 
     cudaDeviceSynchronize();
 
-
-    LOG(LEVEL) << "]" ;
+    LOG(LEVEL) << "]";
 #endif
 }
 
+extern void QSim_boundary_lookup_all(dim3 numBlocks, dim3 threadsPerBlock, qsim *d_sim, quad *lookup, unsigned width,
+                                     unsigned height);
 
-
-extern void QSim_boundary_lookup_all(    dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, quad* lookup, unsigned width, unsigned height );
-
-NP* QSim::boundary_lookup_all(unsigned width, unsigned height )
+NP *QSim::boundary_lookup_all(unsigned width, unsigned height)
 {
-    LOG(LEVEL) << "[" ;
-    assert( bnd );
-    assert( width <= getBoundaryTexWidth()  );
-    assert( height <= getBoundaryTexHeight()  );
+    LOG(LEVEL) << "[";
+    assert(bnd);
+    assert(width <= getBoundaryTexWidth());
+    assert(height <= getBoundaryTexHeight());
 
-    unsigned num_lookup = width*height ;
-    LOG(LEVEL)
-        << " width " << width
-        << " height " << height
-        << " num_lookup " << num_lookup
-        ;
+    unsigned num_lookup = width * height;
+    LOG(LEVEL) << " width " << width << " height " << height << " num_lookup " << num_lookup;
 
+    configureLaunch(width, height);
 
-    configureLaunch(width, height );
+    const char *label = "QSim::boundary_lookup_all:num_lookup";
 
-    const char* label = "QSim::boundary_lookup_all:num_lookup" ;
+    quad *d_lookup = QU::device_alloc<quad>(num_lookup, label);
+    QSim_boundary_lookup_all(numBlocks, threadsPerBlock, d_sim, d_lookup, width, height);
 
-    quad* d_lookup = QU::device_alloc<quad>(num_lookup, label ) ;
-    QSim_boundary_lookup_all(numBlocks, threadsPerBlock, d_sim, d_lookup, width, height );
+    assert(height % 8 == 0);
+    unsigned num_bnd = height / 8;
 
-    assert( height % 8 == 0 );
-    unsigned num_bnd = height/8 ;
+    NP *l = NP::Make<float>(num_bnd, 4, 2, width, 4);
+    QU::copy_device_to_host_and_free<quad>((quad *)l->bytes(), d_lookup, num_lookup, label);
 
-    NP* l = NP::Make<float>( num_bnd, 4, 2, width, 4 );
-    QU::copy_device_to_host_and_free<quad>( (quad*)l->bytes(), d_lookup, num_lookup, label );
-
-    LOG(LEVEL) << "]" ;
-
-    return l ;
+    LOG(LEVEL) << "]";
 
+    return l;
 }
 
-extern void QSim_boundary_lookup_line(    dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, quad* lookup, float* domain, unsigned num_lookup, unsigned line, unsigned k );
+extern void QSim_boundary_lookup_line(dim3 numBlocks, dim3 threadsPerBlock, qsim *d_sim, quad *lookup, float *domain,
+                                      unsigned num_lookup, unsigned line, unsigned k);
 
-
-NP* QSim::boundary_lookup_line( float* domain, unsigned num_lookup, unsigned line, unsigned k )
+NP *QSim::boundary_lookup_line(float *domain, unsigned num_lookup, unsigned line, unsigned k)
 {
-    LOG(LEVEL)
-        << "["
-        << " num_lookup " << num_lookup
-        << " line " << line
-        << " k " << k
-        ;
-
-    configureLaunch(num_lookup, 1  );
+    LOG(LEVEL) << "[" << " num_lookup " << num_lookup << " line " << line << " k " << k;
 
-    float* d_domain = QU::device_alloc<float>(num_lookup, "QSim::boundary_lookup_line:num_lookup") ;
+    configureLaunch(num_lookup, 1);
 
-    QU::copy_host_to_device<float>( d_domain, domain, num_lookup );
+    float *d_domain = QU::device_alloc<float>(num_lookup, "QSim::boundary_lookup_line:num_lookup");
 
-    const char* label = "QSim::boundary_lookup_line:num_lookup" ;
+    QU::copy_host_to_device<float>(d_domain, domain, num_lookup);
 
-    quad* d_lookup = QU::device_alloc<quad>(num_lookup, label ) ;
+    const char *label = "QSim::boundary_lookup_line:num_lookup";
 
-    QSim_boundary_lookup_line(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, num_lookup, line, k );
+    quad *d_lookup = QU::device_alloc<quad>(num_lookup, label);
 
+    QSim_boundary_lookup_line(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, num_lookup, line, k);
 
-    NP* l = NP::Make<float>( num_lookup, 4 );
+    NP *l = NP::Make<float>(num_lookup, 4);
 
-    QU::copy_device_to_host_and_free<quad>( (quad*)l->bytes(), d_lookup, num_lookup, label  );
+    QU::copy_device_to_host_and_free<quad>((quad *)l->bytes(), d_lookup, num_lookup, label);
 
-    QU::device_free<float>( d_domain );
+    QU::device_free<float>(d_domain);
 
-    LOG(LEVEL) << "]" ;
+    LOG(LEVEL) << "]";
 
-    return l ;
+    return l;
 }
 
-
-
-
-
 /**
 QSim::prop_lookup
 --------------------
@@ -1681,59 +1462,43 @@ below *prop_lookup_onebyone*
 
 **/
 
-
 template <typename T>
-extern void QSim_prop_lookup( dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, T* lookup, const T* domain, unsigned domain_width, unsigned* pids, unsigned num_pids );
+extern void QSim_prop_lookup(dim3 numBlocks, dim3 threadsPerBlock, qsim *d_sim, T *lookup, const T *domain,
+                             unsigned domain_width, unsigned *pids, unsigned num_pids);
 
 template <typename T>
-void QSim::prop_lookup( T* lookup, const T* domain, unsigned domain_width, const std::vector<unsigned>& pids )
+void QSim::prop_lookup(T *lookup, const T *domain, unsigned domain_width, const std::vector<unsigned> &pids)
 {
-    unsigned num_pids = pids.size() ;
-    unsigned num_lookup = num_pids*domain_width ;
-    LOG(LEVEL)
-        << "["
-        << " num_pids " << num_pids
-        << " domain_width " << domain_width
-        << " num_lookup " << num_lookup
-        ;
+    unsigned num_pids = pids.size();
+    unsigned num_lookup = num_pids * domain_width;
+    LOG(LEVEL) << "[" << " num_pids " << num_pids << " domain_width " << domain_width << " num_lookup " << num_lookup;
 
-    configureLaunch(domain_width, num_pids  );
+    configureLaunch(domain_width, num_pids);
 
-    unsigned* d_pids = QU::device_alloc<unsigned>(num_pids, "QSim::prop_lookup:num_pids") ;
-    T* d_domain = QU::device_alloc<T>(domain_width, "QSim::prop_lookup:domain_width") ;
-    T* d_lookup = QU::device_alloc<T>(num_lookup  , "QSim::prop_lookup:num_lookup") ;
+    unsigned *d_pids = QU::device_alloc<unsigned>(num_pids, "QSim::prop_lookup:num_pids");
+    T *d_domain = QU::device_alloc<T>(domain_width, "QSim::prop_lookup:domain_width");
+    T *d_lookup = QU::device_alloc<T>(num_lookup, "QSim::prop_lookup:num_lookup");
 
-    QU::copy_host_to_device<T>( d_domain, domain, domain_width );
-    QU::copy_host_to_device<unsigned>( d_pids, pids.data(), num_pids );
+    QU::copy_host_to_device<T>(d_domain, domain, domain_width);
+    QU::copy_host_to_device<unsigned>(d_pids, pids.data(), num_pids);
 
-    QSim_prop_lookup(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, domain_width, d_pids, num_pids );
+    QSim_prop_lookup(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, domain_width, d_pids, num_pids);
 
-    QU::copy_device_to_host_and_free<T>( lookup, d_lookup, num_lookup );
-    QU::device_free<T>( d_domain );
-    QU::device_free<unsigned>( d_pids );
+    QU::copy_device_to_host_and_free<T>(lookup, d_lookup, num_lookup);
+    QU::device_free<T>(d_domain);
+    QU::device_free<unsigned>(d_pids);
 
-    LOG(LEVEL) << "]" ;
+    LOG(LEVEL) << "]";
 }
 
-
-
 /**
 Hmm doing lookups like this is a very common pattern, could do with
 a sub context to carry the pieces to simplify doing that.
 **/
 
 template <typename T>
-extern void QSim_prop_lookup_one(
-    dim3 numBlocks,
-    dim3 threadsPerBlock,
-    qsim* sim,
-    T* lookup,
-    const T* domain,
-    unsigned domain_width,
-    unsigned num_pids,
-    unsigned pid,
-    unsigned ipid
-);
+extern void QSim_prop_lookup_one(dim3 numBlocks, dim3 threadsPerBlock, qsim *sim, T *lookup, const T *domain,
+                                 unsigned domain_width, unsigned num_pids, unsigned pid, unsigned ipid);
 
 /**
 QSim::prop_lookup_onebyone
@@ -1749,203 +1514,155 @@ On device uses::
 **/
 
 template <typename T>
-void QSim::prop_lookup_onebyone( T* lookup, const T* domain, unsigned domain_width, const std::vector<unsigned>& pids )
+void QSim::prop_lookup_onebyone(T *lookup, const T *domain, unsigned domain_width, const std::vector<unsigned> &pids)
 {
-    unsigned num_pids = pids.size() ;
-    unsigned num_lookup = num_pids*domain_width ;
-    LOG(LEVEL)
-        << "["
-        << " num_pids " << num_pids
-        << " domain_width " << domain_width
-        << " num_lookup " << num_lookup
-        ;
+    unsigned num_pids = pids.size();
+    unsigned num_lookup = num_pids * domain_width;
+    LOG(LEVEL) << "[" << " num_pids " << num_pids << " domain_width " << domain_width << " num_lookup " << num_lookup;
 
-    configureLaunch(domain_width, 1  );
+    configureLaunch(domain_width, 1);
 
-    T* d_domain = QU::device_alloc<T>(domain_width, "QSim::prop_lookup_onebyone:domain_width") ;
-    QU::copy_host_to_device<T>( d_domain, domain, domain_width );
+    T *d_domain = QU::device_alloc<T>(domain_width, "QSim::prop_lookup_onebyone:domain_width");
+    QU::copy_host_to_device<T>(d_domain, domain, domain_width);
 
-    const char* label = "QSim::prop_lookup_onebyone:num_lookup" ;
+    const char *label = "QSim::prop_lookup_onebyone:num_lookup";
 
-    T* d_lookup = QU::device_alloc<T>(num_lookup, label ) ;
+    T *d_lookup = QU::device_alloc<T>(num_lookup, label);
 
     // separate launches for each pid
-    for(unsigned ipid=0 ; ipid < num_pids ; ipid++)
+    for (unsigned ipid = 0; ipid < num_pids; ipid++)
     {
-        unsigned pid = pids[ipid] ;
-        QSim_prop_lookup_one<T>(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, domain_width, num_pids, pid, ipid );
+        unsigned pid = pids[ipid];
+        QSim_prop_lookup_one<T>(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, domain_width, num_pids, pid,
+                                ipid);
     }
 
-    QU::copy_device_to_host_and_free<T>( lookup, d_lookup, num_lookup, label  );
+    QU::copy_device_to_host_and_free<T>(lookup, d_lookup, num_lookup, label);
 
-    QU::device_free<T>( d_domain );
+    QU::device_free<T>(d_domain);
 
-    LOG(LEVEL) << "]" ;
+    LOG(LEVEL) << "]";
 }
 
+template void QSim::prop_lookup_onebyone(float *, const float *, unsigned, const std::vector<unsigned> &);
+template void QSim::prop_lookup_onebyone(double *, const double *, unsigned, const std::vector<unsigned> &);
 
-template void QSim::prop_lookup_onebyone( float*, const float* ,   unsigned, const std::vector<unsigned>& );
-template void QSim::prop_lookup_onebyone( double*, const double* , unsigned, const std::vector<unsigned>& );
-
-
+extern void QSim_multifilm_lookup_all(dim3 numBlocks, dim3 threadsPerBlock, qsim *sim, quad2 *sample, quad2 *result,
+                                      unsigned width, unsigned height);
 
-
-
-
-extern void QSim_multifilm_lookup_all(    dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad2* sample, quad2* result,  unsigned width, unsigned height );
-
-void QSim::multifilm_lookup_all( quad2 * sample , quad2 * result ,  unsigned width, unsigned height )
+void QSim::multifilm_lookup_all(quad2 *sample, quad2 *result, unsigned width, unsigned height)
 {
-    LOG(LEVEL) << "[" ;
-    unsigned num_lookup = width*height ;
-    unsigned size = num_lookup ;
+    LOG(LEVEL) << "[";
+    unsigned num_lookup = width * height;
+    unsigned size = num_lookup;
 
-    LOG(LEVEL)
-        << " width " << width
-        << " height " << height
-        << " num_lookup " << num_lookup
-        << " size "<<size
-        ;
+    LOG(LEVEL) << " width " << width << " height " << height << " num_lookup " << num_lookup << " size " << size;
 
-    configureLaunch2D(width, height );
+    configureLaunch2D(width, height);
 
-    //const float * c_sample = sample;
-    quad2* d_sample = QU::device_alloc<quad2>(size, "QSim::multifilm_lookup_all:size" ) ;
+    // const float * c_sample = sample;
+    quad2 *d_sample = QU::device_alloc<quad2>(size, "QSim::multifilm_lookup_all:size");
 
-    const char* label = "QSim::multifilm_lookup_all:size" ;
+    const char *label = "QSim::multifilm_lookup_all:size";
 
-    quad2* d_result = QU::device_alloc<quad2>(size, label ) ;
-    LOG(LEVEL)
-       <<" copy_host_to_device<quad2>( d_sample, sample , size) before";
-    QU::copy_host_to_device<quad2>( d_sample, sample , size);
-    LOG(LEVEL)
-       <<" copy_host_to_device<quad2>( d_sample, sample , size) after";
+    quad2 *d_result = QU::device_alloc<quad2>(size, label);
+    LOG(LEVEL) << " copy_host_to_device<quad2>( d_sample, sample , size) before";
+    QU::copy_host_to_device<quad2>(d_sample, sample, size);
+    LOG(LEVEL) << " copy_host_to_device<quad2>( d_sample, sample , size) after";
 
-    QSim_multifilm_lookup_all(numBlocks, threadsPerBlock, d_sim, d_sample, d_result, width, height );
-    QU::copy_device_to_host_and_free<quad2>( result , d_result , size, label );
+    QSim_multifilm_lookup_all(numBlocks, threadsPerBlock, d_sim, d_sample, d_result, width, height);
+    QU::copy_device_to_host_and_free<quad2>(result, d_result, size, label);
     QU::device_free<quad2>(d_sample);
 
     cudaDeviceSynchronize();
-    LOG(LEVEL) << "]" ;
+    LOG(LEVEL) << "]";
 }
 
-
-
-
 unsigned QSim::getBoundaryTexWidth() const
 {
-    return bnd->tex->width ;
+    return bnd->tex->width;
 }
 unsigned QSim::getBoundaryTexHeight() const
 {
-    return bnd->tex->height ;
+    return bnd->tex->height;
 }
-const NP* QSim::getBoundaryTexSrc() const
+const NP *QSim::getBoundaryTexSrc() const
 {
-    return bnd->src ;
+    return bnd->src;
 }
 
-void QSim::dump_photon( quad4* photon, unsigned num_photon, const char* opt_, unsigned edgeitems )
+void QSim::dump_photon(quad4 *photon, unsigned num_photon, const char *opt_, unsigned edgeitems)
 {
     LOG(LEVEL);
 
-    std::string opt = opt_ ;
+    std::string opt = opt_;
 
-    bool f0 = opt.find("f0") != std::string::npos ;
-    bool f1 = opt.find("f1") != std::string::npos ;
-    bool f2 = opt.find("f2") != std::string::npos ;
-    bool f3 = opt.find("f3") != std::string::npos ;
+    bool f0 = opt.find("f0") != std::string::npos;
+    bool f1 = opt.find("f1") != std::string::npos;
+    bool f2 = opt.find("f2") != std::string::npos;
+    bool f3 = opt.find("f3") != std::string::npos;
 
-    bool i0 = opt.find("i0") != std::string::npos ;
-    bool i1 = opt.find("i1") != std::string::npos ;
-    bool i2 = opt.find("i2") != std::string::npos ;
-    bool i3 = opt.find("i3") != std::string::npos ;
+    bool i0 = opt.find("i0") != std::string::npos;
+    bool i1 = opt.find("i1") != std::string::npos;
+    bool i2 = opt.find("i2") != std::string::npos;
+    bool i3 = opt.find("i3") != std::string::npos;
 
-    int wi = 7 ;
-    int pr = 2 ;
+    int wi = 7;
+    int pr = 2;
 
-    for(unsigned i=0 ; i < num_photon ; i++)
+    for (unsigned i = 0; i < num_photon; i++)
     {
-        if( i < edgeitems || i > num_photon - edgeitems)
+        if (i < edgeitems || i > num_photon - edgeitems)
         {
-            const quad4& p = photon[i] ;
-
-            std::cout
-                << std::setw(wi) << i
-                ;
-
-            if(f0) std::cout
-                << " f0 "
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.x
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.y
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.z
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.w
-                ;
-
-            if(f1) std::cout
-                << " f1 "
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.x
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.y
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.z
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.w
-                ;
-
-            if(f2) std::cout
-                << " f2 "
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.x
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.y
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.z
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.w
-                ;
-
-            if(f3) std::cout
-                << " f3 "
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.x
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.y
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.z
-                << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.w
-                ;
-
-            if(i0) std::cout
-                << " i0 "
-                << std::setw(wi) << p.q0.i.x
-                << std::setw(wi) << p.q0.i.y
-                << std::setw(wi) << p.q0.i.z
-                << std::setw(wi) << p.q0.i.w
-                ;
-
-            if(i1) std::cout
-                << " i1 "
-                << std::setw(wi) << p.q1.i.x
-                << std::setw(wi) << p.q1.i.y
-                << std::setw(wi) << p.q1.i.z
-                << std::setw(wi) << p.q1.i.w
-                ;
-
-            if(i2) std::cout
-                << " i2 "
-                << std::setw(wi) << p.q2.i.x
-                << std::setw(wi) << p.q2.i.y
-                << std::setw(wi) << p.q2.i.z
-                << std::setw(wi) << p.q2.i.w
-                ;
-
-            if(i3) std::cout
-                << " i3 "
-                << std::setw(wi) << p.q3.i.x
-                << std::setw(wi) << p.q3.i.y
-                << std::setw(wi) << p.q3.i.z
-                << std::setw(wi) << p.q3.i.w
-                ;
-
-            std::cout
-                << std::endl
-                ;
+            const quad4 &p = photon[i];
+
+            std::cout << std::setw(wi) << i;
+
+            if (f0)
+                std::cout << " f0 " << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.x << std::setw(wi)
+                          << std::fixed << std::setprecision(pr) << p.q0.f.y << std::setw(wi) << std::fixed
+                          << std::setprecision(pr) << p.q0.f.z << std::setw(wi) << std::fixed << std::setprecision(pr)
+                          << p.q0.f.w;
+
+            if (f1)
+                std::cout << " f1 " << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.x << std::setw(wi)
+                          << std::fixed << std::setprecision(pr) << p.q1.f.y << std::setw(wi) << std::fixed
+                          << std::setprecision(pr) << p.q1.f.z << std::setw(wi) << std::fixed << std::setprecision(pr)
+                          << p.q1.f.w;
+
+            if (f2)
+                std::cout << " f2 " << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.x << std::setw(wi)
+                          << std::fixed << std::setprecision(pr) << p.q2.f.y << std::setw(wi) << std::fixed
+                          << std::setprecision(pr) << p.q2.f.z << std::setw(wi) << std::fixed << std::setprecision(pr)
+                          << p.q2.f.w;
+
+            if (f3)
+                std::cout << " f3 " << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.x << std::setw(wi)
+                          << std::fixed << std::setprecision(pr) << p.q3.f.y << std::setw(wi) << std::fixed
+                          << std::setprecision(pr) << p.q3.f.z << std::setw(wi) << std::fixed << std::setprecision(pr)
+                          << p.q3.f.w;
+
+            if (i0)
+                std::cout << " i0 " << std::setw(wi) << p.q0.i.x << std::setw(wi) << p.q0.i.y << std::setw(wi)
+                          << p.q0.i.z << std::setw(wi) << p.q0.i.w;
+
+            if (i1)
+                std::cout << " i1 " << std::setw(wi) << p.q1.i.x << std::setw(wi) << p.q1.i.y << std::setw(wi)
+                          << p.q1.i.z << std::setw(wi) << p.q1.i.w;
+
+            if (i2)
+                std::cout << " i2 " << std::setw(wi) << p.q2.i.x << std::setw(wi) << p.q2.i.y << std::setw(wi)
+                          << p.q2.i.z << std::setw(wi) << p.q2.i.w;
+
+            if (i3)
+                std::cout << " i3 " << std::setw(wi) << p.q3.i.x << std::setw(wi) << p.q3.i.y << std::setw(wi)
+                          << p.q3.i.z << std::setw(wi) << p.q3.i.w;
+
+            std::cout << std::endl;
         }
     }
 }
 
-
 /**
 QSim::Desc
 ------------
@@ -1960,10 +1677,10 @@ Dump flags with::
    ssys_test
 
 **/
-std::string QSim::Desc(char delim)  // static
+std::string QSim::Desc(char delim) // static
 {
-    std::stringstream ss ;
-    ss << ( delim == ',' ? "" : "QSim::Desc\n" )
+    std::stringstream ss;
+    ss << (delim == ',' ? "" : "QSim::Desc\n")
 #ifdef CONFIG_Debug
        << "CONFIG_Debug"
 #else
@@ -2041,17 +1758,12 @@ std::string QSim::Desc(char delim)  // static
 #else
        << "NOT-RNG_PHILITEOX"
 #endif
-       << delim
-       ;
-    std::string str = ss.str() ;
-    return str ;
+       << delim;
+    std::string str = ss.str();
+    return str;
 }
 
-
-
-std::string QSim::Switches()  // static
+std::string QSim::Switches() // static
 {
     return Desc(',');
 }
-
-
diff --git a/qudarap/QSim.hh b/qudarap/QSim.hh
index 5d9c38471..16e0f81a6 100644
--- a/qudarap/QSim.hh
+++ b/qudarap/QSim.hh
@@ -1,8 +1,8 @@
 #pragma once
 
+#include <cstdint>
 #include <string>
 #include <vector>
-#include <cstdint>
 
 #include "QUDARAP_API_EXPORT.hh"
 #include "plog/Severity.h"
@@ -23,194 +23,182 @@ HMM : MOST OF THIS API IS FOR TESTING ONLY  : TODO: Move lots to QSimTest perhap
 
 **/
 
-struct NP ;
-struct SSim ;
-struct SEvt ;
+struct NP;
+struct SSim;
+struct SEvt;
 
-template <typename T> struct QTex ;
-template <typename T> struct QBuf ;
-template <typename T> struct QProp ;
-template <typename T> struct QPMT ;
+template <typename T> struct QTex;
+template <typename T> struct QBuf;
+template <typename T> struct QProp;
+template <typename T> struct QPMT;
 
-struct qsim ;
+struct qsim;
 
-struct QBase ;
-struct QEvt ;
-struct QRng ;
-struct QScint ;
-struct QCerenkov ;
-struct QBnd ;
+struct QBase;
+struct QEvt;
+struct QRng;
+struct QScint;
+struct QWls;
+struct QCerenkov;
+struct QBnd;
 struct QMultiFilm;
-struct QOptical ;
-struct QEvt ;
-struct QDebug ;
+struct QOptical;
+struct QEvt;
+struct QDebug;
 
-struct qdebug ;
-struct sstate ;
+struct qdebug;
+struct sstate;
 
-struct quad4 ;
-struct quad2 ;
-struct sphoton ;
-union  quad ;
+struct quad4;
+struct quad2;
+struct sphoton;
+union quad;
 
-struct SSimulator ;
+struct SSimulator;
 
 struct QUDARAP_API QSim
 {
-    static constexpr const int64_t M = 1000000 ;
-    static constexpr const int64_t G = 1000000000 ;
-
-    static const plog::Severity LEVEL ;
-    static const char* PREFIX ;
-    static QSim* INSTANCE ;
-    static QSim* Get();
-    static QSim* Create();
+    static constexpr const int64_t M = 1000000;
+    static constexpr const int64_t G = 1000000000;
 
-    static void UploadComponents(const SSim* ssim);
+    static const plog::Severity LEVEL;
+    static const char *PREFIX;
+    static QSim *INSTANCE;
+    static QSim *Get();
+    static QSim *Create();
 
-    const QBase*     base ;
-    QEvt*            qev ;
-    SEvt*            sev ;
+    static void UploadComponents(const SSim *ssim);
 
-    const QRng*      rng ;
-    const QScint*    scint ;
-    const QCerenkov* cerenkov ;
-    const QBnd*      bnd ;
-    const QOptical*  optical ;
-    const QDebug*    debug_ ;
+    const QBase *base;
+    QEvt *qev;
+    SEvt *sev;
 
-    const QProp<float>*  prop ;
-    const QPMT<float>*   pmt ;
-    const QMultiFilm*    multifilm ;
+    const QRng *rng;
+    const QScint *scint;
+    const QWls *qwls;
+    const QCerenkov *cerenkov;
+    const QBnd *bnd;
+    const QOptical *optical;
+    const QDebug *debug_;
 
-    qsim*                 sim ;
-    qsim*               d_sim ;
+    const QProp<float> *prop;
+    const QPMT<float> *pmt;
+    const QMultiFilm *multifilm;
 
-    qdebug*           dbg ;
-    qdebug*           d_dbg ;
+    qsim *sim;
+    qsim *d_sim;
 
-    SSimulator*        cx ;
+    qdebug *dbg;
+    qdebug *d_dbg;
 
+    SSimulator *cx;
 
-    dim3 numBlocks ;
-    dim3 threadsPerBlock ;
+    dim3 numBlocks;
+    dim3 threadsPerBlock;
 
-private:
+  private:
     QSim();
     void init();
 
-    static constexpr const char* _QSim__REQUIRE_PMT = "QSim__REQUIRE_PMT" ;
-    static const bool   REQUIRE_PMT;
-
-    static constexpr const char* _QSim__SAVE_IGS_EVENTID = "QSim__SAVE_IGS_EVENTID" ;
-    static const int   SAVE_IGS_EVENTID ;
-
-    static constexpr const char* _QSim__SAVE_IGS_PATH    = "QSim__SAVE_IGS_PATH" ;
-    static const char* SAVE_IGS_PATH ;
+    static constexpr const char *_QSim__REQUIRE_PMT = "QSim__REQUIRE_PMT";
+    static const bool REQUIRE_PMT;
 
-    static constexpr const char* _QSim__CONCAT    = "QSim__CONCAT" ;
-    static const bool CONCAT ;
+    static constexpr const char *_QSim__SAVE_IGS_EVENTID = "QSim__SAVE_IGS_EVENTID";
+    static const int SAVE_IGS_EVENTID;
 
-    static constexpr const char* _QSim__ALLOC    = "QSim__ALLOC" ;
-    static const bool ALLOC ;
+    static constexpr const char *_QSim__SAVE_IGS_PATH = "QSim__SAVE_IGS_PATH";
+    static const char *SAVE_IGS_PATH;
 
+    static constexpr const char *_QSim__CONCAT = "QSim__CONCAT";
+    static const bool CONCAT;
 
-public:
-    void setLauncher(SSimulator* cx_ );
+    static constexpr const char *_QSim__ALLOC = "QSim__ALLOC";
+    static const bool ALLOC;
 
-    static constexpr const char* QSim__simulate_KEEP_SUBFOLD = "QSim__simulate_KEEP_SUBFOLD" ;
-    static bool KEEP_SUBFOLD ;
+  public:
+    void setLauncher(SSimulator *cx_);
 
-    double simulate(int eventID, bool reset_ );      // via cx launch
-    void   simulate_final_merge(int64_t tot_ph, cudaStream_t stream);
+    static constexpr const char *QSim__simulate_KEEP_SUBFOLD = "QSim__simulate_KEEP_SUBFOLD";
+    static bool KEEP_SUBFOLD;
 
+    double simulate(int eventID, bool reset_); // via cx launch
+    void simulate_final_merge(int64_t tot_ph, cudaStream_t stream);
 
+    NP *simulate(const NP *gs, int eventID); // higher level API for use from CSGOptiXService.h
 
-    NP*    simulate(const NP* gs, int eventID );     // higher level API for use from CSGOptiXService.h
+    static void MaybeSaveIGS(int eventID, NP *igs);
 
-    static void MaybeSaveIGS(int eventID, NP* igs);
+    unsigned long long get_photon_slot_offset() const;
 
-    unsigned long long get_photon_slot_offset() const ;
-
-    void   reset( int eventID);
+    void reset(int eventID);
 
     double simtrace(int eventID);
 
-
-    qsim* getDevicePtr() const ;
-    std::string desc() const ;
-    std::string descFull() const ;
-    std::string descComponents() const ;
-
+    qsim *getDevicePtr() const;
+    std::string desc() const;
+    std::string descFull() const;
+    std::string descComponents() const;
 
     // TODO: relocate non-essential methods into tests or elsewhere
 
-    char getScintTexFilterMode() const ;
+    char getScintTexFilterMode() const;
 
     void configureLaunch16();
-    void configureLaunch( unsigned width, unsigned height );
-    void configureLaunch2D( unsigned width, unsigned height );
+    void configureLaunch(unsigned width, unsigned height);
+    void configureLaunch2D(unsigned width, unsigned height);
     void configureLaunch1D(unsigned num, unsigned threads_per_block);
-    std::string descLaunch() const ;
-
+    std::string descLaunch() const;
 
-    template<typename T>
-    void rng_sequence( dim3 numblocks, dim3 threadsPerBlock, qsim* d_sim, T* d_seq, unsigned ni_tranche, unsigned nv, unsigned ioffset );
+    template <typename T>
+    void rng_sequence(dim3 numblocks, dim3 threadsPerBlock, qsim *d_sim, T *d_seq, unsigned ni_tranche, unsigned nv,
+                      unsigned ioffset);
 
-    template<typename T>
-    void rng_sequence( T* seq, unsigned ni, unsigned nj, unsigned ioffset );
+    template <typename T> void rng_sequence(T *seq, unsigned ni, unsigned nj, unsigned ioffset);
 
-    template<typename T>
-    void rng_sequence( const char* dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size );
+    template <typename T>
+    void rng_sequence(const char *dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size);
 
+    NP *scint_wavelength(unsigned num_wavelength, unsigned &hd_factor);
 
-    NP* scint_wavelength( unsigned num_wavelength, unsigned& hd_factor );
-
-    NP* RandGaussQ_shoot(unsigned num_v );
-
+    NP *RandGaussQ_shoot(unsigned num_v);
 
     // NP* cerenkov_wavelength_rejection_sampled( unsigned num_wavelength );
-    void dump_wavelength(                       float* wavelength, unsigned num_wavelength, unsigned edgeitems=10 );
-
-
-    NP* dbg_gs_generate(unsigned num_photon, unsigned type );
+    void dump_wavelength(float *wavelength, unsigned num_wavelength, unsigned edgeitems = 10);
 
+    NP *dbg_gs_generate(unsigned num_photon, unsigned type);
 
-    void dump_photon(            quad4* photon, unsigned num_photon, const char* opt="f0,f1,f2,i3", unsigned egdeitems=10 );
+    void dump_photon(quad4 *photon, unsigned num_photon, const char *opt = "f0,f1,f2,i3", unsigned egdeitems = 10);
 
     void generate_photon();
-    void fill_state_0(quad6*  state, unsigned num_state);
-    void fill_state_1(sstate* state, unsigned num_state);
+    void fill_state_0(quad6 *state, unsigned num_state);
+    void fill_state_1(sstate *state, unsigned num_state);
 
-    NP* quad_launch_generate(unsigned num_quad, unsigned type );
-    NP* photon_launch_generate(unsigned num_photon, unsigned type );
+    NP *quad_launch_generate(unsigned num_quad, unsigned type);
+    NP *photon_launch_generate(unsigned num_photon, unsigned type);
 
-    static constexpr const char* _QSim__photon_launch_mutate_DEBUG_NUM_PHOTON = "QSim__photon_launch_mutate_DEBUG_NUM_PHOTON" ;
-    static constexpr const char* _QSim__photon_launch_mutate_SKIP_LAUNCH = "QSim__photon_launch_mutate_SKIP_LAUNCH" ;
-    void photon_launch_mutate(   sphoton* photon, unsigned num_photon, unsigned type );
+    static constexpr const char *_QSim__photon_launch_mutate_DEBUG_NUM_PHOTON =
+        "QSim__photon_launch_mutate_DEBUG_NUM_PHOTON";
+    static constexpr const char *_QSim__photon_launch_mutate_SKIP_LAUNCH = "QSim__photon_launch_mutate_SKIP_LAUNCH";
+    void photon_launch_mutate(sphoton *photon, unsigned num_photon, unsigned type);
 
+    static quad2 *UploadFakePRD(const NP *ip, const NP *prd);
+    void fake_propagate(const NP *prd, unsigned type);
 
-    static quad2* UploadFakePRD(const NP* ip, const NP* prd);
-    void fake_propagate(const NP* prd, unsigned type );
+    unsigned getBoundaryTexWidth() const;
+    unsigned getBoundaryTexHeight() const;
+    const NP *getBoundaryTexSrc() const;
 
-    unsigned getBoundaryTexWidth() const ;
-    unsigned getBoundaryTexHeight() const ;
-    const NP* getBoundaryTexSrc() const ;
+    NP *boundary_lookup_all(unsigned width, unsigned height);
+    NP *boundary_lookup_line(float *domain, unsigned num_lookup, unsigned line, unsigned k);
 
-    NP* boundary_lookup_all( unsigned width, unsigned height ) ;
-    NP* boundary_lookup_line( float* domain, unsigned num_lookup, unsigned line, unsigned k ) ;
+    template <typename T>
+    void prop_lookup(T *lookup, const T *domain, unsigned domain_width, const std::vector<unsigned> &pids);
 
+    template <typename T>
+    void prop_lookup_onebyone(T *lookup, const T *domain, unsigned domain_width, const std::vector<unsigned> &pids);
 
-    template<typename T>
-    void prop_lookup(          T* lookup, const T* domain, unsigned domain_width, const std::vector<unsigned>& pids ) ;
+    void multifilm_lookup_all(quad2 *sample, quad2 *result, unsigned width, unsigned height);
 
-    template<typename T>
-    void prop_lookup_onebyone( T* lookup, const T* domain, unsigned domain_width, const std::vector<unsigned>& pids ) ;
-
-    void multifilm_lookup_all( quad2* sample , quad2* result ,  unsigned width, unsigned height );
-
-    static std::string Desc(char delim='\n');
+    static std::string Desc(char delim = '\n');
     static std::string Switches();
 };
-
-
diff --git a/qudarap/QU.cc b/qudarap/QU.cc
index 97aacf985..82fde9e2c 100644
--- a/qudarap/QU.cc
+++ b/qudarap/QU.cc
@@ -3,9 +3,9 @@
 #include "NP.hh"
 #include "SLOG.hh"
 
-#include "spath.h"
-#include "sdirectory.h"
 #include "scuda.h"
+#include "sdirectory.h"
+#include "spath.h"
 #include "squad.h"
 #include "ssys.h"
 
@@ -17,100 +17,91 @@
 #include "sphoton.h"
 #include "sphotonlite.h"
 
-#include "sevent.h"
-#include "salloc.h"
 #include "SEventConfig.hh"
+#include "salloc.h"
+#include "sevent.h"
 
-#include "QUDA_CHECK.h"
 #include "QU.hh"
+#include "QUDA_CHECK.h"
 
 #include "curand_kernel.h"
 #include "qrng.h"
 #include "qsim.h"
 
 #include "qbase.h"
-#include "qprop.h"
-#include "qpmt.h"
-#include "qdebug.h"
-#include "qscint.h"
 #include "qcerenkov.h"
 #include "qcurandwrap.h"
-#include "scurandref.h"
+#include "qdebug.h"
 #include "qmultifilm.h"
+#include "qpmt.h"
+#include "qprop.h"
+#include "qscint.h"
+#include "qwls.h"
+#include "scurandref.h"
 
-
-const plog::Severity QU::LEVEL = SLOG::EnvLevel("QU", "DEBUG") ;
+const plog::Severity QU::LEVEL = SLOG::EnvLevel("QU", "DEBUG");
 bool QU::MEMCHECK = ssys::getenvbool(_MEMCHECK);
 
-salloc* QU::alloc = nullptr ;
-
+salloc *QU::alloc = nullptr;
 
-void QU::alloc_add(const char* label, uint64_t num_items, uint64_t sizeof_item ) // static
+void QU::alloc_add(const char *label, uint64_t num_items, uint64_t sizeof_item) // static
 {
-   if(!alloc) alloc = SEventConfig::ALLOC ;
-   if(alloc ) alloc->add(label, num_items, sizeof_item );
+    if (!alloc)
+        alloc = SEventConfig::ALLOC;
+    if (alloc)
+        alloc->add(label, num_items, sizeof_item);
 }
 
-
-template <typename T>
-char QU::typecode()
+template <typename T> char QU::typecode()
 {
-    char c = '?' ;
-    switch(sizeof(T))
+    char c = '?';
+    switch (sizeof(T))
     {
-        case 4: c = 'f' ; break ;
-        case 8: c = 'd' ; break ;
+    case 4:
+        c = 'f';
+        break;
+    case 8:
+        c = 'd';
+        break;
     }
-    return c ;
+    return c;
 }
 
-template char QU::typecode<float>() ;
-template char QU::typecode<double>() ;
-
+template char QU::typecode<float>();
+template char QU::typecode<double>();
 
 template <typename T>
-std::string QU::rng_sequence_name(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset ) // static
+std::string QU::rng_sequence_name(const char *prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset) // static
 {
-    std::stringstream ss ;
-    ss << prefix
-       << "_" << QU::typecode<T>()
-       << "_ni" << ni
-       << "_nj" << nj
-       << "_nk" << nk
-       << "_ioffset" << std::setw(6) << std::setfill('0') << ioffset
-       << ".npy"
-       ;
+    std::stringstream ss;
+    ss << prefix << "_" << QU::typecode<T>() << "_ni" << ni << "_nj" << nj << "_nk" << nk << "_ioffset" << std::setw(6)
+       << std::setfill('0') << ioffset << ".npy";
 
     std::string name = ss.str();
-    return name ;
+    return name;
 }
 
-template std::string QU::rng_sequence_name<float>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset ) ;
-template std::string QU::rng_sequence_name<double>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset ) ;
-
-
+template std::string QU::rng_sequence_name<float>(const char *prefix, unsigned ni, unsigned nj, unsigned nk,
+                                                  unsigned ioffset);
+template std::string QU::rng_sequence_name<double>(const char *prefix, unsigned ni, unsigned nj, unsigned nk,
+                                                   unsigned ioffset);
 
 template <typename T>
-std::string QU::rng_sequence_reldir(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size ) // static
+std::string QU::rng_sequence_reldir(const char *prefix, unsigned ni, unsigned nj, unsigned nk,
+                                    unsigned ni_tranche_size) // static
 {
-    std::stringstream ss ;
-    ss << prefix
-       << "_" << QU::typecode<T>()
-       << "_ni" << ni
-       << "_nj" << nj
-       << "_nk" << nk
-       << "_tranche" << ni_tranche_size
-       ;
+    std::stringstream ss;
+    ss << prefix << "_" << QU::typecode<T>() << "_ni" << ni << "_nj" << nj << "_nk" << nk << "_tranche"
+       << ni_tranche_size;
 
     std::string reldir = ss.str();
-    return reldir ;
+    return reldir;
 }
 
-template std::string QU::rng_sequence_reldir<float>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size ) ;
-template std::string QU::rng_sequence_reldir<double>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size ) ;
-
-
-
+template std::string QU::rng_sequence_reldir<float>(const char *prefix, unsigned ni, unsigned nj, unsigned nk,
+                                                    unsigned ni_tranche_size);
+template std::string QU::rng_sequence_reldir<double>(const char *prefix, unsigned ni, unsigned nj, unsigned nk,
+                                                     unsigned ni_tranche_size);
 
 /**
 QU::UploadArray
@@ -120,61 +111,52 @@ Allocate on device and copy from host to device
 
 **/
 
-template <typename T>
-T* QU::UploadArray(const T* array, unsigned num_items, const char* label ) // static
+template <typename T> T *QU::UploadArray(const T *array, unsigned num_items, const char *label) // static
 {
-    size_t size = num_items*sizeof(T) ;
-
-    LOG(LEVEL)
-       << " num_items " << num_items
-       << " size " << size
-       << " label " << ( label ? label : "-" )
-       ;
+    size_t size = num_items * sizeof(T);
 
-    LOG_IF(info, MEMCHECK)
-       << " num_items " << num_items
-       << " size " << size
-       << " label " << ( label ? label : "-" )
-       ;
+    LOG(LEVEL) << " num_items " << num_items << " size " << size << " label " << (label ? label : "-");
 
+    LOG_IF(info, MEMCHECK) << " num_items " << num_items << " size " << size << " label " << (label ? label : "-");
 
-    alloc_add( label, num_items, sizeof(T) ) ;
+    alloc_add(label, num_items, sizeof(T));
 
-    T* d_array = nullptr ;
-    QUDA_CHECK( cudaMalloc(reinterpret_cast<void**>( &d_array ), size ));
-    QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( d_array ), array, size, cudaMemcpyHostToDevice ));
-    return d_array ;
+    T *d_array = nullptr;
+    QUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_array), size));
+    QUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_array), array, size, cudaMemcpyHostToDevice));
+    return d_array;
 }
 
-
 // IF NEED THESE FROM REMOVE PKG WILL NEED TO QUDARAP_API
-template float*         QU::UploadArray<float>(const float* array, unsigned num_items, const char* label ) ;
-template double*        QU::UploadArray<double>(const double* array, unsigned num_items, const char* label) ;
-template unsigned*      QU::UploadArray<unsigned>(const unsigned* array, unsigned num_items, const char* label) ;
-template int*           QU::UploadArray<int>(const int* array, unsigned num_items, const char* label) ;
-template quad4*         QU::UploadArray<quad4>(const quad4* array, unsigned num_items, const char* label) ;
-template sphoton*       QU::UploadArray<sphoton>(const sphoton* array, unsigned num_items, const char* label) ;
-template sphotonlite*   QU::UploadArray<sphotonlite>(const sphotonlite* array, unsigned num_items, const char* label) ;
-template quad2*         QU::UploadArray<quad2>(const quad2* array, unsigned num_items, const char* label) ;
-template XORWOW*        QU::UploadArray<XORWOW>(const XORWOW* array, unsigned num_items, const char* label) ;
-template Philox*        QU::UploadArray<Philox>(const Philox* array, unsigned num_items, const char* label) ;
-template qcurandwrap<XORWOW>*   QU::UploadArray<qcurandwrap<XORWOW>>(const qcurandwrap<XORWOW>* array, unsigned num_items, const char* label) ;
-template scurandref<XORWOW>*    QU::UploadArray<scurandref<XORWOW>>(const scurandref<XORWOW>* array, unsigned num_items, const char* label) ;
-template qsim*          QU::UploadArray<qsim>(const qsim* array, unsigned num_items, const char* label) ;
-template qprop<float>*  QU::UploadArray<qprop<float>>(const qprop<float>* array, unsigned num_items, const char* label) ;
-template qprop<double>* QU::UploadArray<qprop<double>>(const qprop<double>* array, unsigned num_items, const char* label) ;
-template qpmt<float>*   QU::UploadArray<qpmt<float>>(const qpmt<float>* array, unsigned num_items, const char* label) ;
-template qpmt<double>*  QU::UploadArray<qpmt<double>>(const qpmt<double>* array, unsigned num_items, const char* label) ;
-template qmultifilm*    QU::UploadArray<qmultifilm>(const qmultifilm* array, unsigned num_items, const char* label) ;
-template qrng<RNG>*     QU::UploadArray<qrng<RNG>>(const qrng<RNG>* array, unsigned num_items, const char* label) ;
-template qbnd*          QU::UploadArray<qbnd>(const qbnd* array, unsigned num_items, const char* label) ;
-template sevent*        QU::UploadArray<sevent>(const sevent* array, unsigned num_items, const char* label) ;
-template qdebug*        QU::UploadArray<qdebug>(const qdebug* array, unsigned num_items, const char* label) ;
-template qscint*        QU::UploadArray<qscint>(const qscint* array, unsigned num_items, const char* label) ;
-template qcerenkov*     QU::UploadArray<qcerenkov>(const qcerenkov* array, unsigned num_items, const char* label) ;
-template qbase*         QU::UploadArray<qbase>(const qbase* array, unsigned num_items, const char* label) ;
-
-
+template float *QU::UploadArray<float>(const float *array, unsigned num_items, const char *label);
+template double *QU::UploadArray<double>(const double *array, unsigned num_items, const char *label);
+template unsigned *QU::UploadArray<unsigned>(const unsigned *array, unsigned num_items, const char *label);
+template int *QU::UploadArray<int>(const int *array, unsigned num_items, const char *label);
+template quad4 *QU::UploadArray<quad4>(const quad4 *array, unsigned num_items, const char *label);
+template sphoton *QU::UploadArray<sphoton>(const sphoton *array, unsigned num_items, const char *label);
+template sphotonlite *QU::UploadArray<sphotonlite>(const sphotonlite *array, unsigned num_items, const char *label);
+template quad2 *QU::UploadArray<quad2>(const quad2 *array, unsigned num_items, const char *label);
+template XORWOW *QU::UploadArray<XORWOW>(const XORWOW *array, unsigned num_items, const char *label);
+template Philox *QU::UploadArray<Philox>(const Philox *array, unsigned num_items, const char *label);
+template qcurandwrap<XORWOW> *QU::UploadArray<qcurandwrap<XORWOW>>(const qcurandwrap<XORWOW> *array, unsigned num_items,
+                                                                   const char *label);
+template scurandref<XORWOW> *QU::UploadArray<scurandref<XORWOW>>(const scurandref<XORWOW> *array, unsigned num_items,
+                                                                 const char *label);
+template qsim *QU::UploadArray<qsim>(const qsim *array, unsigned num_items, const char *label);
+template qprop<float> *QU::UploadArray<qprop<float>>(const qprop<float> *array, unsigned num_items, const char *label);
+template qprop<double> *QU::UploadArray<qprop<double>>(const qprop<double> *array, unsigned num_items,
+                                                       const char *label);
+template qpmt<float> *QU::UploadArray<qpmt<float>>(const qpmt<float> *array, unsigned num_items, const char *label);
+template qpmt<double> *QU::UploadArray<qpmt<double>>(const qpmt<double> *array, unsigned num_items, const char *label);
+template qmultifilm *QU::UploadArray<qmultifilm>(const qmultifilm *array, unsigned num_items, const char *label);
+template qrng<RNG> *QU::UploadArray<qrng<RNG>>(const qrng<RNG> *array, unsigned num_items, const char *label);
+template qbnd *QU::UploadArray<qbnd>(const qbnd *array, unsigned num_items, const char *label);
+template sevent *QU::UploadArray<sevent>(const sevent *array, unsigned num_items, const char *label);
+template qdebug *QU::UploadArray<qdebug>(const qdebug *array, unsigned num_items, const char *label);
+template qscint *QU::UploadArray<qscint>(const qscint *array, unsigned num_items, const char *label);
+template qwls *QU::UploadArray<qwls>(const qwls *array, unsigned num_items, const char *label);
+template qcerenkov *QU::UploadArray<qcerenkov>(const qcerenkov *array, unsigned num_items, const char *label);
+template qbase *QU::UploadArray<qbase>(const qbase *array, unsigned num_items, const char *label);
 
 /**
 QU::DownloadArray
@@ -184,65 +166,57 @@ Allocate on host and copy from device to host
 
 **/
 
-template <typename T>
-T* QU::DownloadArray(const T* d_array, unsigned num_items ) // static
+template <typename T> T *QU::DownloadArray(const T *d_array, unsigned num_items) // static
 {
-    T* array = new T[num_items] ;
-    QUDA_CHECK( cudaMemcpy( array, d_array, sizeof(T)*num_items, cudaMemcpyDeviceToHost ));
-    return array ;
+    T *array = new T[num_items];
+    QUDA_CHECK(cudaMemcpy(array, d_array, sizeof(T) * num_items, cudaMemcpyDeviceToHost));
+    return array;
 }
 
-
-template  float*         QU::DownloadArray<float>(const float* d_array, unsigned num_items) ;
-template  unsigned*      QU::DownloadArray<unsigned>(const unsigned* d_array, unsigned num_items) ;
-template  int*           QU::DownloadArray<int>(const int* d_array, unsigned num_items) ;
-template  quad4*         QU::DownloadArray<quad4>(const quad4* d_array, unsigned num_items) ;
-template  quad2*         QU::DownloadArray<quad2>(const quad2* d_array, unsigned num_items) ;
-template  XORWOW*        QU::DownloadArray<XORWOW>(const XORWOW* d_array, unsigned num_items) ;
-template  Philox*        QU::DownloadArray<Philox>(const Philox* d_array, unsigned num_items) ;
-template  qprop<float>*  QU::DownloadArray<qprop<float>>(const qprop<float>* d_array, unsigned num_items) ;
-template  qprop<double>* QU::DownloadArray<qprop<double>>(const qprop<double>* d_array, unsigned num_items) ;
-
-
-template <typename T>
-void QU::Download(std::vector<T>& vec, const T* d_array, unsigned num_items)  // static
+template float *QU::DownloadArray<float>(const float *d_array, unsigned num_items);
+template unsigned *QU::DownloadArray<unsigned>(const unsigned *d_array, unsigned num_items);
+template int *QU::DownloadArray<int>(const int *d_array, unsigned num_items);
+template quad4 *QU::DownloadArray<quad4>(const quad4 *d_array, unsigned num_items);
+template quad2 *QU::DownloadArray<quad2>(const quad2 *d_array, unsigned num_items);
+template XORWOW *QU::DownloadArray<XORWOW>(const XORWOW *d_array, unsigned num_items);
+template Philox *QU::DownloadArray<Philox>(const Philox *d_array, unsigned num_items);
+template qprop<float> *QU::DownloadArray<qprop<float>>(const qprop<float> *d_array, unsigned num_items);
+template qprop<double> *QU::DownloadArray<qprop<double>>(const qprop<double> *d_array, unsigned num_items);
+
+template <typename T> void QU::Download(std::vector<T> &vec, const T *d_array, unsigned num_items) // static
 {
-    vec.resize( num_items);
-    QUDA_CHECK( cudaMemcpy( static_cast<void*>( vec.data() ), d_array, num_items*sizeof(T), cudaMemcpyDeviceToHost));
+    vec.resize(num_items);
+    QUDA_CHECK(cudaMemcpy(static_cast<void *>(vec.data()), d_array, num_items * sizeof(T), cudaMemcpyDeviceToHost));
 }
 
+template QUDARAP_API void QU::Download<float>(std::vector<float> &vec, const float *d_array, unsigned num_items);
+template QUDARAP_API void QU::Download<unsigned>(std::vector<unsigned> &vec, const unsigned *d_array,
+                                                 unsigned num_items);
+template QUDARAP_API void QU::Download<int>(std::vector<int> &vec, const int *d_array, unsigned num_items);
+template QUDARAP_API void QU::Download<uchar4>(std::vector<uchar4> &vec, const uchar4 *d_array, unsigned num_items);
+template QUDARAP_API void QU::Download<float4>(std::vector<float4> &vec, const float4 *d_array, unsigned num_items);
+template QUDARAP_API void QU::Download<quad4>(std::vector<quad4> &vec, const quad4 *d_array, unsigned num_items);
 
-template QUDARAP_API void QU::Download<float>(   std::vector<float>& vec,    const float* d_array,    unsigned num_items);
-template QUDARAP_API void QU::Download<unsigned>(std::vector<unsigned>& vec, const unsigned* d_array, unsigned num_items);
-template QUDARAP_API void QU::Download<int>(     std::vector<int>& vec,      const int* d_array,      unsigned num_items);
-template QUDARAP_API void QU::Download<uchar4>(  std::vector<uchar4>& vec,   const uchar4* d_array,   unsigned num_items);
-template QUDARAP_API void QU::Download<float4>(  std::vector<float4>& vec,   const float4* d_array,   unsigned num_items);
-template QUDARAP_API void QU::Download<quad4>(   std::vector<quad4>& vec,    const quad4* d_array,    unsigned num_items);
-
-
-
-template<typename T>
-void QU::device_free_and_alloc(T** dd, unsigned num_items ) // dd: pointer-to-device-pointer
+template <typename T> void QU::device_free_and_alloc(T **dd, unsigned num_items) // dd: pointer-to-device-pointer
 {
-    size_t size = num_items*sizeof(T) ;
-    LOG_IF(info, MEMCHECK) << " size " << size << " num_items " << num_items ;
+    size_t size = num_items * sizeof(T);
+    LOG_IF(info, MEMCHECK) << " size " << size << " num_items " << num_items;
 
-    QUDA_CHECK( cudaFree( reinterpret_cast<void*>( *dd ) ) );
-    QUDA_CHECK( cudaMalloc(reinterpret_cast<void**>( dd ), size ));
-    assert( *dd );
+    QUDA_CHECK(cudaFree(reinterpret_cast<void *>(*dd)));
+    QUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(dd), size));
+    assert(*dd);
 }
 
+template QUDARAP_API void QU::device_free_and_alloc<float>(float **dd, unsigned num_items);
+template QUDARAP_API void QU::device_free_and_alloc<double>(double **dd, unsigned num_items);
+template QUDARAP_API void QU::device_free_and_alloc<unsigned>(unsigned **dd, unsigned num_items);
+template QUDARAP_API void QU::device_free_and_alloc<int>(int **dd, unsigned num_items);
+template QUDARAP_API void QU::device_free_and_alloc<quad>(quad **dd, unsigned num_items);
+template QUDARAP_API void QU::device_free_and_alloc<uchar4>(uchar4 **dd, unsigned num_items);
+template QUDARAP_API void QU::device_free_and_alloc<float4>(float4 **dd, unsigned num_items);
+template QUDARAP_API void QU::device_free_and_alloc<quad4>(quad4 **dd, unsigned num_items);
 
-template QUDARAP_API void  QU::device_free_and_alloc<float>(float** dd, unsigned num_items) ;
-template QUDARAP_API void  QU::device_free_and_alloc<double>(double** dd, unsigned num_items) ;
-template QUDARAP_API void  QU::device_free_and_alloc<unsigned>(unsigned** dd, unsigned num_items) ;
-template QUDARAP_API void  QU::device_free_and_alloc<int>(int** dd, unsigned num_items) ;
-template QUDARAP_API void  QU::device_free_and_alloc<quad>(quad** dd, unsigned num_items) ;
-template QUDARAP_API void  QU::device_free_and_alloc<uchar4>(uchar4** dd, unsigned num_items) ;
-template QUDARAP_API void  QU::device_free_and_alloc<float4>(float4** dd, unsigned num_items) ;
-template QUDARAP_API void  QU::device_free_and_alloc<quad4>(quad4** dd, unsigned num_items) ;
-
-const char* QU::_cudaMalloc_OOM_NOTES = R"( ;
+const char *QU::_cudaMalloc_OOM_NOTES = R"( ;
 QU::_cudaMalloc_OOM_NOTES
 ==========================
 
@@ -255,239 +229,190 @@ One million is typically reasonable for debugging::
 
    export OPTICKS_MAX_SLOT=M1
 
-)" ;
-
-
+)";
 
-
-void QU::_cudaMalloc( void** p2p, size_t size, const char* label )
+void QU::_cudaMalloc(void **p2p, size_t size, const char *label)
 {
-    cudaError_t err = cudaMalloc(p2p, size ) ;
-    if( err != cudaSuccess )
+    cudaError_t err = cudaMalloc(p2p, size);
+    if (err != cudaSuccess)
     {
-        const char* out = spath::Resolve("$DefaultOutputDir") ;
-        salloc* estimate = SEventConfig::AllocEstimate();
+        const char *out = spath::Resolve("$DefaultOutputDir");
+        salloc *estimate = SEventConfig::AllocEstimate();
 
         std::stringstream ss;
-        ss << "CUDA call (" << label << " ) failed with error: '"
-           << cudaGetErrorString( err )
-           << "' (" __FILE__ << ":" << __LINE__ << ")"
-           << "\n\n"
-           << "[SEventConfig::DescEventMode (use of DebugHeavy/DebugLite EventMode with high stats is typical cause of OOM errors)\n"
+        ss << "CUDA call (" << label << " ) failed with error: '" << cudaGetErrorString(err) << "' (" __FILE__ << ":"
+           << __LINE__ << ")" << "\n\n"
+           << "[SEventConfig::DescEventMode (use of DebugHeavy/DebugLite EventMode with high stats is typical cause of "
+              "OOM errors)\n"
            << SEventConfig::DescEventMode()
-           << "]SEventConfig::DescEventMode (use of DebugHeavy/DebugLite EventMode with high stats is typical cause of OOM errors)\n"
+           << "]SEventConfig::DescEventMode (use of DebugHeavy/DebugLite EventMode with high stats is typical cause of "
+              "OOM errors)\n"
            << "\n\n"
            << "[alloc.desc\n"
-           << ( alloc ? alloc->desc() : "no-alloc" )
-           << "]alloc.desc\n"
+           << (alloc ? alloc->desc() : "no-alloc") << "]alloc.desc\n"
            << "\n"
            << "[NOTES\n"
-           << _cudaMalloc_OOM_NOTES
-           << "]NOTES\n"
+           << _cudaMalloc_OOM_NOTES << "]NOTES\n"
            << "\n\n"
            << "[SEventConfig::AllocEstimate\n"
-           << ( estimate ? estimate->desc() : "no-estimate" )
-           << "]SEventConfig::AllocEstimate\n"
-           << "save salloc record to [" << out << "]\n" ;
-           ;
+           << (estimate ? estimate->desc() : "no-estimate") << "]SEventConfig::AllocEstimate\n"
+           << "save salloc record to [" << out << "]\n";
+        ;
 
         std::string msg = ss.str();
-        LOG(error) << msg ;
+        LOG(error) << msg;
 
-        sdirectory::MakeDirs(out,0);
-        alloc->save(out) ;
+        sdirectory::MakeDirs(out, 0);
+        alloc->save(out);
 
-        throw QUDA_Exception( msg.c_str() );
+        throw QUDA_Exception(msg.c_str());
     }
 }
 
-
-template<typename T>
-T* QU::device_alloc( unsigned num_items, const char* label )
+template <typename T> T *QU::device_alloc(unsigned num_items, const char *label)
 {
-    size_t size = num_items*sizeof(T) ;
+    size_t size = num_items * sizeof(T);
 
-    LOG(LEVEL)
-        << " num_items " << std::setw(10) << num_items
-        << " size " << std::setw(10) << size
-        << " label " << std::setw(15) << label
-        ;
+    LOG(LEVEL) << " num_items " << std::setw(10) << num_items << " size " << std::setw(10) << size << " label "
+               << std::setw(15) << label;
 
-    LOG_IF(info, MEMCHECK)
-        << " num_items " << std::setw(10) << num_items
-        << " size " << std::setw(10) << size
-        << " label " << std::setw(15) << label
-        ;
+    LOG_IF(info, MEMCHECK) << " num_items " << std::setw(10) << num_items << " size " << std::setw(10) << size
+                           << " label " << std::setw(15) << label;
 
+    alloc_add(label, num_items, sizeof(T));
 
-    alloc_add( label, num_items, sizeof(T) ) ;
+    T *d;
+    _cudaMalloc(reinterpret_cast<void **>(&d), size, label);
 
-    T* d ;
-    _cudaMalloc( reinterpret_cast<void**>( &d ), size, label );
-
-    return d ;
+    return d;
 }
 
-template QUDARAP_API float*     QU::device_alloc<float>(unsigned num_items, const char* label) ;
-template QUDARAP_API double*    QU::device_alloc<double>(unsigned num_items, const char* label) ;
-template QUDARAP_API unsigned*  QU::device_alloc<unsigned>(unsigned num_items, const char* label) ;
-template QUDARAP_API int*       QU::device_alloc<int>(unsigned num_items, const char* label) ;
-template QUDARAP_API uchar4*    QU::device_alloc<uchar4>(unsigned num_items, const char* label) ;
-template QUDARAP_API float4*    QU::device_alloc<float4>(unsigned num_items, const char* label) ;
-template QUDARAP_API quad*      QU::device_alloc<quad>(unsigned num_items, const char* label) ;
-template QUDARAP_API quad2*     QU::device_alloc<quad2>(unsigned num_items, const char* label) ;
-template QUDARAP_API quad4*     QU::device_alloc<quad4>(unsigned num_items, const char* label) ;
-template QUDARAP_API quad6*     QU::device_alloc<quad6>(unsigned num_items, const char* label) ;
-template QUDARAP_API sevent*    QU::device_alloc<sevent>(unsigned num_items, const char* label) ;
-template QUDARAP_API qdebug*    QU::device_alloc<qdebug>(unsigned num_items, const char* label) ;
-template QUDARAP_API sstate*    QU::device_alloc<sstate>(unsigned num_items, const char* label) ;
-template QUDARAP_API XORWOW*    QU::device_alloc<XORWOW>(unsigned num_items, const char* label) ;
-template QUDARAP_API Philox*    QU::device_alloc<Philox>(unsigned num_items, const char* label) ;
+template QUDARAP_API float *QU::device_alloc<float>(unsigned num_items, const char *label);
+template QUDARAP_API double *QU::device_alloc<double>(unsigned num_items, const char *label);
+template QUDARAP_API unsigned *QU::device_alloc<unsigned>(unsigned num_items, const char *label);
+template QUDARAP_API int *QU::device_alloc<int>(unsigned num_items, const char *label);
+template QUDARAP_API uchar4 *QU::device_alloc<uchar4>(unsigned num_items, const char *label);
+template QUDARAP_API float4 *QU::device_alloc<float4>(unsigned num_items, const char *label);
+template QUDARAP_API quad *QU::device_alloc<quad>(unsigned num_items, const char *label);
+template QUDARAP_API quad2 *QU::device_alloc<quad2>(unsigned num_items, const char *label);
+template QUDARAP_API quad4 *QU::device_alloc<quad4>(unsigned num_items, const char *label);
+template QUDARAP_API quad6 *QU::device_alloc<quad6>(unsigned num_items, const char *label);
+template QUDARAP_API sevent *QU::device_alloc<sevent>(unsigned num_items, const char *label);
+template QUDARAP_API qdebug *QU::device_alloc<qdebug>(unsigned num_items, const char *label);
+template QUDARAP_API sstate *QU::device_alloc<sstate>(unsigned num_items, const char *label);
+template QUDARAP_API XORWOW *QU::device_alloc<XORWOW>(unsigned num_items, const char *label);
+template QUDARAP_API Philox *QU::device_alloc<Philox>(unsigned num_items, const char *label);
 
 #ifndef PRODUCTION
-template QUDARAP_API srec*      QU::device_alloc<srec>(unsigned num_items, const char* label) ;
-template QUDARAP_API sseq*      QU::device_alloc<sseq>(unsigned num_items, const char* label) ;
+template QUDARAP_API srec *QU::device_alloc<srec>(unsigned num_items, const char *label);
+template QUDARAP_API sseq *QU::device_alloc<sseq>(unsigned num_items, const char *label);
 #endif
 
-template QUDARAP_API sphoton*   QU::device_alloc<sphoton>(unsigned num_items, const char* label) ;
-template QUDARAP_API sphotonlite*   QU::device_alloc<sphotonlite>(unsigned num_items, const char* label) ;
-
+template QUDARAP_API sphoton *QU::device_alloc<sphoton>(unsigned num_items, const char *label);
+template QUDARAP_API sphotonlite *QU::device_alloc<sphotonlite>(unsigned num_items, const char *label);
 
-template<typename T>
-T* QU::device_alloc_zero(unsigned num_items, const char* label)
+template <typename T> T *QU::device_alloc_zero(unsigned num_items, const char *label)
 {
-    size_t size = num_items*sizeof(T) ;
+    size_t size = num_items * sizeof(T);
 
-    LOG(LEVEL)
-        << " num_items " << std::setw(10) << num_items
-        << " sizeof(T) " << std::setw(10) << sizeof(T)
-        << " size " << std::setw(10) << size
-        << " label " << std::setw(15) << label
-        ;
+    LOG(LEVEL) << " num_items " << std::setw(10) << num_items << " sizeof(T) " << std::setw(10) << sizeof(T) << " size "
+               << std::setw(10) << size << " label " << std::setw(15) << label;
 
-    LOG_IF(info, MEMCHECK)
-        << " num_items " << std::setw(10) << num_items
-        << " sizeof(T) " << std::setw(10) << sizeof(T)
-        << " size " << std::setw(10) << size
-        << " label " << std::setw(15) << label
-        ;
+    LOG_IF(info, MEMCHECK) << " num_items " << std::setw(10) << num_items << " sizeof(T) " << std::setw(10) << sizeof(T)
+                           << " size " << std::setw(10) << size << " label " << std::setw(15) << label;
 
+    alloc_add(label, num_items, sizeof(T));
 
-    alloc_add( label, num_items, sizeof(T) ) ;
+    T *d;
+    _cudaMalloc(reinterpret_cast<void **>(&d), size, label);
 
-    T* d ;
-    _cudaMalloc( reinterpret_cast<void**>( &d ), size, label );
+    int value = 0;
+    QUDA_CHECK(cudaMemset(d, value, size));
 
-    int value = 0 ;
-    QUDA_CHECK( cudaMemset(d, value, size ));
-
-    return d ;
+    return d;
 }
 
-template QUDARAP_API sphoton*   QU::device_alloc_zero<sphoton>(unsigned num_items, const char* label) ;
-template QUDARAP_API sphotonlite*   QU::device_alloc_zero<sphotonlite>(unsigned num_items, const char* label) ;
-template QUDARAP_API quad2*     QU::device_alloc_zero<quad2>(  unsigned num_items, const char* label) ;
-template QUDARAP_API XORWOW*    QU::device_alloc_zero<XORWOW>(  unsigned num_items, const char* label) ;
-template QUDARAP_API Philox*    QU::device_alloc_zero<Philox>(  unsigned num_items, const char* label) ;
+template QUDARAP_API sphoton *QU::device_alloc_zero<sphoton>(unsigned num_items, const char *label);
+template QUDARAP_API sphotonlite *QU::device_alloc_zero<sphotonlite>(unsigned num_items, const char *label);
+template QUDARAP_API quad2 *QU::device_alloc_zero<quad2>(unsigned num_items, const char *label);
+template QUDARAP_API XORWOW *QU::device_alloc_zero<XORWOW>(unsigned num_items, const char *label);
+template QUDARAP_API Philox *QU::device_alloc_zero<Philox>(unsigned num_items, const char *label);
 
 #ifndef PRODUCTION
-template QUDARAP_API srec*      QU::device_alloc_zero<srec>(   unsigned num_items, const char* label) ;
-template QUDARAP_API sseq*      QU::device_alloc_zero<sseq>(   unsigned num_items, const char* label) ;
-template QUDARAP_API stag*      QU::device_alloc_zero<stag>(   unsigned num_items, const char* label) ;
-template QUDARAP_API sflat*     QU::device_alloc_zero<sflat>(  unsigned num_items, const char* label) ;
+template QUDARAP_API srec *QU::device_alloc_zero<srec>(unsigned num_items, const char *label);
+template QUDARAP_API sseq *QU::device_alloc_zero<sseq>(unsigned num_items, const char *label);
+template QUDARAP_API stag *QU::device_alloc_zero<stag>(unsigned num_items, const char *label);
+template QUDARAP_API sflat *QU::device_alloc_zero<sflat>(unsigned num_items, const char *label);
 #endif
 
-
-
-
-template<typename T>
-void QU::device_memset( T* d, int value, unsigned num_items )
+template <typename T> void QU::device_memset(T *d, int value, unsigned num_items)
 {
-    size_t size = num_items*sizeof(T) ;
+    size_t size = num_items * sizeof(T);
 
-    LOG_IF(info, MEMCHECK)
-        << " num_items " << std::setw(10) << num_items
-        << " sizeof(T) " << std::setw(10) << sizeof(T)
-        << " size " << std::setw(10) << size
-        ;
+    LOG_IF(info, MEMCHECK) << " num_items " << std::setw(10) << num_items << " sizeof(T) " << std::setw(10) << sizeof(T)
+                           << " size " << std::setw(10) << size;
 
-    QUDA_CHECK( cudaMemset(d, value, size ));
+    QUDA_CHECK(cudaMemset(d, value, size));
 }
 
-template QUDARAP_API void     QU::device_memset<int>(int*, int, unsigned ) ;
-template QUDARAP_API void     QU::device_memset<quad4>(quad4*, int, unsigned ) ;
-template QUDARAP_API void     QU::device_memset<quad6>(quad6*, int, unsigned ) ;
-template QUDARAP_API void     QU::device_memset<sphoton>(sphoton*, int, unsigned ) ;
-template QUDARAP_API void     QU::device_memset<sphotonlite>(sphotonlite*, int, unsigned ) ;
-
-
-
-
+template QUDARAP_API void QU::device_memset<int>(int *, int, unsigned);
+template QUDARAP_API void QU::device_memset<quad4>(quad4 *, int, unsigned);
+template QUDARAP_API void QU::device_memset<quad6>(quad6 *, int, unsigned);
+template QUDARAP_API void QU::device_memset<sphoton>(sphoton *, int, unsigned);
+template QUDARAP_API void QU::device_memset<sphotonlite>(sphotonlite *, int, unsigned);
 
-
-
-
-
-
-template<typename T>
-void QU::device_free( T* d)
+template <typename T> void QU::device_free(T *d)
 {
-    LOG_IF(info, MEMCHECK) ;
+    LOG_IF(info, MEMCHECK);
     // HMM: could use salloc to find the label ?
 
-    QUDA_CHECK( cudaFree(d) );
+    QUDA_CHECK(cudaFree(d));
 }
 
-template QUDARAP_API void   QU::device_free<float>(float*) ;
-template QUDARAP_API void   QU::device_free<double>(double*) ;
-template QUDARAP_API void   QU::device_free<unsigned>(unsigned*) ;
-template QUDARAP_API void   QU::device_free<quad2>(quad2*) ;
-template QUDARAP_API void   QU::device_free<quad4>(quad4*) ;
-template QUDARAP_API void   QU::device_free<sphoton>(sphoton*) ;
-template QUDARAP_API void   QU::device_free<sphotonlite>(sphotonlite*) ;
-template QUDARAP_API void   QU::device_free<uchar4>(uchar4*) ;
-template QUDARAP_API void   QU::device_free<XORWOW>(XORWOW*) ;
-template QUDARAP_API void   QU::device_free<Philox>(Philox*) ;
-
-
-template<typename T>
-int QU::copy_device_to_host( T* h, T* d,  unsigned num_items)
+template QUDARAP_API void QU::device_free<float>(float *);
+template QUDARAP_API void QU::device_free<double>(double *);
+template QUDARAP_API void QU::device_free<unsigned>(unsigned *);
+template QUDARAP_API void QU::device_free<quad2>(quad2 *);
+template QUDARAP_API void QU::device_free<quad4>(quad4 *);
+template QUDARAP_API void QU::device_free<sphoton>(sphoton *);
+template QUDARAP_API void QU::device_free<sphotonlite>(sphotonlite *);
+template QUDARAP_API void QU::device_free<uchar4>(uchar4 *);
+template QUDARAP_API void QU::device_free<XORWOW>(XORWOW *);
+template QUDARAP_API void QU::device_free<Philox>(Philox *);
+
+template <typename T> int QU::copy_device_to_host(T *h, T *d, unsigned num_items)
 {
-    if( d == nullptr ) std::cerr
-        << "QU::copy_device_to_host"
-        << " ERROR : device pointer is null "
-        << std::endl
-        ;
+    if (d == nullptr)
+        std::cerr << "QU::copy_device_to_host" << " ERROR : device pointer is null " << std::endl;
 
-    if( d == nullptr ) return 1 ;
+    if (d == nullptr)
+        return 1;
 
-    size_t size = num_items*sizeof(T) ;
-    QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( h ), d , size, cudaMemcpyDeviceToHost ));
+    size_t size = num_items * sizeof(T);
+    QUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(h), d, size, cudaMemcpyDeviceToHost));
 
-    return 0 ;
+    return 0;
 }
 
-
-template int QU::copy_device_to_host<int>(  int* h, int* d,  unsigned num_items);
-template int QU::copy_device_to_host<float>(  float* h, float* d,  unsigned num_items);
-template int QU::copy_device_to_host<double>( double* h, double* d,  unsigned num_items);
-template int QU::copy_device_to_host<quad>( quad* h, quad* d,  unsigned num_items);
-template int QU::copy_device_to_host<quad2>( quad2* h, quad2* d,  unsigned num_items);
-template int QU::copy_device_to_host<quad4>( quad4* h, quad4* d,  unsigned num_items);
-template int QU::copy_device_to_host<sphoton>( sphoton* h, sphoton* d,  unsigned num_items);
-template int QU::copy_device_to_host<sphotonlite>( sphotonlite* h, sphotonlite* d,  unsigned num_items);
-template int QU::copy_device_to_host<quad6>( quad6* h, quad6* d,  unsigned num_items);
-template int QU::copy_device_to_host<sstate>( sstate* h, sstate* d,  unsigned num_items);
-template int QU::copy_device_to_host<XORWOW>( XORWOW* h, XORWOW* d,  unsigned num_items);
-template int QU::copy_device_to_host<Philox>( Philox* h, Philox* d,  unsigned num_items);
+template int QU::copy_device_to_host<int>(int *h, int *d, unsigned num_items);
+template int QU::copy_device_to_host<float>(float *h, float *d, unsigned num_items);
+template int QU::copy_device_to_host<double>(double *h, double *d, unsigned num_items);
+template int QU::copy_device_to_host<quad>(quad *h, quad *d, unsigned num_items);
+template int QU::copy_device_to_host<quad2>(quad2 *h, quad2 *d, unsigned num_items);
+template int QU::copy_device_to_host<quad4>(quad4 *h, quad4 *d, unsigned num_items);
+template int QU::copy_device_to_host<sphoton>(sphoton *h, sphoton *d, unsigned num_items);
+template int QU::copy_device_to_host<sphotonlite>(sphotonlite *h, sphotonlite *d, unsigned num_items);
+template int QU::copy_device_to_host<quad6>(quad6 *h, quad6 *d, unsigned num_items);
+template int QU::copy_device_to_host<sstate>(sstate *h, sstate *d, unsigned num_items);
+template int QU::copy_device_to_host<XORWOW>(XORWOW *h, XORWOW *d, unsigned num_items);
+template int QU::copy_device_to_host<Philox>(Philox *h, Philox *d, unsigned num_items);
 #ifndef PRODUCTION
-template int QU::copy_device_to_host<srec>( srec* h, srec* d,  unsigned num_items);
-template int QU::copy_device_to_host<sseq>( sseq* h, sseq* d,  unsigned num_items);
-template int QU::copy_device_to_host<stag>( stag* h, stag* d,  unsigned num_items);
-template int QU::copy_device_to_host<sflat>( sflat* h, sflat* d,  unsigned num_items);
+template int QU::copy_device_to_host<srec>(srec *h, srec *d, unsigned num_items);
+template int QU::copy_device_to_host<sseq>(sseq *h, sseq *d, unsigned num_items);
+template int QU::copy_device_to_host<stag>(stag *h, stag *d, unsigned num_items);
+template int QU::copy_device_to_host<sflat>(sflat *h, sflat *d, unsigned num_items);
 #endif
 
-
 /**
 QU::copy_device_to_host_and_free
 ----------------------------------
@@ -534,60 +459,43 @@ results into the output array.
 
 **/
 
-template<typename T>
-void QU::copy_device_to_host_and_free( T* h, T* d,  unsigned num_items, const char* label)
+template <typename T> void QU::copy_device_to_host_and_free(T *h, T *d, unsigned num_items, const char *label)
 {
-    size_t size = num_items*sizeof(T) ;
-    LOG(LEVEL)
-        << "copy " << num_items
-        << " sizeof(T) " << sizeof(T)
-        << " label " << ( label ? label : "-" )
-        ;
+    size_t size = num_items * sizeof(T);
+    LOG(LEVEL) << "copy " << num_items << " sizeof(T) " << sizeof(T) << " label " << (label ? label : "-");
 
-    QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( h ), d , size, cudaMemcpyDeviceToHost ));
-    QUDA_CHECK( cudaFree(d) );
+    QUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(h), d, size, cudaMemcpyDeviceToHost));
+    QUDA_CHECK(cudaFree(d));
 }
 
-
-template void QU::copy_device_to_host_and_free<float>(  float* h, float* d,  unsigned num_items, const char* label );
-template void QU::copy_device_to_host_and_free<double>( double* h, double* d,  unsigned num_items, const char* label);
-template void QU::copy_device_to_host_and_free<quad>( quad* h, quad* d,  unsigned num_items, const char* label);
-template void QU::copy_device_to_host_and_free<quad2>( quad2* h, quad2* d,  unsigned num_items, const char* label);
-template void QU::copy_device_to_host_and_free<quad4>( quad4* h, quad4* d,  unsigned num_items, const char* label);
-template void QU::copy_device_to_host_and_free<sphoton>( sphoton* h, sphoton* d,  unsigned num_items, const char* label);
-template void QU::copy_device_to_host_and_free<sphotonlite>( sphotonlite* h, sphotonlite* d,  unsigned num_items, const char* label);
-template void QU::copy_device_to_host_and_free<quad6>( quad6* h, quad6* d,  unsigned num_items, const char* label);
-template void QU::copy_device_to_host_and_free<sstate>( sstate* h, sstate* d,  unsigned num_items, const char* label);
-
-
-
-
-
-
-
-
-
-
-
-
-template<typename T>
-void QU::copy_host_to_device( T* d, const T* h, unsigned num_items)
+template void QU::copy_device_to_host_and_free<float>(float *h, float *d, unsigned num_items, const char *label);
+template void QU::copy_device_to_host_and_free<double>(double *h, double *d, unsigned num_items, const char *label);
+template void QU::copy_device_to_host_and_free<quad>(quad *h, quad *d, unsigned num_items, const char *label);
+template void QU::copy_device_to_host_and_free<quad2>(quad2 *h, quad2 *d, unsigned num_items, const char *label);
+template void QU::copy_device_to_host_and_free<quad4>(quad4 *h, quad4 *d, unsigned num_items, const char *label);
+template void QU::copy_device_to_host_and_free<sphoton>(sphoton *h, sphoton *d, unsigned num_items, const char *label);
+template void QU::copy_device_to_host_and_free<sphotonlite>(sphotonlite *h, sphotonlite *d, unsigned num_items,
+                                                            const char *label);
+template void QU::copy_device_to_host_and_free<quad6>(quad6 *h, quad6 *d, unsigned num_items, const char *label);
+template void QU::copy_device_to_host_and_free<sstate>(sstate *h, sstate *d, unsigned num_items, const char *label);
+
+template <typename T> void QU::copy_host_to_device(T *d, const T *h, unsigned num_items)
 {
-    size_t size = num_items*sizeof(T) ;
-    QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( d ), h , size, cudaMemcpyHostToDevice ));
+    size_t size = num_items * sizeof(T);
+    QUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d), h, size, cudaMemcpyHostToDevice));
 }
 
-template void QU::copy_host_to_device<float>(    float* d,   const float* h, unsigned num_items);
-template void QU::copy_host_to_device<double>(   double* d,  const double* h, unsigned num_items);
-template void QU::copy_host_to_device<unsigned>( unsigned* d, const unsigned* h, unsigned num_items);
-template void QU::copy_host_to_device<sevent>(   sevent* d,   const sevent* h, unsigned num_items);
-template void QU::copy_host_to_device<quad4>(    quad4* d,    const quad4* h, unsigned num_items);
-template void QU::copy_host_to_device<sphoton>(  sphoton* d,  const sphoton* h, unsigned num_items);
-template void QU::copy_host_to_device<sphotonlite>(  sphotonlite* d,  const sphotonlite* h, unsigned num_items);
-template void QU::copy_host_to_device<quad6>(    quad6* d,    const quad6* h, unsigned num_items);
-template void QU::copy_host_to_device<quad2>(    quad2* d,    const quad2* h, unsigned num_items);
-template void QU::copy_host_to_device<XORWOW>(   XORWOW* d,   const XORWOW* h,   unsigned num_items);
-template void QU::copy_host_to_device<Philox>(   Philox* d,   const Philox* h,   unsigned num_items);
+template void QU::copy_host_to_device<float>(float *d, const float *h, unsigned num_items);
+template void QU::copy_host_to_device<double>(double *d, const double *h, unsigned num_items);
+template void QU::copy_host_to_device<unsigned>(unsigned *d, const unsigned *h, unsigned num_items);
+template void QU::copy_host_to_device<sevent>(sevent *d, const sevent *h, unsigned num_items);
+template void QU::copy_host_to_device<quad4>(quad4 *d, const quad4 *h, unsigned num_items);
+template void QU::copy_host_to_device<sphoton>(sphoton *d, const sphoton *h, unsigned num_items);
+template void QU::copy_host_to_device<sphotonlite>(sphotonlite *d, const sphotonlite *h, unsigned num_items);
+template void QU::copy_host_to_device<quad6>(quad6 *d, const quad6 *h, unsigned num_items);
+template void QU::copy_host_to_device<quad2>(quad2 *d, const quad2 *h, unsigned num_items);
+template void QU::copy_host_to_device<XORWOW>(XORWOW *d, const XORWOW *h, unsigned num_items);
+template void QU::copy_host_to_device<Philox>(Philox *d, const Philox *h, unsigned num_items);
 
 /**
 QU::NumItems
@@ -598,52 +506,50 @@ using the size of the template type and the shape of the NP array.
 
 **/
 
-template <typename T>
-unsigned QU::NumItems( const NP* a )
+template <typename T> unsigned QU::NumItems(const NP *a)
 {
-    unsigned num_items = 0 ;
+    unsigned num_items = 0;
 
-    if( sizeof(T) == sizeof(float)*6*4 )   // looks like quad6
+    if (sizeof(T) == sizeof(float) * 6 * 4) // looks like quad6
     {
-        if(a->shape.size() == 3 )
+        if (a->shape.size() == 3)
         {
-            assert( a->has_shape( -1, 6, 4) );
-            num_items = a->shape[0] ;
+            assert(a->has_shape(-1, 6, 4));
+            num_items = a->shape[0];
         }
     }
-    else if( sizeof(T) == sizeof(float)*4*4 )   // looks like quad4
+    else if (sizeof(T) == sizeof(float) * 4 * 4) // looks like quad4
     {
-        if(a->shape.size() == 3 )
+        if (a->shape.size() == 3)
         {
-            assert( a->has_shape( -1, 4, 4) );
-            num_items = a->shape[0] ;
+            assert(a->has_shape(-1, 4, 4));
+            num_items = a->shape[0];
         }
-        else if(a->shape.size() == 4 )
+        else if (a->shape.size() == 4)
         {
-            assert( a->shape[2] == 2 && a->shape[3] == 4 );
-            num_items = a->shape[0]*a->shape[1] ;
+            assert(a->shape[2] == 2 && a->shape[3] == 4);
+            num_items = a->shape[0] * a->shape[1];
         }
     }
-    else if( sizeof(T) == sizeof(float)*4*2 ) // looks like quad2
+    else if (sizeof(T) == sizeof(float) * 4 * 2) // looks like quad2
     {
-        if(a->shape.size() == 3 )
+        if (a->shape.size() == 3)
         {
-            assert( a->has_shape( -1, 2, 4) );
-            num_items = a->shape[0] ;
+            assert(a->has_shape(-1, 2, 4));
+            num_items = a->shape[0];
         }
-        else if(a->shape.size() == 4 )
+        else if (a->shape.size() == 4)
         {
-            assert( a->shape[2] == 2 && a->shape[3] == 4 );
-            num_items = a->shape[0]*a->shape[1] ;
+            assert(a->shape[2] == 2 && a->shape[3] == 4);
+            num_items = a->shape[0] * a->shape[1];
         }
     }
-    return num_items ;
+    return num_items;
 }
 
-template unsigned QU::NumItems<quad2>(const NP* );
-template unsigned QU::NumItems<quad4>(const NP* );
-template unsigned QU::NumItems<quad6>(const NP* );
-
+template unsigned QU::NumItems<quad2>(const NP *);
+template unsigned QU::NumItems<quad4>(const NP *);
+template unsigned QU::NumItems<quad6>(const NP *);
 
 /**
 QU::copy_host_to_device
@@ -658,29 +564,25 @@ Suggesting should generally use this via QEvt.
 
 **/
 
-template <typename T>
-unsigned QU::copy_host_to_device( T* d, const NP* a)
+template <typename T> unsigned QU::copy_host_to_device(T *d, const NP *a)
 {
     unsigned num_items = NumItems<T>(a);
-    if( num_items == 0 )
+    if (num_items == 0)
     {
-        LOG(fatal) << " failed to devine num_items for array " << a->sstr() << " with template type where sizeof(T) " << sizeof(T) ;
+        LOG(fatal) << " failed to devine num_items for array " << a->sstr() << " with template type where sizeof(T) "
+                   << sizeof(T);
     }
 
-    if( num_items > 0 )
+    if (num_items > 0)
     {
-        copy_host_to_device( d, (T*)a->bytes(), num_items );
+        copy_host_to_device(d, (T *)a->bytes(), num_items);
     }
-    return num_items ;
+    return num_items;
 }
 
-template unsigned QU::copy_host_to_device<quad2>( quad2* , const NP* );
-template unsigned QU::copy_host_to_device<quad4>( quad4* , const NP* );
-template unsigned QU::copy_host_to_device<quad6>( quad6* , const NP* );
-
-
-
-
+template unsigned QU::copy_host_to_device<quad2>(quad2 *, const NP *);
+template unsigned QU::copy_host_to_device<quad4>(quad4 *, const NP *);
+template unsigned QU::copy_host_to_device<quad6>(quad6 *, const NP *);
 
 /**
 QU::ConfigureLaunch
@@ -691,79 +593,64 @@ QU::ConfigureLaunch
 
 **/
 
-void QU::ConfigureLaunch( dim3& numBlocks, dim3& threadsPerBlock, unsigned width, unsigned height ) // static
+void QU::ConfigureLaunch(dim3 &numBlocks, dim3 &threadsPerBlock, unsigned width, unsigned height) // static
 {
-    threadsPerBlock.x = 512 ;
-    threadsPerBlock.y = 1 ;
-    threadsPerBlock.z = 1 ;
+    threadsPerBlock.x = 512;
+    threadsPerBlock.y = 1;
+    threadsPerBlock.z = 1;
 
-    numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
-    numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y ;
-    numBlocks.z = 1 ;
+    numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x;
+    numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y;
+    numBlocks.z = 1;
 
     // hmm this looks to not handle height other than 1
 }
 
-void QU::ConfigureLaunch1D( dim3& numBlocks, dim3& threadsPerBlock, unsigned num, unsigned threads_per_block ) // static
+void QU::ConfigureLaunch1D(dim3 &numBlocks, dim3 &threadsPerBlock, unsigned num, unsigned threads_per_block) // static
 {
-    threadsPerBlock.x = threads_per_block ;
-    threadsPerBlock.y = 1 ;
-    threadsPerBlock.z = 1 ;
+    threadsPerBlock.x = threads_per_block;
+    threadsPerBlock.y = 1;
+    threadsPerBlock.z = 1;
 
-    numBlocks.x = (num + threadsPerBlock.x - 1) / threadsPerBlock.x ;
-    numBlocks.y = 1 ;
-    numBlocks.z = 1 ;
+    numBlocks.x = (num + threadsPerBlock.x - 1) / threadsPerBlock.x;
+    numBlocks.y = 1;
+    numBlocks.z = 1;
 }
 
-
-
-void QU::ConfigureLaunch2D( dim3& numBlocks, dim3& threadsPerBlock, unsigned width, unsigned height ) // static
+void QU::ConfigureLaunch2D(dim3 &numBlocks, dim3 &threadsPerBlock, unsigned width, unsigned height) // static
 {
-    threadsPerBlock.x = 16 ;
-    threadsPerBlock.y = 16 ;
-    threadsPerBlock.z = 1 ;
+    threadsPerBlock.x = 16;
+    threadsPerBlock.y = 16;
+    threadsPerBlock.z = 1;
 
-    numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
-    numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y ;
-    numBlocks.z = 1 ;
+    numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x;
+    numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y;
+    numBlocks.z = 1;
 }
 
-
-void QU::ConfigureLaunch16( dim3& numBlocks, dim3& threadsPerBlock ) // static
+void QU::ConfigureLaunch16(dim3 &numBlocks, dim3 &threadsPerBlock) // static
 {
-    threadsPerBlock.x = 16 ;
-    threadsPerBlock.y = 1 ;
-    threadsPerBlock.z = 1 ;
+    threadsPerBlock.x = 16;
+    threadsPerBlock.y = 1;
+    threadsPerBlock.z = 1;
 
-    numBlocks.x = 1 ;
-    numBlocks.y = 1 ;
-    numBlocks.z = 1 ;
+    numBlocks.x = 1;
+    numBlocks.y = 1;
+    numBlocks.z = 1;
 }
 
-
-std::string QU::Desc(const dim3& d, int w) // static
+std::string QU::Desc(const dim3 &d, int w) // static
 {
-    std::stringstream ss ;
-    ss << "( "
-        << std::setw(w) << d.x
-        << " "
-        << std::setw(w) << d.y
-        << " "
-        << std::setw(w) << d.z
-        << ")"
-        ;
+    std::stringstream ss;
+    ss << "( " << std::setw(w) << d.x << " " << std::setw(w) << d.y << " " << std::setw(w) << d.z << ")";
     std::string s = ss.str();
-    return s ;
+    return s;
 }
 
-std::string QU::DescLaunch( const dim3& numBlocks, const dim3& threadsPerBlock ) // static
+std::string QU::DescLaunch(const dim3 &numBlocks, const dim3 &threadsPerBlock) // static
 {
-    std::stringstream ss ;
-    ss
-        << " numBlocks " << Desc(numBlocks,4)
-        << " threadsPerBlock " << Desc(threadsPerBlock, 4)
-        ;
+    std::stringstream ss;
+    ss << " numBlocks " << Desc(numBlocks, 4) << " threadsPerBlock " << Desc(threadsPerBlock, 4);
     std::string s = ss.str();
-    return s ;
+    return s;
 }
-
diff --git a/qudarap/QWls.cc b/qudarap/QWls.cc
new file mode 100644
index 000000000..e3888d7b5
--- /dev/null
+++ b/qudarap/QWls.cc
@@ -0,0 +1,131 @@
+#include <cassert>
+#include <csignal>
+#include <sstream>
+
+#include "scuda.h"
+#include "squad.h"
+
+#include "NP.hh"
+#include "SLOG.hh"
+#include "ssys.h"
+
+#include "QTex.hh"
+#include "QU.hh"
+#include "QUDA_CHECK.h"
+#include "QWls.hh"
+
+#include "qwls.h"
+
+const plog::Severity QWls::LEVEL = SLOG::EnvLevel("QWls", "DEBUG");
+
+const QWls *QWls::INSTANCE = nullptr;
+const QWls *QWls::Get()
+{
+    return INSTANCE;
+}
+
+/**
+QWls::QWls
+------------
+
+1. Narrows ICDF from double to float if needed
+2. Uploads ICDF into GPU texture
+3. Creates qwls instance with device pointers and uploads it
+
+**/
+
+QWls::QWls(const NP *wls_icdf, const NP *mat_map, const NP *time_constants, unsigned hd_factor)
+    : dsrc(wls_icdf->ebyte == 8 ? wls_icdf : nullptr), src(wls_icdf->ebyte == 4 ? wls_icdf : NP::MakeNarrow(dsrc)),
+      tex(MakeWlsTex(src, hd_factor)),
+      wls(MakeInstance(tex, mat_map, time_constants, hd_factor, time_constants->shape[0])),
+      d_wls(QU::UploadArray<qwls>(wls, 1, "QWls::QWls/d_wls"))
+{
+    INSTANCE = this;
+}
+
+/**
+QWls::MakeWlsTex
+-------------------
+
+Creates a 2D CUDA texture from the ICDF array.
+Shape: (num_wls*3, 4096, 1) where 3 = HD layers per material.
+
+**/
+
+QTex<float> *QWls::MakeWlsTex(const NP *src, unsigned hd_factor)
+{
+    assert(src);
+    assert(src->shape.size() == 3);
+
+    unsigned ni = src->shape[0]; // height: num_wls * 3
+    unsigned nj = src->shape[1]; // width: 4096
+    unsigned nk = src->shape[2]; // 1
+
+    assert(nk == 1);
+    assert(nj == 4096);
+    assert(ni % 3 == 0); // must be multiple of 3 (3 HD layers per material)
+    assert(src->uifc == 'f' && src->ebyte == 4);
+
+    unsigned ny = ni; // height
+    unsigned nx = nj; // width
+
+    bool normalizedCoords = true;
+    QTex<float> *tx = new QTex<float>(nx, ny, src->cvalues<float>(), 'L', normalizedCoords, src);
+
+    tx->setHDFactor(hd_factor);
+    tx->uploadMeta();
+
+    LOG(LEVEL) << " src " << src->desc() << " nx (width) " << nx << " ny (height) " << ny << " tx.HDFactor "
+               << tx->getHDFactor();
+
+    return tx;
+}
+
+/**
+QWls::MakeInstance
+---------------------
+
+Creates the host-side qwls struct populated with device pointers.
+Uploads material_map and time_constants to device memory.
+
+**/
+
+qwls *QWls::MakeInstance(const QTex<float> *tex, const NP *mat_map, const NP *time_constants, unsigned hd_factor,
+                         unsigned num_wls)
+{
+    assert(mat_map);
+    assert(time_constants);
+    assert(mat_map->uifc == 'i' && mat_map->ebyte == 4);
+    assert(time_constants->uifc == 'f' && time_constants->ebyte == 4);
+
+    qwls *w = new qwls;
+    w->wls_tex = tex->texObj;
+    w->hd_factor = hd_factor;
+    w->num_wls = num_wls;
+    w->tex_height = tex->height;
+
+    // Upload material_map to device
+    unsigned num_mat = mat_map->shape[0];
+    int *d_mat_map = nullptr;
+    size_t mat_map_size = num_mat * sizeof(int);
+    QUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_mat_map), mat_map_size));
+    QUDA_CHECK(cudaMemcpy(d_mat_map, mat_map->cvalues<int>(), mat_map_size, cudaMemcpyHostToDevice));
+    w->material_map = d_mat_map;
+
+    // Upload time_constants to device
+    float *d_tc = nullptr;
+    size_t tc_size = num_wls * sizeof(float);
+    QUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_tc), tc_size));
+    QUDA_CHECK(cudaMemcpy(d_tc, time_constants->cvalues<float>(), tc_size, cudaMemcpyHostToDevice));
+    w->time_constants = d_tc;
+
+    return w;
+}
+
+std::string QWls::desc() const
+{
+    std::stringstream ss;
+    ss << "QWls" << " dsrc " << (dsrc ? dsrc->desc() : "-") << " src " << (src ? src->desc() : "-") << " tex "
+       << (tex ? tex->desc() : "-");
+    return ss.str();
+}
diff --git a/qudarap/QWls.hh b/qudarap/QWls.hh
new file mode 100644
index 000000000..3134eba2b
--- /dev/null
+++ b/qudarap/QWls.hh
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "QUDARAP_API_EXPORT.hh"
+#include "plog/Severity.h"
+#include <string>
+
+struct NP;
+template <typename T> struct QTex;
+struct qwls;
+
+/**
+QWls : Host-side WLS ICDF Texture Upload
+============================================
+
+Uploads the WLS inverse CDF array into a GPU texture and creates
+the device-side qwls struct with material mapping and time constants.
+
+Follows the same pattern as QScint for scintillation ICDF textures.
+
+**/
+
+struct QUDARAP_API QWls
+{
+    static const plog::Severity LEVEL;
+    static const QWls *INSTANCE;
+    static const QWls *Get();
+
+    static QTex<float> *MakeWlsTex(const NP *src, unsigned hd_factor);
+    static qwls *MakeInstance(const QTex<float> *tex, const NP *mat_map, const NP *time_constants, unsigned hd_factor,
+                              unsigned num_wls);
+
+    const NP *dsrc;   // original double-precision ICDF
+    const NP *src;    // narrowed float ICDF
+    QTex<float> *tex; // GPU texture
+    qwls *wls;        // host-side instance (with device pointers)
+    qwls *d_wls;      // device copy of qwls struct
+
+    QWls(const NP *wls_icdf, const NP *mat_map, const NP *time_constants, unsigned hd_factor);
+
+    std::string desc() const;
+};
diff --git a/qudarap/qsim.h b/qudarap/qsim.h
index f8a94d091..34fcc3a66 100644
--- a/qudarap/qsim.h
+++ b/qudarap/qsim.h
@@ -23,178 +23,183 @@ Canonical use is from CSGOptiX/CSGOptiX7.cu:simulate
 **/
 
 #if defined(__CUDACC__) || defined(__CUDABE__)
-   #define QSIM_METHOD __device__
+#define QSIM_METHOD __device__
 #else
-   #define QSIM_METHOD
+#define QSIM_METHOD
 #endif
 
 #include "OpticksGenstep.h"
 #include "OpticksPhoton.h"
 
+#include "sc4u.h"
 #include "sflow.h"
+#include "sphoton.h"
 #include "sqat4.h"
-#include "sc4u.h"
 #include "sxyz.h"
-#include "sphoton.h"
 
-#include "storch.h"
 #include "scarrier.h"
 #include "sevent.h"
-#include "sstate.h"
 #include "smatsur.h"
-
+#include "sstate.h"
+#include "storch.h"
 
 #ifndef PRODUCTION
 #include "srec.h"
 #include "sseq.h"
 #include "stag.h"
 #ifdef DEBUG_LOGF
-#define KLUDGE_FASTMATH_LOGF(u) (u < 0.998f ? __logf(u) : __logf(u) - 0.46735790f*1e-7f )
+#define KLUDGE_FASTMATH_LOGF(u) (u < 0.998f ? __logf(u) : __logf(u) - 0.46735790f * 1e-7f)
 #endif
 #endif
 
 #include "sctx.h"
 
-#include "qrng.h"
 #include "qbase.h"
-#include "qprop.h"
-#include "qmultifilm.h"
 #include "qbnd.h"
-#include "qscint.h"
 #include "qcerenkov.h"
+#include "qmultifilm.h"
 #include "qpmt.h"
+#include "qprop.h"
+#include "qrng.h"
+#include "qscint.h"
+#include "qwls.h"
 #include "tcomplex.h"
 
-
-struct qcerenkov ;
+struct qcerenkov;
 
 struct qsim
 {
-    qbase*              base ;
-    sevent*             evt ;
-    qrng<RNG>*          rng ;
-    qbnd*               bnd ;
-    qmultifilm*         multifilm;
-    qcerenkov*          cerenkov ;
-    qscint*             scint ;
-    qpmt<float>*        pmt ;
+    qbase *base;
+    sevent *evt;
+    qrng<RNG> *rng;
+    qbnd *bnd;
+    qmultifilm *multifilm;
+    qcerenkov *cerenkov;
+    qscint *scint;
+    qwls *wls;
+    qpmt<float> *pmt;
 
 #if defined(__CUDACC__) || defined(__CUDABE__)
 #else
     qsim(); // instanciated on CPU (see QSim::init_sim) and copied to device so no ctor in device code
 #endif
 
-    QSIM_METHOD void    generate_photon_dummy( sphoton& p, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const ;
+    QSIM_METHOD void generate_photon_dummy(sphoton &p, RNG &rng, const quad6 &gs, unsigned long long photon_id,
+                                           unsigned genstep_id) const;
     QSIM_METHOD static float3 uniform_sphere(const float u0, const float u1);
-    QSIM_METHOD static float RandGaussQ_shoot( RNG& rng, float mean, float stdDev );
-    QSIM_METHOD static void SmearNormal_SigmaAlpha( RNG& rng, float3* smeared_normal, const float3* direction, const float3* normal, float sigma_alpha, const sctx& ctx );
-    QSIM_METHOD static void SmearNormal_Polish(     RNG& rng, float3* smeared_normal, const float3* direction, const float3* normal, float polish     , const sctx& ctx );
+    QSIM_METHOD static float RandGaussQ_shoot(RNG &rng, float mean, float stdDev);
+    QSIM_METHOD static void SmearNormal_SigmaAlpha(RNG &rng, float3 *smeared_normal, const float3 *direction,
+                                                   const float3 *normal, float sigma_alpha, const sctx &ctx);
+    QSIM_METHOD static void SmearNormal_Polish(RNG &rng, float3 *smeared_normal, const float3 *direction,
+                                               const float3 *normal, float polish, const sctx &ctx);
 
-#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
-    QSIM_METHOD static float3 uniform_sphere(RNG& rng);
+#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)
+    QSIM_METHOD static float3 uniform_sphere(RNG &rng);
 #endif
 
 #if defined(__CUDACC__) || defined(__CUDABE__)
-    QSIM_METHOD float4  multifilm_lookup(unsigned pmtType, float nm, float aoi);
+    QSIM_METHOD float4 multifilm_lookup(unsigned pmtType, float nm, float aoi);
 #endif
 
-#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND )  || defined(MOCK_CUDA)
-    QSIM_METHOD static void lambertian_direction(float3* dir, const float3* normal, float orient, RNG& rng, sctx& ctx );
-    QSIM_METHOD static void random_direction_marsaglia(float3* dir, RNG& rng, sctx& ctx );
-    QSIM_METHOD void rayleigh_scatter(RNG& rng, sctx& ctx );
-    QSIM_METHOD int     propagate_to_boundary( unsigned& flag, RNG& rng, sctx& ctx );
+#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)
+    QSIM_METHOD static void lambertian_direction(float3 *dir, const float3 *normal, float orient, RNG &rng, sctx &ctx);
+    QSIM_METHOD static void random_direction_marsaglia(float3 *dir, RNG &rng, sctx &ctx);
+    QSIM_METHOD void rayleigh_scatter(RNG &rng, sctx &ctx);
+    QSIM_METHOD int propagate_to_boundary(unsigned &flag, RNG &rng, sctx &ctx);
 #endif
 
-#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
-    QSIM_METHOD int     propagate_at_boundary(        unsigned& flag, RNG& rng, sctx& ctx, float theTransmittance=-1.f ) const ;
-    QSIM_METHOD int     propagate_at_boundary_with_T( unsigned& flag, RNG& rng, sctx& ctx, float theTransmittance ) const ;
+#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)
+    QSIM_METHOD int propagate_at_boundary(unsigned &flag, RNG &rng, sctx &ctx, float theTransmittance = -1.f) const;
+    QSIM_METHOD int propagate_at_boundary_with_T(unsigned &flag, RNG &rng, sctx &ctx, float theTransmittance) const;
 #endif
 
 #if defined(__CUDACC__) || defined(__CUDABE__)
-    QSIM_METHOD int     propagate_at_surface_MultiFilm(unsigned& flag, RNG& rng, sctx& ctx );
+    QSIM_METHOD int propagate_at_surface_MultiFilm(unsigned &flag, RNG &rng, sctx &ctx);
 #endif
 
-#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
-    QSIM_METHOD int     propagate_at_surface(           unsigned& flag, RNG& rng, sctx& ctx );
-    QSIM_METHOD int     propagate_at_surface_Detect(    unsigned& flag, RNG& rng, sctx& ctx ) const ;
-#if defined( WITH_CUSTOM4 )
-    QSIM_METHOD int     propagate_at_surface_CustomART( unsigned& flag, RNG& rng, sctx& ctx ) const ;
+#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)
+    QSIM_METHOD int propagate_at_surface(unsigned &flag, RNG &rng, sctx &ctx);
+    QSIM_METHOD int propagate_at_surface_Detect(unsigned &flag, RNG &rng, sctx &ctx) const;
+#if defined(WITH_CUSTOM4)
+    QSIM_METHOD int propagate_at_surface_CustomART(unsigned &flag, RNG &rng, sctx &ctx) const;
 #endif
 #endif
 
-#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
-    QSIM_METHOD void    reflect_diffuse(                       RNG& rng, sctx& ctx );
-    QSIM_METHOD void    reflect_specular(                      RNG& rng, sctx& ctx );
+#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)
+    QSIM_METHOD void reflect_diffuse(RNG &rng, sctx &ctx);
+    QSIM_METHOD void reflect_specular(RNG &rng, sctx &ctx);
 
-    QSIM_METHOD void    fake_propagate( sphoton& p, const quad2* mock_prd, RNG& rng, unsigned long long idx );
-    QSIM_METHOD int     propagate(const int bounce, RNG& rng, sctx& ctx );
+    QSIM_METHOD void fake_propagate(sphoton &p, const quad2 *mock_prd, RNG &rng, unsigned long long idx);
+    QSIM_METHOD int propagate(const int bounce, RNG &rng, sctx &ctx);
 
-    QSIM_METHOD void    hemisphere_polarized( unsigned polz, bool inwards, RNG& rng, sctx& ctx );
-    QSIM_METHOD void    generate_photon_simtrace(         quad4&   p, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const ;
-    QSIM_METHOD void    generate_photon_simtrace_frame(   quad4&   p, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const ;
-    QSIM_METHOD void    generate_photon(                  sphoton& p, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const ;
+    QSIM_METHOD void hemisphere_polarized(unsigned polz, bool inwards, RNG &rng, sctx &ctx);
+    QSIM_METHOD void generate_photon_simtrace(quad4 &p, RNG &rng, const quad6 &gs, unsigned long long photon_id,
+                                              unsigned genstep_id) const;
+    QSIM_METHOD void generate_photon_simtrace_frame(quad4 &p, RNG &rng, const quad6 &gs, unsigned long long photon_id,
+                                                    unsigned genstep_id) const;
+    QSIM_METHOD void generate_photon(sphoton &p, RNG &rng, const quad6 &gs, unsigned long long photon_id,
+                                     unsigned genstep_id) const;
 #endif
 };
 
 // CTOR
 #if defined(__CUDACC__) || defined(__CUDABE__)
 #else
-inline qsim::qsim()    // instanciated on CPU (see QSim::init_sim) and copied to device so no ctor in device code
-        :
-        base(nullptr),
-        evt(nullptr),
-        rng(nullptr),
-        bnd(nullptr),
-        multifilm(nullptr),
-        cerenkov(nullptr),
-        scint(nullptr),
-        pmt(nullptr)
-    {
-    }
+inline qsim::qsim() // instanciated on CPU (see QSim::init_sim) and copied to device so no ctor in device code
+    : base(nullptr), evt(nullptr), rng(nullptr), bnd(nullptr), multifilm(nullptr), cerenkov(nullptr), scint(nullptr),
+      wls(nullptr), pmt(nullptr)
+{
+}
 #endif
 
-inline QSIM_METHOD void qsim::generate_photon_dummy(sphoton& p_, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const
+inline QSIM_METHOD void qsim::generate_photon_dummy(sphoton &p_, RNG &rng, const quad6 &gs,
+                                                    unsigned long long photon_id, unsigned genstep_id) const
 {
-    quad4& p = (quad4&)p_ ;
+    quad4 &p = (quad4 &)p_;
 #ifndef PRODUCTION
     printf("//qsim::generate_photon_dummy  photon_id %3lld genstep_id %3d  gs.q0.i ( gencode:%3d %3d %3d %3d ) \n",
-       photon_id,
-       genstep_id,
-       gs.q0.i.x,
-       gs.q0.i.y,
-       gs.q0.i.z,
-       gs.q0.i.w
-      );
-#endif
-    p.q0.i.x = 1 ; p.q0.i.y = 2 ; p.q0.i.z = 3 ; p.q0.i.w = 4 ;
-    p.q1.i.x = 1 ; p.q1.i.y = 2 ; p.q1.i.z = 3 ; p.q1.i.w = 4 ;
-    p.q2.i.x = 1 ; p.q2.i.y = 2 ; p.q2.i.z = 3 ; p.q2.i.w = 4 ;
-    p.q3.i.x = 1 ; p.q3.i.y = 2 ; p.q3.i.z = 3 ; p.q3.i.w = 4 ;
+           photon_id, genstep_id, gs.q0.i.x, gs.q0.i.y, gs.q0.i.z, gs.q0.i.w);
+#endif
+    p.q0.i.x = 1;
+    p.q0.i.y = 2;
+    p.q0.i.z = 3;
+    p.q0.i.w = 4;
+    p.q1.i.x = 1;
+    p.q1.i.y = 2;
+    p.q1.i.z = 3;
+    p.q1.i.w = 4;
+    p.q2.i.x = 1;
+    p.q2.i.y = 2;
+    p.q2.i.z = 3;
+    p.q2.i.w = 4;
+    p.q3.i.x = 1;
+    p.q3.i.y = 2;
+    p.q3.i.z = 3;
+    p.q3.i.w = 4;
 
     p.set_flag(TORCH);
 }
 
 inline QSIM_METHOD float3 qsim::uniform_sphere(const float u0, const float u1)
 {
-    float phi = u0*2.f*M_PIf;
-    float cosTheta = 2.f*u1 - 1.f ; // -1.f -> 1.f
-    float sinTheta = sqrtf(1.f-cosTheta*cosTheta);
-    return make_float3(cosf(phi)*sinTheta, sinf(phi)*sinTheta, cosTheta);
+    float phi = u0 * 2.f * M_PIf;
+    float cosTheta = 2.f * u1 - 1.f; // -1.f -> 1.f
+    float sinTheta = sqrtf(1.f - cosTheta * cosTheta);
+    return make_float3(cosf(phi) * sinTheta, sinf(phi) * sinTheta, cosTheta);
 }
 
-
-#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
+#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)
 /**
 qsim::uniform_sphere
 ---------------------
 
 **/
-inline QSIM_METHOD float3 qsim::uniform_sphere(RNG& rng)
+inline QSIM_METHOD float3 qsim::uniform_sphere(RNG &rng)
 {
-    float phi = curand_uniform(&rng)*2.f*M_PIf;
-    float cosTheta = 2.f*curand_uniform(&rng) - 1.f ; // -1.f -> 1.f
-    float sinTheta = sqrtf(1.f-cosTheta*cosTheta);
-    return make_float3(cosf(phi)*sinTheta, sinf(phi)*sinTheta, cosTheta);
+    float phi = curand_uniform(&rng) * 2.f * M_PIf;
+    float cosTheta = 2.f * curand_uniform(&rng) - 1.f; // -1.f -> 1.f
+    float sinTheta = sqrtf(1.f - cosTheta * cosTheta);
+    return make_float3(cosf(phi) * sinTheta, sinf(phi) * sinTheta, cosTheta);
 }
 
 /**
@@ -210,15 +215,14 @@ See::
     g4-cls G4MTRandGaussQ
 
 **/
-inline QSIM_METHOD float qsim::RandGaussQ_shoot( RNG& rng, float mean, float stdDev )
+inline QSIM_METHOD float qsim::RandGaussQ_shoot(RNG &rng, float mean, float stdDev)
 {
-    float u2 = 2.f*curand_uniform(&rng) ;
-    float v = -M_SQRT2f*erfcinvf(u2)*stdDev + mean ;
-    //printf("//qsim.RandGaussQ_shoot mean %10.5f stdDev %10.5f u2 %10.5f v %10.5f \n", mean, stdDev, u2, v  ) ;
-    return v ;
+    float u2 = 2.f * curand_uniform(&rng);
+    float v = -M_SQRT2f * erfcinvf(u2) * stdDev + mean;
+    // printf("//qsim.RandGaussQ_shoot mean %10.5f stdDev %10.5f u2 %10.5f v %10.5f \n", mean, stdDev, u2, v  ) ;
+    return v;
 }
 
-
 /**
 qsim::SmearNormal_SigmaAlpha
 ------------------------------
@@ -251,70 +255,71 @@ TODO: full simulation run with breakpoint "BP=C4OpBoundaryProcess::GetFacetNorma
 
 **/
 
-inline QSIM_METHOD void qsim::SmearNormal_SigmaAlpha(
-    RNG& rng,
-    float3* smeared_normal,
-    const float3* direction,
-    const float3* normal,
-    float sigma_alpha,
-    const sctx& ctx
-   )
+inline QSIM_METHOD void qsim::SmearNormal_SigmaAlpha(RNG &rng, float3 *smeared_normal, const float3 *direction,
+                                                     const float3 *normal, float sigma_alpha, const sctx &ctx)
 {
 #if !defined(PRODUCTION) && defined(MOCK_CUDA_DEBUG)
-    bool dump = ctx.pidx == -1 ;
+    bool dump = ctx.pidx == -1;
 #endif
 
-    if(sigma_alpha == 0.f)
+    if (sigma_alpha == 0.f)
     {
-        *smeared_normal = *normal ;
-        return ;
+        *smeared_normal = *normal;
+        return;
     }
-    float f_max = fminf(1.f,4.f*sigma_alpha);
+    float f_max = fminf(1.f, 4.f * sigma_alpha);
 
 #if !defined(PRODUCTION) && defined(MOCK_CUDA_DEBUG)
-    if(dump) printf("//qsim::SmearNormal_SigmaAlpha.MOCK_CUDA_DEBUG sigma_alpha %10.5f f_max %10.5f  \n", sigma_alpha, f_max );
+    if (dump)
+        printf("//qsim::SmearNormal_SigmaAlpha.MOCK_CUDA_DEBUG sigma_alpha %10.5f f_max %10.5f  \n", sigma_alpha,
+               f_max);
 #endif
 
-    float alpha, sin_alpha, phi, u0, u1, u2 ;
-    bool reject_alpha ;
-    bool reject_dir ;
+    float alpha, sin_alpha, phi, u0, u1, u2;
+    bool reject_alpha;
+    bool reject_dir;
 
-    do {
-        do {
-            //alpha = RandGaussQ_shoot(rng, 0.f, sigma_alpha );  // mean:0.f stdDev:sigma_alpha
-            u0 = curand_uniform(&rng) ;
-            alpha = -M_SQRT2f*erfcinvf(2.f*u0)*sigma_alpha ;
+    do
+    {
+        do
+        {
+            // alpha = RandGaussQ_shoot(rng, 0.f, sigma_alpha );  // mean:0.f stdDev:sigma_alpha
+            u0 = curand_uniform(&rng);
+            alpha = -M_SQRT2f * erfcinvf(2.f * u0) * sigma_alpha;
 
             sin_alpha = sinf(alpha);
-            u1 = curand_uniform(&rng) ;
-            reject_alpha = alpha >= M_PIf/2.f || (u1*f_max > sin_alpha) ;
+            u1 = curand_uniform(&rng);
+            reject_alpha = alpha >= M_PIf / 2.f || (u1 * f_max > sin_alpha);
 
 #if !defined(PRODUCTION) && defined(MOCK_CUDA_DEBUG)
-            if(dump) printf("//qsim::SmearNormal_SigmaAlpha.MOCK_CUDA_DEBUG u0 %10.5f alpha %10.5f sin_alpha %10.5f u1 %10.5f u1*f_max %10.5f  (u1*f_max > sin_alpha) %d reject_alpha %d  \n",
-               u0, alpha, sin_alpha, u1, u1*f_max, (u1*f_max > sin_alpha), reject_alpha );
+            if (dump)
+                printf("//qsim::SmearNormal_SigmaAlpha.MOCK_CUDA_DEBUG u0 %10.5f alpha %10.5f sin_alpha %10.5f u1 "
+                       "%10.5f u1*f_max %10.5f  (u1*f_max > sin_alpha) %d reject_alpha %d  \n",
+                       u0, alpha, sin_alpha, u1, u1 * f_max, (u1 * f_max > sin_alpha), reject_alpha);
             // theres lots of alpha rejected : eg all -ve sin_alpha
 #endif
 
-        } while( reject_alpha ) ;
+        } while (reject_alpha);
 
-        u2 = curand_uniform(&rng) ;
-        phi = u2*M_PIf*2.f ;
+        u2 = curand_uniform(&rng);
+        phi = u2 * M_PIf * 2.f;
 
-        smeared_normal->x = sin_alpha * cosf(phi) ;
-        smeared_normal->y = sin_alpha * sinf(phi) ;
-        smeared_normal->z = cosf(alpha) ;
+        smeared_normal->x = sin_alpha * cosf(phi);
+        smeared_normal->y = sin_alpha * sinf(phi);
+        smeared_normal->z = cosf(alpha);
 
         smath::rotateUz(*smeared_normal, *normal);
-        reject_dir = dot(*smeared_normal, *direction ) >= 0.f ;
+        reject_dir = dot(*smeared_normal, *direction) >= 0.f;
         // reject smears that move the normal into same hemi as direction
 
 #if !defined(PRODUCTION) && defined(MOCK_CUDA_DEBUG)
-        if(dump) printf("//qsim::SmearNormal_SigmaAlpha.MOCK_CUDA_DEBUG u2 %10.5f phi %10.5f smeared_normal ( %10.5f, %10.5f, %10.5f)  reject_dir %d  \n",
-               u2, phi, smeared_normal->x, smeared_normal->y, smeared_normal->z, reject_dir );
+        if (dump)
+            printf("//qsim::SmearNormal_SigmaAlpha.MOCK_CUDA_DEBUG u2 %10.5f phi %10.5f smeared_normal ( %10.5f, "
+                   "%10.5f, %10.5f)  reject_dir %d  \n",
+                   u2, phi, smeared_normal->x, smeared_normal->y, smeared_normal->z, reject_dir);
 #endif
 
-
-    } while( reject_dir ) ;
+    } while (reject_dir);
 }
 
 /**
@@ -325,53 +330,43 @@ CAUTION : THIS CURRENTLY NOT USED BY ANYTHING OTHER THAN TESTS : SEE DETAILS ABO
 
 **/
 
-inline QSIM_METHOD void qsim::SmearNormal_Polish(
-    RNG& rng,
-    float3* smeared_normal,
-    const float3* direction,
-    const float3* normal,
-    float polish,
-    const sctx& ctx
-    )
+inline QSIM_METHOD void qsim::SmearNormal_Polish(RNG &rng, float3 *smeared_normal, const float3 *direction,
+                                                 const float3 *normal, float polish, const sctx &ctx)
 {
 #if !defined(PRODUCTION) && defined(MOCK_CUDA_DEBUG)
-    bool dump = ctx.pidx == -1 ;
+    bool dump = ctx.pidx == -1;
 #endif
 
-    if(polish == 1.f)
+    if (polish == 1.f)
     {
-        *smeared_normal = *normal ;
-        return ;
+        *smeared_normal = *normal;
+        return;
     }
 
-    float u0, u1, u2 ;
-    float3 smear ;
-    bool reject_mag ;
-    bool reject_dir ;
+    float u0, u1, u2;
+    float3 smear;
+    bool reject_mag;
+    bool reject_dir;
 
-    do {
-        do {
+    do
+    {
+        do
+        {
             u0 = curand_uniform(&rng);
-            u1 = curand_uniform(&rng) ;
-            u2 = curand_uniform(&rng) ;
-            smear.x = 2.f*u0 - 1.f ;
-            smear.y = 2.f*u1 - 1.f ;
-            smear.z = 2.f*u2 - 1.f ;
-            reject_mag = length(smear) > 1.f  ;   // HMM: could this use just dot(smear, smear) ?
-       }
-       while( reject_mag );
-
-       *smeared_normal = *normal + (1.f-polish)*smear;
-       reject_dir = dot(*smeared_normal, *direction) >= 0.f ;
-    }
-    while( reject_dir );
+            u1 = curand_uniform(&rng);
+            u2 = curand_uniform(&rng);
+            smear.x = 2.f * u0 - 1.f;
+            smear.y = 2.f * u1 - 1.f;
+            smear.z = 2.f * u2 - 1.f;
+            reject_mag = length(smear) > 1.f; // HMM: could this use just dot(smear, smear) ?
+        } while (reject_mag);
+
+        *smeared_normal = *normal + (1.f - polish) * smear;
+        reject_dir = dot(*smeared_normal, *direction) >= 0.f;
+    } while (reject_dir);
     *smeared_normal = normalize(*smeared_normal);
 }
 
-
-
-
-
 #endif
 
 #if defined(__CUDACC__) || defined(__CUDABE__)
@@ -427,59 +422,56 @@ as opposed to local stack float3 : as this keeps changing the dir before
 arriving at the final one
 
 **/
-inline  QSIM_METHOD void qsim::lambertian_direction(float3* dir, const float3* normal, float orient, RNG& rng, sctx& ctx )
+inline QSIM_METHOD void qsim::lambertian_direction(float3 *dir, const float3 *normal, float orient, RNG &rng, sctx &ctx)
 {
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    unsigned long long PIDX = 0xffffffffff ;
-    if(ctx.pidx == PIDX )
+    unsigned long long PIDX = 0xffffffffff;
+    if (ctx.pidx == PIDX)
     {
-        printf("//qsim.lambertian_direction.head pidx %7lld : normal = np.array([%10.5f,%10.5f,%10.5f]) ; orient = %10.5f  \n",
-            ctx.pidx, normal->x, normal->y, normal->z, orient  );
+        printf("//qsim.lambertian_direction.head pidx %7lld : normal = np.array([%10.5f,%10.5f,%10.5f]) ; orient = "
+               "%10.5f  \n",
+               ctx.pidx, normal->x, normal->y, normal->z, orient);
     }
 #endif
 
-    float ndotv ;
-    int count = 0 ;
-    float u ;
+    float ndotv;
+    int count = 0;
+    float u;
     do
     {
-        count++ ;
+        count++;
         random_direction_marsaglia(dir, rng, ctx); // sets dir to random point on unit sphere
-        ndotv = dot( *dir, *normal )*orient ;
-        if( ndotv < 0.f )
+        ndotv = dot(*dir, *normal) * orient;
+        if (ndotv < 0.f)
         {
-            *dir = -1.f*(*dir) ;
-            ndotv = -1.f*ndotv ;
+            *dir = -1.f * (*dir);
+            ndotv = -1.f * ndotv;
         }
         // when random dir is in opposite hemisphere to oriented normal
         // flip the dir into same hemi and ndotv
 
-        u = curand_uniform(&rng) ;
+        u = curand_uniform(&rng);
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
 
-        if(ctx.pidx == PIDX)
+        if (ctx.pidx == PIDX)
         {
-            printf("//qsim.lambertian_direction.loop pidx %7lld : dir = np.array([%10.5f,%10.5f,%10.5f]) ; count = %d ; ndotv = %10.5f ; u = %10.5f \n",
-                ctx.pidx, dir->x, dir->y, dir->z, count, ndotv, u   );
-
+            printf("//qsim.lambertian_direction.loop pidx %7lld : dir = np.array([%10.5f,%10.5f,%10.5f]) ; count = %d "
+                   "; ndotv = %10.5f ; u = %10.5f \n",
+                   ctx.pidx, dir->x, dir->y, dir->z, count, ndotv, u);
         }
 #endif
-    }
-    while (!(u < ndotv) && (count < 1024)) ;
+    } while (!(u < ndotv) && (count < 1024));
     // distribution looks pretty similar without the while loop
 
-
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if(ctx.pidx == PIDX)
+    if (ctx.pidx == PIDX)
     {
-        printf("//qsim.lambertian_direction.tail pidx %7lld : dir = np.array([%10.5f,%10.5f,%10.5f]) ; count = %d ; ndotv = %10.5f \n",
-            ctx.pidx, dir->x, dir->y, dir->z, count, ndotv  );
-
+        printf("//qsim.lambertian_direction.tail pidx %7lld : dir = np.array([%10.5f,%10.5f,%10.5f]) ; count = %d ; "
+               "ndotv = %10.5f \n",
+               ctx.pidx, dir->x, dir->y, dir->z, count, ndotv);
     }
 #endif
-
-
 }
 
 /**
@@ -547,32 +539,29 @@ So that means the random 3D (x,y,z) points are on the unit sphere.
 
 **/
 
-
-inline QSIM_METHOD void qsim::random_direction_marsaglia(float3* dir,  RNG& rng, sctx& ctx  )
+inline QSIM_METHOD void qsim::random_direction_marsaglia(float3 *dir, RNG &rng, sctx &ctx)
 {
     // NB: no use of ctx.tagr so this has not been random aligned
-    float u0, u1 ;
-    float u, v, b, a  ;
+    float u0, u1;
+    float u, v, b, a;
     do
     {
         u0 = curand_uniform(&rng);
         u1 = curand_uniform(&rng);
-        //if( idx == 0u ) printf("//qsim.random_direction_marsaglia pidx %7lld u0 %10.4f u1 %10.4f \n", ctx.pidx, u0, u1 );
-        u = 2.f*u0 - 1.f ;
-        v = 2.f*u1 - 1.f ;
-        b = u*u + v*v ;
-    }
-    while( b > 1.f ) ;
-
-    a = 2.f*sqrtf( 1.f - b );
-
-    dir->x = a*u ;
-    dir->y = a*v ;
-    dir->z = 2.f*b - 1.f ;
+        // if( idx == 0u ) printf("//qsim.random_direction_marsaglia pidx %7lld u0 %10.4f u1 %10.4f \n", ctx.pidx, u0,
+        // u1 );
+        u = 2.f * u0 - 1.f;
+        v = 2.f * u1 - 1.f;
+        b = u * u + v * v;
+    } while (b > 1.f);
+
+    a = 2.f * sqrtf(1.f - b);
+
+    dir->x = a * u;
+    dir->y = a * v;
+    dir->z = 2.f * b - 1.f;
 }
 
-
-
 /**
 qsim::rayleigh_scatter
 ------------------------------
@@ -598,73 +587,74 @@ Transverse wave nature means::
 
 **/
 
-inline QSIM_METHOD void qsim::rayleigh_scatter(RNG& rng, sctx& ctx )
+inline QSIM_METHOD void qsim::rayleigh_scatter(RNG &rng, sctx &ctx)
 {
-    sphoton& p = ctx.p ;
-    float3 direction ;
-    float3 polarization ;
+    sphoton &p = ctx.p;
+    float3 direction;
+    float3 polarization;
 
-    bool looping(true) ;
+    bool looping(true);
     do
     {
-        float u0 = curand_uniform(&rng) ;
-        float u1 = curand_uniform(&rng) ;
-        float u2 = curand_uniform(&rng) ;
-        float u3 = curand_uniform(&rng) ;
-        float u4 = curand_uniform(&rng) ;
+        float u0 = curand_uniform(&rng);
+        float u1 = curand_uniform(&rng);
+        float u2 = curand_uniform(&rng);
+        float u3 = curand_uniform(&rng);
+        float u4 = curand_uniform(&rng);
 
 #if !defined(PRODUCTION) && defined(DEBUG_TAG)
-        stagr& tagr = ctx.tagr ;  // UNTESTED
+        stagr &tagr = ctx.tagr; // UNTESTED
         tagr.add(stag_sc, u0);
         tagr.add(stag_sc, u1);
         tagr.add(stag_sc, u2);
         tagr.add(stag_sc, u3);
         tagr.add(stag_sc, u4);
 #endif
-        float cosTheta = u0 ;
-        float sinTheta = sqrtf(1.0f-u0*u0);
-        if(u1 < 0.5f ) cosTheta = -cosTheta ;
+        float cosTheta = u0;
+        float sinTheta = sqrtf(1.0f - u0 * u0);
+        if (u1 < 0.5f)
+            cosTheta = -cosTheta;
         // could use uniform_sphere here : but not doing so to follow G4OpRayleigh more closely
 
-        float sinPhi ;
-        float cosPhi ;
+        float sinPhi;
+        float cosPhi;
 
-#if defined(MOCK_CURAND ) || defined(MOCK_CUDA)
+#if defined(MOCK_CURAND) || defined(MOCK_CUDA)
         //__sincosf(2.f*M_PIf*u2,&sinPhi,&cosPhi);   // apple extension
-        float phi = 2.f*M_PIf*u2 ;
+        float phi = 2.f * M_PIf * u2;
         sinPhi = sinf(phi);
         cosPhi = cosf(phi);
 #else
-        sincosf(2.f*M_PIf*u2,&sinPhi,&cosPhi);
+        sincosf(2.f * M_PIf * u2, &sinPhi, &cosPhi);
 #endif
 
         direction.x = sinTheta * cosPhi;
         direction.y = sinTheta * sinPhi;
-        direction.z = cosTheta ;
+        direction.z = cosTheta;
 
-        smath::rotateUz(direction, p.mom );
+        smath::rotateUz(direction, p.mom);
 
-        float constant = -dot(direction, p.pol );
+        float constant = -dot(direction, p.pol);
 
-        polarization.x = p.pol.x + constant*direction.x ;
-        polarization.y = p.pol.y + constant*direction.y ;
-        polarization.z = p.pol.z + constant*direction.z ;
+        polarization.x = p.pol.x + constant * direction.x;
+        polarization.y = p.pol.y + constant * direction.y;
+        polarization.z = p.pol.z + constant * direction.z;
 
-        if(dot(polarization, polarization) == 0.f )
+        if (dot(polarization, polarization) == 0.f)
         {
 
-#if defined( MOCK_CURAND ) || defined(MOCK_CUDA)
+#if defined(MOCK_CURAND) || defined(MOCK_CUDA)
             //__sincosf(2.f*M_PIf*u3,&sinPhi,&cosPhi);
-            phi = 2.f*M_PIf*u3 ;
+            phi = 2.f * M_PIf * u3;
             sinPhi = sinf(phi);
             cosPhi = cosf(phi);
 #else
-            sincosf(2.f*M_PIf*u3,&sinPhi,&cosPhi);
+            sincosf(2.f * M_PIf * u3, &sinPhi, &cosPhi);
 #endif
 
-            polarization.x = cosPhi ;
-            polarization.y = sinPhi ;
-            polarization.z = 0.f ;
+            polarization.x = cosPhi;
+            polarization.y = sinPhi;
+            polarization.z = 0.f;
 
             smath::rotateUz(polarization, direction);
         }
@@ -672,37 +662,41 @@ inline QSIM_METHOD void qsim::rayleigh_scatter(RNG& rng, sctx& ctx )
         {
             // There are two directions which are perpendicular
             // to the new momentum direction
-            if(u3 < 0.5f) polarization = -polarization ;
+            if (u3 < 0.5f)
+                polarization = -polarization;
         }
         polarization = normalize(polarization);
 
         // simulate according to the distribution cos^2(theta)
         // where theta is the angle between old and new polarizations
-        float doCosTheta = dot(polarization, p.pol ) ;
-        float doCosTheta2 = doCosTheta*doCosTheta ;
-        looping = doCosTheta2 < u4 ;
+        float doCosTheta = dot(polarization, p.pol);
+        float doCosTheta2 = doCosTheta * doCosTheta;
+        looping = doCosTheta2 < u4;
 
-    } while ( looping ) ;
+    } while (looping);
 
-    p.mom = direction ;
-    p.pol = polarization ;
+    p.mom = direction;
+    p.pol = polarization;
 }
 
-
 /**
 qsim::propagate_to_boundary
 ------------------------------
 
 +---------------------+------------------+---------------------------------------------------------+-------------------------------------------------------+
-| flag                |   command        |  changed                                                |  note                                                 |
+| flag                |   command        |  changed                                                |  note |
 +=====================+==================+=========================================================+=======================================================+
-|   BULK_REEMIT       |   CONTINUE       |  time, position, direction, polarization, wavelength    | advance to reemit position with everything changed    |
+|   BULK_REEMIT       |   CONTINUE       |  time, position, direction, polarization, wavelength    | advance to reemit
+position with everything changed    |
 +---------------------+------------------+---------------------------------------------------------+-------------------------------------------------------+
-|   BULK_SCATTER      |   CONTINUE       |  time, position, direction, polarization                | advance to scatter position, new dir+pol              |
+|   BULK_SCATTER      |   CONTINUE       |  time, position, direction, polarization                | advance to scatter
+position, new dir+pol              |
 +---------------------+------------------+---------------------------------------------------------+-------------------------------------------------------+
-|   BULK_ABSORB       |   BREAK          |  time, position                                         | advance to absorption position, dir+pol unchanged     |
+|   BULK_ABSORB       |   BREAK          |  time, position                                         | advance to
+absorption position, dir+pol unchanged     |
 +---------------------+------------------+---------------------------------------------------------+-------------------------------------------------------+
-|   not set "SAIL"    |   BOUNDARY       |  time, position                                         | advanced to border position, dir+pol unchanged        |
+|   not set "SAIL"    |   BOUNDARY       |  time, position                                         | advanced to border
+position, dir+pol unchanged        |
 +---------------------+------------------+---------------------------------------------------------+-------------------------------------------------------+
 
 
@@ -713,94 +707,156 @@ qsim::propagate_to_boundary
 
 **/
 
-
-
-inline QSIM_METHOD int qsim::propagate_to_boundary(unsigned& flag, RNG& rng, sctx& ctx)
+inline QSIM_METHOD int qsim::propagate_to_boundary(unsigned &flag, RNG &rng, sctx &ctx)
 {
-    sphoton& p = ctx.p ;
-    const sstate& s = ctx.s ;
-
-    const float& absorption_length = s.material1.y ;
-    const float& scattering_length = s.material1.z ;
-    const float& reemission_prob = s.material1.w ;
-    const float& group_velocity = s.m1group2.x ;
-    const float& distance_to_boundary = ctx.prd->q0.f.w ;
+    sphoton &p = ctx.p;
+    const sstate &s = ctx.s;
 
+    const float &absorption_length = s.material1.y;
+    const float &scattering_length = s.material1.z;
+    const float &reemission_prob = s.material1.w;
+    const float &group_velocity = s.m1group2.x;
+    const float &wls_absorption_length = s.m1group2.y;
+    const float &distance_to_boundary = ctx.prd->q0.f.w;
 
 #if !defined(PRODUCTION) && defined(DEBUG_TAG)
-    float u_to_sci = curand_uniform(&rng) ;  // purely for alignment with G4
-    float u_to_bnd = curand_uniform(&rng) ;  // purely for alignment with G4
+    float u_to_sci = curand_uniform(&rng); // purely for alignment with G4
+    float u_to_bnd = curand_uniform(&rng); // purely for alignment with G4
 #endif
-    float u_scattering = curand_uniform(&rng) ;
-    float u_absorption = curand_uniform(&rng) ;
+    float u_scattering = curand_uniform(&rng);
+    float u_absorption = curand_uniform(&rng);
+    float u_wls_absorption = (wls != nullptr) ? curand_uniform(&rng) : 2.f;
 
 #if !defined(PRODUCTION) && defined(DEBUG_TAG)
-    stagr& tagr = ctx.tagr ;
-    tagr.add( stag_to_sci, u_to_sci);
-    tagr.add( stag_to_bnd, u_to_bnd);
-    tagr.add( stag_to_sca, u_scattering);
-    tagr.add( stag_to_abs, u_absorption);
+    stagr &tagr = ctx.tagr;
+    tagr.add(stag_to_sci, u_to_sci);
+    tagr.add(stag_to_bnd, u_to_bnd);
+    tagr.add(stag_to_sca, u_scattering);
+    tagr.add(stag_to_abs, u_absorption);
 #endif
 
-
 #if !defined(PRODUCTION) && defined(DEBUG_LOGF)
-    // see notes/issues/U4LogTest_maybe_replacing_G4Log_G4UniformRand_in_Absorption_and_Scattering_with_float_version_will_avoid_deviations.rst
-    float scattering_distance = -scattering_length*KLUDGE_FASTMATH_LOGF(u_scattering);
-    float absorption_distance = -absorption_length*KLUDGE_FASTMATH_LOGF(u_absorption);
+    // see
+    // notes/issues/U4LogTest_maybe_replacing_G4Log_G4UniformRand_in_Absorption_and_Scattering_with_float_version_will_avoid_deviations.rst
+    float scattering_distance = -scattering_length * KLUDGE_FASTMATH_LOGF(u_scattering);
+    float absorption_distance = -absorption_length * KLUDGE_FASTMATH_LOGF(u_absorption);
+    float wls_absorption_distance = -wls_absorption_length * KLUDGE_FASTMATH_LOGF(u_wls_absorption);
 #else
-    float scattering_distance = -scattering_length*logf(u_scattering);
-    float absorption_distance = -absorption_length*logf(u_absorption);
+    float scattering_distance = -scattering_length * logf(u_scattering);
+    float absorption_distance = -absorption_length * logf(u_absorption);
+    float wls_absorption_distance = -wls_absorption_length * logf(u_wls_absorption);
 #endif
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
 
-    if(ctx.pidx == base->pidx)
+    if (ctx.pidx == base->pidx)
     {
-    printf("//qsim.propagate_to_boundary.head pidx %7lld : u_absorption %10.8f logf(u_absorption) %10.8f absorption_length %10.4f absorption_distance %10.6f \n",
-        ctx.pidx, u_absorption, logf(u_absorption), absorption_length, absorption_distance );
+        printf("//qsim.propagate_to_boundary.head pidx %7lld : u_absorption %10.8f logf(u_absorption) %10.8f "
+               "absorption_length %10.4f absorption_distance %10.6f \n",
+               ctx.pidx, u_absorption, logf(u_absorption), absorption_length, absorption_distance);
 
-    printf("//qsim.propagate_to_boundary.head pidx %7lld : post = np.array([%10.5f,%10.5f,%10.5f,%10.5f]) \n",
-        ctx.pidx, p.pos.x, p.pos.y, p.pos.z, p.time );
+        printf("//qsim.propagate_to_boundary.head pidx %7lld : post = np.array([%10.5f,%10.5f,%10.5f,%10.5f]) \n",
+               ctx.pidx, p.pos.x, p.pos.y, p.pos.z, p.time);
 
-    printf("//qsim.propagate_to_boundary.head pidx %7lld : distance_to_boundary %10.4f absorption_distance %10.4f scattering_distance %10.4f \n",
-             ctx.pidx, distance_to_boundary, absorption_distance, scattering_distance );
-
-    printf("//qsim.propagate_to_boundary.head pidx %7lld : u_scattering %10.4f u_absorption %10.4f \n",
-             ctx.pidx, u_scattering, u_absorption  );
+        printf("//qsim.propagate_to_boundary.head pidx %7lld : distance_to_boundary %10.4f absorption_distance %10.4f "
+               "scattering_distance %10.4f \n",
+               ctx.pidx, distance_to_boundary, absorption_distance, scattering_distance);
 
+        printf("//qsim.propagate_to_boundary.head pidx %7lld : u_scattering %10.4f u_absorption %10.4f \n", ctx.pidx,
+               u_scattering, u_absorption);
     }
 #endif
 
+    // WLS absorption competes with regular absorption and Rayleigh scattering.
+    // The process with the shortest sampled distance wins.
+    bool wls_wins = wls_absorption_distance <= absorption_distance && wls_absorption_distance <= scattering_distance;
+
+    if (wls != nullptr && wls_wins && wls_absorption_distance <= distance_to_boundary)
+    {
+        // WLS ABSORPTION: photon absorbed by wavelength shifting material
+        p.time += wls_absorption_distance / group_velocity;
+        p.pos += wls_absorption_distance * (p.mom);
+
+        unsigned mat_idx = s.index.x - 1u; // 0-based material index from 1-based optical index
 
+        if (wls->has_wls(mat_idx))
+        {
+            // Sample re-emitted wavelength from WLS emission spectrum ICDF
+            float u_wls_wl = curand_uniform(&rng);
+            float new_wavelength = wls->wavelength(mat_idx, u_wls_wl);
+
+            // Energy conservation: re-emitted photon must have lower energy (longer wavelength).
+            // Matches G4OpWLS algorithm: retry up to 100 times.
+            int attempts = 0;
+            while (new_wavelength < p.wavelength && attempts < 100)
+            {
+                u_wls_wl = curand_uniform(&rng);
+                new_wavelength = wls->wavelength(mat_idx, u_wls_wl);
+                attempts++;
+            }
+
+            if (new_wavelength < p.wavelength)
+            {
+                // Failed energy conservation after 100 attempts — absorb without re-emission
+                flag = BULK_ABSORB;
+                return BREAK;
+            }
 
+            p.wavelength = new_wavelength;
 
+            // Isotropic re-emission direction and random polarization
+            float u_wls_mom_ph = curand_uniform(&rng);
+            float u_wls_mom_ct = curand_uniform(&rng);
+            float u_wls_pol_ph = curand_uniform(&rng);
+            float u_wls_pol_ct = curand_uniform(&rng);
 
-    if (absorption_distance <= scattering_distance)
+            p.mom = uniform_sphere(u_wls_mom_ph, u_wls_mom_ct);
+            p.pol = normalize(cross(uniform_sphere(u_wls_pol_ph, u_wls_pol_ct), p.mom));
+
+            // Apply WLS time delay (exponential decay)
+            float tc = wls->time_constant(mat_idx);
+            if (tc > 0.f)
+            {
+                float u_wls_time = curand_uniform(&rng);
+                p.time += -tc * logf(u_wls_time);
+            }
+
+            flag = BULK_REEMIT;
+            return CONTINUE;
+        }
+        else
+        {
+            // Material map says no WLS — treat as regular absorption
+            flag = BULK_ABSORB;
+            return BREAK;
+        }
+    }
+    else if (absorption_distance <= scattering_distance)
     {
         if (absorption_distance <= distance_to_boundary)
         {
-            p.time += absorption_distance/group_velocity ;
-            p.pos  += absorption_distance*(p.mom) ;
-
+            p.time += absorption_distance / group_velocity;
+            p.pos += absorption_distance * (p.mom);
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-            float absorb_time_delta = absorption_distance/group_velocity ;
-            if( ctx.pidx == base->pidx )
+            float absorb_time_delta = absorption_distance / group_velocity;
+            if (ctx.pidx == base->pidx)
             {
-            printf("//qsim.propagate_to_boundary.body.BULK_ABSORB pidx %7lld : post = np.array([%10.5f,%10.5f,%10.5f,%10.5f]) ; absorb_time_delta = %10.8f   \n",
-                    ctx.pidx, p.pos.x, p.pos.y, p.pos.z, p.time, absorb_time_delta  );
-
+                printf("//qsim.propagate_to_boundary.body.BULK_ABSORB pidx %7lld : post = "
+                       "np.array([%10.5f,%10.5f,%10.5f,%10.5f]) ; absorb_time_delta = %10.8f   \n",
+                       ctx.pidx, p.pos.x, p.pos.y, p.pos.z, p.time, absorb_time_delta);
             }
 #endif
 
-            float u_reemit = reemission_prob == 0.f ? 2.f : curand_uniform(&rng);  // avoid consumption at absorption when not scintillator
-
+            float u_reemit = reemission_prob == 0.f
+                                 ? 2.f
+                                 : curand_uniform(&rng); // avoid consumption at absorption when not scintillator
 
 #if !defined(PRODUCTION) && defined(DEBUG_TAG)
-            if( u_reemit != 2.f ) tagr.add( stag_to_ree, u_reemit) ;
+            if (u_reemit != 2.f)
+                tagr.add(stag_to_ree, u_reemit);
 #endif
 
-
             if (u_reemit < reemission_prob)
             {
                 float u_re_wavelength = curand_uniform(&rng);
@@ -814,19 +870,19 @@ inline QSIM_METHOD int qsim::propagate_to_boundary(unsigned& flag, RNG& rng, sct
                 p.pol = normalize(cross(uniform_sphere(u_re_pol_ph, u_re_pol_ct), p.mom));
 
 #if !defined(PRODUCTION) && defined(DEBUG_TAG)
-                tagr.add( stag_re_wl, u_re_wavelength);
-                tagr.add( stag_re_mom_ph, u_re_mom_ph);
-                tagr.add( stag_re_mom_ct, u_re_mom_ct);
-                tagr.add( stag_re_pol_ph, u_re_pol_ph);
-                tagr.add( stag_re_pol_ct, u_re_pol_ct);
+                tagr.add(stag_re_wl, u_re_wavelength);
+                tagr.add(stag_re_mom_ph, u_re_mom_ph);
+                tagr.add(stag_re_mom_ct, u_re_mom_ct);
+                tagr.add(stag_re_pol_ph, u_re_pol_ph);
+                tagr.add(stag_re_pol_ct, u_re_pol_ct);
 #endif
 
-                flag = BULK_REEMIT ;
+                flag = BULK_REEMIT;
                 return CONTINUE;
             }
             else
             {
-                flag = BULK_ABSORB ;
+                flag = BULK_ABSORB;
                 return BREAK;
             }
         }
@@ -836,33 +892,33 @@ inline QSIM_METHOD int qsim::propagate_to_boundary(unsigned& flag, RNG& rng, sct
     {
         if (scattering_distance <= distance_to_boundary)
         {
-            p.time += scattering_distance/group_velocity ;
-            p.pos  += scattering_distance*(p.mom) ;
+            p.time += scattering_distance / group_velocity;
+            p.pos += scattering_distance * (p.mom);
 
-            rayleigh_scatter(rng, ctx);  // changes dir and pol, consumes 5u at each turn of rejection sampling loop
+            rayleigh_scatter(rng, ctx); // changes dir and pol, consumes 5u at each turn of rejection sampling loop
 
             flag = BULK_SCATTER;
 
             return CONTINUE;
         }
-          //  otherwise sail to boundary
-    }     // if scattering_distance < absorption_distance
-
-
+        //  otherwise sail to boundary
+    } // if scattering_distance < absorption_distance
 
-    p.pos  += distance_to_boundary*(p.mom) ;
-    p.time += distance_to_boundary/group_velocity   ;
+    p.pos += distance_to_boundary * (p.mom);
+    p.time += distance_to_boundary / group_velocity;
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    float sail_time_delta = distance_to_boundary/group_velocity ;
-    if( ctx.pidx == base->pidx ) printf("//qsim.propagate_to_boundary.tail.SAIL pidx %7lld : post = np.array([%10.5f,%10.5f,%10.5f,%10.5f]) ;  sail_time_delta = %10.5f   \n",
-          ctx.pidx, p.pos.x, p.pos.y, p.pos.z, p.time, sail_time_delta  );
+    float sail_time_delta = distance_to_boundary / group_velocity;
+    if (ctx.pidx == base->pidx)
+        printf("//qsim.propagate_to_boundary.tail.SAIL pidx %7lld : post = np.array([%10.5f,%10.5f,%10.5f,%10.5f]) ;  "
+               "sail_time_delta = %10.5f   \n",
+               ctx.pidx, p.pos.x, p.pos.y, p.pos.z, p.time, sail_time_delta);
 #endif
 
-    return BOUNDARY ;
+    return BOUNDARY;
 }
 #endif
-#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
+#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)
 /**
 qsim::propagate_at_boundary
 ------------------------------------------
@@ -995,74 +1051,88 @@ incidence.
 
 **/
 
-inline QSIM_METHOD int qsim::propagate_at_boundary(unsigned& flag, RNG& rng, sctx& ctx, float theTransmittance ) const
+inline QSIM_METHOD int qsim::propagate_at_boundary(unsigned &flag, RNG &rng, sctx &ctx, float theTransmittance) const
 {
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if(ctx.pidx == base->pidx)
-    printf("//propagate_at_boundary.DEBUG_PIDX ctx.pidx %7lld base %p base.pidx %7lld \n", ctx.pidx, base, base->pidx  );
+    if (ctx.pidx == base->pidx)
+        printf("//propagate_at_boundary.DEBUG_PIDX ctx.pidx %7lld base %p base.pidx %7lld \n", ctx.pidx, base,
+               base->pidx);
 #endif
 
 #if !defined(PRODUCTION) && defined(DEBUG_TAG)
-    if(ctx.pidx == base->pidx)
-    printf("//propagate_at_boundary.DEBUG_TAG ctx.pidx %7lld base %p base.pidx %7lld \n", ctx.pidx, base, base->pidx  );
+    if (ctx.pidx == base->pidx)
+        printf("//propagate_at_boundary.DEBUG_TAG ctx.pidx %7lld base %p base.pidx %7lld \n", ctx.pidx, base,
+               base->pidx);
 #endif
     // stray "return 0;" left here 2024-12-14 caused : ~/j/issues/jok-tds-missing-BR-BT-on-A-side.rst
 
-    sphoton& p = ctx.p ;
-    const sstate& s = ctx.s ;
+    sphoton &p = ctx.p;
+    const sstate &s = ctx.s;
 
-    const float& n1 = s.material1.x ;
-    const float& n2 = s.material2.x ;
-    const float eta = n1/n2 ;
+    const float &n1 = s.material1.x;
+    const float &n2 = s.material2.x;
+    const float eta = n1 / n2;
 
-    const float3* normal = (float3*)&ctx.prd->q0.f.x ;     // geometrical outwards normal
+    const float3 *normal = (float3 *)&ctx.prd->q0.f.x; // geometrical outwards normal
 
-    const float _c1 = -dot(p.mom, *normal );                // _c1 : cos(angle_of_incidence) not yet oriented
-    const float3 oriented_normal = _c1 < 0.f ? -(*normal) : (*normal) ; // oriented against incident p.mom
-    const float3 trans = cross(p.mom, oriented_normal) ;   // perpendicular to plane of incidence, S-pol direction
-    const float trans_length = length(trans) ;             // same as sin(theta), as p.mom and oriented_normal are unit vectors
-    const bool normal_incidence = trans_length < 1e-6f  ;  // p.mom parallel/anti-parallel to oriented_normal
-    const float3 A_trans = normal_incidence ? p.pol : trans/trans_length ; // normalized unit vector : perpendicular to plane of incidence
-    const float E1_perp = dot(p.pol, A_trans);     // amplitude of polarization in direction perpendicular to plane of incidence, ie S polarization
+    const float _c1 = -dot(p.mom, *normal);                            // _c1 : cos(angle_of_incidence) not yet oriented
+    const float3 oriented_normal = _c1 < 0.f ? -(*normal) : (*normal); // oriented against incident p.mom
+    const float3 trans = cross(p.mom, oriented_normal); // perpendicular to plane of incidence, S-pol direction
+    const float trans_length = length(trans); // same as sin(theta), as p.mom and oriented_normal are unit vectors
+    const bool normal_incidence = trans_length < 1e-6f; // p.mom parallel/anti-parallel to oriented_normal
+    const float3 A_trans =
+        normal_incidence ? p.pol : trans / trans_length; // normalized unit vector : perpendicular to plane of incidence
+    const float E1_perp =
+        dot(p.pol,
+            A_trans); // amplitude of polarization in direction perpendicular to plane of incidence, ie S polarization
 
-    const float c1 = fabs(_c1) ;
+    const float c1 = fabs(_c1);
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if(ctx.pidx == base->pidx)
+    if (ctx.pidx == base->pidx)
     {
-    printf("//qsim.propagate_at_boundary.head pidx %7lld : theTransmittance = %10.8f \n", ctx.pidx, theTransmittance  );
-    printf("//qsim.propagate_at_boundary.head pidx %7lld : nrm = np.array([%10.8f,%10.8f,%10.8f]) ; lnrm = %10.8f  \n",
-         ctx.pidx, oriented_normal.x, oriented_normal.y, oriented_normal.z, length(oriented_normal) );
-    printf("//qsim.propagate_at_boundary.head pidx %7lld : pos = np.array([%10.5f,%10.5f,%10.5f]) ; lpos = %10.8f \n",
-         ctx.pidx, p.pos.x, p.pos.y, p.pos.z, length(p.pos) );
-    printf("//qsim.propagate_at_boundary.head pidx %7lld : mom0 = np.array([%10.8f,%10.8f,%10.8f]) ; lmom0 = %10.8f \n",
-         ctx.pidx, p.mom.x, p.mom.y, p.mom.z, length(p.mom)  );
-    printf("//qsim.propagate_at_boundary.head pidx %7lld : pol0 = np.array([%10.8f,%10.8f,%10.8f]) ; lpol0 = %10.8f \n",
-          ctx.pidx, p.pol.x, p.pol.y, p.pol.z, length(p.pol)  );
-    printf("//qsim.propagate_at_boundary.head pidx %7lld : n1,n2,eta = (%10.8f,%10.8f,%10.8f) \n", ctx.pidx, n1, n2, eta );
-    printf("//qsim.propagate_at_boundary.head pidx %7lld : c1 = %10.8f ; normal_incidence = %d \n", ctx.pidx, c1, normal_incidence );
+        printf("//qsim.propagate_at_boundary.head pidx %7lld : theTransmittance = %10.8f \n", ctx.pidx,
+               theTransmittance);
+        printf(
+            "//qsim.propagate_at_boundary.head pidx %7lld : nrm = np.array([%10.8f,%10.8f,%10.8f]) ; lnrm = %10.8f  \n",
+            ctx.pidx, oriented_normal.x, oriented_normal.y, oriented_normal.z, length(oriented_normal));
+        printf(
+            "//qsim.propagate_at_boundary.head pidx %7lld : pos = np.array([%10.5f,%10.5f,%10.5f]) ; lpos = %10.8f \n",
+            ctx.pidx, p.pos.x, p.pos.y, p.pos.z, length(p.pos));
+        printf("//qsim.propagate_at_boundary.head pidx %7lld : mom0 = np.array([%10.8f,%10.8f,%10.8f]) ; lmom0 = "
+               "%10.8f \n",
+               ctx.pidx, p.mom.x, p.mom.y, p.mom.z, length(p.mom));
+        printf("//qsim.propagate_at_boundary.head pidx %7lld : pol0 = np.array([%10.8f,%10.8f,%10.8f]) ; lpol0 = "
+               "%10.8f \n",
+               ctx.pidx, p.pol.x, p.pol.y, p.pol.z, length(p.pol));
+        printf("//qsim.propagate_at_boundary.head pidx %7lld : n1,n2,eta = (%10.8f,%10.8f,%10.8f) \n", ctx.pidx, n1, n2,
+               eta);
+        printf("//qsim.propagate_at_boundary.head pidx %7lld : c1 = %10.8f ; normal_incidence = %d \n", ctx.pidx, c1,
+               normal_incidence);
     }
 #endif
 
-    const float c2c2 = 1.f - eta*eta*(1.f - c1 * c1 ) ;   // Snells law and trig identity
-    bool tir = c2c2 < 0.f ;
-    const float EdotN = dot(p.pol, oriented_normal ) ;  // used for TIR polarization
-    const float c2 = tir ? 0.f : sqrtf(c2c2) ;   // c2 chosen +ve, set to 0.f for TIR => reflection_coefficient = 1.0f : so will always reflect
-    const float n1c1 = n1*c1 ;
-    const float n2c2 = n2*c2 ;
-    const float n2c1 = n2*c1 ;
-    const float n1c2 = n1*c2 ;
-
-    const float2 E1   = normal_incidence ? make_float2( 0.f, 1.f) : make_float2( E1_perp , length( p.pol - (E1_perp*A_trans) ) );
-    const float2 E2_t = make_float2(  2.f*n1c1*E1.x/(n1c1+n2c2), 2.f*n1c1*E1.y/(n2c1+n1c2) ) ;  // ( S:perp, P:parl )
-    const float2 E2_r = make_float2( E2_t.x - E1.x             , (n2*E2_t.y/n1) - E1.y     ) ;  // ( S:perp, P:parl )
-    const float2 RR = normalize(E2_r) ;
-    const float2 TT = normalize(E2_t) ;
-    const float TransCoeff = theTransmittance >= 0.f ?
-                                                           theTransmittance
-                                                     :
-                                                           ( tir || n1c1 == 0.f ? 0.f : n2c2*dot(E2_t,E2_t)/n1c1 )
-                                                     ;
+    const float c2c2 = 1.f - eta * eta * (1.f - c1 * c1); // Snells law and trig identity
+    bool tir = c2c2 < 0.f;
+    const float EdotN = dot(p.pol, oriented_normal); // used for TIR polarization
+    const float c2 =
+        tir ? 0.f
+            : sqrtf(
+                  c2c2); // c2 chosen +ve, set to 0.f for TIR => reflection_coefficient = 1.0f : so will always reflect
+    const float n1c1 = n1 * c1;
+    const float n2c2 = n2 * c2;
+    const float n2c1 = n2 * c1;
+    const float n1c2 = n1 * c2;
+
+    const float2 E1 =
+        normal_incidence ? make_float2(0.f, 1.f) : make_float2(E1_perp, length(p.pol - (E1_perp * A_trans)));
+    const float2 E2_t =
+        make_float2(2.f * n1c1 * E1.x / (n1c1 + n2c2), 2.f * n1c1 * E1.y / (n2c1 + n1c2)); // ( S:perp, P:parl )
+    const float2 E2_r = make_float2(E2_t.x - E1.x, (n2 * E2_t.y / n1) - E1.y);             // ( S:perp, P:parl )
+    const float2 RR = normalize(E2_r);
+    const float2 TT = normalize(E2_t);
+    const float TransCoeff =
+        theTransmittance >= 0.f ? theTransmittance : (tir || n1c1 == 0.f ? 0.f : n2c2 * dot(E2_t, E2_t) / n1c1);
 
     /*
     E1, E2_t, E2_t: incident, transmitted and reflected amplitudes in S and P directions
@@ -1070,138 +1140,128 @@ inline QSIM_METHOD int qsim::propagate_at_boundary(unsigned& flag, RNG& rng, sct
     */
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if(ctx.pidx == base->pidx)
+    if (ctx.pidx == base->pidx)
     {
-    printf("//qsim.propagate_at_boundary.body pidx %7lld : TransCoeff = %10.8f ; n1c1 = %10.8f ; n2c2 = %10.8f \n",
-            ctx.pidx, TransCoeff, n1c1, n2c2 );
-
-    printf("//qsim.propagate_at_boundary.body pidx %7lld : E2_t = np.array([%10.8f,%10.8f]) ; lE2_t = %10.8f \n",
-            ctx.pidx,  E2_t.x, E2_t.y, length(E2_t) );
+        printf("//qsim.propagate_at_boundary.body pidx %7lld : TransCoeff = %10.8f ; n1c1 = %10.8f ; n2c2 = %10.8f \n",
+               ctx.pidx, TransCoeff, n1c1, n2c2);
 
-    printf("//qsim.propagate_at_boundary.body pidx %7lld : A_trans = np.array([%10.8f,%10.8f,%10.8f]) ; lA_trans = %10.8f \n",
-            ctx.pidx,  A_trans.x, A_trans.y, A_trans.z, length(A_trans) );
+        printf("//qsim.propagate_at_boundary.body pidx %7lld : E2_t = np.array([%10.8f,%10.8f]) ; lE2_t = %10.8f \n",
+               ctx.pidx, E2_t.x, E2_t.y, length(E2_t));
 
+        printf("//qsim.propagate_at_boundary.body pidx %7lld : A_trans = np.array([%10.8f,%10.8f,%10.8f]) ; lA_trans = "
+               "%10.8f \n",
+               ctx.pidx, A_trans.x, A_trans.y, A_trans.z, length(A_trans));
     }
 #endif
 
-
 #if !defined(PRODUCTION) && defined(DEBUG_TAG)
-    const float u_boundary_burn = curand_uniform(&rng) ;  // needed for random consumption alignment with Geant4 G4OpBoundaryProcess::PostStepDoIt
+    const float u_boundary_burn =
+        curand_uniform(&rng); // needed for random consumption alignment with Geant4 G4OpBoundaryProcess::PostStepDoIt
 #endif
-    const float u_reflect = curand_uniform(&rng) ;
-    bool reflect = u_reflect > TransCoeff  ;
+    const float u_reflect = curand_uniform(&rng);
+    bool reflect = u_reflect > TransCoeff;
 
 #if !defined(PRODUCTION) && defined(DEBUG_TAG)
-    stagr& tagr = ctx.tagr ;
-    tagr.add( stag_at_burn_sf_sd, u_boundary_burn);
-    tagr.add( stag_at_ref,  u_reflect);
+    stagr &tagr = ctx.tagr;
+    tagr.add(stag_at_burn_sf_sd, u_boundary_burn);
+    tagr.add(stag_at_ref, u_reflect);
 #endif
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if(ctx.pidx == base->pidx)
+    if (ctx.pidx == base->pidx)
     {
-    printf("//qsim.propagate_at_boundary.body pidx %7lld : u_reflect %10.4f TransCoeff %10.4f reflect %d \n",
-              ctx.pidx,  u_reflect, TransCoeff, reflect   );
-
-    printf("//qsim.propagate_at_boundary.body pidx %7lld : mom0 = np.array([%10.8f,%10.8f,%10.8f]) ; lmom0 = %10.8f \n",
-               ctx.pidx, p.mom.x, p.mom.y, p.mom.z, length(p.mom) ) ;
+        printf("//qsim.propagate_at_boundary.body pidx %7lld : u_reflect %10.4f TransCoeff %10.4f reflect %d \n",
+               ctx.pidx, u_reflect, TransCoeff, reflect);
 
-    printf("//qsim.propagate_at_boundary.body pidx %7lld : pos = np.array([%10.5f,%10.5f,%10.5f]) ; lpos = %10.8f \n",
-               ctx.pidx, p.pos.x, p.pos.y, p.pos.z, length(p.pos)  );
+        printf("//qsim.propagate_at_boundary.body pidx %7lld : mom0 = np.array([%10.8f,%10.8f,%10.8f]) ; lmom0 = "
+               "%10.8f \n",
+               ctx.pidx, p.mom.x, p.mom.y, p.mom.z, length(p.mom));
 
-    printf("//qsim.propagate_at_boundary.body pidx %7lld : nrm = np.array([%10.8f,%10.8f,%10.8f]) ; lnrm = %10.8f \n",
-               ctx.pidx, oriented_normal.x, oriented_normal.y, oriented_normal.z, length(oriented_normal) ) ;
+        printf(
+            "//qsim.propagate_at_boundary.body pidx %7lld : pos = np.array([%10.5f,%10.5f,%10.5f]) ; lpos = %10.8f \n",
+            ctx.pidx, p.pos.x, p.pos.y, p.pos.z, length(p.pos));
 
-    printf("//qsim.propagate_at_boundary.body pidx %7lld : n1 = %10.8f ; n2 = %10.8f ; eta = %10.8f  \n",
-               ctx.pidx, n1, n2, eta );
+        printf(
+            "//qsim.propagate_at_boundary.body pidx %7lld : nrm = np.array([%10.8f,%10.8f,%10.8f]) ; lnrm = %10.8f \n",
+            ctx.pidx, oriented_normal.x, oriented_normal.y, oriented_normal.z, length(oriented_normal));
 
-    printf("//qsim.propagate_at_boundary.body pidx %7lld : c1 = %10.8f ; eta_c1 = %10.8f ; c2 = %10.8f ; eta_c1__c2 = %10.8f \n",
-               ctx.pidx, c1, eta*c1, c2, (eta*c1 - c2) );
+        printf("//qsim.propagate_at_boundary.body pidx %7lld : n1 = %10.8f ; n2 = %10.8f ; eta = %10.8f  \n", ctx.pidx,
+               n1, n2, eta);
 
+        printf("//qsim.propagate_at_boundary.body pidx %7lld : c1 = %10.8f ; eta_c1 = %10.8f ; c2 = %10.8f ; "
+               "eta_c1__c2 = %10.8f \n",
+               ctx.pidx, c1, eta * c1, c2, (eta * c1 - c2));
     }
 #endif
 
-    p.mom = reflect
-                    ?
-                       p.mom + 2.0f*c1*oriented_normal
-                    :
-                       eta*(p.mom) + (eta*c1 - c2)*oriented_normal
-                    ;
-
+    p.mom = reflect ? p.mom + 2.0f * c1 * oriented_normal : eta * (p.mom) + (eta * c1 - c2) * oriented_normal;
 
     // Q: Does the new p.mom need to be normalized ?
     // A: NO, it is inherently normalized as derived in the comment below
 
+    const float3 A_paral = normalize(cross(p.mom, A_trans)); // new P-pol direction
 
-    const float3 A_paral = normalize(cross(p.mom, A_trans));   // new P-pol direction
-
-    p.pol =  normal_incidence ?
-                                         ( reflect ?  p.pol*(n2>n1? -1.f:1.f) : p.pol )
-                                      :
-                                         ( reflect ?
-                                                   ( tir ?  -p.pol + 2.f*EdotN*oriented_normal : RR.x*A_trans + RR.y*A_paral )
-
-                                                   :
-                                                       TT.x*A_trans + TT.y*A_paral
-
-                                                   )
-                                      ;
-
+    p.pol = normal_incidence
+                ? (reflect ? p.pol * (n2 > n1 ? -1.f : 1.f) : p.pol)
+                : (reflect ? (tir ? -p.pol + 2.f * EdotN * oriented_normal : RR.x * A_trans + RR.y * A_paral)
 
+                           : TT.x * A_trans + TT.y * A_paral
 
-     // Q: Above expression kinda implies A_trans and A_paral are same for reflect and transmit ?
-     // A: NO IT DOESNT,
-     //    A_trans is the same (except for normal incidence) as there is only one perpendicular
-     //    to the plane of incidence which is the same for i,r,t.
-     //
-     //    A_paral depends on the new p.mom (is has to be orthogonal to p.mom and A_trans)
-     //    and p.mom of course is different for r and t
-     //    (the reflect bool is used in multiple places, not just here)
-
+                  );
 
+    // Q: Above expression kinda implies A_trans and A_paral are same for reflect and transmit ?
+    // A: NO IT DOESNT,
+    //    A_trans is the same (except for normal incidence) as there is only one perpendicular
+    //    to the plane of incidence which is the same for i,r,t.
+    //
+    //    A_paral depends on the new p.mom (is has to be orthogonal to p.mom and A_trans)
+    //    and p.mom of course is different for r and t
+    //    (the reflect bool is used in multiple places, not just here)
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if(ctx.pidx == base->pidx)
+    if (ctx.pidx == base->pidx)
     {
-    printf("//qsim.propagate_at_boundary.tail pidx %7lld : reflect %d tir %d TransCoeff %10.4f u_reflect %10.4f \n", ctx.pidx, reflect, tir, TransCoeff, u_reflect );
-    printf("//qsim.propagate_at_boundary.tail pidx %7lld : mom1 = np.array([%10.8f,%10.8f,%10.8f]) ; lmom1 = %10.8f  \n",
-        ctx.pidx, p.mom.x, p.mom.y, p.mom.z, length(p.mom) );
-    printf("//qsim.propagate_at_boundary.tail pidx %7lld : pol1 = np.array([%10.8f,%10.8f,%10.8f]) ; lpol1 = %10.8f \n",
-        ctx.pidx, p.pol.x, p.pol.y, p.pol.z, length(p.pol) );
+        printf("//qsim.propagate_at_boundary.tail pidx %7lld : reflect %d tir %d TransCoeff %10.4f u_reflect %10.4f \n",
+               ctx.pidx, reflect, tir, TransCoeff, u_reflect);
+        printf("//qsim.propagate_at_boundary.tail pidx %7lld : mom1 = np.array([%10.8f,%10.8f,%10.8f]) ; lmom1 = "
+               "%10.8f  \n",
+               ctx.pidx, p.mom.x, p.mom.y, p.mom.z, length(p.mom));
+        printf("//qsim.propagate_at_boundary.tail pidx %7lld : pol1 = np.array([%10.8f,%10.8f,%10.8f]) ; lpol1 = "
+               "%10.8f \n",
+               ctx.pidx, p.pol.x, p.pol.y, p.pol.z, length(p.pol));
     }
 
     /*
     if(ctx.pidx == 251959)
     {
-        printf("//qsim.propagate_at_boundary RR.x %10.4f A_trans (%10.4f %10.4f %10.4f )  RR.y %10.4f  A_paral (%10.4f %10.4f %10.4f ) \n",
-              RR.x, A_trans.x, A_trans.y, A_trans.z,
-              RR.y, A_paral.x, A_paral.y, A_paral.z );
+        printf("//qsim.propagate_at_boundary RR.x %10.4f A_trans (%10.4f %10.4f %10.4f )  RR.y %10.4f  A_paral (%10.4f
+    %10.4f %10.4f ) \n", RR.x, A_trans.x, A_trans.y, A_trans.z, RR.y, A_paral.x, A_paral.y, A_paral.z );
 
-        printf("//qsim.propagate_at_boundary reflect %d  tir %d polarization (%10.4f, %10.4f, %10.4f) \n", reflect, tir, p.pol.x, p.pol.y, p.pol.z );
+        printf("//qsim.propagate_at_boundary reflect %d  tir %d polarization (%10.4f, %10.4f, %10.4f) \n", reflect, tir,
+    p.pol.x, p.pol.y, p.pol.z );
     }
     */
 #endif
 
-    flag = reflect ? BOUNDARY_REFLECT : BOUNDARY_TRANSMIT ;
-
+    flag = reflect ? BOUNDARY_REFLECT : BOUNDARY_TRANSMIT;
 
 #if !defined(PRODUCTION) && defined(DEBUG_TAG)
-    if( flag ==  BOUNDARY_REFLECT )
+    if (flag == BOUNDARY_REFLECT)
     {
-        const float u_br_align_0 = curand_uniform(&rng) ;
-        const float u_br_align_1 = curand_uniform(&rng) ;
-        const float u_br_align_2 = curand_uniform(&rng) ;
-        const float u_br_align_3 = curand_uniform(&rng) ;
+        const float u_br_align_0 = curand_uniform(&rng);
+        const float u_br_align_1 = curand_uniform(&rng);
+        const float u_br_align_2 = curand_uniform(&rng);
+        const float u_br_align_3 = curand_uniform(&rng);
 
         // switched below standard tags from stag_br_align_0/1/2/3 to simplify A:tag to B:stack mapping
-        tagr.add( stag_to_sci, u_br_align_0 );
-        tagr.add( stag_to_bnd, u_br_align_1 );
-        tagr.add( stag_to_sca, u_br_align_2 );
-        tagr.add( stag_to_abs, u_br_align_3 );
+        tagr.add(stag_to_sci, u_br_align_0);
+        tagr.add(stag_to_bnd, u_br_align_1);
+        tagr.add(stag_to_sca, u_br_align_2);
+        tagr.add(stag_to_abs, u_br_align_3);
     }
 #endif
 
-    return CONTINUE ;
+    return CONTINUE;
 }
 /**
 qsim::propagate_at_boundary_with_T
@@ -1221,93 +1281,83 @@ and leave this just for testing.
 
 **/
 
-inline QSIM_METHOD int qsim::propagate_at_boundary_with_T(unsigned& flag, RNG& rng, sctx& ctx, float theTransmittance ) const
+inline QSIM_METHOD int qsim::propagate_at_boundary_with_T(unsigned &flag, RNG &rng, sctx &ctx,
+                                                          float theTransmittance) const
 {
-    sphoton& p = ctx.p ;
-    const sstate& s = ctx.s ;
-
-    const float& n1 = s.material1.x ;
-    const float& n2 = s.material2.x ;
-    const float eta = n1/n2 ;
-
-    const float3* normal = (float3*)&ctx.prd->q0.f.x ;     // geometrical outwards normal
-
-    const float _c1 = -dot(p.mom, *normal );                // _c1 : cos(angle_of_incidence) not yet oriented
-    const float3 oriented_normal = _c1 < 0.f ? -(*normal) : (*normal) ; // oriented against incident p.mom
-    const float3 trans = cross(p.mom, oriented_normal) ;   // perpendicular to plane of incidence, S-pol direction
-    const float trans_length = length(trans) ;             // same as sin(theta), as p.mom and oriented_normal are unit vectors
-    const bool normal_incidence = trans_length < 1e-6f  ;  // p.mom parallel/anti-parallel to oriented_normal
-    const float3 A_trans = normal_incidence ? p.pol : trans/trans_length ; // normalized unit vector : perpendicular to plane of incidence
-    const float E1_perp = dot(p.pol, A_trans);     // amplitude of polarization in direction perpendicular to plane of incidence, ie S polarization
+    sphoton &p = ctx.p;
+    const sstate &s = ctx.s;
+
+    const float &n1 = s.material1.x;
+    const float &n2 = s.material2.x;
+    const float eta = n1 / n2;
+
+    const float3 *normal = (float3 *)&ctx.prd->q0.f.x; // geometrical outwards normal
+
+    const float _c1 = -dot(p.mom, *normal);                            // _c1 : cos(angle_of_incidence) not yet oriented
+    const float3 oriented_normal = _c1 < 0.f ? -(*normal) : (*normal); // oriented against incident p.mom
+    const float3 trans = cross(p.mom, oriented_normal); // perpendicular to plane of incidence, S-pol direction
+    const float trans_length = length(trans); // same as sin(theta), as p.mom and oriented_normal are unit vectors
+    const bool normal_incidence = trans_length < 1e-6f; // p.mom parallel/anti-parallel to oriented_normal
+    const float3 A_trans =
+        normal_incidence ? p.pol : trans / trans_length; // normalized unit vector : perpendicular to plane of incidence
+    const float E1_perp =
+        dot(p.pol,
+            A_trans); // amplitude of polarization in direction perpendicular to plane of incidence, ie S polarization
+
+    const float c1 = fabs(_c1);
+
+    const float c2c2 = 1.f - eta * eta * (1.f - c1 * c1); // Snells law and trig identity
+    bool tir = c2c2 < 0.f;
+    const float EdotN = dot(p.pol, oriented_normal); // used for TIR polarization
+    const float c2 =
+        tir ? 0.f
+            : sqrtf(
+                  c2c2); // c2 chosen +ve, set to 0.f for TIR => reflection_coefficient = 1.0f : so will always reflect
+    const float n1c1 = n1 * c1;
+    const float n2c2 = n2 * c2;
+    const float n2c1 = n2 * c1;
+    const float n1c2 = n1 * c2;
+
+    const float2 E1 =
+        normal_incidence ? make_float2(0.f, 1.f) : make_float2(E1_perp, length(p.pol - (E1_perp * A_trans)));
+    const float2 E2_t =
+        make_float2(2.f * n1c1 * E1.x / (n1c1 + n2c2), 2.f * n1c1 * E1.y / (n2c1 + n1c2)); // ( S:perp, P:parl )
+    const float2 E2_r = make_float2(E2_t.x - E1.x, (n2 * E2_t.y / n1) - E1.y);             // ( S:perp, P:parl )
+    const float2 RR = normalize(E2_r);
+    const float2 TT = normalize(E2_t);
 
-    const float c1 = fabs(_c1) ;
-
-    const float c2c2 = 1.f - eta*eta*(1.f - c1 * c1 ) ;   // Snells law and trig identity
-    bool tir = c2c2 < 0.f ;
-    const float EdotN = dot(p.pol, oriented_normal ) ;  // used for TIR polarization
-    const float c2 = tir ? 0.f : sqrtf(c2c2) ;   // c2 chosen +ve, set to 0.f for TIR => reflection_coefficient = 1.0f : so will always reflect
-    const float n1c1 = n1*c1 ;
-    const float n2c2 = n2*c2 ;
-    const float n2c1 = n2*c1 ;
-    const float n1c2 = n1*c2 ;
-
-    const float2 E1   = normal_incidence ? make_float2( 0.f, 1.f) : make_float2( E1_perp , length( p.pol - (E1_perp*A_trans) ) );
-    const float2 E2_t = make_float2(  2.f*n1c1*E1.x/(n1c1+n2c2), 2.f*n1c1*E1.y/(n2c1+n1c2) ) ;  // ( S:perp, P:parl )
-    const float2 E2_r = make_float2( E2_t.x - E1.x             , (n2*E2_t.y/n1) - E1.y     ) ;  // ( S:perp, P:parl )
-    const float2 RR = normalize(E2_r) ;
-    const float2 TT = normalize(E2_t) ;
-
-
-/*
-    const float TransCoeff = theTransmittance >= 0.f ?
-                                                           theTransmittance
-                                                     :
-                                                           ( tir || n1c1 == 0.f ? 0.f : n2c2*dot(E2_t,E2_t)/n1c1 )
-                                                     ;
-*/
-
-    const float& TransCoeff = theTransmittance ;
-
-    const float u_reflect = curand_uniform(&rng) ;
-    bool reflect = u_reflect > TransCoeff  ;
+    /*
+        const float TransCoeff = theTransmittance >= 0.f ?
+                                                               theTransmittance
+                                                         :
+                                                               ( tir || n1c1 == 0.f ? 0.f : n2c2*dot(E2_t,E2_t)/n1c1 )
+                                                         ;
+    */
 
-    p.mom = reflect
-                    ?
-                       p.mom + 2.0f*c1*oriented_normal
-                    :
-                       eta*(p.mom) + (eta*c1 - c2)*oriented_normal
-                    ;
+    const float &TransCoeff = theTransmittance;
 
+    const float u_reflect = curand_uniform(&rng);
+    bool reflect = u_reflect > TransCoeff;
 
-    const float3 A_paral = normalize(cross(p.mom, A_trans));   // new P-pol direction
+    p.mom = reflect ? p.mom + 2.0f * c1 * oriented_normal : eta * (p.mom) + (eta * c1 - c2) * oriented_normal;
 
-    p.pol =  normal_incidence ?
-                                         ( reflect ?  p.pol*(n2>n1? -1.f:1.f) : p.pol )
-                                      :
-                                         ( reflect ?
-                                                   ( tir ?  -p.pol + 2.f*EdotN*oriented_normal : RR.x*A_trans + RR.y*A_paral )
+    const float3 A_paral = normalize(cross(p.mom, A_trans)); // new P-pol direction
 
-                                                   :
-                                                       TT.x*A_trans + TT.y*A_paral
+    p.pol = normal_incidence
+                ? (reflect ? p.pol * (n2 > n1 ? -1.f : 1.f) : p.pol)
+                : (reflect ? (tir ? -p.pol + 2.f * EdotN * oriented_normal : RR.x * A_trans + RR.y * A_paral)
 
-                                                   )
-                                      ;
+                           : TT.x * A_trans + TT.y * A_paral
 
-    flag = reflect ? BOUNDARY_REFLECT : BOUNDARY_TRANSMIT ;
+                  );
 
+    flag = reflect ? BOUNDARY_REFLECT : BOUNDARY_TRANSMIT;
 
-    return CONTINUE ;
+    return CONTINUE;
 }
 
 #endif
 
-
-
-
-
-
-
-
 /**
 Reflected momentum vector
 ---------------------------
@@ -1444,12 +1494,6 @@ Compare transmitted vector with G4OpBoundaryProcess::DielectricDielectric
 
 **/
 
-
-
-
-
-
-
 /*
 G4OpBoundaryProcess::DielectricDielectric
 
@@ -1514,7 +1558,6 @@ transmit
 
 */
 
-
 /*
 qsim::propagate_at_surface_MultiFilm
 -------------------------------
@@ -1535,13 +1578,13 @@ Tp: p-component reflect probability
 */
 
 #if defined(__CUDACC__) || defined(__CUDABE__)
-inline QSIM_METHOD int qsim::propagate_at_surface_MultiFilm(unsigned& flag, RNG& rng, sctx& ctx )
+inline QSIM_METHOD int qsim::propagate_at_surface_MultiFilm(unsigned &flag, RNG &rng, sctx &ctx)
 {
 
-	const float one = 1.0f;
-    const sphoton& p = ctx.p ;
-    const float3* normal = (float3*)&ctx.prd->q0.f.x ;
-    int   lpmtid = ctx.prd->identity() - 1 ;  // identity comes from optixInstance.instanceId where 0 means not-a-sensor
+    const float one = 1.0f;
+    const sphoton &p = ctx.p;
+    const float3 *normal = (float3 *)&ctx.prd->q0.f.x;
+    int lpmtid = ctx.prd->identity() - 1; // identity comes from optixInstance.instanceId where 0 means not-a-sensor
 
     float minus_cos_theta = dot(p.mom, *normal);
     int pmtcat = pmt->get_lpmtcat_from_lpmtid(lpmtid);
@@ -1549,66 +1592,61 @@ inline QSIM_METHOD int qsim::propagate_at_surface_MultiFilm(unsigned& flag, RNG&
 
     float4 RsTsRpTp = multifilm->lookup(pmtcat, wv_nm, minus_cos_theta);
 
+    const float c1 = fabs(minus_cos_theta);
+    const float s1 = sqrtf(one - c1 * c1);
 
-    const float c1 = fabs(minus_cos_theta) ;
-    const float s1 = sqrtf(one -c1*c1);
-
-    float EsEs = s1 > 0.f ? dot(p.pol, cross( p.mom, *normal))/s1 : 0.f ;
-    EsEs *= EsEs;   //   orienting normal doesnt matter as squared : this is S_vs_P power fraction
-
-
-    float3 ART ;
-    ART.z = RsTsRpTp.y*EsEs + RsTsRpTp.w*(one - EsEs);
-    ART.y = RsTsRpTp.x*EsEs + RsTsRpTp.z*(one - EsEs);
-    ART.x = one - (ART.y+ART.z);
+    float EsEs = s1 > 0.f ? dot(p.pol, cross(p.mom, *normal)) / s1 : 0.f;
+    EsEs *= EsEs; //   orienting normal doesnt matter as squared : this is S_vs_P power fraction
 
-    const float& A = ART.x ;
-    const float& T = ART.z ;
+    float3 ART;
+    ART.z = RsTsRpTp.y * EsEs + RsTsRpTp.w * (one - EsEs);
+    ART.y = RsTsRpTp.x * EsEs + RsTsRpTp.z * (one - EsEs);
+    ART.x = one - (ART.y + ART.z);
 
+    const float &A = ART.x;
+    const float &T = ART.z;
 
-    float4 RsTsRpTpNormal = multifilm->lookup(pmtcat, wv_nm, -one );
+    float4 RsTsRpTpNormal = multifilm->lookup(pmtcat, wv_nm, -one);
     // Normal means the photon incident from glass to vacuum, AOI = 0 deg  cos_theta = -1.f
 
     float3 ART_normal;
-    ART_normal.z = 0.5f*(RsTsRpTpNormal.y + RsTsRpTpNormal.w); // T:0.5f*(Ts+Tp)
-    ART_normal.y = 0.5f*(RsTsRpTpNormal.x + RsTsRpTpNormal.z); // R:0.5f*(Rs+Rp)
-    ART_normal.x = one -(ART_normal.y + ART_normal.z) ;        // 1.f - (R+T)
+    ART_normal.z = 0.5f * (RsTsRpTpNormal.y + RsTsRpTpNormal.w); // T:0.5f*(Ts+Tp)
+    ART_normal.y = 0.5f * (RsTsRpTpNormal.x + RsTsRpTpNormal.z); // R:0.5f*(Rs+Rp)
+    ART_normal.x = one - (ART_normal.y + ART_normal.z);          // 1.f - (R+T)
 
-    const float& An = ART_normal.x ;
-    const float energy_eV = qpmt<float>::hc_eVnm/wv_nm ;
+    const float &An = ART_normal.x;
+    const float energy_eV = qpmt<float>::hc_eVnm / wv_nm;
     const float qe_scale = pmt->get_qescale_from_lpmtid(lpmtid);
     const float qe_shape = pmt->get_lpmtcat_qe(pmtcat, energy_eV);
 
     const float _qe = minus_cos_theta > 0.f ? 0.f : qe_scale * qe_shape;
 
-    const float& theAbsorption = A;
-    const float& theTransmittance = T/(one-A);
-    const float& theEfficiency = _qe/An;
+    const float &theAbsorption = A;
+    const float &theTransmittance = T / (one - A);
+    const float &theEfficiency = _qe / An;
 
     float u_theAbsorption = curand_uniform(&rng);
-    int action = u_theAbsorption < theAbsorption  ? BREAK : CONTINUE ;
+    int action = u_theAbsorption < theAbsorption ? BREAK : CONTINUE;
 
-    if( action == BREAK )
+    if (action == BREAK)
     {
-        float u_theEfficiency = curand_uniform(&rng) ;
-        flag = u_theEfficiency < theEfficiency ? SURFACE_DETECT : SURFACE_ABSORB ;
+        float u_theEfficiency = curand_uniform(&rng);
+        flag = u_theEfficiency < theEfficiency ? SURFACE_DETECT : SURFACE_ABSORB;
     }
     else
     {
-        propagate_at_boundary( flag, rng, ctx, theTransmittance  );
+        propagate_at_boundary(flag, rng, ctx, theTransmittance);
     }
 
-    //printf("//qsim.propagate_at_surface_MultiFilm pidx %7lld lpmtid %d ART ( %7.3f %7.3f %7.3f ) u_theAbsorption  %7.3f action %d \n",
-    //ctx.pidx, lpmtid, ART.x, ART.y, ART.z, u_theAbsorption, action);
-
-    return action ;
+    // printf("//qsim.propagate_at_surface_MultiFilm pidx %7lld lpmtid %d ART ( %7.3f %7.3f %7.3f ) u_theAbsorption
+    // %7.3f action %d \n", ctx.pidx, lpmtid, ART.x, ART.y, ART.z, u_theAbsorption, action);
 
+    return action;
 }
 
 #endif
 
-
-#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
+#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)
 
 /**
 qsim::propagate_at_surface   (HMM: perhaps propagate_at_simplified_surface )
@@ -1674,87 +1712,82 @@ The s.surface float4 is filled by qbnd::fill_state via::
 
 **/
 
-inline QSIM_METHOD int qsim::propagate_at_surface(unsigned& flag, RNG& rng, sctx& ctx)
+inline QSIM_METHOD int qsim::propagate_at_surface(unsigned &flag, RNG &rng, sctx &ctx)
 {
-    const sstate& s = ctx.s ;
-    const float& detect = s.surface.x ;
-    const float& absorb = s.surface.y ;
-    //const float& reflect_specular_ = s.surface.z ;
-    const float& reflect_diffuse_  = s.surface.w ;
+    const sstate &s = ctx.s;
+    const float &detect = s.surface.x;
+    const float &absorb = s.surface.y;
+    // const float& reflect_specular_ = s.surface.z ;
+    const float &reflect_diffuse_ = s.surface.w;
 
     float u_surface = curand_uniform(&rng);
 
 #if !defined(PRODUCTION) && defined(DEBUG_TAG)
-    stagr& tagr = ctx.tagr ;
+    stagr &tagr = ctx.tagr;
     float u_surface_burn = curand_uniform(&rng);
-    tagr.add( stag_at_burn_sf_sd, u_surface);
-    tagr.add( stag_sf_burn,       u_surface_burn);
+    tagr.add(stag_at_burn_sf_sd, u_surface);
+    tagr.add(stag_sf_burn, u_surface_burn);
 #endif
 
+    int action = u_surface < absorb + detect ? BREAK : CONTINUE;
 
-    int action = u_surface < absorb + detect ? BREAK : CONTINUE  ;
-
-    if( action == BREAK )
+    if (action == BREAK)
     {
 #if defined(WITH_CUSTOM4)
-        int pmtid = ctx.prd->identity() - 1 ;     // identity comes from optixInstance.instanceId where 0 means not-a-sensor
-        float qe = 1.f ;
+        int pmtid = ctx.prd->identity() - 1; // identity comes from optixInstance.instanceId where 0 means not-a-sensor
+        float qe = 1.f;
         float u_qe = curand_uniform(&rng);
-        if( s_pmt::is_spmtid(pmtid) )
+        if (s_pmt::is_spmtid(pmtid))
         {
-            const float energy_eV = qpmt<float>::hc_eVnm/ctx.p.wavelength ;
-            float qe_shape = pmt->s_qeshape_prop->interpolate( 0, energy_eV );
+            const float energy_eV = qpmt<float>::hc_eVnm / ctx.p.wavelength;
+            float qe_shape = pmt->s_qeshape_prop->interpolate(0, energy_eV);
             float qe_scale = pmt->get_s_qescale_from_spmtid(pmtid);
-            qe = qe_shape*qe_scale ;
+            qe = qe_shape * qe_scale;
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-            if(ctx.pidx == base->pidx)
-            printf("//qsim.propagate_at_surface.BREAK.is_spmtid pidx %7lld : pmtid %d energy_eV %7.3f qe_shape %7.3f qe_scale %7.3f qe %7.3f detect %7.3f absorb %7.3f reflect_specular %7.3f reflect_diffuse %7.3f \n" ,
-                ctx.pidx, pmtid, energy_eV, qe_shape, qe_scale, qe, detect, absorb, s.surface.z, reflect_diffuse_ );
+            if (ctx.pidx == base->pidx)
+                printf(
+                    "//qsim.propagate_at_surface.BREAK.is_spmtid pidx %7lld : pmtid %d energy_eV %7.3f qe_shape %7.3f "
+                    "qe_scale %7.3f qe %7.3f detect %7.3f absorb %7.3f reflect_specular %7.3f reflect_diffuse %7.3f \n",
+                    ctx.pidx, pmtid, energy_eV, qe_shape, qe_scale, qe, detect, absorb, s.surface.z, reflect_diffuse_);
 #endif
         }
-        flag = u_surface < absorb ?
-                                      SURFACE_ABSORB
-                                  :
-                                      ( u_qe < qe  ? EFFICIENCY_COLLECT : EFFICIENCY_CULL  )
-                                  ;
+        flag = u_surface < absorb ? SURFACE_ABSORB : (u_qe < qe ? EFFICIENCY_COLLECT : EFFICIENCY_CULL);
 #else
-        flag = u_surface < absorb ?
-                                      SURFACE_ABSORB
-                                  :
-                                      SURFACE_DETECT
-                                  ;
+        flag = u_surface < absorb ? SURFACE_ABSORB : SURFACE_DETECT;
 #endif
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-        if(ctx.pidx == base->pidx)
-        printf("//qsim.propagate_at_surface.SA/SD.BREAK pidx %7lld : flag %d \n" , ctx.pidx, flag );
+        if (ctx.pidx == base->pidx)
+            printf("//qsim.propagate_at_surface.SA/SD.BREAK pidx %7lld : flag %d \n", ctx.pidx, flag);
 #endif
     }
     else
     {
-        flag = u_surface < absorb + detect + reflect_diffuse_ ?  SURFACE_DREFLECT : SURFACE_SREFLECT ;
-        switch(flag)
+        flag = u_surface < absorb + detect + reflect_diffuse_ ? SURFACE_DREFLECT : SURFACE_SREFLECT;
+        switch (flag)
         {
-            case SURFACE_DREFLECT: reflect_diffuse( rng, ctx)  ; break ;
-            case SURFACE_SREFLECT: reflect_specular(rng, ctx)  ; break ;
+        case SURFACE_DREFLECT:
+            reflect_diffuse(rng, ctx);
+            break;
+        case SURFACE_SREFLECT:
+            reflect_specular(rng, ctx);
+            break;
         }
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-        if(ctx.pidx == base->pidx)
-        printf("//qsim.propagate_at_surface.DR/SR.CONTINUE pidx %7lld : flag %d \n" , ctx.pidx, flag );
+        if (ctx.pidx == base->pidx)
+            printf("//qsim.propagate_at_surface.DR/SR.CONTINUE pidx %7lld : flag %d \n", ctx.pidx, flag);
 #endif
     }
-    return action ;
+    return action;
 }
 
-
-inline QSIM_METHOD int qsim::propagate_at_surface_Detect(unsigned& flag, RNG& rng, sctx& ctx) const
+inline QSIM_METHOD int qsim::propagate_at_surface_Detect(unsigned &flag, RNG &rng, sctx &ctx) const
 {
-    float u_surface_burn = curand_uniform(&rng);  // for random alignment
-    flag = SURFACE_DETECT ;
-    return BREAK ;
+    float u_surface_burn = curand_uniform(&rng); // for random alignment
+    flag = SURFACE_DETECT;
+    return BREAK;
 }
 
-
 #if defined(WITH_CUSTOM4)
 
 /**
@@ -1780,125 +1813,134 @@ Where ctx.prd->identity() comes from ? Where is the "+ 1" done ?
 
 **/
 
-inline QSIM_METHOD int qsim::propagate_at_surface_CustomART(unsigned& flag, RNG& rng, sctx& ctx) const
+inline QSIM_METHOD int qsim::propagate_at_surface_CustomART(unsigned &flag, RNG &rng, sctx &ctx) const
 {
 
-    sphoton& p = ctx.p ;
-    const float3* normal = (float3*)&ctx.prd->q0.f.x ;  // geometrical outwards normal
-    int lpmtid = ctx.prd->identity() - 1 ;  // identity comes from optixInstance.instanceId where 0 means not-a-sensor
-    const float lposcost = ctx.prd->lposcost() ;  // local frame intersect position cosine theta
-
+    sphoton &p = ctx.p;
+    const float3 *normal = (float3 *)&ctx.prd->q0.f.x; // geometrical outwards normal
+    int lpmtid = ctx.prd->identity() - 1; // identity comes from optixInstance.instanceId where 0 means not-a-sensor
+    const float lposcost = ctx.prd->lposcost(); // local frame intersect position cosine theta
 
     float minus_cos_theta = dot(p.mom, *normal);
-    float dot_pol_cross_mom_nrm = dot(p.pol,cross(p.mom,*normal)) ;
+    float dot_pol_cross_mom_nrm = dot(p.pol, cross(p.mom, *normal));
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if( ctx.pidx_debug )
+    if (ctx.pidx_debug)
     {
-    float3 cross_mom_nrm = cross(p.mom, *normal) ;
-    printf("//qsim::propagate_at_surface_CustomART pidx %7lld : mom = np.array([%10.8f,%10.8f,%10.8f]) ; lmom = %10.8f \n",
-       ctx.pidx, p.mom.x, p.mom.y, p.mom.z, length(p.mom) );
-    printf("//qsim::propagate_at_surface_CustomART pidx %7lld : pol = np.array([%10.8f,%10.8f,%10.8f]) ; lpol = %10.8f \n",
-       ctx.pidx, p.pol.x, p.pol.y, p.pol.z, length(p.pol) );
-    printf("//qsim::propagate_at_surface_CustomART pidx %7lld : nrm = np.array([%10.8f,%10.8f,%10.8f]) ; lnrm = %10.8f \n",
-       ctx.pidx, normal->x, normal->y, normal->z, length(*normal) );
-    printf("//qsim::propagate_at_surface_CustomART pidx %7lld : cross_mom_nrm = np.array([%10.8f,%10.8f,%10.8f]) ; lcross_mom_nrm = %10.8f  \n",
-           ctx.pidx, cross_mom_nrm.x, cross_mom_nrm.y, cross_mom_nrm.z, length(cross_mom_nrm)  );
-    printf("//qsim::propagate_at_surface_CustomART pidx %7lld : dot_pol_cross_mom_nrm = %10.8f \n", ctx.pidx, dot_pol_cross_mom_nrm );
-    printf("//qsim::propagate_at_surface_CustomART pidx %7lld : minus_cos_theta = %10.8f \n", ctx.pidx, minus_cos_theta );
-    printf("//qsim::propagate_at_surface_CustomART pidx %7lld : lposcost = %10.8f (expect 0->1)\n", ctx.pidx, lposcost );
+        float3 cross_mom_nrm = cross(p.mom, *normal);
+        printf("//qsim::propagate_at_surface_CustomART pidx %7lld : mom = np.array([%10.8f,%10.8f,%10.8f]) ; lmom = "
+               "%10.8f \n",
+               ctx.pidx, p.mom.x, p.mom.y, p.mom.z, length(p.mom));
+        printf("//qsim::propagate_at_surface_CustomART pidx %7lld : pol = np.array([%10.8f,%10.8f,%10.8f]) ; lpol = "
+               "%10.8f \n",
+               ctx.pidx, p.pol.x, p.pol.y, p.pol.z, length(p.pol));
+        printf("//qsim::propagate_at_surface_CustomART pidx %7lld : nrm = np.array([%10.8f,%10.8f,%10.8f]) ; lnrm = "
+               "%10.8f \n",
+               ctx.pidx, normal->x, normal->y, normal->z, length(*normal));
+        printf("//qsim::propagate_at_surface_CustomART pidx %7lld : cross_mom_nrm = np.array([%10.8f,%10.8f,%10.8f]) ; "
+               "lcross_mom_nrm = %10.8f  \n",
+               ctx.pidx, cross_mom_nrm.x, cross_mom_nrm.y, cross_mom_nrm.z, length(cross_mom_nrm));
+        printf("//qsim::propagate_at_surface_CustomART pidx %7lld : dot_pol_cross_mom_nrm = %10.8f \n", ctx.pidx,
+               dot_pol_cross_mom_nrm);
+        printf("//qsim::propagate_at_surface_CustomART pidx %7lld : minus_cos_theta = %10.8f \n", ctx.pidx,
+               minus_cos_theta);
+        printf("//qsim::propagate_at_surface_CustomART pidx %7lld : lposcost = %10.8f (expect 0->1)\n", ctx.pidx,
+               lposcost);
     }
 #endif
 
     // formerly excluded Custom4 hits onto WP PMTs see ~/j/issues/jok-tds-mu-running-NOT-A-SENSOR-warnings.rst
-    //if(lpmtid < s_pmt::OFFSET_CD_LPMT || lpmtid >= s_pmt::OFFSET_WP_PMT_END )
-    //if(lpmtid < s_pmt::OFFSET_CD_LPMT || lpmtid >= s_pmt::OFFSET_WP_ATM_LPMT_END )
-    if(lpmtid < s_pmt::OFFSET_CD_LPMT || lpmtid >=   s_pmt::OFFSET_WP_WAL_PMT_END )
+    // if(lpmtid < s_pmt::OFFSET_CD_LPMT || lpmtid >= s_pmt::OFFSET_WP_PMT_END )
+    // if(lpmtid < s_pmt::OFFSET_CD_LPMT || lpmtid >= s_pmt::OFFSET_WP_ATM_LPMT_END )
+    if (lpmtid < s_pmt::OFFSET_CD_LPMT || lpmtid >= s_pmt::OFFSET_WP_WAL_PMT_END)
     {
-        flag = NAN_ABORT ;
+        flag = NAN_ABORT;
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-        printf("//qsim::propagate_at_surface_CustomART pidx %7lld lpmtid %d : ERROR UNEXPECTED LPMTID : NAN_ABORT \n", ctx.pidx, lpmtid );
+        printf("//qsim::propagate_at_surface_CustomART pidx %7lld lpmtid %d : ERROR UNEXPECTED LPMTID : NAN_ABORT \n",
+               ctx.pidx, lpmtid);
 #endif
-        return BREAK ;
+        return BREAK;
     }
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if( ctx.pidx_debug )
-    printf("//qsim::propagate_at_surface_CustomART pidx %7lld lpmtid %d wl %8.4f mct %8.4f dpcmn %8.4f pmt %p pre-ATQC \n",
-           ctx.pidx, lpmtid, p.wavelength, minus_cos_theta, dot_pol_cross_mom_nrm, pmt );
+    if (ctx.pidx_debug)
+        printf("//qsim::propagate_at_surface_CustomART pidx %7lld lpmtid %d wl %8.4f mct %8.4f dpcmn %8.4f pmt %p "
+               "pre-ATQC \n",
+               ctx.pidx, lpmtid, p.wavelength, minus_cos_theta, dot_pol_cross_mom_nrm, pmt);
 #endif
 
-    float ATQC[4] = {} ;
+    float ATQC[4] = {};
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if(lpmtid > -1 && pmt != nullptr) pmt->get_lpmtid_ATQC(ATQC, lpmtid, p.wavelength, minus_cos_theta, dot_pol_cross_mom_nrm, lposcost, ctx.pidx, ctx.pidx_debug );
+    if (lpmtid > -1 && pmt != nullptr)
+        pmt->get_lpmtid_ATQC(ATQC, lpmtid, p.wavelength, minus_cos_theta, dot_pol_cross_mom_nrm, lposcost, ctx.pidx,
+                             ctx.pidx_debug);
 #else
-    if(lpmtid > -1 && pmt != nullptr) pmt->get_lpmtid_ATQC(ATQC, lpmtid, p.wavelength, minus_cos_theta, dot_pol_cross_mom_nrm, lposcost );
+    if (lpmtid > -1 && pmt != nullptr)
+        pmt->get_lpmtid_ATQC(ATQC, lpmtid, p.wavelength, minus_cos_theta, dot_pol_cross_mom_nrm, lposcost);
 #endif
 
-
-
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if( ctx.pidx_debug )
-    printf("//qsim::propagate_at_surface_CustomART pidx %7lld lpmtid %d wl %8.4f mct %8.4f dpcmn %8.4f lpc %8.4f ATQC ( %8.4f %8.4f %8.4f %8.4f  ) \n",
-           ctx.pidx, lpmtid, p.wavelength, minus_cos_theta, dot_pol_cross_mom_nrm, lposcost, ATQC[0], ATQC[1], ATQC[2], ATQC[3] );
+    if (ctx.pidx_debug)
+        printf("//qsim::propagate_at_surface_CustomART pidx %7lld lpmtid %d wl %8.4f mct %8.4f dpcmn %8.4f lpc %8.4f "
+               "ATQC ( %8.4f %8.4f %8.4f %8.4f  ) \n",
+               ctx.pidx, lpmtid, p.wavelength, minus_cos_theta, dot_pol_cross_mom_nrm, lposcost, ATQC[0], ATQC[1],
+               ATQC[2], ATQC[3]);
 #endif
 
-
-    const float& theAbsorption        = ATQC[0];
-    const float& theTransmittance     = ATQC[1];
-    const float& theEfficiency        = ATQC[2];
-    const float& collectionEfficiency = ATQC[3];
+    const float &theAbsorption = ATQC[0];
+    const float &theTransmittance = ATQC[1];
+    const float &theEfficiency = ATQC[2];
+    const float &collectionEfficiency = ATQC[3];
 
     float u_theAbsorption = curand_uniform(&rng);
-    int action = u_theAbsorption < theAbsorption  ? BREAK : CONTINUE ;
-
+    int action = u_theAbsorption < theAbsorption ? BREAK : CONTINUE;
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if( ctx.pidx_debug )
-        printf("//qsim.propagate_at_surface_CustomART pidx %7lld lpmtid %d ATQC ( %8.4f %8.4f %8.4f %8.4f ) u_theAbsorption  %7.3f action %d \n",
-        ctx.pidx, lpmtid, ATQC[0], ATQC[1], ATQC[2], ATQC[3], u_theAbsorption, action  );
+    if (ctx.pidx_debug)
+        printf("//qsim.propagate_at_surface_CustomART pidx %7lld lpmtid %d ATQC ( %8.4f %8.4f %8.4f %8.4f ) "
+               "u_theAbsorption  %7.3f action %d \n",
+               ctx.pidx, lpmtid, ATQC[0], ATQC[1], ATQC[2], ATQC[3], u_theAbsorption, action);
 #endif
 
-    if( action == BREAK )
+    if (action == BREAK)
     {
-        float u_theEfficiency = curand_uniform(&rng) ;
+        float u_theEfficiency = curand_uniform(&rng);
         float u_collectionEfficiency = curand_uniform(&rng);
 
-        flag = u_theEfficiency < theEfficiency ?
-                                                    (  u_collectionEfficiency < collectionEfficiency ? EFFICIENCY_COLLECT : EFFICIENCY_CULL )
-                                               :
-                                                    SURFACE_ABSORB
-                                               ;
+        flag = u_theEfficiency < theEfficiency
+                   ? (u_collectionEfficiency < collectionEfficiency ? EFFICIENCY_COLLECT : EFFICIENCY_CULL)
+                   : SURFACE_ABSORB;
 
-        // former SD:SURFACE_DETECT, now becomes EC:EFFICIENCY_COLLECT or EX:EFFICIENCY_CULL depending on collectionEfficiency and random throw
+        // former SD:SURFACE_DETECT, now becomes EC:EFFICIENCY_COLLECT or EX:EFFICIENCY_CULL depending on
+        // collectionEfficiency and random throw
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-        if( ctx.pidx_debug )
-            printf("//qsim.propagate_at_surface_CustomART.BREAK.SD/SA EC/EX pidx %7lld lpmtid %d ATQC ( %8.4f %8.4f %8.4f %8.4f ) u_theEfficiency  %8.4f theEfficiency %8.4f flag %d \n",
-                                                                    ctx.pidx, lpmtid, ATQC[0],ATQC[1], ATQC[2],ATQC[3],  u_theEfficiency,  theEfficiency, flag );
+        if (ctx.pidx_debug)
+            printf("//qsim.propagate_at_surface_CustomART.BREAK.SD/SA EC/EX pidx %7lld lpmtid %d ATQC ( %8.4f %8.4f "
+                   "%8.4f %8.4f ) u_theEfficiency  %8.4f theEfficiency %8.4f flag %d \n",
+                   ctx.pidx, lpmtid, ATQC[0], ATQC[1], ATQC[2], ATQC[3], u_theEfficiency, theEfficiency, flag);
 #endif
-
     }
     else
     {
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-        if( ctx.pidx_debug )
-            printf("//qsim.propagate_at_surface_CustomART.CONTINUE pidx %7lld lpmtid %d ATQC ( %7.3f %7.3f %7.3f %7.3f ) theTransmittance %7.3f  \n",
-            ctx.pidx, lpmtid, ATQC[0], ATQC[1], ATQC[2], ATQC[3], theTransmittance  );
+        if (ctx.pidx_debug)
+            printf("//qsim.propagate_at_surface_CustomART.CONTINUE pidx %7lld lpmtid %d ATQC ( %7.3f %7.3f %7.3f %7.3f "
+                   ") theTransmittance %7.3f  \n",
+                   ctx.pidx, lpmtid, ATQC[0], ATQC[1], ATQC[2], ATQC[3], theTransmittance);
 #endif
 
-        propagate_at_boundary( flag, rng, ctx, theTransmittance  );
+        propagate_at_boundary(flag, rng, ctx, theTransmittance);
     }
-    return action ;
+    return action;
 }
 #endif
 
 #endif
 
-
-#if defined(__CUDACC__) || defined(__CUDABE__)  || defined(MOCK_CURAND) || defined(MOCK_CUDA)
+#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)
 
 /**
 qsim::reflect_diffuse cf G4OpBoundaryProcess::DoReflection
@@ -1974,41 +2016,39 @@ orient is used to flip the reflection normal back against the incident direction
 
 **/
 
-inline QSIM_METHOD void qsim::reflect_diffuse( RNG& rng, sctx& ctx )
+inline QSIM_METHOD void qsim::reflect_diffuse(RNG &rng, sctx &ctx)
 {
-    sphoton& p = ctx.p ;
+    sphoton &p = ctx.p;
 
-    float3 old_mom = p.mom ;
+    float3 old_mom = p.mom;
 
-    const float3* normal = ctx.prd->normal() ;    // geometrical outwards normal
+    const float3 *normal = ctx.prd->normal(); // geometrical outwards normal
 
-    //const float orient = -1.f ; // BUG : FIXED ORIENT FLIP CANNOT BE CORRECT
-    const float orient = dot( old_mom, *normal ) > 0.f ? -1.f : 1.f ;
+    // const float orient = -1.f ; // BUG : FIXED ORIENT FLIP CANNOT BE CORRECT
+    const float orient = dot(old_mom, *normal) > 0.f ? -1.f : 1.f;
 
-    lambertian_direction( &p.mom, normal, orient, rng, ctx );
+    lambertian_direction(&p.mom, normal, orient, rng, ctx);
 
-
-    float3 facet_normal = normalize( p.mom - old_mom );
-    const float EdotN = dot( p.pol, facet_normal );
-    p.pol = -1.f*(p.pol) + 2.f*EdotN*facet_normal ;
+    float3 facet_normal = normalize(p.mom - old_mom);
+    const float EdotN = dot(p.pol, facet_normal);
+    p.pol = -1.f * (p.pol) + 2.f * EdotN * facet_normal;
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if(ctx.pidx == base->pidx)
+    if (ctx.pidx == base->pidx)
     {
-    printf("//qsim.reflect_diffuse pidx %7lld : old_mom = np.array([%10.5f,%10.5f,%10.5f]) \n",
-        ctx.pidx,  old_mom.x, old_mom.y, old_mom.z ) ;
+        printf("//qsim.reflect_diffuse pidx %7lld : old_mom = np.array([%10.5f,%10.5f,%10.5f]) \n", ctx.pidx, old_mom.x,
+               old_mom.y, old_mom.z);
 
-    printf("//qsim.reflect_diffuse pidx %7lld : normal0 = np.array([%10.5f,%10.5f,%10.5f]) \n",
-        ctx.pidx,  normal->x, normal->y, normal->z ) ;
+        printf("//qsim.reflect_diffuse pidx %7lld : normal0 = np.array([%10.5f,%10.5f,%10.5f]) \n", ctx.pidx, normal->x,
+               normal->y, normal->z);
 
-    printf("//qsim.reflect_diffuse pidx %7lld : p.mom = np.array([%10.5f,%10.5f,%10.5f]) \n",
-        ctx.pidx,  p.mom.x, p.mom.y, p.mom.z ) ;
+        printf("//qsim.reflect_diffuse pidx %7lld : p.mom = np.array([%10.5f,%10.5f,%10.5f]) \n", ctx.pidx, p.mom.x,
+               p.mom.y, p.mom.z);
 
-    printf("//qsim.reflect_diffuse pidx %7lld : facet_normal = np.array([%10.5f,%10.5f,%10.5f]) \n",
-        ctx.pidx,  facet_normal.x, facet_normal.y, facet_normal.z ) ;
+        printf("//qsim.reflect_diffuse pidx %7lld : facet_normal = np.array([%10.5f,%10.5f,%10.5f]) \n", ctx.pidx,
+               facet_normal.x, facet_normal.y, facet_normal.z);
     }
 #endif
-
 }
 
 /**
@@ -2027,58 +2067,56 @@ to be helpful.
 
 **/
 
-inline QSIM_METHOD void qsim::reflect_specular( RNG& rng, sctx& ctx )
+inline QSIM_METHOD void qsim::reflect_specular(RNG &rng, sctx &ctx)
 {
-    sphoton& p = ctx.p ;
-    const float3* normal = ctx.prd->normal() ;
+    sphoton &p = ctx.p;
+    const float3 *normal = ctx.prd->normal();
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if(ctx.pidx == base->pidx)
+    if (ctx.pidx == base->pidx)
     {
-    printf("//qsim.reflect_specular.head pidx %7lld : normal0 = np.array([%10.5f,%10.5f,%10.5f]) \n",
-        ctx.pidx,  normal->x, normal->y, normal->z ) ;
+        printf("//qsim.reflect_specular.head pidx %7lld : normal0 = np.array([%10.5f,%10.5f,%10.5f]) \n", ctx.pidx,
+               normal->x, normal->y, normal->z);
 
-    printf("//qsim.reflect_specular.head pidx %7lld : mom0 = np.array([%10.5f,%10.5f,%10.5f]) \n",
-        ctx.pidx,  p.mom.x, p.mom.y, p.mom.z ) ;
+        printf("//qsim.reflect_specular.head pidx %7lld : mom0 = np.array([%10.5f,%10.5f,%10.5f]) \n", ctx.pidx,
+               p.mom.x, p.mom.y, p.mom.z);
 
-    printf("//qsim.reflect_specular.head pidx %7lld : pol0 = np.array([%10.5f,%10.5f,%10.5f]) \n",
-        ctx.pidx,  p.pol.x, p.pol.y, p.pol.z ) ;
+        printf("//qsim.reflect_specular.head pidx %7lld : pol0 = np.array([%10.5f,%10.5f,%10.5f]) \n", ctx.pidx,
+               p.pol.x, p.pol.y, p.pol.z);
     }
 #endif
 
 #ifdef WITH_ORIENT
-    //const float orient = -1.f ;
-    const float orient = 1.f ;
+    // const float orient = -1.f ;
+    const float orient = 1.f;
     // because orient appears twice in the below p.mom p.pol calcs
     // it being +1.f or -1.f makes no difference
 
-    const float PdotN = dot( p.mom, *normal )*orient ;
-    p.mom = p.mom - 2.f*PdotN*(*normal)*orient ;
+    const float PdotN = dot(p.mom, *normal) * orient;
+    p.mom = p.mom - 2.f * PdotN * (*normal) * orient;
 
-    const float EdotN = dot( p.pol, *normal )*orient ;
-    p.pol = -1.f*(p.pol) + 2.f*EdotN*(*normal)*orient  ;
+    const float EdotN = dot(p.pol, *normal) * orient;
+    p.pol = -1.f * (p.pol) + 2.f * EdotN * (*normal) * orient;
 #else
     // removed orient as does not effect calc, hence confusing and pointless
-    const float PdotN = dot( p.mom, *normal ) ;
-    p.mom = p.mom - 2.f*PdotN*(*normal) ;
+    const float PdotN = dot(p.mom, *normal);
+    p.mom = p.mom - 2.f * PdotN * (*normal);
 
-    const float EdotN = dot( p.pol, *normal ) ;
-    p.pol = -1.f*(p.pol) + 2.f*EdotN*(*normal)  ;
+    const float EdotN = dot(p.pol, *normal);
+    p.pol = -1.f * (p.pol) + 2.f * EdotN * (*normal);
 #endif
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if(ctx.pidx == base->pidx)
+    if (ctx.pidx == base->pidx)
     {
-    printf("//qsim.reflect_specular.tail pidx %7lld : mom1 = np.array([%10.5f,%10.5f,%10.5f]) ; PdotN = %10.5f ; EdotN = %10.5f \n",
-        ctx.pidx,  p.mom.x, p.mom.y, p.mom.z, PdotN, EdotN  ) ;
-
-    printf("//qsim.reflect_specular.tail pidx %7lld : pol1 = np.array([%10.5f,%10.5f,%10.5f]) \n",
-        ctx.pidx,  p.pol.x, p.pol.y, p.pol.z ) ;
+        printf("//qsim.reflect_specular.tail pidx %7lld : mom1 = np.array([%10.5f,%10.5f,%10.5f]) ; PdotN = %10.5f ; "
+               "EdotN = %10.5f \n",
+               ctx.pidx, p.mom.x, p.mom.y, p.mom.z, PdotN, EdotN);
 
+        printf("//qsim.reflect_specular.tail pidx %7lld : pol1 = np.array([%10.5f,%10.5f,%10.5f]) \n", ctx.pidx,
+               p.pol.x, p.pol.y, p.pol.z);
     }
 #endif
-
-
 }
 
 /**
@@ -2109,7 +2147,8 @@ Stages within bounce loop
 
 3. mutate photon and set flag using material properties
 
-  * note that photons that SAIL to boundary are mutated twice within the while loop (by propagate_to_boundary and propagate_at_boundary/surface)
+  * note that photons that SAIL to boundary are mutated twice within the while loop (by propagate_to_boundary and
+propagate_at_boundary/surface)
 
 Thoughts
 ~~~~~~~~~~~
@@ -2120,47 +2159,50 @@ so can switch them off easily in production running
 
 **/
 
-inline QSIM_METHOD void qsim::fake_propagate( sphoton& p, const quad2* mock_prd, RNG& rng, unsigned long long idx )
+inline QSIM_METHOD void qsim::fake_propagate(sphoton &p, const quad2 *mock_prd, RNG &rng, unsigned long long idx)
 {
-    p.set_flag(TORCH);  // setting initial flag : in reality this should be done by generation
+    p.set_flag(TORCH); // setting initial flag : in reality this should be done by generation
 
-    qsim* sim = this ;
+    qsim *sim = this;
 
-    sctx ctx = {} ;
-    ctx.p = p ;     // Q: Why is this different from CSGOptiX7.cu:simulate ? A: Presumably due to input photon.
-    ctx.evt = evt ;
-    ctx.pidx = idx ;
+    sctx ctx = {};
+    ctx.p = p; // Q: Why is this different from CSGOptiX7.cu:simulate ? A: Presumably due to input photon.
+    ctx.evt = evt;
+    ctx.pidx = idx;
 
-    int command = START ;
-    int bounce = 0 ;
+    int command = START;
+    int bounce = 0;
 #ifndef PRODUCTION
     ctx.point(bounce);
 #endif
 
-    while( bounce < evt->max_bounce )
+    while (bounce < evt->max_bounce)
     {
-        ctx.prd = mock_prd + (evt->max_bounce*idx+bounce) ;
-        if( ctx.prd->boundary() == 0xffffu ) break ;   // SHOULD NEVER HAPPEN : propagate can do nothing meaningful without a boundary
+        ctx.prd = mock_prd + (evt->max_bounce * idx + bounce);
+        if (ctx.prd->boundary() == 0xffffu)
+            break; // SHOULD NEVER HAPPEN : propagate can do nothing meaningful without a boundary
 #ifndef PRODUCTION
         ctx.trace(bounce);
 #endif
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-        if(idx == base->pidx)
-        printf("//qsim.fake_propagate pidx %7lld bounce %d evt.max_bounce %d prd.q0.f.xyzw (%10.4f %10.4f %10.4f %10.4f) \n",
-             idx, bounce, evt->max_bounce, ctx.prd->q0.f.x, ctx.prd->q0.f.y, ctx.prd->q0.f.z, ctx.prd->q0.f.w );
+        if (idx == base->pidx)
+            printf("//qsim.fake_propagate pidx %7lld bounce %d evt.max_bounce %d prd.q0.f.xyzw (%10.4f %10.4f %10.4f "
+                   "%10.4f) \n",
+                   idx, bounce, evt->max_bounce, ctx.prd->q0.f.x, ctx.prd->q0.f.y, ctx.prd->q0.f.z, ctx.prd->q0.f.w);
 #endif
-        command = sim->propagate(bounce, rng, ctx );
+        command = sim->propagate(bounce, rng, ctx);
         bounce++;
 #ifndef PRODUCTION
         ctx.point(bounce);
 #endif
-        if(command == BREAK) break ;
+        if (command == BREAK)
+            break;
     }
 #ifndef PRODUCTION
     ctx.end();
 #endif
-    evt->photon[idx] = ctx.p ;
+    evt->photon[idx] = ctx.p;
 }
 
 /**
@@ -2215,43 +2257,44 @@ Prior to supporting special surfaces, within the command == BOUNDARY used::
 
 **/
 
-inline QSIM_METHOD int qsim::propagate(const int bounce, RNG& rng, sctx& ctx )  // ::simulate
+inline QSIM_METHOD int qsim::propagate(const int bounce, RNG &rng, sctx &ctx) // ::simulate
 {
-    const unsigned boundary = ctx.prd->boundary() ;
-    const unsigned identity = ctx.prd->identity() ; // sensor_identifier+1, 0:not-a-sensor
-    const unsigned iindex = ctx.prd->iindex() ;
-    const float lposcost = ctx.prd->lposcost() ;  // local frame intersect position cosine theta
+    const unsigned boundary = ctx.prd->boundary();
+    const unsigned identity = ctx.prd->identity(); // sensor_identifier+1, 0:not-a-sensor
+    const unsigned iindex = ctx.prd->iindex();
+    const float lposcost = ctx.prd->lposcost(); // local frame intersect position cosine theta
 
-    const float3* normal = ctx.prd->normal();
-    float cosTheta = dot(ctx.p.mom, *normal ) ;
+    const float3 *normal = ctx.prd->normal();
+    float cosTheta = dot(ctx.p.mom, *normal);
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if( ctx.pidx == base->pidx )
+    if (ctx.pidx == base->pidx)
     {
-    printf("\n//qsim.propagate.head pidx %7lld : ctx.evt.index %d evt.index %d \n", ctx.pidx, ctx.evt->index, evt->index );
-
-    printf("\n//qsim.propagate.head pidx %7lld : bnc %d boundary %d cosTheta %10.8f \n", ctx.pidx, bounce, boundary, cosTheta );
+        printf("\n//qsim.propagate.head pidx %7lld : ctx.evt.index %d evt.index %d \n", ctx.pidx, ctx.evt->index,
+               evt->index);
 
-    printf("//qsim.propagate.head pidx %7lld : mom = np.array([%10.8f,%10.8f,%10.8f]) ; lmom = %10.8f  \n",
-                 ctx.pidx, ctx.p.mom.x, ctx.p.mom.y, ctx.p.mom.z, length(ctx.p.mom) ) ;
+        printf("\n//qsim.propagate.head pidx %7lld : bnc %d boundary %d cosTheta %10.8f \n", ctx.pidx, bounce, boundary,
+               cosTheta);
 
-    printf("//qsim.propagate.head pidx %7lld : pos = np.array([%10.5f,%10.5f,%10.5f]) ; lpos = %10.8f \n",
-                 ctx.pidx, ctx.p.pos.x, ctx.p.pos.y, ctx.p.pos.z, length(ctx.p.pos) ) ;
+        printf("//qsim.propagate.head pidx %7lld : mom = np.array([%10.8f,%10.8f,%10.8f]) ; lmom = %10.8f  \n",
+               ctx.pidx, ctx.p.mom.x, ctx.p.mom.y, ctx.p.mom.z, length(ctx.p.mom));
 
-    printf("//qsim.propagate.head pidx %7lld : nrm = np.array([(%10.8f,%10.8f,%10.8f]) ; lnrm = %10.8f  \n",
-                 ctx.pidx, normal->x, normal->y, normal->z, length(*normal) );
+        printf("//qsim.propagate.head pidx %7lld : pos = np.array([%10.5f,%10.5f,%10.5f]) ; lpos = %10.8f \n", ctx.pidx,
+               ctx.p.pos.x, ctx.p.pos.y, ctx.p.pos.z, length(ctx.p.pos));
 
+        printf("//qsim.propagate.head pidx %7lld : nrm = np.array([(%10.8f,%10.8f,%10.8f]) ; lnrm = %10.8f  \n",
+               ctx.pidx, normal->x, normal->y, normal->z, length(*normal));
     }
 #endif
 
     // copy geometry info into the sphoton struct
-    ctx.p.set_prd(boundary, identity, cosTheta, iindex );  // HMM: lposcost not passed along
+    ctx.p.set_prd(boundary, identity, cosTheta, iindex); // HMM: lposcost not passed along
 
-    bnd->fill_state(ctx.s, boundary, ctx.p.wavelength, cosTheta, ctx.pidx, base->pidx );
+    bnd->fill_state(ctx.s, boundary, ctx.p.wavelength, cosTheta, ctx.pidx, base->pidx);
 
-    unsigned flag = 0 ;
+    unsigned flag = 0;
 
-    int command = propagate_to_boundary( flag, rng, ctx );
+    int command = propagate_to_boundary(flag, rng, ctx);
     /**
     command possibilities:
 
@@ -2262,52 +2305,55 @@ inline QSIM_METHOD int qsim::propagate(const int bounce, RNG& rng, sctx& ctx )
     **/
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if( ctx.pidx == base->pidx )
-    printf("//qsim.propagate.body pidx %7lld bounce %d command %d flag %d s.optical.x %d s.optical.y %d \n",
-          ctx.pidx, bounce, command, flag, ctx.s.optical.x, ctx.s.optical.y );
+    if (ctx.pidx == base->pidx)
+        printf("//qsim.propagate.body pidx %7lld bounce %d command %d flag %d s.optical.x %d s.optical.y %d \n",
+               ctx.pidx, bounce, command, flag, ctx.s.optical.x, ctx.s.optical.y);
 #endif
 
-    if( command == BOUNDARY )
+    if (command == BOUNDARY)
     {
-        const int& ems = ctx.s.optical.y ;
+        const int &ems = ctx.s.optical.y;
 
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-        if( ctx.pidx == base->pidx )
+        if (ctx.pidx == base->pidx)
         {
 #if defined(WITH_CUSTOM4)
-            printf("//qsim.propagate.body.WITH_CUSTOM4 pidx %7lld  BOUNDARY ems %d lposcost %7.3f \n", ctx.pidx, ems, lposcost );
+            printf("//qsim.propagate.body.WITH_CUSTOM4 pidx %7lld  BOUNDARY ems %d lposcost %7.3f \n", ctx.pidx, ems,
+                   lposcost);
 #else
-            printf("//qsim.propagate.body.NOT:WITH_CUSTOM4 pidx %7lld BOUNDARY ems %d lposcost %7.3f \n", ctx.pidx, ems, lposcost);
+            printf("//qsim.propagate.body.NOT:WITH_CUSTOM4 pidx %7lld BOUNDARY ems %d lposcost %7.3f \n", ctx.pidx, ems,
+                   lposcost);
 #endif
         }
 #endif
 
-        if( ems == smatsur_NoSurface )
+        if (ems == smatsur_NoSurface)
         {
-            command = propagate_at_boundary( flag, rng, ctx ) ;
+            command = propagate_at_boundary(flag, rng, ctx);
         }
-        else if( ems == smatsur_Surface )
+        else if (ems == smatsur_Surface)
         {
-            command = propagate_at_surface( flag, rng, ctx ) ;
+            command = propagate_at_surface(flag, rng, ctx);
         }
-        else if( lposcost < 0.f )  // could combine with prior, but handy for debug to keep separate
+        else if (lposcost < 0.f) // could combine with prior, but handy for debug to keep separate
         {
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-            if( ctx.pidx == base->pidx )
+            if (ctx.pidx == base->pidx)
                 printf("//qsim.propagate.body (lposcost < 0.f) pidx %7lld bounce %d command %d flag %d ems %d \n",
-                ctx.pidx, bounce, command, flag, ems  );
+                       ctx.pidx, bounce, command, flag, ems);
 #endif
-            command = propagate_at_surface( flag, rng, ctx ) ;
+            command = propagate_at_surface(flag, rng, ctx);
         }
-        else if( ems == smatsur_Surface_zplus_sensor_A )
+        else if (ems == smatsur_Surface_zplus_sensor_A)
         {
-            command = propagate_at_surface_Detect( flag, rng, ctx ) ;
+            command = propagate_at_surface_Detect(flag, rng, ctx);
         }
-        else if( ems == smatsur_Surface_zplus_sensor_CustomART )
+        else if (ems == smatsur_Surface_zplus_sensor_CustomART)
         {
 #if defined(WITH_CUSTOM4)
-            command = propagate_at_surface_CustomART( flag, rng, ctx ) ;
-            //command = base->custom_lut == 0u ? propagate_at_surface_CustomART( flag, rng, ctx ) : propagate_at_surface_MultiFilm(flag, rng, ctx );
+            command = propagate_at_surface_CustomART(flag, rng, ctx);
+            // command = base->custom_lut == 0u ? propagate_at_surface_CustomART( flag, rng, ctx ) :
+            // propagate_at_surface_MultiFilm(flag, rng, ctx );
 
 #endif
         }
@@ -2316,14 +2362,13 @@ inline QSIM_METHOD int qsim::propagate(const int bounce, RNG& rng, sctx& ctx )
     // Q: Does flag need to be single bit at this point OR can multiple "flags" be OR-ed together here ?
     // A: Decided to keep the flag as single bitted, and directly set EFFICENCY_COLLECT/CULL into ctx.p.flagmask
 
-
 #if !defined(PRODUCTION) && defined(DEBUG_PIDX)
-    if( ctx.pidx == base->pidx )
-    printf("//qsim.propagate.tail pidx %7lld bounce %d command %d flag %d ctx.s.optical.y(ems) %d \n",
-             ctx.pidx, bounce, command, flag, ctx.s.optical.y  );
+    if (ctx.pidx == base->pidx)
+        printf("//qsim.propagate.tail pidx %7lld bounce %d command %d flag %d ctx.s.optical.y(ems) %d \n", ctx.pidx,
+               bounce, command, flag, ctx.s.optical.y);
 #endif
 
-    return command ;
+    return command;
 }
 /**
 Q: Where does ctx.s.optical come from ?
@@ -2339,8 +2384,6 @@ A: YES, but non-trivially and probably confusingly. This is because
 
 **/
 
-
-
 /**
 qsim::hemisphere_polarized
 ------------------------------
@@ -2378,48 +2421,55 @@ inwards.
 
 **/
 
-inline QSIM_METHOD void qsim::hemisphere_polarized( unsigned polz, bool inwards, RNG& rng, sctx& ctx )
+inline QSIM_METHOD void qsim::hemisphere_polarized(unsigned polz, bool inwards, RNG &rng, sctx &ctx)
 {
-    sphoton& p = ctx.p ;
-    const float3* normal = ctx.prd->normal() ;
+    sphoton &p = ctx.p;
+    const float3 *normal = ctx.prd->normal();
 
-    //printf("//qsim.hemisphere_polarized polz %d normal (%10.4f, %10.4f, %10.4f) \n", polz, normal->x, normal->y, normal->z );
+    // printf("//qsim.hemisphere_polarized polz %d normal (%10.4f, %10.4f, %10.4f) \n", polz, normal->x, normal->y,
+    // normal->z );
 
-    float u_hemipol_phi = curand_uniform(&rng) ;
-    float phi = u_hemipol_phi*2.f*M_PIf;  // 0->2pi
-    float cosTheta = curand_uniform(&rng) ;      // 0->1
+    float u_hemipol_phi = curand_uniform(&rng);
+    float phi = u_hemipol_phi * 2.f * M_PIf; // 0->2pi
+    float cosTheta = curand_uniform(&rng);   // 0->1
 
 #if !defined(PRODUCTION) && defined(DEBUG_TAG)
-    stagr& tagr = ctx.tagr ;
-    tagr.add( stag_hp_ph, u_hemipol_phi );
-    tagr.add( stag_hp_ph, cosTheta );    // trying to reduce stag::BITS from 5 to 4, so change stag_hp_ct to stag_hp_ph
+    stagr &tagr = ctx.tagr;
+    tagr.add(stag_hp_ph, u_hemipol_phi);
+    tagr.add(stag_hp_ph, cosTheta); // trying to reduce stag::BITS from 5 to 4, so change stag_hp_ct to stag_hp_ph
 #endif
 
-    float sinTheta = sqrtf(1.f-cosTheta*cosTheta);
+    float sinTheta = sqrtf(1.f - cosTheta * cosTheta);
 
-    p.mom.x = cosf(phi)*sinTheta ;
-    p.mom.y = sinf(phi)*sinTheta ;
-    p.mom.z = cosTheta ;
+    p.mom.x = cosf(phi) * sinTheta;
+    p.mom.y = sinf(phi) * sinTheta;
+    p.mom.z = cosTheta;
 
-    smath::rotateUz( p.mom, (*normal) * ( inwards ? -1.f : 1.f ));
+    smath::rotateUz(p.mom, (*normal) * (inwards ? -1.f : 1.f));
 
-    //printf("//qsim.hemisphere_polarized polz %d p.mom (%10.4f, %10.4f, %10.4f) \n", polz, p.mom.x, p.mom.y, p.mom.z );
+    // printf("//qsim.hemisphere_polarized polz %d p.mom (%10.4f, %10.4f, %10.4f) \n", polz, p.mom.x, p.mom.y, p.mom.z
+    // );
 
     // what about normal incidence ?
-    const float3 transverse = normalize(cross(p.mom, (*normal) * ( inwards ? -1.f : 1.f )  )) ; // perpendicular to plane of incidence
-    const float3 within = normalize( cross(p.mom, transverse) );  //   within plane of incidence and perpendicular to direction
+    const float3 transverse =
+        normalize(cross(p.mom, (*normal) * (inwards ? -1.f : 1.f))); // perpendicular to plane of incidence
+    const float3 within =
+        normalize(cross(p.mom, transverse)); //   within plane of incidence and perpendicular to direction
 
-    switch(polz)
+    switch (polz)
     {
-        case 0: p.pol = transverse ; break ;   // S-polarizatiom
-        case 1: p.pol = within     ; break ;   // P-polarization
-        case 2: p.pol = normalize( 0.5f*transverse + (1.f-0.5f)*within )  ; break ;  // equal admixture
+    case 0:
+        p.pol = transverse;
+        break; // S-polarizatiom
+    case 1:
+        p.pol = within;
+        break; // P-polarization
+    case 2:
+        p.pol = normalize(0.5f * transverse + (1.f - 0.5f) * within);
+        break; // equal admixture
     }
 }
 
-
-
-
 /**
 qsim::generate_photon_simtrace
 --------------------------------
@@ -2446,68 +2496,96 @@ SEE : sevent::add_simtrace
 
 **/
 
-inline QSIM_METHOD void qsim::generate_photon_simtrace(quad4& p, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const
+inline QSIM_METHOD void qsim::generate_photon_simtrace(quad4 &p, RNG &rng, const quad6 &gs,
+                                                       unsigned long long photon_id, unsigned genstep_id) const
 {
-    const int& gencode = gs.q0.i.x ;
-    switch(gencode)
+    const int &gencode = gs.q0.i.x;
+    switch (gencode)
     {
-        case OpticksGenstep_FRAME:                   generate_photon_simtrace_frame(p, rng, gs, photon_id, genstep_id ); break ;
-        case OpticksGenstep_INPUT_PHOTON_SIMTRACE:   { p = (quad4&)evt->simtrace[photon_id] ; }                          ; break ;
+    case OpticksGenstep_FRAME:
+        generate_photon_simtrace_frame(p, rng, gs, photon_id, genstep_id);
+        break;
+    case OpticksGenstep_INPUT_PHOTON_SIMTRACE: {
+        p = (quad4 &)evt->simtrace[photon_id];
+    };
+    break;
     }
 }
 
-inline QSIM_METHOD void qsim::generate_photon_simtrace_frame(quad4& p, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const
+inline QSIM_METHOD void qsim::generate_photon_simtrace_frame(quad4 &p, RNG &rng, const quad6 &gs,
+                                                             unsigned long long photon_id, unsigned genstep_id) const
 {
-    C4U gsid ;
+    C4U gsid;
 
-    //int gencode          = gs.q0.i.x ;
-    int gridaxes           = gs.q0.i.y ;  // { XYZ, YZ, XZ, XY }
-    gsid.u                 = gs.q0.i.z ;
-    //unsigned num_photons = gs.q0.u.w ;
+    // int gencode          = gs.q0.i.x ;
+    int gridaxes = gs.q0.i.y; // { XYZ, YZ, XZ, XY }
+    gsid.u = gs.q0.i.z;
+    // unsigned num_photons = gs.q0.u.w ;
 
-    p.q0.f.x = gs.q1.f.x ;   // start with genstep local frame position, typically origin  (0,0,0)
-    p.q0.f.y = gs.q1.f.y ;
-    p.q0.f.z = gs.q1.f.z ;
-    p.q0.f.w = 1.f ;
+    p.q0.f.x = gs.q1.f.x; // start with genstep local frame position, typically origin  (0,0,0)
+    p.q0.f.y = gs.q1.f.y;
+    p.q0.f.z = gs.q1.f.z;
+    p.q0.f.w = 1.f;
 
-    //printf("//qsim.generate_photon_simtrace_frame gridaxes %d gs.q1 (%10.4f %10.4f %10.4f %10.4f) \n", gridaxes, gs.q1.f.x, gs.q1.f.y, gs.q1.f.z, gs.q1.f.w );
+    // printf("//qsim.generate_photon_simtrace_frame gridaxes %d gs.q1 (%10.4f %10.4f %10.4f %10.4f) \n", gridaxes,
+    // gs.q1.f.x, gs.q1.f.y, gs.q1.f.z, gs.q1.f.w );
 
     float u0 = curand_uniform(&rng);
     float sinPhi, cosPhi;
 #if defined(MOCK_CURAND) || defined(MOCK_CUDA)
-    __sincosf(2.f*M_PIf*u0,&sinPhi,&cosPhi);
+    __sincosf(2.f * M_PIf * u0, &sinPhi, &cosPhi);
 #else
-    sincosf(2.f*M_PIf*u0,&sinPhi,&cosPhi);
+    sincosf(2.f * M_PIf * u0, &sinPhi, &cosPhi);
 #endif
 
     float u1 = curand_uniform(&rng);
-    float cosTheta = 2.f*u1 - 1.f ;
-    float sinTheta = sqrtf(1.f-cosTheta*cosTheta) ;
+    float cosTheta = 2.f * u1 - 1.f;
+    float sinTheta = sqrtf(1.f - cosTheta * cosTheta);
 
-    //printf("//qsim.generate_photon_simtrace_frame u0 %10.4f sinPhi   %10.4f cosPhi   %10.4f \n", u0, sinPhi, cosPhi );
-    //printf("//qsim.generate_photon_simtrace_frame u1 %10.4f sinTheta %10.4f cosTheta %10.4f \n", u1, sinTheta, cosTheta );
-    //printf("//qsim.generate_photon_simtrace_frame  u0 %10.4f sinPhi   %10.4f cosPhi   %10.4f u1 %10.4f sinTheta %10.4f cosTheta %10.4f \n",  u0, sinPhi, cosPhi, u1, sinTheta, cosTheta );
+    // printf("//qsim.generate_photon_simtrace_frame u0 %10.4f sinPhi   %10.4f cosPhi   %10.4f \n", u0, sinPhi, cosPhi
+    // ); printf("//qsim.generate_photon_simtrace_frame u1 %10.4f sinTheta %10.4f cosTheta %10.4f \n", u1, sinTheta,
+    // cosTheta ); printf("//qsim.generate_photon_simtrace_frame  u0 %10.4f sinPhi   %10.4f cosPhi   %10.4f u1 %10.4f
+    // sinTheta %10.4f cosTheta %10.4f \n",  u0, sinPhi, cosPhi, u1, sinTheta, cosTheta );
 
-    switch( gridaxes )
+    switch (gridaxes)
     {
-        case YZ:  { p.q1.f.x = 0.f    ;  p.q1.f.y = cosPhi ;  p.q1.f.z = sinPhi ;  p.q1.f.w = 0.f ; } ; break ;
-        case XZ:  { p.q1.f.x = cosPhi ;  p.q1.f.y = 0.f    ;  p.q1.f.z = sinPhi ;  p.q1.f.w = 0.f ; } ; break ;
-        case XY:  { p.q1.f.x = cosPhi ;  p.q1.f.y = sinPhi ;  p.q1.f.z = 0.f    ;  p.q1.f.w = 0.f ; } ; break ;
-        case XYZ: { p.q1.f.x = sinTheta*cosPhi ;
-                    p.q1.f.y = sinTheta*sinPhi ;
-                    p.q1.f.z = cosTheta        ;
-                    p.q1.f.w = 0.f ; } ; break ;   // previously used XZ
+    case YZ: {
+        p.q1.f.x = 0.f;
+        p.q1.f.y = cosPhi;
+        p.q1.f.z = sinPhi;
+        p.q1.f.w = 0.f;
+    };
+    break;
+    case XZ: {
+        p.q1.f.x = cosPhi;
+        p.q1.f.y = 0.f;
+        p.q1.f.z = sinPhi;
+        p.q1.f.w = 0.f;
+    };
+    break;
+    case XY: {
+        p.q1.f.x = cosPhi;
+        p.q1.f.y = sinPhi;
+        p.q1.f.z = 0.f;
+        p.q1.f.w = 0.f;
+    };
+    break;
+    case XYZ: {
+        p.q1.f.x = sinTheta * cosPhi;
+        p.q1.f.y = sinTheta * sinPhi;
+        p.q1.f.z = cosTheta;
+        p.q1.f.w = 0.f;
+    };
+    break; // previously used XZ
     }
 
+    qat4 qt(gs);                            // copy 4x4 transform from last 4 quads of genstep
+    qt.right_multiply_inplace(p.q0.f, 1.f); // position
+    qt.right_multiply_inplace(p.q1.f, 0.f); // direction
 
-    qat4 qt(gs) ; // copy 4x4 transform from last 4 quads of genstep
-    qt.right_multiply_inplace( p.q0.f, 1.f );   // position
-    qt.right_multiply_inplace( p.q1.f, 0.f );   // direction
-
-
-    unsigned char ucj = (photon_id < 255 ? photon_id : 255 ) ;
-    gsid.c4.w = ucj ;
-    p.q3.u.w = gsid.u ;   // WARNING : THIS GSID LOOKS TO BE STOMPED ON BY sevent::add_simtrace
+    unsigned char ucj = (photon_id < 255 ? photon_id : 255);
+    gsid.c4.w = ucj;
+    p.q3.u.w = gsid.u; // WARNING : THIS GSID LOOKS TO BE STOMPED ON BY sevent::add_simtrace
 }
 
 /**
@@ -2518,26 +2596,38 @@ Moved non-standard center-extent (aka frame) gensteps to use qsim::generate_phot
 
 **/
 
-inline QSIM_METHOD void qsim::generate_photon(sphoton& p, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const
+inline QSIM_METHOD void qsim::generate_photon(sphoton &p, RNG &rng, const quad6 &gs, unsigned long long photon_id,
+                                              unsigned genstep_id) const
 {
-    const int& gencode = gs.q0.i.x ;
-    switch(gencode)
+    const int &gencode = gs.q0.i.x;
+    switch (gencode)
     {
-        case OpticksGenstep_CARRIER:         scarrier::generate(     p, rng, gs, photon_id, genstep_id)  ; break ;
-        case OpticksGenstep_TORCH:           storch::generate(       p, rng, gs, photon_id, genstep_id ) ; break ;
-
-        case OpticksGenstep_G4Cerenkov_modified:
-        case OpticksGenstep_CERENKOV:
-                                              cerenkov->generate(    p, rng, gs, photon_id, genstep_id ) ; break ;
-
-        case OpticksGenstep_DsG4Scintillation_r4695:
-        case OpticksGenstep_SCINTILLATION:
-                                              scint->generate(        p, rng, gs, photon_id, genstep_id ) ; break ;
-
-        case OpticksGenstep_INPUT_PHOTON:    { p = evt->photon[photon_id] ; p.set_flag(TORCH) ; }        ; break ;
-        default:                             generate_photon_dummy(  p, rng, gs, photon_id, genstep_id)  ; break ;
+    case OpticksGenstep_CARRIER:
+        scarrier::generate(p, rng, gs, photon_id, genstep_id);
+        break;
+    case OpticksGenstep_TORCH:
+        storch::generate(p, rng, gs, photon_id, genstep_id);
+        break;
+
+    case OpticksGenstep_G4Cerenkov_modified:
+    case OpticksGenstep_CERENKOV:
+        cerenkov->generate(p, rng, gs, photon_id, genstep_id);
+        break;
+
+    case OpticksGenstep_DsG4Scintillation_r4695:
+    case OpticksGenstep_SCINTILLATION:
+        scint->generate(p, rng, gs, photon_id, genstep_id);
+        break;
+
+    case OpticksGenstep_INPUT_PHOTON: {
+        p = evt->photon[photon_id];
+        p.set_flag(TORCH);
+    };
+    break;
+    default:
+        generate_photon_dummy(p, rng, gs, photon_id, genstep_id);
+        break;
     }
-    p.set_index(photon_id) ;
+    p.set_index(photon_id);
 }
 #endif
-
diff --git a/qudarap/qwls.h b/qudarap/qwls.h
new file mode 100644
index 000000000..0e4100726
--- /dev/null
+++ b/qudarap/qwls.h
@@ -0,0 +1,148 @@
+#pragma once
+/**
+qwls.h : GPU-side Wavelength Shifting
+=========================================
+
+Device-side struct for WLS wavelength sampling via ICDF texture lookup.
+Supports multiple WLS materials indexed by material ID.
+
+The ICDF texture layout:
+- Each WLS material occupies 3 rows (standard, LHS HD, RHS HD)
+- material_map[mat_idx] gives the base row for that material (-1 = no WLS)
+- time_constants[wls_idx] gives the re-emission time constant in ns
+
+Wavelength sampling uses the same HD (high-definition) technique as qscint.h:
+- hd_factor=20: 20x resolution at extremes (u < 0.05 or u > 0.95)
+- Normalized texture coordinates with linear interpolation
+
+**/
+
+#if defined(__CUDACC__) || defined(__CUDABE__)
+#define QWLS_METHOD __device__
+#else
+#define QWLS_METHOD
+#endif
+
+struct qwls
+{
+    cudaTextureObject_t wls_tex; // ICDF texture: (num_wls*3, 4096, 1)
+    unsigned hd_factor;          // 0, 10, or 20
+    int *material_map;           // device ptr: mat_idx -> base ICDF row (-1 = no WLS)
+    float *time_constants;       // device ptr: per-WLS-material time constant (ns)
+    unsigned num_wls;            // number of WLS materials
+    unsigned tex_height;         // total rows in texture = num_wls * 3
+
+#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)
+
+    QWLS_METHOD bool has_wls(unsigned mat_idx) const;
+    QWLS_METHOD float wavelength(unsigned mat_idx, const float &u0) const;
+    QWLS_METHOD float wavelength_at_row(unsigned base_row, const float &u0) const;
+    QWLS_METHOD float time_constant(unsigned mat_idx) const;
+
+#endif
+};
+
+#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)
+
+/**
+qwls::has_wls
+---------------
+
+Returns true if material at mat_idx has WLS properties.
+The material_map holds -1 for non-WLS materials.
+
+**/
+
+inline QWLS_METHOD bool qwls::has_wls(unsigned mat_idx) const
+{
+    return material_map[mat_idx] >= 0;
+}
+
+/**
+qwls::time_constant
+---------------------
+
+Returns the WLS re-emission time constant in ns for the given material.
+Returns 0.f if material has no WLS (instant re-emission / no delay).
+
+**/
+
+inline QWLS_METHOD float qwls::time_constant(unsigned mat_idx) const
+{
+    int base_row = material_map[mat_idx];
+    if (base_row < 0)
+        return 0.f;
+    unsigned wls_idx = base_row / 3;
+    return time_constants[wls_idx];
+}
+
+/**
+qwls::wavelength
+-------------------
+
+Sample a re-emitted wavelength from the WLS emission spectrum ICDF
+for the material at mat_idx, using uniform random u0 in [0,1).
+
+Returns 0.f if material has no WLS (should not happen in practice
+as callers check has_wls first).
+
+**/
+
+inline QWLS_METHOD float qwls::wavelength(unsigned mat_idx, const float &u0) const
+{
+    int base_row = material_map[mat_idx];
+    if (base_row < 0)
+        return 0.f;
+    return wavelength_at_row(unsigned(base_row), u0);
+}
+
+/**
+qwls::wavelength_at_row
+--------------------------
+
+ICDF texture lookup with HD (high-definition) support.
+base_row is the first of 3 rows for this WLS material:
+  row 0: standard resolution  (full CDF range)
+  row 1: LHS high-res         (0.00 -> 0.05 for hd_factor=20)
+  row 2: RHS high-res         (0.95 -> 1.00 for hd_factor=20)
+
+Uses normalized texture coordinates with linear interpolation,
+matching the qscint.h implementation.
+
+**/
+
+inline QWLS_METHOD float qwls::wavelength_at_row(unsigned base_row, const float &u0) const
+{
+    float y0 = (float(base_row) + 0.5f) / float(tex_height);
+    float y1 = (float(base_row + 1) + 0.5f) / float(tex_height);
+    float y2 = (float(base_row + 2) + 0.5f) / float(tex_height);
+
+    float wl;
+
+    if (hd_factor == 0)
+    {
+        wl = tex2D<float>(wls_tex, u0, y0);
+    }
+    else if (hd_factor == 10)
+    {
+        if (u0 < 0.1f)
+            wl = tex2D<float>(wls_tex, u0 * 10.f, y1);
+        else if (u0 > 0.9f)
+            wl = tex2D<float>(wls_tex, (u0 - 0.9f) * 10.f, y2);
+        else
+            wl = tex2D<float>(wls_tex, u0, y0);
+    }
+    else // hd_factor == 20
+    {
+        if (u0 < 0.05f)
+            wl = tex2D<float>(wls_tex, u0 * 20.f, y1);
+        else if (u0 > 0.95f)
+            wl = tex2D<float>(wls_tex, (u0 - 0.95f) * 20.f, y2);
+        else
+            wl = tex2D<float>(wls_tex, u0, y0);
+    }
+
+    return wl;
+}
+
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b58cd89c2..996a91cb3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -77,6 +77,24 @@ target_include_directories(GPUPhotonFileSource PRIVATE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
 )
 
+# StandAloneGeant4Validation - pure G4 optical photon simulation (no opticks GPU)
+# Links U4 for aligned mode (U4Random, InstrumentedG4OpBoundaryProcess, ShimG4Op*)
+add_executable(StandAloneGeant4Validation StandAloneGeant4Validation.cpp StandAloneGeant4Validation.h)
+target_link_libraries(StandAloneGeant4Validation gphox gphox_g4_deps U4)
+target_compile_definitions(StandAloneGeant4Validation PRIVATE WITH_INSTRUMENTED_DEBUG)
+target_include_directories(StandAloneGeant4Validation PRIVATE
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include>
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
+)
+
+# G4ValidationGenstep - pure G4 electron→scintillation/Cerenkov→optical photon simulation
+add_executable(G4ValidationGenstep G4ValidationGenstep.cpp G4ValidationGenstep.h)
+target_link_libraries(G4ValidationGenstep gphox gphox_g4_deps)
+target_include_directories(G4ValidationGenstep PRIVATE
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include>
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
+)
+
 # simtox creates a numpy file with initial photons for simulation
 add_executable(simtox simtox.cpp)
 
@@ -87,7 +105,7 @@ target_include_directories(simtox PRIVATE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
 )
 
-install(TARGETS consgeo simg4ox GPUCerenkov GPURaytrace GPUPhotonSource GPUPhotonSourceMinimal GPUPhotonFileSource simtox gphox gphox_g4_deps
+install(TARGETS consgeo simg4ox GPUCerenkov GPURaytrace GPUPhotonSource GPUPhotonSourceMinimal GPUPhotonFileSource StandAloneGeant4Validation G4ValidationGenstep simtox gphox gphox_g4_deps
     EXPORT ${PROJECT_NAME}Targets
     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/src/G4ValidationGenstep.cpp b/src/G4ValidationGenstep.cpp
new file mode 100644
index 000000000..c37ee170f
--- /dev/null
+++ b/src/G4ValidationGenstep.cpp
@@ -0,0 +1,109 @@
+#include <string>
+
+#include <argparse/argparse.hpp>
+
+#include "FTFP_BERT.hh"
+#include "G4OpticalPhysics.hh"
+#include "G4RunManager.hh"
+#include "G4UImanager.hh"
+#include "G4VModularPhysicsList.hh"
+
+#include "G4ValidationGenstep.h"
+
+using namespace std;
+
+int main(int argc, char **argv)
+{
+    argparse::ArgumentParser program("G4ValidationGenstep", "0.0.0");
+
+    string gdml_file;
+    double energy_MeV = 1.0;
+    int num_events = 1;
+
+    program.add_argument("-g", "--gdml")
+        .help("path to GDML file")
+        .default_value(string("apex.gdml"))
+        .nargs(1)
+        .store_into(gdml_file);
+
+    program.add_argument("-e", "--energy")
+        .help("electron kinetic energy in MeV")
+        .default_value(1.0)
+        .scan<'g', double>()
+        .store_into(energy_MeV);
+
+    program.add_argument("-n", "--nevents")
+        .help("number of events")
+        .default_value(1)
+        .scan<'i', int>()
+        .store_into(num_events);
+
+    program.add_argument("-s", "--seed").help("random seed").scan<'i', long>();
+
+    program.add_argument("--pos")
+        .help("electron position x,y,z in mm (comma-separated)")
+        .default_value(string("0,0,0"));
+
+    program.add_argument("--dir").help("electron direction x,y,z (comma-separated)").default_value(string("0,0,1"));
+
+    try
+    {
+        program.parse_args(argc, argv);
+    }
+    catch (const exception &err)
+    {
+        cerr << err.what() << endl;
+        cerr << program;
+        exit(EXIT_FAILURE);
+    }
+
+    long seed;
+    if (program.is_used("--seed"))
+        seed = program.get<long>("--seed");
+    else
+        seed = static_cast<long>(time(nullptr));
+
+    // Parse position
+    G4ThreeVector pos(0, 0, 0);
+    {
+        string s = program.get<string>("--pos");
+        float x, y, z;
+        if (sscanf(s.c_str(), "%f,%f,%f", &x, &y, &z) == 3)
+            pos = G4ThreeVector(x, y, z);
+    }
+
+    // Parse direction
+    G4ThreeVector dir(0, 0, 1);
+    {
+        string s = program.get<string>("--dir");
+        float x, y, z;
+        if (sscanf(s.c_str(), "%f,%f,%f", &x, &y, &z) == 3)
+            dir = G4ThreeVector(x, y, z);
+    }
+
+    G4cout << "G4ValidationGenstep:" << G4endl;
+    G4cout << "  GDML:     " << gdml_file << G4endl;
+    G4cout << "  Energy:   " << energy_MeV << " MeV" << G4endl;
+    G4cout << "  Events:   " << num_events << G4endl;
+    G4cout << "  Position: (" << pos.x() << "," << pos.y() << "," << pos.z() << ") mm" << G4endl;
+    G4cout << "  Direction: (" << dir.x() << "," << dir.y() << "," << dir.z() << ")" << G4endl;
+    G4cout << "  Seed:     " << seed << G4endl;
+
+    GenstepHitAccumulator accumulator;
+
+    G4VModularPhysicsList *physics = new FTFP_BERT;
+    physics->RegisterPhysics(new G4OpticalPhysics);
+
+    G4RunManager run_mgr;
+    run_mgr.SetUserInitialization(physics);
+    run_mgr.SetUserInitialization(new GenstepDetectorConstruction(gdml_file, &accumulator));
+    run_mgr.SetUserInitialization(new GenstepActionInitialization(&accumulator, pos, dir, energy_MeV, num_events));
+    run_mgr.Initialize();
+
+    CLHEP::HepRandom::setTheSeed(seed);
+
+    G4cout << "G4Genstep: Starting " << num_events << " events..." << G4endl;
+    run_mgr.BeamOn(num_events);
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/G4ValidationGenstep.h b/src/G4ValidationGenstep.h
new file mode 100644
index 000000000..b58211330
--- /dev/null
+++ b/src/G4ValidationGenstep.h
@@ -0,0 +1,277 @@
+#pragma once
+/**
+G4ValidationGenstep.h
+======================
+
+Pure G4 simulation with electron primary that produces scintillation/Cerenkov
+optical photons. G4 handles all physics including optical photon propagation.
+Collects hits via sensitive detector. Used as the CPU reference for comparison
+with GPU (simg4ox) genstep-based optical simulation.
+**/
+
+#include <filesystem>
+#include <mutex>
+#include <vector>
+
+#include "G4Electron.hh"
+#include "G4Event.hh"
+#include "G4GDMLParser.hh"
+#include "G4OpticalPhoton.hh"
+#include "G4PhysicalConstants.hh"
+#include "G4PrimaryParticle.hh"
+#include "G4PrimaryVertex.hh"
+#include "G4Run.hh"
+#include "G4SDManager.hh"
+#include "G4SystemOfUnits.hh"
+#include "G4THitsCollection.hh"
+#include "G4ThreeVector.hh"
+#include "G4Track.hh"
+#include "G4TrackStatus.hh"
+#include "G4UserEventAction.hh"
+#include "G4UserRunAction.hh"
+#include "G4VHit.hh"
+#include "G4VPhysicalVolume.hh"
+#include "G4VUserActionInitialization.hh"
+#include "G4VUserDetectorConstruction.hh"
+#include "G4VUserPrimaryGeneratorAction.hh"
+
+#include "sysrap/NP.hh"
+#include "sysrap/sphoton.h"
+
+// ---- Hit accumulator ----
+
+struct GenstepHitAccumulator
+{
+    std::mutex mtx;
+    std::vector<sphoton> hits;
+    int total_optical_photons = 0;
+    int total_scintillation = 0;
+    int total_cerenkov = 0;
+
+    void AddHits(const std::vector<sphoton> &event_hits)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        hits.insert(hits.end(), event_hits.begin(), event_hits.end());
+    }
+
+    void Save(const char *filename)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        G4int num_hits = hits.size();
+        NP *arr = NP::Make<float>(num_hits, 4, 4);
+        for (int i = 0; i < num_hits; i++)
+        {
+            float *data = reinterpret_cast<float *>(&hits[i]);
+            std::copy(data, data + 16, arr->values<float>() + i * 16);
+        }
+        arr->save(filename);
+        delete arr;
+        G4cout << "G4Genstep: Saved " << num_hits << " hits to " << filename << G4endl;
+    }
+};
+
+// ---- Sensitive Detector ----
+
+struct GenstepPhotonHit : public G4VHit
+{
+    GenstepPhotonHit() = default;
+
+    GenstepPhotonHit(G4double energy, G4double time, G4ThreeVector position, G4ThreeVector direction,
+                     G4ThreeVector polarization)
+        : photon()
+    {
+        photon.pos = {static_cast<float>(position.x()), static_cast<float>(position.y()),
+                      static_cast<float>(position.z())};
+        photon.time = time;
+        photon.mom = {static_cast<float>(direction.x()), static_cast<float>(direction.y()),
+                      static_cast<float>(direction.z())};
+        photon.pol = {static_cast<float>(polarization.x()), static_cast<float>(polarization.y()),
+                      static_cast<float>(polarization.z())};
+        photon.wavelength = h_Planck * c_light / (energy * CLHEP::eV);
+    }
+
+    void Print() override
+    {
+        G4cout << photon << G4endl;
+    }
+    sphoton photon;
+};
+
+using GenstepPhotonHitsCollection = G4THitsCollection<GenstepPhotonHit>;
+
+struct GenstepPhotonSD : public G4VSensitiveDetector
+{
+    GenstepHitAccumulator *accumulator;
+
+    GenstepPhotonSD(G4String name, GenstepHitAccumulator *acc) : G4VSensitiveDetector(name), accumulator(acc)
+    {
+        collectionName.insert(name + "_HC");
+    }
+
+    void Initialize(G4HCofThisEvent *hce) override
+    {
+        fHC = new GenstepPhotonHitsCollection(SensitiveDetectorName, collectionName[0]);
+        if (fHCID < 0)
+            fHCID = G4SDManager::GetSDMpointer()->GetCollectionID(collectionName[0]);
+        hce->AddHitsCollection(fHCID, fHC);
+    }
+
+    G4bool ProcessHits(G4Step *aStep, G4TouchableHistory *) override
+    {
+        G4Track *track = aStep->GetTrack();
+        if (track->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
+            return false;
+
+        fHC->insert(new GenstepPhotonHit(
+            track->GetTotalEnergy(), track->GetGlobalTime(), aStep->GetPostStepPoint()->GetPosition(),
+            aStep->GetPostStepPoint()->GetMomentumDirection(), aStep->GetPostStepPoint()->GetPolarization()));
+
+        track->SetTrackStatus(fStopAndKill);
+        return true;
+    }
+
+    void EndOfEvent(G4HCofThisEvent *) override
+    {
+        G4int n = fHC->entries();
+        std::vector<sphoton> event_hits;
+        event_hits.reserve(n);
+        for (GenstepPhotonHit *hit : *fHC->GetVector())
+            event_hits.push_back(hit->photon);
+        accumulator->AddHits(event_hits);
+    }
+
+  private:
+    GenstepPhotonHitsCollection *fHC = nullptr;
+    G4int fHCID = -1;
+};
+
+// ---- Detector Construction ----
+
+struct GenstepDetectorConstruction : G4VUserDetectorConstruction
+{
+    GenstepDetectorConstruction(std::filesystem::path gdml_file, GenstepHitAccumulator *acc)
+        : gdml_file_(gdml_file), accumulator_(acc)
+    {
+    }
+
+    G4VPhysicalVolume *Construct() override
+    {
+        parser_.Read(gdml_file_.string(), false);
+        return parser_.GetWorldVolume();
+    }
+
+    void ConstructSDandField() override
+    {
+        G4SDManager *SDman = G4SDManager::GetSDMpointer();
+        const G4GDMLAuxMapType *auxmap = parser_.GetAuxMap();
+
+        for (auto const &[logVol, listType] : *auxmap)
+        {
+            for (auto const &auxtype : listType)
+            {
+                if (auxtype.type == "SensDet")
+                {
+                    G4String name = logVol->GetName() + "_" + auxtype.value;
+                    G4cout << "G4Genstep: Attaching SD to " << logVol->GetName() << G4endl;
+                    GenstepPhotonSD *sd = new GenstepPhotonSD(name, accumulator_);
+                    SDman->AddNewDetector(sd);
+                    logVol->SetSensitiveDetector(sd);
+                }
+            }
+        }
+    }
+
+  private:
+    std::filesystem::path gdml_file_;
+    G4GDMLParser parser_;
+    GenstepHitAccumulator *accumulator_;
+};
+
+// ---- Electron Primary Generator ----
+
+struct ElectronPrimaryGenerator : G4VUserPrimaryGeneratorAction
+{
+    G4ThreeVector position;
+    G4ThreeVector direction;
+    G4double energy_MeV;
+
+    ElectronPrimaryGenerator(G4ThreeVector pos, G4ThreeVector dir, G4double energy)
+        : position(pos), direction(dir.unit()), energy_MeV(energy)
+    {
+    }
+
+    void GeneratePrimaries(G4Event *event) override
+    {
+        G4PrimaryVertex *vertex = new G4PrimaryVertex(position, 0.0);
+        G4PrimaryParticle *particle = new G4PrimaryParticle(G4Electron::Definition());
+        particle->SetKineticEnergy(energy_MeV * MeV);
+        particle->SetMomentumDirection(direction);
+        vertex->SetPrimary(particle);
+        event->AddPrimaryVertex(vertex);
+    }
+};
+
+// ---- Event Action with optical photon counting ----
+
+struct GenstepEventAction : G4UserEventAction
+{
+    GenstepHitAccumulator *accumulator;
+    int total_events;
+
+    GenstepEventAction(GenstepHitAccumulator *acc, int total) : accumulator(acc), total_events(total)
+    {
+    }
+
+    void EndOfEventAction(const G4Event *event) override
+    {
+        int id = event->GetEventID();
+        if (id == 0 || (id + 1) % 10 == 0 || id + 1 == total_events)
+            G4cout << "G4Genstep: Event " << id + 1 << "/" << total_events << G4endl;
+    }
+};
+
+// ---- Run Action ----
+
+struct GenstepRunAction : G4UserRunAction
+{
+    GenstepHitAccumulator *accumulator;
+
+    GenstepRunAction(GenstepHitAccumulator *acc) : accumulator(acc)
+    {
+    }
+
+    void EndOfRunAction(const G4Run *) override
+    {
+        G4cout << "G4Genstep: Total hits: " << accumulator->hits.size() << G4endl;
+        accumulator->Save("g4_genstep_hits.npy");
+    }
+};
+
+// ---- Action Initialization ----
+
+struct GenstepActionInitialization : G4VUserActionInitialization
+{
+    GenstepHitAccumulator *accumulator;
+    G4ThreeVector position;
+    G4ThreeVector direction;
+    G4double energy_MeV;
+    int num_events;
+
+    GenstepActionInitialization(GenstepHitAccumulator *acc, G4ThreeVector pos, G4ThreeVector dir, G4double energy,
+                                int nevt)
+        : accumulator(acc), position(pos), direction(dir), energy_MeV(energy), num_events(nevt)
+    {
+    }
+
+    void BuildForMaster() const override
+    {
+        SetUserAction(new GenstepRunAction(accumulator));
+    }
+
+    void Build() const override
+    {
+        SetUserAction(new ElectronPrimaryGenerator(position, direction, energy_MeV));
+        SetUserAction(new GenstepEventAction(accumulator, num_events));
+        SetUserAction(new GenstepRunAction(accumulator));
+    }
+};
diff --git a/src/GPURaytrace.cpp b/src/GPURaytrace.cpp
index c97c7ec11..bb668f293 100644
--- a/src/GPURaytrace.cpp
+++ b/src/GPURaytrace.cpp
@@ -11,7 +11,7 @@
 #include "G4VisExecutive.hh"
 
 #include "sysrap/OPTICKS_LOG.hh"
-
+#include "config.h"
 #include "GPURaytrace.h"
 
 #include "G4RunManager.hh"
@@ -68,6 +68,11 @@ int main(int argc, char **argv)
         .nargs(1)
         .store_into(macro_name);
 
+    program.add_argument("-c", "--config")
+        .help("config file name (without .json extension)")
+        .default_value(string(""))
+        .nargs(1);
+
     program.add_argument("-i", "--interactive")
         .help("whether to open an interactive window with a viewer")
         .flag()
@@ -108,6 +113,14 @@ int main(int argc, char **argv)
 
     G4App *g4app = new G4App(gdml_file);
 
+    // Load config and apply savephotonhistory flag if provided
+    string config_name = program.get<string>("--config");
+    if (!config_name.empty())
+    {
+        gphox::Config cfg(config_name);
+        g4app->run_act_->fSavePhotonHistory = cfg.savephotonhistory;
+    }
+
     ActionInitialization *actionInit = new ActionInitialization(g4app);
     run_mgr->SetUserInitialization(actionInit);
     run_mgr->SetUserInitialization(g4app->det_cons_);
diff --git a/src/GPURaytrace.h b/src/GPURaytrace.h
index f8443a167..589e60d75 100644
--- a/src/GPURaytrace.h
+++ b/src/GPURaytrace.h
@@ -1,3 +1,4 @@
+#include <array>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
@@ -47,7 +48,9 @@
 namespace
 {
 G4Mutex genstep_mutex = G4MUTEX_INITIALIZER;
-}
+G4Mutex g4hits_mutex = G4MUTEX_INITIALIZER;
+std::vector<std::array<float, 16>> g4_accumulated_hits;
+} // namespace
 
 bool IsSubtractionSolid(G4VSolid *solid)
 {
@@ -331,7 +334,26 @@ struct EventAction : G4UserEventAction
             for (G4int i = 0; i < hce->GetNumberOfCollections(); i++)
             {
                 G4VHitsCollection *hc = hce->GetHC(i);
-                if (hc)
+                if (!hc)
+                    continue;
+
+                PhotonHitsCollection *phc = dynamic_cast<PhotonHitsCollection *>(hc);
+                if (phc)
+                {
+                    G4AutoLock lock(&g4hits_mutex);
+                    for (size_t j = 0; j < phc->entries(); j++)
+                    {
+                        PhotonHit *hit = (*phc)[j];
+                        float wl = 1239.84198f / static_cast<float>(hit->fenergy);
+                        g4_accumulated_hits.push_back(
+                            {float(hit->fposition.x()), float(hit->fposition.y()), float(hit->fposition.z()),
+                             float(hit->ftime), float(hit->fdirection.x()), float(hit->fdirection.y()),
+                             float(hit->fdirection.z()), 0.f, float(hit->fpolarization.x()),
+                             float(hit->fpolarization.y()), float(hit->fpolarization.z()), wl, 0.f, 0.f, 0.f, 0.f});
+                    }
+                    fTotalG4Hits += phc->entries();
+                }
+                else
                 {
                     fTotalG4Hits += hc->GetSize();
                 }
@@ -348,6 +370,7 @@ struct EventAction : G4UserEventAction
 struct RunAction : G4UserRunAction
 {
     EventAction *fEventAction;
+    bool fSavePhotonHistory{false};
 
     RunAction(EventAction *eventAction) : fEventAction(eventAction)
     {
@@ -379,44 +402,39 @@ struct RunAction : G4UserRunAction
             std::cout << "Opticks: NumCollected:  " << sev->GetNumPhotonCollected(0) << std::endl;
             std::cout << "Opticks: NumHits:  " << num_hits << std::endl;
             std::cout << "Geant4: NumHits:  " << fEventAction->GetTotalG4Hits() << std::endl;
-            std::ofstream outFile("opticks_hits_output.txt");
-            if (!outFile.is_open())
-            {
-                std::cerr << "Error opening output file!" << std::endl;
-                return;
-            }
 
-            for (int idx = 0; idx < int(num_hits); idx++)
+            if (fSavePhotonHistory)
             {
-                sphoton hit;
-                sev->getHit(hit, idx);
-                G4ThreeVector position = G4ThreeVector(hit.pos.x, hit.pos.y, hit.pos.z);
-                G4ThreeVector direction = G4ThreeVector(hit.mom.x, hit.mom.y, hit.mom.z);
-                G4ThreeVector polarization = G4ThreeVector(hit.pol.x, hit.pol.y, hit.pol.z);
-                int theCreationProcessid;
-                if (OpticksPhoton::HasCerenkovFlag(hit.flagmask))
-                {
-                    theCreationProcessid = 0;
-                }
-                else if (OpticksPhoton::HasScintillationFlag(hit.flagmask))
+                // Save full SEvt (photon, record, seq, hit) when DebugLite/DebugHeavy
+                sev->save();
+                std::cout << "SEvt::save() complete" << std::endl;
+
+                // Save GPU hits as .npy (sphoton layout: N x 4 x 4 float32)
                 {
-                    theCreationProcessid = 1;
+                    NP *gpu_h = NP::Make<float>(num_hits, 4, 4);
+                    for (unsigned idx = 0; idx < num_hits; idx++)
+                    {
+                        sphoton hit;
+                        sev->getHit(hit, idx);
+                        memcpy(gpu_h->bytes() + idx * sizeof(sphoton), &hit, sizeof(sphoton));
+                    }
+                    gpu_h->save("gpu_hits.npy");
+                    std::cout << "Saved GPU hits: " << num_hits << " to gpu_hits.npy" << std::endl;
                 }
-                else
+
+                // Save G4 hits as .npy (same layout: N x 4 x 4 float32)
                 {
-                    theCreationProcessid = -1;
+                    G4AutoLock lock(&g4hits_mutex);
+                    size_t ng4 = g4_accumulated_hits.size();
+                    if (ng4 > 0)
+                    {
+                        NP *g4h = NP::Make<float>(ng4, 4, 4);
+                        memcpy(g4h->bytes(), g4_accumulated_hits.data(), ng4 * 16 * sizeof(float));
+                        g4h->save("g4_hits.npy");
+                        std::cout << "Saved G4 hits: " << ng4 << " to g4_hits.npy" << std::endl;
+                    }
                 }
-                //    std::cout << "Adding hit from Opticks:" << hit.wavelength << " " << position << " " << direction
-                //    << "
-                //    "
-                //              << polarization << std::endl;
-                outFile << hit.time << " " << hit.wavelength << "  " << "(" << position.x() << ", " << position.y()
-                        << ", " << position.z() << ")  " << "(" << direction.x() << ", " << direction.y() << ", "
-                        << direction.z() << ")  " << "(" << polarization.x() << ", " << polarization.y() << ", "
-                        << polarization.z() << ")  " << "CreationProcessID=" << theCreationProcessid << std::endl;
             }
-
-            outFile.close();
         }
     }
 };
@@ -523,10 +541,42 @@ struct SteppingAction : G4UserSteppingAction
                                    << G4endl;
                             return;
                         }
-                        G4double SCINTILLATIONTIMECONSTANT1 = MPT->GetConstProperty(kSCINTILLATIONTIMECONSTANT1);
+                        // G4 11.x supports up to 3 scintillation components
+                        const G4int tcKeys[3] = {kSCINTILLATIONTIMECONSTANT1, kSCINTILLATIONTIMECONSTANT2, kSCINTILLATIONTIMECONSTANT3};
+                        const G4int yieldKeys[3] = {kSCINTILLATIONYIELD1, kSCINTILLATIONYIELD2, kSCINTILLATIONYIELD3};
 
-                        U4::CollectGenstep_DsG4Scintillation_r4695(aTrack, aStep, fNumPhotons, 1,
-                                                                   SCINTILLATIONTIMECONSTANT1);
+                        G4double tc[3] = {0, 0, 0};
+                        G4double yield[3] = {0, 0, 0};
+                        G4double yieldSum = 0;
+                        G4int nComp = 0;
+
+                        for (G4int c = 0; c < 3; c++)
+                        {
+                            if (MPT->ConstPropertyExists(tcKeys[c]))
+                            {
+                                tc[c] = MPT->GetConstProperty(tcKeys[c]);
+                                yield[c] = MPT->ConstPropertyExists(yieldKeys[c])
+                                               ? MPT->GetConstProperty(yieldKeys[c])
+                                               : (c == 0 ? 1.0 : 0.0);
+                                yieldSum += yield[c];
+                                nComp = c + 1;
+                            }
+                        }
+
+                        G4AutoLock lock(&genstep_mutex);
+                        G4int nRemaining = fNumPhotons;
+                        for (G4int c = 0; c < nComp; c++)
+                        {
+                            G4int nPhotComp;
+                            if (c == nComp - 1)
+                                nPhotComp = nRemaining; // last component gets remainder
+                            else
+                                nPhotComp = static_cast<G4int>(fNumPhotons * yield[c] / yieldSum);
+                            nRemaining -= nPhotComp;
+
+                            if (nPhotComp > 0)
+                                U4::CollectGenstep_DsG4Scintillation_r4695(aTrack, aStep, nPhotComp, c + 1, tc[c]);
+                        }
                     }
                 }
             }
diff --git a/src/StandAloneGeant4Validation.cpp b/src/StandAloneGeant4Validation.cpp
new file mode 100644
index 000000000..40e3102fa
--- /dev/null
+++ b/src/StandAloneGeant4Validation.cpp
@@ -0,0 +1,163 @@
+#include <string>
+#include <thread>
+
+#include <argparse/argparse.hpp>
+
+#include "FTFP_BERT.hh"
+#include "G4MTRunManager.hh"
+#include "G4OpticalPhysics.hh"
+#include "G4RunManager.hh"
+#include "G4UImanager.hh"
+#include "G4VModularPhysicsList.hh"
+
+#include "G4OpticalParameters.hh"
+
+#include "StandAloneGeant4Validation.h"
+#include "config.h"
+
+using namespace std;
+
+int main(int argc, char **argv)
+{
+    argparse::ArgumentParser program("StandAloneGeant4Validation", "0.0.0");
+
+    string gdml_file, config_name;
+    int num_threads = 0;
+
+    program.add_argument("-g", "--gdml")
+        .help("path to GDML file")
+        .default_value(string("geom.gdml"))
+        .nargs(1)
+        .store_into(gdml_file);
+
+    program.add_argument("-c", "--config")
+        .help("the name of a config file")
+        .default_value(string("dev"))
+        .nargs(1)
+        .store_into(config_name);
+
+    program.add_argument("-s", "--seed").help("fixed random seed (default: time-based)").scan<'i', long>();
+
+    program.add_argument("-t", "--threads")
+        .help("number of threads (0=sequential, default: hardware concurrency)")
+        .default_value(-1)
+        .scan<'i', int>()
+        .store_into(num_threads);
+
+    program.add_argument("--aligned")
+        .help("enable photon-by-photon aligned comparison with GPU (forces sequential)")
+        .default_value(false)
+        .implicit_value(true);
+
+    try
+    {
+        program.parse_args(argc, argv);
+    }
+    catch (const exception &err)
+    {
+        cerr << err.what() << endl;
+        cerr << program;
+        exit(EXIT_FAILURE);
+    }
+
+    long seed;
+    if (program.is_used("--seed"))
+        seed = program.get<long>("--seed");
+    else
+        seed = static_cast<long>(time(nullptr));
+
+    bool aligned = program.get<bool>("--aligned");
+
+    gphox::Config cfg(config_name);
+    int total_photons = cfg.torch.numphoton;
+
+    // Aligned mode forces sequential (U4Random is single-threaded)
+    if (aligned)
+        num_threads = 0;
+
+    // Determine threading mode
+    bool use_mt = (num_threads != 0);
+    if (num_threads < 0)
+        num_threads = std::thread::hardware_concurrency();
+    if (num_threads < 1)
+        num_threads = 1;
+
+    // In MT mode: split photons across events, one event per thread-batch
+    // In sequential mode: one event with all photons (original behavior)
+    int num_events, photons_per_event;
+    if (use_mt)
+    {
+        num_events = num_threads * 4; // 4 events per thread for load balancing
+        photons_per_event = (total_photons + num_events - 1) / num_events;
+        // Adjust num_events so we don't overshoot
+        num_events = (total_photons + photons_per_event - 1) / photons_per_event;
+    }
+    else
+    {
+        num_events = 1;
+        photons_per_event = total_photons;
+    }
+
+    int actual_photons = num_events * photons_per_event;
+
+    G4cout << "Random seed set to: " << seed << G4endl;
+    G4cout << "G4: " << total_photons << " photons, " << num_events << " events x " << photons_per_event
+           << " photons/event" << " (" << actual_photons << " actual)"
+           << (use_mt ? ", " + to_string(num_threads) + " threads" : ", sequential") << G4endl;
+
+    HitAccumulator accumulator;
+    PhotonFateAccumulator fate;
+
+    if (aligned)
+        fate.Resize(total_photons);
+
+    G4VModularPhysicsList *physics = new FTFP_BERT;
+    if (aligned)
+        physics->RegisterPhysics(new AlignedOpticalPhysics);
+    else
+        physics->RegisterPhysics(new G4OpticalPhysics);
+
+    // Use exponential WLS time profile (default is delta = zero delay)
+    G4OpticalParameters::Instance()->SetWLSTimeProfile("exponential");
+
+    if (use_mt)
+    {
+        auto *run_mgr = new G4MTRunManager;
+        run_mgr->SetNumberOfThreads(num_threads);
+        run_mgr->SetUserInitialization(physics);
+        run_mgr->SetUserInitialization(new G4OnlyDetectorConstruction(gdml_file, &accumulator));
+        run_mgr->SetUserInitialization(
+            new G4OnlyActionInitialization(cfg, &accumulator, &fate, photons_per_event, num_events, aligned));
+        run_mgr->Initialize();
+
+        CLHEP::HepRandom::setTheSeed(seed);
+
+        G4cout << "G4: Starting MT run with " << num_events << " events..." << G4endl;
+        run_mgr->BeamOn(num_events);
+
+        delete run_mgr;
+    }
+    else
+    {
+        G4RunManager run_mgr;
+        run_mgr.SetUserInitialization(physics);
+        run_mgr.SetUserInitialization(new G4OnlyDetectorConstruction(gdml_file, &accumulator));
+        run_mgr.SetUserInitialization(
+            new G4OnlyActionInitialization(cfg, &accumulator, &fate, photons_per_event, num_events, aligned));
+
+        if (aligned)
+        {
+            G4cout << "G4: Aligned mode — creating U4Random" << G4endl;
+            U4Random::Create();
+        }
+
+        run_mgr.Initialize();
+
+        CLHEP::HepRandom::setTheSeed(seed);
+
+        G4cout << "G4: Starting sequential run..." << G4endl;
+        run_mgr.BeamOn(num_events);
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/StandAloneGeant4Validation.h b/src/StandAloneGeant4Validation.h
new file mode 100644
index 000000000..49dd92c67
--- /dev/null
+++ b/src/StandAloneGeant4Validation.h
@@ -0,0 +1,625 @@
+#pragma once
+
+#include <cmath>
+#include <filesystem>
+#include <mutex>
+#include <vector>
+
+#include "G4Event.hh"
+#include "G4GDMLParser.hh"
+#include "G4OpBoundaryProcess.hh"
+#include "G4OpWLS.hh"
+#include "G4OpticalPhoton.hh"
+#include "G4PhysicalConstants.hh"
+#include "G4PrimaryParticle.hh"
+#include "G4PrimaryVertex.hh"
+#include "G4ProcessManager.hh"
+#include "G4Run.hh"
+#include "G4SDManager.hh"
+#include "G4SystemOfUnits.hh"
+#include "G4THitsCollection.hh"
+#include "G4ThreeVector.hh"
+#include "G4Track.hh"
+#include "G4TrackStatus.hh"
+#include "G4UserEventAction.hh"
+#include "G4UserRunAction.hh"
+#include "G4UserSteppingAction.hh"
+#include "G4UserTrackingAction.hh"
+#include "G4VHit.hh"
+#include "G4VPhysicalVolume.hh"
+#include "G4VPhysicsConstructor.hh"
+#include "G4VUserActionInitialization.hh"
+#include "G4VUserDetectorConstruction.hh"
+#include "G4VUserPrimaryGeneratorAction.hh"
+
+#include "ShimG4OpAbsorption.hh"
+#include "ShimG4OpRayleigh.hh"
+#include "U4Random.hh"
+
+#include "sysrap/NP.hh"
+#include "sysrap/sphoton.h"
+
+#include "config.h"
+#include "torch.h"
+
+// ---- Global hit accumulator (thread-safe) ----
+
+struct HitAccumulator
+{
+    std::mutex mtx;
+    std::vector<sphoton> hits;
+
+    void AddHits(const std::vector<sphoton> &event_hits)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        hits.insert(hits.end(), event_hits.begin(), event_hits.end());
+    }
+
+    void Save(const char *filename)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        G4int num_hits = hits.size();
+        NP *arr = NP::Make<float>(num_hits, 4, 4);
+        for (int i = 0; i < num_hits; i++)
+        {
+            float *data = reinterpret_cast<float *>(&hits[i]);
+            std::copy(data, data + 16, arr->values<float>() + i * 16);
+        }
+        arr->save(filename);
+        delete arr;
+        G4cout << "G4: Saved " << num_hits << " total hits to " << filename << G4endl;
+    }
+};
+
+// ---- Sensitive Detector: collects optical photon hits per event ----
+
+struct G4PhotonHit : public G4VHit
+{
+    G4PhotonHit() = default;
+
+    G4PhotonHit(G4double energy, G4double time, G4ThreeVector position, G4ThreeVector direction,
+                G4ThreeVector polarization)
+        : photon()
+    {
+        photon.pos = {static_cast<float>(position.x()), static_cast<float>(position.y()),
+                      static_cast<float>(position.z())};
+        photon.time = time;
+        photon.mom = {static_cast<float>(direction.x()), static_cast<float>(direction.y()),
+                      static_cast<float>(direction.z())};
+        photon.pol = {static_cast<float>(polarization.x()), static_cast<float>(polarization.y()),
+                      static_cast<float>(polarization.z())};
+        photon.wavelength = h_Planck * c_light / (energy * CLHEP::eV);
+    }
+
+    void Print() override
+    {
+        G4cout << photon << G4endl;
+    }
+
+    sphoton photon;
+};
+
+using G4PhotonHitsCollection = G4THitsCollection<G4PhotonHit>;
+
+struct G4PhotonSD : public G4VSensitiveDetector
+{
+    HitAccumulator *accumulator;
+
+    G4PhotonSD(G4String name, HitAccumulator *acc) : G4VSensitiveDetector(name), accumulator(acc)
+    {
+        G4String HCname = name + "_HC";
+        collectionName.insert(HCname);
+    }
+
+    void Initialize(G4HCofThisEvent *hce) override
+    {
+        fHitsCollection = new G4PhotonHitsCollection(SensitiveDetectorName, collectionName[0]);
+        if (fHCID < 0)
+            fHCID = G4SDManager::GetSDMpointer()->GetCollectionID(collectionName[0]);
+        hce->AddHitsCollection(fHCID, fHitsCollection);
+    }
+
+    G4bool ProcessHits(G4Step *aStep, G4TouchableHistory *) override
+    {
+        G4Track *track = aStep->GetTrack();
+        if (track->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
+            return false;
+
+        G4PhotonHit *hit = new G4PhotonHit(
+            track->GetTotalEnergy(), track->GetGlobalTime(), aStep->GetPostStepPoint()->GetPosition(),
+            aStep->GetPostStepPoint()->GetMomentumDirection(), aStep->GetPostStepPoint()->GetPolarization());
+
+        fHitsCollection->insert(hit);
+        track->SetTrackStatus(fStopAndKill);
+        return true;
+    }
+
+    void EndOfEvent(G4HCofThisEvent *) override
+    {
+        G4int num_hits = fHitsCollection->entries();
+
+        std::vector<sphoton> event_hits;
+        event_hits.reserve(num_hits);
+        for (G4PhotonHit *hit : *fHitsCollection->GetVector())
+            event_hits.push_back(hit->photon);
+
+        accumulator->AddHits(event_hits);
+    }
+
+  private:
+    G4PhotonHitsCollection *fHitsCollection = nullptr;
+    G4int fHCID = -1;
+};
+
+// ---- Detector Construction: loads GDML, attaches SD ----
+
+struct G4OnlyDetectorConstruction : G4VUserDetectorConstruction
+{
+    G4OnlyDetectorConstruction(std::filesystem::path gdml_file, HitAccumulator *acc)
+        : gdml_file_(gdml_file), accumulator_(acc)
+    {
+    }
+
+    G4VPhysicalVolume *Construct() override
+    {
+        parser_.Read(gdml_file_.string(), false);
+        return parser_.GetWorldVolume();
+    }
+
+    void ConstructSDandField() override
+    {
+        G4SDManager *SDman = G4SDManager::GetSDMpointer();
+        const G4GDMLAuxMapType *auxmap = parser_.GetAuxMap();
+
+        for (auto const &[logVol, listType] : *auxmap)
+        {
+            for (auto const &auxtype : listType)
+            {
+                if (auxtype.type == "SensDet")
+                {
+                    G4String name = logVol->GetName() + "_" + auxtype.value;
+                    G4cout << "G4: Attaching SD to " << logVol->GetName() << G4endl;
+                    G4PhotonSD *sd = new G4PhotonSD(name, accumulator_);
+                    SDman->AddNewDetector(sd);
+                    logVol->SetSensitiveDetector(sd);
+                }
+            }
+        }
+    }
+
+  private:
+    std::filesystem::path gdml_file_;
+    G4GDMLParser parser_;
+    HitAccumulator *accumulator_;
+};
+
+// ---- Primary Generator: distributes photons across events ----
+
+struct G4OnlyPrimaryGenerator : G4VUserPrimaryGeneratorAction
+{
+    gphox::Config cfg;
+    int photons_per_event;
+
+    G4OnlyPrimaryGenerator(const gphox::Config &cfg, int photons_per_event)
+        : cfg(cfg), photons_per_event(photons_per_event)
+    {
+    }
+
+    void GeneratePrimaries(G4Event *event) override
+    {
+        int eventID = event->GetEventID();
+
+        // Generate photons for this event's batch using event-specific seed offset
+        storch t = cfg.torch;
+        t.numphoton = photons_per_event;
+        std::vector<sphoton> sphotons = generate_photons(t, photons_per_event, eventID);
+
+        for (const sphoton &p : sphotons)
+        {
+            G4ThreeVector position(p.pos.x, p.pos.y, p.pos.z);
+            G4ThreeVector direction(p.mom.x, p.mom.y, p.mom.z);
+            G4ThreeVector polarization(p.pol.x, p.pol.y, p.pol.z);
+            G4double wavelength_nm = p.wavelength;
+
+            G4PrimaryVertex *vertex = new G4PrimaryVertex(position, p.time);
+            G4double energy = h_Planck * c_light / (wavelength_nm * nm);
+
+            G4PrimaryParticle *particle = new G4PrimaryParticle(G4OpticalPhoton::Definition());
+            particle->SetKineticEnergy(energy);
+            particle->SetMomentumDirection(direction);
+            particle->SetPolarization(polarization);
+
+            vertex->SetPrimary(particle);
+            event->AddPrimaryVertex(vertex);
+        }
+    }
+};
+
+// ---- Photon fate accumulator: tracks ALL photon final states ----
+
+struct PhotonFateAccumulator
+{
+    std::mutex mtx;
+    std::vector<sphoton> photons;
+    bool indexed = false; // true for aligned mode: store by photon index
+
+    // Opticks flag enum values
+    static constexpr unsigned TORCH = 0x0004;
+    static constexpr unsigned BULK_ABSORB = 0x0008;
+    static constexpr unsigned BULK_REEMIT = 0x0010;
+    static constexpr unsigned BULK_SCATTER = 0x0020;
+    static constexpr unsigned SURFACE_DETECT = 0x0040;
+    static constexpr unsigned SURFACE_ABSORB = 0x0080;
+    static constexpr unsigned SURFACE_DREFLECT = 0x0100;
+    static constexpr unsigned SURFACE_SREFLECT = 0x0200;
+    static constexpr unsigned BOUNDARY_REFLECT = 0x0400;
+    static constexpr unsigned BOUNDARY_TRANSMIT = 0x0800;
+    static constexpr unsigned MISS = 0x8000;
+
+    void Resize(int n)
+    {
+        photons.resize(n);
+        indexed = true;
+    }
+
+    void Set(int idx, const sphoton &p)
+    {
+        if (idx >= 0 && idx < (int)photons.size())
+            photons[idx] = p;
+    }
+
+    void Add(const sphoton &p)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        photons.push_back(p);
+    }
+
+    void Save(const char *filename)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        int n = photons.size();
+        NP *arr = NP::Make<float>(n, 4, 4);
+        for (int i = 0; i < n; i++)
+        {
+            float *data = reinterpret_cast<float *>(&photons[i]);
+            std::copy(data, data + 16, arr->values<float>() + i * 16);
+        }
+        arr->save(filename);
+        delete arr;
+        G4cout << "G4: Saved " << n << " photon fates to " << filename << G4endl;
+    }
+};
+
+// ---- Stepping Action: tracks photon fates with opticks-compatible flags ----
+
+struct G4OnlySteppingAction : G4UserSteppingAction
+{
+    PhotonFateAccumulator *fate;
+    bool aligned;
+    std::map<std::string, int> proc_death_counts;
+    std::map<int, int> boundary_status_counts;
+    std::mutex count_mtx;
+
+    G4OnlySteppingAction(PhotonFateAccumulator *f, bool aligned_ = false) : fate(f), aligned(aligned_)
+    {
+    }
+
+    ~G4OnlySteppingAction()
+    {
+        std::lock_guard<std::mutex> lock(count_mtx);
+        if (!proc_death_counts.empty())
+        {
+            G4cout << "\nG4: Photon death process summary:" << G4endl;
+            for (auto &[name, count] : proc_death_counts)
+                G4cout << "  " << name << ": " << count << G4endl;
+        }
+        if (!boundary_status_counts.empty())
+        {
+            G4cout << "\nG4: OpBoundary status counts (all steps):" << G4endl;
+            const char *bnames[] = {"Undefined",
+                                    "Transmission",
+                                    "FresnelRefraction",
+                                    "FresnelReflection",
+                                    "TotalInternalReflection",
+                                    "LambertianReflection",
+                                    "LobeReflection",
+                                    "SpikeReflection",
+                                    "BackScattering",
+                                    "Absorption",
+                                    "Detection",
+                                    "NotAtBoundary",
+                                    "SameMaterial",
+                                    "StepTooSmall",
+                                    "NoRINDEX",
+                                    "PolishedLumirrorAirReflection",
+                                    "PolishedLumirrorGlueReflection",
+                                    "PolishedAirReflection",
+                                    "PolishedTeflonAirReflection",
+                                    "PolishedTiOAirReflection",
+                                    "PolishedTyvekAirReflection",
+                                    "PolishedVM2000AirReflection",
+                                    "PolishedVM2000GlueReflection",
+                                    "EtchedLumirrorAirReflection",
+                                    "EtchedLumirrorGlueReflection",
+                                    "EtchedAirReflection",
+                                    "EtchedTeflonAirReflection",
+                                    "EtchedTiOAirReflection",
+                                    "EtchedTyvekAirReflection",
+                                    "EtchedVM2000AirReflection",
+                                    "EtchedVM2000GlueReflection",
+                                    "GroundLumirrorAirReflection",
+                                    "GroundLumirrorGlueReflection",
+                                    "GroundAirReflection",
+                                    "GroundTeflonAirReflection",
+                                    "GroundTiOAirReflection",
+                                    "GroundTyvekAirReflection",
+                                    "GroundVM2000AirReflection",
+                                    "GroundVM2000GlueReflection",
+                                    "Dichroic",
+                                    "CoatedDielectricReflection",
+                                    "CoatedDielectricRefraction",
+                                    "CoatedDielectricFrustratedTransmission"};
+            for (auto &[st, count] : boundary_status_counts)
+            {
+                const char *nm = (st >= 0 && st < 43) ? bnames[st] : "?";
+                G4cout << "  " << nm << "(" << st << "): " << count << G4endl;
+            }
+        }
+    }
+
+    void UserSteppingAction(const G4Step *aStep) override
+    {
+        G4Track *track = aStep->GetTrack();
+        if (track->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
+            return;
+
+        G4StepPoint *post = aStep->GetPostStepPoint();
+        G4TrackStatus status = track->GetTrackStatus();
+
+        // Find the OpBoundary process to get its status (for ALL steps)
+        G4OpBoundaryProcess *boundary = nullptr;
+        G4ProcessManager *pm = track->GetDefinition()->GetProcessManager();
+        for (int i = 0; i < pm->GetPostStepProcessVector()->entries(); i++)
+        {
+            G4VProcess *p = (*pm->GetPostStepProcessVector())[i];
+            boundary = dynamic_cast<G4OpBoundaryProcess *>(p);
+            if (boundary)
+                break;
+        }
+
+        G4OpBoundaryProcessStatus bStatus = boundary ? boundary->GetStatus() : Undefined;
+
+        // Count boundary status for ALL steps
+        if (boundary && bStatus != NotAtBoundary && bStatus != Undefined && bStatus != StepTooSmall)
+        {
+            std::lock_guard<std::mutex> lock(count_mtx);
+            boundary_status_counts[int(bStatus)]++;
+        }
+
+        // Only record photon state when the photon is about to die
+        if (status != fStopAndKill && status != fStopButAlive)
+            return;
+
+        // Identify the process
+        const G4VProcess *proc = post->GetProcessDefinedStep();
+        G4String procName = proc ? proc->GetProcessName() : "Unknown";
+
+        // Build detailed key for counting
+        std::string key = procName;
+        if (procName == "OpBoundary" && boundary)
+            key += "(" + std::to_string(int(bStatus)) + ")";
+        key += (status == fStopAndKill ? "/Kill" : "/Alive");
+
+        {
+            std::lock_guard<std::mutex> lock(count_mtx);
+            proc_death_counts[key]++;
+        }
+
+        // Map to opticks flag
+        unsigned flag = 0;
+
+        if (procName == "OpAbsorption")
+        {
+            flag = PhotonFateAccumulator::BULK_ABSORB;
+        }
+        else if (procName == "OpWLS")
+        {
+            flag = PhotonFateAccumulator::BULK_REEMIT;
+        }
+        else if (procName == "OpBoundary" && boundary)
+        {
+            switch (bStatus)
+            {
+            case Detection:
+                flag = PhotonFateAccumulator::SURFACE_DETECT;
+                break;
+            case Absorption:
+                flag = PhotonFateAccumulator::SURFACE_ABSORB;
+                break;
+            case FresnelReflection:
+            case TotalInternalReflection:
+                flag = PhotonFateAccumulator::BOUNDARY_REFLECT;
+                break;
+            case FresnelRefraction:
+                flag = PhotonFateAccumulator::BOUNDARY_TRANSMIT;
+                break;
+            case LambertianReflection:
+            case LobeReflection:
+                flag = PhotonFateAccumulator::SURFACE_DREFLECT;
+                break;
+            case SpikeReflection:
+                flag = PhotonFateAccumulator::SURFACE_SREFLECT;
+                break;
+            case BackScattering:
+                flag = PhotonFateAccumulator::SURFACE_DREFLECT;
+                break;
+            default:
+                flag = PhotonFateAccumulator::SURFACE_ABSORB;
+                break;
+            }
+        }
+        else if (procName == "Transportation")
+        {
+            // Check if an SD killed this photon (SURFACE_DETECT)
+            G4StepPoint *pre = aStep->GetPreStepPoint();
+            G4VPhysicalVolume *preVol = pre->GetPhysicalVolume();
+            G4VPhysicalVolume *postVol = post->GetPhysicalVolume();
+            G4LogicalVolume *preLog = preVol ? preVol->GetLogicalVolume() : nullptr;
+            G4LogicalVolume *postLog = postVol ? postVol->GetLogicalVolume() : nullptr;
+            bool sd_pre = preLog && preLog->GetSensitiveDetector();
+            bool sd_post = postLog && postLog->GetSensitiveDetector();
+            if (sd_pre || sd_post)
+                flag = PhotonFateAccumulator::SURFACE_DETECT;
+            else
+                flag = PhotonFateAccumulator::BOUNDARY_TRANSMIT;
+        }
+
+        if (flag == 0)
+            flag = PhotonFateAccumulator::MISS; // catch-all
+
+        // Build sphoton with the final state
+        G4ThreeVector pos = post->GetPosition();
+        G4ThreeVector mom = post->GetMomentumDirection();
+        G4ThreeVector pol = post->GetPolarization();
+        G4double time = post->GetGlobalTime();
+        G4double energy = post->GetTotalEnergy();
+
+        sphoton p = {};
+        p.pos = {float(pos.x()), float(pos.y()), float(pos.z())};
+        p.time = float(time);
+        p.mom = {float(mom.x()), float(mom.y()), float(mom.z())};
+        p.pol = {float(pol.x()), float(pol.y()), float(pol.z())};
+        p.wavelength = (energy > 0) ? float(h_Planck * c_light / (energy * CLHEP::eV)) : 0.f;
+
+        p.orient_boundary_flag = flag & 0xFFFF;
+        p.flagmask = flag;
+
+        if (aligned && fate->indexed)
+        {
+            int photon_idx = track->GetTrackID() - 1; // G4 trackIDs are 1-based
+            fate->Set(photon_idx, p);
+        }
+        else
+        {
+            fate->Add(p);
+        }
+    }
+};
+
+// ---- Tracking Action: per-photon RNG sync for aligned mode ----
+
+struct G4OnlyTrackingAction : G4UserTrackingAction
+{
+    void PreUserTrackingAction(const G4Track *track) override
+    {
+        if (track->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
+            return;
+        int photon_idx = track->GetTrackID() - 1; // G4 trackIDs are 1-based
+        U4Random::SetSequenceIndex(photon_idx);
+    }
+
+    void PostUserTrackingAction(const G4Track *track) override
+    {
+        if (track->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
+            return;
+        U4Random::SetSequenceIndex(-1);
+    }
+};
+
+// ---- AlignedOpticalPhysics: uses Shim processes for precise RNILL matching ----
+
+struct AlignedOpticalPhysics : G4VPhysicsConstructor
+{
+    AlignedOpticalPhysics() : G4VPhysicsConstructor("AlignedOptical")
+    {
+    }
+    void ConstructParticle() override
+    {
+    }
+    void ConstructProcess() override
+    {
+        auto *pm = G4OpticalPhoton::OpticalPhoton()->GetProcessManager();
+        pm->AddDiscreteProcess(new ShimG4OpAbsorption());
+        pm->AddDiscreteProcess(new ShimG4OpRayleigh());
+        pm->AddDiscreteProcess(new G4OpBoundaryProcess());
+        pm->AddDiscreteProcess(new G4OpWLS());
+    }
+};
+
+// ---- Event Action: reports per-event progress ----
+
+struct G4OnlyEventAction : G4UserEventAction
+{
+    int total_events;
+
+    G4OnlyEventAction(int total_events) : total_events(total_events)
+    {
+    }
+
+    void EndOfEventAction(const G4Event *event) override
+    {
+        int id = event->GetEventID();
+        if (id == 0 || (id + 1) % 10 == 0 || id + 1 == total_events)
+            G4cout << "G4: Event " << id + 1 << "/" << total_events << G4endl;
+    }
+};
+
+// ---- Run Action: saves merged hits at end ----
+
+struct G4OnlyRunAction : G4UserRunAction
+{
+    HitAccumulator *accumulator;
+    PhotonFateAccumulator *fate;
+
+    G4OnlyRunAction(HitAccumulator *acc, PhotonFateAccumulator *f = nullptr) : accumulator(acc), fate(f)
+    {
+    }
+
+    void EndOfRunAction(const G4Run *) override
+    {
+        if (G4Threading::IsMasterThread() || !G4Threading::IsMultithreadedApplication())
+        {
+            G4cout << "G4: Total accumulated hits: " << accumulator->hits.size() << G4endl;
+            accumulator->Save("g4_hits.npy");
+            if (fate)
+            {
+                G4cout << "G4: Total photon fates: " << fate->photons.size() << G4endl;
+                fate->Save("g4_photon.npy");
+            }
+        }
+    }
+};
+
+// ---- Action Initialization (required for MT) ----
+
+struct G4OnlyActionInitialization : G4VUserActionInitialization
+{
+    gphox::Config cfg;
+    HitAccumulator *accumulator;
+    PhotonFateAccumulator *fate;
+    int photons_per_event;
+    int num_events;
+    bool aligned;
+
+    G4OnlyActionInitialization(const gphox::Config &cfg, HitAccumulator *acc, PhotonFateAccumulator *f,
+                               int photons_per_event, int num_events, bool aligned_ = false)
+        : cfg(cfg), accumulator(acc), fate(f), photons_per_event(photons_per_event), num_events(num_events),
+          aligned(aligned_)
+    {
+    }
+
+    void BuildForMaster() const override
+    {
+        SetUserAction(new G4OnlyRunAction(accumulator, fate));
+    }
+
+    void Build() const override
+    {
+        SetUserAction(new G4OnlyPrimaryGenerator(cfg, photons_per_event));
+        SetUserAction(new G4OnlyEventAction(num_events));
+        SetUserAction(new G4OnlyRunAction(accumulator, fate));
+        SetUserAction(new G4OnlySteppingAction(fate, aligned));
+        if (aligned)
+            SetUserAction(new G4OnlyTrackingAction());
+    }
+};
diff --git a/src/config.cpp b/src/config.cpp
index 844244017..8c898cdce 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -6,15 +6,16 @@
 #include <sys/stat.h>
 #include <vector>
 
-#include <nlohmann/json.hpp>
 #include <cuda_runtime.h>
+#include <nlohmann/json.hpp>
 
 #include "sysrap/SEventConfig.hh"
 
 #include "config.h"
 #include "config_path.h"
 
-namespace gphox {
+namespace gphox
+{
 
 using namespace std;
 
@@ -28,10 +29,9 @@ bool FileExists(const std::string &path)
     return std::filesystem::exists(path, ec) && !ec;
 }
 
-Config::Config(std::string config_name) :
-  name{std::getenv("GPHOX_CONFIG") ? std::getenv("GPHOX_CONFIG") : config_name}
+Config::Config(std::string config_name) : name{std::getenv("GPHOX_CONFIG") ? std::getenv("GPHOX_CONFIG") : config_name}
 {
-  ReadConfig(Locate(name + ".json"));
+    ReadConfig(Locate(name + ".json"));
 }
 
 std::string Config::PtxPath(const std::string &ptx_name)
@@ -54,93 +54,96 @@ std::string Config::PtxPath(const std::string &ptx_name)
 
 std::string Config::Locate(std::string filename) const
 {
-  std::vector<std::string> search_paths;
+    std::vector<std::string> search_paths;
+
+    const std::string user_dir{std::getenv("GPHOX_CONFIG_DIR") ? std::getenv("GPHOX_CONFIG_DIR") : ""};
 
-  const std::string user_dir{std::getenv("GPHOX_CONFIG_DIR") ? std::getenv("GPHOX_CONFIG_DIR") : ""};
+    if (user_dir.empty())
+    {
+        std::string paths(GPHOX_CONFIG_SEARCH_PATHS);
 
-  if (user_dir.empty())
-  {
-    std::string paths(GPHOX_CONFIG_SEARCH_PATHS);
+        size_t last = 0;
+        size_t next = 0;
+        while ((next = paths.find(':', last)) != std::string::npos)
+        {
+            search_paths.push_back(paths.substr(last, next - last));
+            last = next + 1;
+        }
 
-    size_t last = 0;
-    size_t next = 0;
-    while ((next = paths.find(':', last)) != std::string::npos)
+        search_paths.push_back(paths.substr(last));
+    }
+    else
     {
-      search_paths.push_back(paths.substr(last, next-last));
-      last = next + 1;
+        search_paths.push_back(user_dir);
     }
 
-    search_paths.push_back(paths.substr(last));
-  }
-  else
-  {
-    search_paths.push_back(user_dir);
-  }
-
-  struct stat buffer;
-  std::string filepath{""};
-  for (std::string path : search_paths)
-  {
-    std::string fpath{path + "/" + filename};
-    if (stat(fpath.c_str(), &buffer) == 0)
+    struct stat buffer;
+    std::string filepath{""};
+    for (std::string path : search_paths)
     {
-      filepath = fpath;
-      break;
+        std::string fpath{path + "/" + filename};
+        if (stat(fpath.c_str(), &buffer) == 0)
+        {
+            filepath = fpath;
+            break;
+        }
     }
-  }
 
-  if (filepath.empty())
-  {
-    std::string errmsg{"Could not find config file \"" + filename + "\" in "};
-    for (std::string path : search_paths) errmsg += (path + ":");
-    throw std::runtime_error(errmsg);
-  }
+    if (filepath.empty())
+    {
+        std::string errmsg{"Could not find config file \"" + filename + "\" in "};
+        for (std::string path : search_paths)
+            errmsg += (path + ":");
+        throw std::runtime_error(errmsg);
+    }
 
-  return filepath;
+    return filepath;
 }
 
-
 /**
  * Expects a valid filepath.
  */
 void Config::ReadConfig(std::string filepath)
 {
-  nlohmann::json json;
-
-  try {
-    std::ifstream ifs(filepath);
-    ifs >> json;
-
-    nlohmann::json torch_ = json["torch"];
-
-    torch = {
-      .gentype = OpticksGenstep_::Type(torch_["gentype"]),
-      .trackid = torch_["trackid"],
-      .matline = torch_["matline"],
-      .numphoton = torch_["numphoton"],
-      .pos = make_float3(torch_["pos"][0], torch_["pos"][1], torch_["pos"][2]),
-      .time = torch_["time"],
-      .mom = normalize(make_float3(torch_["mom"][0], torch_["mom"][1], torch_["mom"][2])),
-      .weight = torch_["weight"],
-      .pol = make_float3(torch_["pol"][0], torch_["pol"][1], torch_["pol"][2]),
-      .wavelength = torch_["wavelength"],
-      .zenith = make_float2(torch_["zenith"][0], torch_["zenith"][1]),
-      .azimuth = make_float2(torch_["azimuth"][0], torch_["azimuth"][1]),
-      .radius = torch_["radius"],
-      .distance = torch_["distance"],
-      .mode = torch_["mode"],
-      .type = storchtype::Type(torch_["type"])
-    };
-
-    nlohmann::json event_ = json["event"];
-
-    SEventConfig::SetEventMode( string(event_["mode"]).c_str() );
-    SEventConfig::SetMaxSlot( event_["maxslot"] );
-  }
-  catch (nlohmann::json::exception& e) {
-    std::string errmsg{"Failed reading config parameters from " + filepath + "\n" + e.what()};
-    throw std::runtime_error{errmsg};
-  }
-}
+    nlohmann::json json;
 
+    try
+    {
+        std::ifstream ifs(filepath);
+        ifs >> json;
+
+        nlohmann::json torch_ = json["torch"];
+
+        torch = {.gentype = OpticksGenstep_::Type(torch_["gentype"]),
+                 .trackid = torch_["trackid"],
+                 .matline = torch_["matline"],
+                 .numphoton = torch_["numphoton"],
+                 .pos = make_float3(torch_["pos"][0], torch_["pos"][1], torch_["pos"][2]),
+                 .time = torch_["time"],
+                 .mom = normalize(make_float3(torch_["mom"][0], torch_["mom"][1], torch_["mom"][2])),
+                 .weight = torch_["weight"],
+                 .pol = make_float3(torch_["pol"][0], torch_["pol"][1], torch_["pol"][2]),
+                 .wavelength = torch_["wavelength"],
+                 .zenith = make_float2(torch_["zenith"][0], torch_["zenith"][1]),
+                 .azimuth = make_float2(torch_["azimuth"][0], torch_["azimuth"][1]),
+                 .radius = torch_["radius"],
+                 .distance = torch_["distance"],
+                 .mode = torch_["mode"],
+                 .type = storchtype::Type(torch_["type"])};
+
+        nlohmann::json event_ = json["event"];
+
+        SEventConfig::SetEventMode(string(event_["mode"]).c_str());
+        SEventConfig::SetMaxSlot(event_["maxslot"]);
+
+        if (event_.contains("savephotonhistory"))
+            savephotonhistory = event_["savephotonhistory"].get<bool>();
+    }
+    catch (nlohmann::json::exception &e)
+    {
+        std::string errmsg{"Failed reading config parameters from " + filepath + "\n" + e.what()};
+        throw std::runtime_error{errmsg};
+    }
 }
+
+} // namespace gphox
diff --git a/src/config.h b/src/config.h
index 1fc5c838d..28e26416c 100644
--- a/src/config.h
+++ b/src/config.h
@@ -7,29 +7,29 @@
 #include "sysrap/srng.h"
 #include "sysrap/storch.h"
 
-namespace gphox {
-
+namespace gphox
+{
 
 /**
  * Provides access to all configuration types and data.
  */
 class Config
 {
- public:
-
-  Config(std::string config_name = "dev");
+  public:
+    Config(std::string config_name = "dev");
 
-  static std::string PtxPath(const std::string &ptx_name = "CSGOptiX7.ptx");
+    static std::string PtxPath(const std::string &ptx_name = "CSGOptiX7.ptx");
 
-  /// A unique name associated with this Config
-  std::string name;
+    /// A unique name associated with this Config
+    std::string name;
 
-  storch torch;
+    storch torch;
 
- private:
+    bool savephotonhistory{false};
 
-  std::string Locate(std::string filename) const;
-  void ReadConfig(std::string filepath);
+  private:
+    std::string Locate(std::string filename) const;
+    void ReadConfig(std::string filepath);
 };
 
-}
+} // namespace gphox
diff --git a/sysrap/SEventConfig.cc b/sysrap/SEventConfig.cc
index c37de3632..b3d95633d 100644
--- a/sysrap/SEventConfig.cc
+++ b/sysrap/SEventConfig.cc
@@ -776,7 +776,7 @@ void SEventConfig::LIMIT_Check()
    //assert( _MaxBounce >= 0 && _MaxBounce <  LIMIT ) ;
    // MaxBounce should not in principal be limited
 
-   assert( _MaxRecord >= 0 && _MaxRecord <= RecordLimit() ) ;
+   assert(_MaxRecord >= 0); // RecordLimit relaxed to allow large record arrays for step analysis
    assert( _MaxRec    >= 0 && _MaxRec    <= RecordLimit() ) ;
    assert( _MaxPrd    >= 0 && _MaxPrd    <= RecordLimit() ) ;
 
@@ -1590,7 +1590,8 @@ void SEventConfig::Initialize_Comp_Simulate_(unsigned& gather_mask, unsigned& sa
         else if(IsDebugLite())
         {
             SEventConfig::SetMaxRec(0);
-            SEventConfig::SetMaxRecord(record_limit);
+            int env_max_record = ssys::getenvint(kMaxRecord, 0);
+            SEventConfig::SetMaxRecord(env_max_record > 0 ? env_max_record : record_limit);
             SEventConfig::SetMaxSeq(1);  // formerly incorrectly set to max_bounce+1
         }
 
diff --git a/sysrap/snam.h b/sysrap/snam.h
index 1f2e49f57..4dd713e9f 100644
--- a/sysrap/snam.h
+++ b/sysrap/snam.h
@@ -16,6 +16,10 @@ struct snam
     static constexpr const char* OPTICAL = "optical.npy" ;
     static constexpr const char* ICDF = "icdf.npy" ;
 
+    static constexpr const char *WLS_ICDF = "wls_icdf.npy";
+    static constexpr const char *WLS_MAT_MAP = "wls_mat_map.npy";
+    static constexpr const char *WLS_TIME_CONSTANTS = "wls_time_constants.npy";
+
     static constexpr const char* MULTIFILM = "multifilm.npy" ;
     static constexpr const char* PROPCOM = "propcom.npy" ;
 
diff --git a/sysrap/sproplist.h b/sysrap/sproplist.h
index deeffa02a..9ca1b3239 100644
--- a/sysrap/sproplist.h
+++ b/sysrap/sproplist.h
@@ -3,25 +3,25 @@
 sproplist.h
 ===================
 
-For MATERIAL the property default constants 
+For MATERIAL the property default constants
 are taken from  GMaterialLib::defineDefaults
 
 For SURFACE setting the prop values::
 
     (detect, absorb, reflect_specular, reflect_diffuse
 
-requires access to optical surface type, 
-if not already present need to add metadata 
-to the surface NPFold/NP to carry that info.   
+requires access to optical surface type,
+if not already present need to add metadata
+to the surface NPFold/NP to carry that info.
 
 
-Nov 1 2023 : Increase default ABSLENGTH RAYLEIGH 1e6 -> 1e12 mm 
+Nov 1 2023 : Increase default ABSLENGTH RAYLEIGH 1e6 -> 1e12 mm
 ------------------------------------------------------------------
 
 Increase default ABSLENGTH RAYLEIGH from 1e6 to 1e12
 due to notes/issues/G4CXTest_raindrop_shakedown.rst
-This is relevant to simple tests where it is common 
-not to define ABSLENGTH and RAYLEIGH properties. 
+This is relevant to simple tests where it is common
+not to define ABSLENGTH and RAYLEIGH properties.
 
 **/
 
@@ -29,19 +29,19 @@ not to define ABSLENGTH and RAYLEIGH properties.
 
 struct sproplist
 {
-    static constexpr const char* MATERIAL = R"(
+    static constexpr const char *MATERIAL = R"(
     0 0 RINDEX          1
     0 1 ABSLENGTH       1e12
     0 2 RAYLEIGH        1e12
     0 3 REEMISSIONPROB  0.
     1 0 GROUPVEL        299.792458
-    1 1 SPARE11         0.
+    1 1 WLSABSLENGTH    1e12
     1 2 SPARE12         0.
     1 3 SPARE13         0.
-    )" ;
-    // default GROUPVEL set to c_light_mm_per_ns, see U4PhysicalConstants.h 
+    )";
+    // default GROUPVEL set to c_light_mm_per_ns, see U4PhysicalConstants.h
 
-    static constexpr const char* SURFACE = R"(
+    static constexpr const char *SURFACE = R"(
     0 0 EFFICIENCY      -2
     0 1 SPARE01         -2
     0 2 REFLECTIVITY    -2
@@ -50,48 +50,46 @@ struct sproplist
     1 1 SPARE11         -2
     1 2 SPARE12         -2
     1 3 SPARE13         -2
-    )" ;   
+    )";
 
-    static const sproplist* Material() ; 
-    static const sproplist* Surface() ; 
+    static const sproplist *Material();
+    static const sproplist *Surface();
 
-    std::vector<sprop> PROP ; 
-    sproplist(const char* spec ); 
+    std::vector<sprop> PROP;
+    sproplist(const char *spec);
 
-    std::string desc() const ; 
-    void getNames(std::vector<std::string>& pnames, const char* skip_prefix="SPARE") const ; 
-    const sprop* findProp(const char* pname) const ; 
-    const sprop* get(int g, int p) const ; 
+    std::string desc() const;
+    void getNames(std::vector<std::string> &pnames, const char *skip_prefix = "SPARE") const;
+    const sprop *findProp(const char *pname) const;
+    const sprop *get(int g, int p) const;
 };
 
-inline const sproplist* sproplist::Material() // static
+inline const sproplist *sproplist::Material() // static
 {
-    return new sproplist(MATERIAL) ; 
+    return new sproplist(MATERIAL);
 }
-inline const sproplist* sproplist::Surface() // static
+inline const sproplist *sproplist::Surface() // static
 {
-    return new sproplist(SURFACE) ; 
+    return new sproplist(SURFACE);
 }
 
-inline sproplist::sproplist(const char* spec)
+inline sproplist::sproplist(const char *spec)
 {
-    sprop::Parse(PROP, spec); 
+    sprop::Parse(PROP, spec);
 }
-inline std::string sproplist::desc() const 
+inline std::string sproplist::desc() const
 {
-    return sprop::Desc(PROP); 
+    return sprop::Desc(PROP);
 }
-inline void sproplist::getNames(std::vector<std::string>& pnames, const char* skip_prefix ) const 
+inline void sproplist::getNames(std::vector<std::string> &pnames, const char *skip_prefix) const
 {
-    sprop::GetNames(pnames, PROP, skip_prefix);  
+    sprop::GetNames(pnames, PROP, skip_prefix);
 }
-inline const sprop* sproplist::findProp(const char* pname) const 
+inline const sprop *sproplist::findProp(const char *pname) const
 {
-    return sprop::FindProp(PROP, pname); 
+    return sprop::FindProp(PROP, pname);
 }
-inline const sprop* sproplist::get(int g, int v) const 
+inline const sprop *sproplist::get(int g, int v) const
 {
-    return sprop::Find(PROP, g, v) ; 
+    return sprop::Find(PROP, g, v);
 }
-
-
diff --git a/sysrap/sstandard.h b/sysrap/sstandard.h
index 1c62d6f0c..c14370a66 100644
--- a/sysrap/sstandard.h
+++ b/sysrap/sstandard.h
@@ -77,92 +77,70 @@ In the old X4/GGeo workflow, the bnd buffer was created with::
 
 **/
 
-#include <limits>
 #include <array>
 #include <csignal>
+#include <limits>
 
 #include "NPFold.h"
 #include "NPX.h"
-#include "sproplist.h"
 #include "sdomain.h"
 #include "smatsur.h"
 #include "snam.h"
+#include "sproplist.h"
 
 struct sstandard
 {
-    static constexpr const bool VERBOSE = false ;
-    static constexpr const char* IMPLICIT_PREFIX = "Implicit_RINDEX_NoRINDEX" ;
-    const sdomain* dom ;
+    static constexpr const bool VERBOSE = false;
+    static constexpr const char *IMPLICIT_PREFIX = "Implicit_RINDEX_NoRINDEX";
+    const sdomain *dom;
 
-    const NP* wavelength ;
-    const NP* energy ;
-    const NP* rayleigh ;
-    const NP* mat ;
-    const NP* sur ;
-    const NP* bd ;
-    const NP* bnd ;
-    const NP* optical ;
+    const NP *wavelength;
+    const NP *energy;
+    const NP *rayleigh;
+    const NP *mat;
+    const NP *sur;
+    const NP *bd;
+    const NP *bnd;
+    const NP *optical;
 
-    const NP* icdf ;
+    const NP *icdf;
 
+    const NP *wls_icdf;
+    const NP *wls_mat_map;
+    const NP *wls_time_constants;
 
     sstandard();
 
-    void deferred_init(
-        const std::vector<int4>& vbd,
-        const std::vector<std::string>& bdname,
-        const std::vector<std::string>& suname,
-        const NPFold* surface
-    );
-
-    NPFold* serialize() const ;
-    void import(const NPFold* fold );
-
-    void save(const char* base, const char* rel );
-    void load(const char* base, const char* rel );
-
-
-    static NP* make_bd(
-        const std::vector<int4>& vbd,
-        const std::vector<std::string>& bdname
-    );
-
-    static NP* make_optical(
-        const std::vector<int4>& vbd,
-        const std::vector<std::string>& suname,
-        const NPFold* surface
-    );
-
-    static NP* make_bnd(
-        const std::vector<int4>& vbd,
-        const std::vector<std::string>& bdname,
-        const NP* mat,
-        const NP* sur
-    );
-
-    static void column_range(int4& mn, int4& mx,  const std::vector<int4>& vbd) ;
-    static NP* unused_mat(const std::vector<std::string>& names, const NPFold* fold );
-    static NP* unused_sur(const std::vector<std::string>& names, const NPFold* fold );
-    static NP* unused_create(const sproplist* pl,  const std::vector<std::string>& names, const NPFold* fold );
-};
+    void deferred_init(const std::vector<int4> &vbd, const std::vector<std::string> &bdname,
+                       const std::vector<std::string> &suname, const NPFold *surface);
+
+    NPFold *serialize() const;
+    void import(const NPFold *fold);
+
+    void save(const char *base, const char *rel);
+    void load(const char *base, const char *rel);
+
+    static NP *make_bd(const std::vector<int4> &vbd, const std::vector<std::string> &bdname);
 
+    static NP *make_optical(const std::vector<int4> &vbd, const std::vector<std::string> &suname,
+                            const NPFold *surface);
+
+    static NP *make_bnd(const std::vector<int4> &vbd, const std::vector<std::string> &bdname, const NP *mat,
+                        const NP *sur);
+
+    static void column_range(int4 &mn, int4 &mx, const std::vector<int4> &vbd);
+    static NP *unused_mat(const std::vector<std::string> &names, const NPFold *fold);
+    static NP *unused_sur(const std::vector<std::string> &names, const NPFold *fold);
+    static NP *unused_create(const sproplist *pl, const std::vector<std::string> &names, const NPFold *fold);
+};
 
 inline sstandard::sstandard()
-    :
-    dom(nullptr),
-    wavelength(nullptr),
-    energy(nullptr),
-    rayleigh(nullptr),
-    mat(nullptr),
-    sur(nullptr),
-    bd(nullptr),
-    bnd(nullptr),
-    optical(nullptr),
-    icdf(nullptr)
+    : dom(nullptr), wavelength(nullptr), energy(nullptr), rayleigh(nullptr), mat(nullptr), sur(nullptr), bd(nullptr),
+      bnd(nullptr), optical(nullptr), icdf(nullptr), wls_icdf(nullptr), wls_mat_map(nullptr),
+      wls_time_constants(nullptr)
 {
 }
 
-
 /**
 sstandard::deferred_init
 --------------------------
@@ -176,45 +154,44 @@ after mat and sur have been filled.
 
 **/
 
-inline void sstandard::deferred_init(
-        const std::vector<int4>& vbd,
-        const std::vector<std::string>& bdname,
-        const std::vector<std::string>& suname,
-        const NPFold* surface
-    )
+inline void sstandard::deferred_init(const std::vector<int4> &vbd, const std::vector<std::string> &bdname,
+                                     const std::vector<std::string> &suname, const NPFold *surface)
 {
-    dom = new sdomain ;
+    dom = new sdomain;
 
-    wavelength = dom->get_wavelength_nm() ;
-    energy = dom->get_energy_eV() ;
+    wavelength = dom->get_wavelength_nm();
+    energy = dom->get_energy_eV();
 
-    bd      = make_bd(     vbd, bdname );
-    bnd     = make_bnd(    vbd, bdname, mat, sur ) ;
-    optical = make_optical(vbd, suname, surface) ;
+    bd = make_bd(vbd, bdname);
+    bnd = make_bnd(vbd, bdname, mat, sur);
+    optical = make_optical(vbd, suname, surface);
 }
 
-
-inline NPFold* sstandard::serialize() const
+inline NPFold *sstandard::serialize() const
 {
-    NPFold* fold = new NPFold ;
+    NPFold *fold = new NPFold;
+
+    fold->add(snam::WAVELENGTH, wavelength);
+    fold->add(snam::ENERGY, energy);
 
-    fold->add(snam::WAVELENGTH , wavelength );
-    fold->add(snam::ENERGY,      energy );
+    fold->add(snam::RAYLEIGH, rayleigh);
+    fold->add(snam::MAT, mat);
+    fold->add(snam::SUR, sur);
 
-    fold->add(snam::RAYLEIGH,    rayleigh );
-    fold->add(snam::MAT ,    mat );
-    fold->add(snam::SUR ,    sur );
+    fold->add(snam::BD, bd);
+    fold->add(snam::BND, bnd);
+    fold->add(snam::OPTICAL, optical);
 
-    fold->add(snam::BD,      bd );
-    fold->add(snam::BND,     bnd );
-    fold->add(snam::OPTICAL, optical );
+    fold->add(snam::ICDF, icdf);
 
-    fold->add(snam::ICDF, icdf) ;
+    fold->add(snam::WLS_ICDF, wls_icdf);
+    fold->add(snam::WLS_MAT_MAP, wls_mat_map);
+    fold->add(snam::WLS_TIME_CONSTANTS, wls_time_constants);
 
-    return fold ;
+    return fold;
 }
 
-inline void sstandard::import(const NPFold* fold )
+inline void sstandard::import(const NPFold *fold)
 {
     wavelength = fold->get(snam::WAVELENGTH);
     energy = fold->get(snam::ENERGY);
@@ -228,21 +205,24 @@ inline void sstandard::import(const NPFold* fold )
     optical = fold->get(snam::OPTICAL);
 
     icdf = fold->get(snam::ICDF);
+
+    wls_icdf = fold->get(snam::WLS_ICDF);
+    wls_mat_map = fold->get(snam::WLS_MAT_MAP);
+    wls_time_constants = fold->get(snam::WLS_TIME_CONSTANTS);
 }
 
-inline void sstandard::save(const char* base, const char* rel )
+inline void sstandard::save(const char *base, const char *rel)
 {
-    NPFold* fold = serialize();
+    NPFold *fold = serialize();
     fold->save(base, rel);
 }
 
-inline void sstandard::load(const char* base, const char* rel )
+inline void sstandard::load(const char *base, const char *rel)
 {
-    NPFold* fold = NPFold::Load(base, rel) ;
-    import(fold) ;
+    NPFold *fold = NPFold::Load(base, rel);
+    import(fold);
 }
 
-
 /**
 sstandard::make_bd
 -------------------
@@ -251,11 +231,11 @@ Create array of shape (num_bd, 4) holding int "pointers" to (omat,osur,isur,imat
 
 **/
 
-inline NP* sstandard::make_bd( const std::vector<int4>& vbd, const std::vector<std::string>& bdname )
+inline NP *sstandard::make_bd(const std::vector<int4> &vbd, const std::vector<std::string> &bdname)
 {
-    NP* a_bd = NPX::ArrayFromVec<int, int4>( vbd );
-    a_bd->set_names( bdname );
-    return a_bd ;
+    NP *a_bd = NPX::ArrayFromVec<int, int4>(vbd);
+    a_bd->set_names(bdname);
+    return a_bd;
 }
 
 /**
@@ -308,96 +288,96 @@ that via the ems smatsur.h enum value.
 
 **/
 
-inline NP* sstandard::make_optical(
-     const std::vector<int4>& vbd,
-     const std::vector<std::string>& suname,
-     const NPFold* surface )
+inline NP *sstandard::make_optical(const std::vector<int4> &vbd, const std::vector<std::string> &suname,
+                                   const NPFold *surface)
 {
-    int ni = vbd.size() ;
-    int nj = 4 ;
-    int nk = 4 ;
+    int ni = vbd.size();
+    int nj = 4;
+    int nk = 4;
 
-    NP* op = NP::Make<int>(ni, nj, nk);
-    int* op_v = op->values<int>();
+    NP *op = NP::Make<int>(ni, nj, nk);
+    int *op_v = op->values<int>();
 
-    for(int i=0 ; i < ni ; i++)       // over vbd
+    for (int i = 0; i < ni; i++) // over vbd
     {
-        const int4& bd_ = vbd[i] ;
-        for(int j=0 ; j < nj ; j++)   // over (omat,osur,isur,imat)
+        const int4 &bd_ = vbd[i];
+        for (int j = 0; j < nj; j++) // over (omat,osur,isur,imat)
         {
-            int op_index = i*nj*nk + j*nk ;
+            int op_index = i * nj * nk + j * nk;
 
-            int idx = -2 ;
-            switch(j)
+            int idx = -2;
+            switch (j)
             {
-                case 0: idx = bd_.x ; break ;
-                case 1: idx = bd_.y ; break ;
-                case 2: idx = bd_.z ; break ;
-                case 3: idx = bd_.w ; break ;
+            case 0:
+                idx = bd_.x;
+                break;
+            case 1:
+                idx = bd_.y;
+                break;
+            case 2:
+                idx = bd_.z;
+                break;
+            case 3:
+                idx = bd_.w;
+                break;
             }
-            int idx1 = idx+1 ;  // 1-based idx
-            bool is_mat = j == 0 || j == 3 ;
-            bool is_sur = j == 1 || j == 2 ;
+            int idx1 = idx + 1; // 1-based idx
+            bool is_mat = j == 0 || j == 3;
+            bool is_sur = j == 1 || j == 2;
 
-            if(is_mat)
+            if (is_mat)
             {
-                assert( idx > -1 );   // omat,imat must always be present
-                op_v[op_index+0] = idx1 ;
-                op_v[op_index+1] = 0 ;
-                op_v[op_index+2] = 0 ;
-                op_v[op_index+3] = 0 ;
+                assert(idx > -1); // omat,imat must always be present
+                op_v[op_index + 0] = idx1;
+                op_v[op_index + 1] = 0;
+                op_v[op_index + 2] = 0;
+                op_v[op_index + 3] = 0;
             }
-            else if(is_sur)
+            else if (is_sur)
             {
-                const char* surfname = snam::get(suname, idx) ;
-
-                bool no_surfname_for_surface_idx = idx > -1 && surfname == nullptr ;
-
-                if(no_surfname_for_surface_idx) std::cerr
-                    << "sstandard::make_optical"
-                    << " ERROR "
-                    << " no_surfname_for_surface_idx " << ( no_surfname_for_surface_idx ? "YES" : "NO " )
-                    << " sur idx from bd " << idx
-                    << " but no corresponding surfname "
-                    << " suname.size " << suname.size()
-                    << " surface.subfold.size " << surface->subfold.size()
-                    << " surface.ff.size " << surface->ff.size()
-                    << "\n"
-                    << " snam::Desc(suname)\n"
-                    << snam::Desc(suname)
-                    << "\n"
-                    ;
-
-                if(idx > -1 ) assert(surfname) ;
+                const char *surfname = snam::get(suname, idx);
+
+                bool no_surfname_for_surface_idx = idx > -1 && surfname == nullptr;
+
+                if (no_surfname_for_surface_idx)
+                    std::cerr << "sstandard::make_optical" << " ERROR " << " no_surfname_for_surface_idx "
+                              << (no_surfname_for_surface_idx ? "YES" : "NO ") << " sur idx from bd " << idx
+                              << " but no corresponding surfname " << " suname.size " << suname.size()
+                              << " surface.subfold.size " << surface->subfold.size() << " surface.ff.size "
+                              << surface->ff.size() << "\n"
+                              << " snam::Desc(suname)\n"
+                              << snam::Desc(suname) << "\n";
+
+                if (idx > -1)
+                    assert(surfname);
                 // all surf should have name, do not always have surf
 
-                NPFold* surf = surfname ? surface->get_subfold(surfname) : nullptr ;
-                bool is_implicit = surfname && strncmp(surfname, IMPLICIT_PREFIX, strlen(IMPLICIT_PREFIX) ) == 0 ;
-                int Type = -2 ;
-                int Finish = -2 ;
-                int ModelValuePercent = -2 ;
-                std::string OSN = "-" ;
+                NPFold *surf = surfname ? surface->get_subfold(surfname) : nullptr;
+                bool is_implicit = surfname && strncmp(surfname, IMPLICIT_PREFIX, strlen(IMPLICIT_PREFIX)) == 0;
+                int Type = -2;
+                int Finish = -2;
+                int ModelValuePercent = -2;
+                std::string OSN = "-";
 
-                if( is_implicit )
+                if (is_implicit)
                 {
-                    assert( surf == nullptr ) ;  // not expecting to find surf for implicits
-                    Type = 1 ;
-                    Finish = 1 ;
-                    ModelValuePercent = 100 ;  // placeholders to match old_optical ones
-                    OSN = "X" ;  // Implicits classified as ordinary Surface as they have bnd/sur entries
+                    assert(surf == nullptr); // not expecting to find surf for implicits
+                    Type = 1;
+                    Finish = 1;
+                    ModelValuePercent = 100; // placeholders to match old_optical ones
+                    OSN = "X";               // Implicits classified as ordinary Surface as they have bnd/sur entries
                 }
                 else
                 {
-                    int missing = 0 ;  // -2 better, but use 0 to match old_optical
-                    Type              = surf ? surf->get_meta<int>("Type",-1) : missing ;
-                    Finish            = surf ? surf->get_meta<int>("Finish", -1 ) : missing ;
-                    ModelValuePercent = surf ? int(100.*surf->get_meta<double>("ModelValue", 0.)) : missing ;
-                    OSN = surf ? surf->get_meta<std::string>("OpticalSurfaceName", "-") : "-" ;
+                    int missing = 0; // -2 better, but use 0 to match old_optical
+                    Type = surf ? surf->get_meta<int>("Type", -1) : missing;
+                    Finish = surf ? surf->get_meta<int>("Finish", -1) : missing;
+                    ModelValuePercent = surf ? int(100. * surf->get_meta<double>("ModelValue", 0.)) : missing;
+                    OSN = surf ? surf->get_meta<std::string>("OpticalSurfaceName", "-") : "-";
                 }
 
-
-                char OSN0 = *OSN.c_str() ;
-                int ems = smatsur::TypeFromChar(OSN0) ;
+                char OSN0 = *OSN.c_str();
+                int ems = smatsur::TypeFromChar(OSN0);
 
                 /**
                 HERE CAN DETECT FINISH AND ModelValuePercent THAT
@@ -405,37 +385,26 @@ inline NP* sstandard::make_optical(
                 FOR WHICH WILL NEED NEW smatsur.h enum value
                 **/
 
-                int Payload_Y = ems ;
-
-                if(VERBOSE) std::cout
-                    << " bnd:i "   << std::setw(3) << i
-                    << " sur:idx " << std::setw(3) << idx
-                    << " Type " << std::setw(2) << Type
-                    << " Finish " << std::setw(2) << Finish
-                    << " MVP " << std::setw(3) << ModelValuePercent
-                    << " surf " << ( surf ? "YES" : "NO " )
-                    << " impl " << ( is_implicit ? "YES" : "NO " )
-                    << " osn0 " << ( OSN0 == '\0' ? '0' : OSN0 )
-                    << " OSN " << OSN
-                    << " ems " << ems
-                    << " emsn " << smatsur::Name(ems)
-                    << " surfname " << ( surfname ? surfname : "-" )
-                    << std::endl
-                    ;
-
-                op_v[op_index+0] = idx1 ;
-                op_v[op_index+1] = Payload_Y ;
-                op_v[op_index+2] = Finish ;
-                op_v[op_index+3] = ModelValuePercent ;
+                int Payload_Y = ems;
+
+                if (VERBOSE)
+                    std::cout << " bnd:i " << std::setw(3) << i << " sur:idx " << std::setw(3) << idx << " Type "
+                              << std::setw(2) << Type << " Finish " << std::setw(2) << Finish << " MVP " << std::setw(3)
+                              << ModelValuePercent << " surf " << (surf ? "YES" : "NO ") << " impl "
+                              << (is_implicit ? "YES" : "NO ") << " osn0 " << (OSN0 == '\0' ? '0' : OSN0) << " OSN "
+                              << OSN << " ems " << ems << " emsn " << smatsur::Name(ems) << " surfname "
+                              << (surfname ? surfname : "-") << std::endl;
+
+                op_v[op_index + 0] = idx1;
+                op_v[op_index + 1] = Payload_Y;
+                op_v[op_index + 2] = Finish;
+                op_v[op_index + 3] = ModelValuePercent;
             }
         }
     }
-    return op ;
+    return op;
 }
 
-
-
-
 /**
 sstandard::make_bnd
 ---------------------
@@ -444,116 +413,129 @@ Form bnd array by interleaving mat and sur array entries as directed by vbd int
 
 **/
 
-inline NP* sstandard::make_bnd(
-    const std::vector<int4>& vbd,
-    const std::vector<std::string>& bdname,
-    const NP* mat,
-    const NP* sur )
+inline NP *sstandard::make_bnd(const std::vector<int4> &vbd, const std::vector<std::string> &bdname, const NP *mat,
+                               const NP *sur)
 {
-    assert( mat->shape.size() == 4 );
-    assert( sur->shape.size() == 4 );
+    assert(mat->shape.size() == 4);
+    assert(sur->shape.size() == 4);
 
-    int num_mat = mat->shape[0] ;
-    int num_sur = sur->shape[0] ;
+    int num_mat = mat->shape[0];
+    int num_sur = sur->shape[0];
 
-    for(int d=1 ; d < 4 ; d++) assert( mat->shape[d] == sur->shape[d] ) ;
+    for (int d = 1; d < 4; d++)
+        assert(mat->shape[d] == sur->shape[d]);
 
-    assert( mat->shape[1] == sprop::NUM_PAYLOAD_GRP );
-    int num_domain = mat->shape[2] ;
-    assert( mat->shape[3] == sprop::NUM_PAYLOAD_VAL );
+    assert(mat->shape[1] == sprop::NUM_PAYLOAD_GRP);
+    int num_domain = mat->shape[2];
+    assert(mat->shape[3] == sprop::NUM_PAYLOAD_VAL);
 
-    const double* mat_v = mat->cvalues<double>();
-    const double* sur_v = sur->cvalues<double>();
+    const double *mat_v = mat->cvalues<double>();
+    const double *sur_v = sur->cvalues<double>();
 
-    int num_bnd = vbd.size() ;
-    int num_bdname = bdname.size() ;
+    int num_bnd = vbd.size();
+    int num_bdname = bdname.size();
 
-    bool num_bnd_expect = num_bnd == num_bdname ;
-    if(!num_bnd_expect) std::raise(SIGINT) ;
-    assert( num_bnd_expect);
+    bool num_bnd_expect = num_bnd == num_bdname;
+    if (!num_bnd_expect)
+        std::raise(SIGINT);
+    assert(num_bnd_expect);
 
-    int4 mn ;
-    int4 mx ;
-    column_range(mn, mx, vbd );
-    if(VERBOSE) std::cout << " sstandard::bnd mn " << mn << " mx " << mx << std::endl ;
+    int4 mn;
+    int4 mx;
+    column_range(mn, mx, vbd);
+    if (VERBOSE)
+        std::cout << " sstandard::bnd mn " << mn << " mx " << mx << std::endl;
 
-    bool mat_expect = mx.x < num_mat && mx.w < num_mat ;
-    bool sur_expect = mx.y < num_sur && mx.z < num_sur ;
+    bool mat_expect = mx.x < num_mat && mx.w < num_mat;
+    bool sur_expect = mx.y < num_sur && mx.z < num_sur;
 
-    if(!mat_expect) std::raise(SIGINT);
-    if(!sur_expect) std::raise(SIGINT);
+    if (!mat_expect)
+        std::raise(SIGINT);
+    if (!sur_expect)
+        std::raise(SIGINT);
 
-    assert( mat_expect );
-    assert( sur_expect );
+    assert(mat_expect);
+    assert(sur_expect);
 
-    int ni = num_bnd ;                // ~53
-    int nj = sprop::NUM_MATSUR ;      //   4  (omat,osur,isur,imat)
-    int nk = sprop::NUM_PAYLOAD_GRP ; //   2
-    int nl = num_domain ;             // 761  fine domain
-    int nn = sprop::NUM_PAYLOAD_VAL ; //   4
+    int ni = num_bnd;                // ~53
+    int nj = sprop::NUM_MATSUR;      //   4  (omat,osur,isur,imat)
+    int nk = sprop::NUM_PAYLOAD_GRP; //   2
+    int nl = num_domain;             // 761  fine domain
+    int nn = sprop::NUM_PAYLOAD_VAL; //   4
 
-    int np = nk*nl*nn ;               // 2*761*4  number of payload values for one mat/sur
+    int np = nk * nl * nn; // 2*761*4  number of payload values for one mat/sur
 
-
-    NP* bnd_ = NP::Make<double>(ni, nj, nk, nl, nn );
-    bnd_->fill<double>(-1.) ; // trying to match X4/GGeo unfilled
-    bnd_->set_names( bdname );
+    NP *bnd_ = NP::Make<double>(ni, nj, nk, nl, nn);
+    bnd_->fill<double>(-1.); // trying to match X4/GGeo unfilled
+    bnd_->set_names(bdname);
 
     // metadata needed by QBnd::MakeBoundaryTex
-    bnd_->set_meta<float>("domain_low",  sdomain::DomainLow() );
-    bnd_->set_meta<float>("domain_high", sdomain::DomainHigh() );
-    bnd_->set_meta<float>("domain_step", sdomain::DomainStep() );
-    bnd_->set_meta<float>("domain_range", sdomain::DomainRange() );
+    bnd_->set_meta<float>("domain_low", sdomain::DomainLow());
+    bnd_->set_meta<float>("domain_high", sdomain::DomainHigh());
+    bnd_->set_meta<float>("domain_step", sdomain::DomainStep());
+    bnd_->set_meta<float>("domain_range", sdomain::DomainRange());
 
-    double* bnd_v = bnd_->values<double>() ;
+    double *bnd_v = bnd_->values<double>();
 
-    for(int i=0 ; i < ni ; i++)
+    for (int i = 0; i < ni; i++)
     {
-        std::array<int, 4> _bd = {{ vbd[i].x, vbd[i].y, vbd[i].z, vbd[i].w }} ;
-        for(int j=0 ; j < nj ; j++)
+        std::array<int, 4> _bd = {{vbd[i].x, vbd[i].y, vbd[i].z, vbd[i].w}};
+        for (int j = 0; j < nj; j++)
         {
-            int ptr     = _bd[j] ;  // omat,osur,isur,imat index "pointer" into mat or sur arrays
-            if( ptr < 0 ) continue ;
-            bool is_mat =  j == 0 || j == 3 ;
-            bool is_sur =  j == 1 || j == 2 ;
-            if(is_mat) assert( ptr < num_mat );
-            if(is_sur) assert( ptr < num_sur );
-
-            int src_index = ptr*np ;
-            int dst_index = (i*nj + j)*np ;
-            const double* src_v = is_mat ? mat_v : sur_v ;
-
-            for(int p=0 ; p < np ; p++) bnd_v[dst_index + p] = src_v[src_index + p] ;
+            int ptr = _bd[j]; // omat,osur,isur,imat index "pointer" into mat or sur arrays
+            if (ptr < 0)
+                continue;
+            bool is_mat = j == 0 || j == 3;
+            bool is_sur = j == 1 || j == 2;
+            if (is_mat)
+                assert(ptr < num_mat);
+            if (is_sur)
+                assert(ptr < num_sur);
+
+            int src_index = ptr * np;
+            int dst_index = (i * nj + j) * np;
+            const double *src_v = is_mat ? mat_v : sur_v;
+
+            for (int p = 0; p < np; p++)
+                bnd_v[dst_index + p] = src_v[src_index + p];
         }
     }
-    return bnd_  ;
+    return bnd_;
 }
 
-inline void sstandard::column_range(int4& mn, int4& mx,  const std::vector<int4>& vbd)
+inline void sstandard::column_range(int4 &mn, int4 &mx, const std::vector<int4> &vbd)
 {
-    mn.x = std::numeric_limits<int>::max() ;
-    mn.y = std::numeric_limits<int>::max() ;
-    mn.z = std::numeric_limits<int>::max() ;
-    mn.w = std::numeric_limits<int>::max() ;
+    mn.x = std::numeric_limits<int>::max();
+    mn.y = std::numeric_limits<int>::max();
+    mn.z = std::numeric_limits<int>::max();
+    mn.w = std::numeric_limits<int>::max();
 
-    mx.x = std::numeric_limits<int>::min() ;
-    mx.y = std::numeric_limits<int>::min() ;
-    mx.z = std::numeric_limits<int>::min() ;
-    mx.w = std::numeric_limits<int>::min() ;
+    mx.x = std::numeric_limits<int>::min();
+    mx.y = std::numeric_limits<int>::min();
+    mx.z = std::numeric_limits<int>::min();
+    mx.w = std::numeric_limits<int>::min();
 
     int num = vbd.size();
-    for(int i=0 ; i < num ; i++)
+    for (int i = 0; i < num; i++)
     {
-        const int4& b = vbd[i] ;
-        if(b.x > mx.x) mx.x = b.x ;
-        if(b.y > mx.y) mx.y = b.y ;
-        if(b.z > mx.z) mx.z = b.z ;
-        if(b.w > mx.w) mx.w = b.w ;
-
-        if(b.x < mn.x) mn.x = b.x ;
-        if(b.y < mn.y) mn.y = b.y ;
-        if(b.z < mn.z) mn.z = b.z ;
-        if(b.w < mn.w) mn.w = b.w ;
+        const int4 &b = vbd[i];
+        if (b.x > mx.x)
+            mx.x = b.x;
+        if (b.y > mx.y)
+            mx.y = b.y;
+        if (b.z > mx.z)
+            mx.z = b.z;
+        if (b.w > mx.w)
+            mx.w = b.w;
+
+        if (b.x < mn.x)
+            mn.x = b.x;
+        if (b.y < mn.y)
+            mn.y = b.y;
+        if (b.z < mn.z)
+            mn.z = b.z;
+        if (b.w < mn.w)
+            mn.w = b.w;
     }
 }
 
@@ -569,11 +551,11 @@ In principal it should give equivalent results to Geant4 interpolation.
 However its simpler to just use Geant4 interpolation from U4Tree level.
 
 **/
-inline NP* sstandard::unused_mat( const std::vector<std::string>& names, const NPFold* fold )
+inline NP *sstandard::unused_mat(const std::vector<std::string> &names, const NPFold *fold)
 {
     assert(0);
-    const sproplist* pl = sproplist::Material() ;
-    return unused_create(pl, names, fold );
+    const sproplist *pl = sproplist::Material();
+    return unused_create(pl, names, fold);
 }
 
 /**
@@ -587,11 +569,11 @@ like the mat array this approach is anyhow unworkable as it stands.
 
 **/
 
-inline NP* sstandard::unused_sur( const std::vector<std::string>& names, const NPFold* fold )
+inline NP *sstandard::unused_sur(const std::vector<std::string> &names, const NPFold *fold)
 {
     assert(0);
-    const sproplist* pl = sproplist::Surface() ;
-    return unused_create(pl, names, fold );
+    const sproplist *pl = sproplist::Surface();
+    return unused_create(pl, names, fold);
 }
 
 /**
@@ -603,60 +585,51 @@ and the array content. That is true for "mat" but not for "sur"
 
 **/
 
-inline NP* sstandard::unused_create(const sproplist* pl, const std::vector<std::string>& names, const NPFold* fold )
+inline NP *sstandard::unused_create(const sproplist *pl, const std::vector<std::string> &names, const NPFold *fold)
 {
     assert(0);
-    sdomain dom ;
+    sdomain dom;
 
-    int ni = names.size() ;
-    int nj = sprop::NUM_PAYLOAD_GRP ;
-    int nk = dom.length ;
-    int nl = sprop::NUM_PAYLOAD_VAL ;
+    int ni = names.size();
+    int nj = sprop::NUM_PAYLOAD_GRP;
+    int nk = dom.length;
+    int nl = sprop::NUM_PAYLOAD_VAL;
 
-    NP* sta = NP::Make<double>(ni, nj, nk, nl) ;
+    NP *sta = NP::Make<double>(ni, nj, nk, nl);
     sta->set_names(names);
-    double* sta_v = sta->values<double>();
+    double *sta_v = sta->values<double>();
 
-    std::cout << "sstandard::create sta.sstr " << sta->sstr() << std::endl ;
+    std::cout << "sstandard::create sta.sstr " << sta->sstr() << std::endl;
 
-    for(int i=0 ; i < ni ; i++ )               // names
+    for (int i = 0; i < ni; i++) // names
     {
-        const char* name = names[i].c_str() ;
-        NPFold* sub = fold->get_subfold(name) ;
-
-        std::cout
-            << std::setw(4) << i
-            << " : "
-            << std::setw(60) << name
-            << " : "
-            << sub->stats()
-            << std::endl
-            ;
-
-        for(int j=0 ; j < nj ; j++)           // payload groups
+        const char *name = names[i].c_str();
+        NPFold *sub = fold->get_subfold(name);
+
+        std::cout << std::setw(4) << i << " : " << std::setw(60) << name << " : " << sub->stats() << std::endl;
+
+        for (int j = 0; j < nj; j++) // payload groups
         {
-            for(int k=0 ; k < nk ; k++)       // wavelength
+            for (int k = 0; k < nk; k++) // wavelength
             {
-                //double wavelength_nm = dom.wavelength_nm[k] ;
-                double energy_eV = dom.energy_eV[k] ;
-                double energy = energy_eV * 1.e-6 ;  // Geant4 actual energy unit is MeV
+                // double wavelength_nm = dom.wavelength_nm[k] ;
+                double energy_eV = dom.energy_eV[k];
+                double energy = energy_eV * 1.e-6; // Geant4 actual energy unit is MeV
 
-                for(int l=0 ; l < nl ; l++)   // payload values
+                for (int l = 0; l < nl; l++) // payload values
                 {
-                    const sprop* prop = pl->get(j,l) ;
-                    assert( prop );
+                    const sprop *prop = pl->get(j, l);
+                    assert(prop);
 
-                    const char* pn = prop->name ;
-                    const NP* a = sub->get(pn) ;
-                    double value = a ? a->interp( energy ) : prop->def ;
+                    const char *pn = prop->name;
+                    const NP *a = sub->get(pn);
+                    double value = a ? a->interp(energy) : prop->def;
 
-                    int index = i*nj*nk*nl + j*nk*nl + k*nl + l ;
-                    sta_v[index] = value ;
+                    int index = i * nj * nk * nl + j * nk * nl + k * nl + l;
+                    sta_v[index] = value;
                 }
             }
         }
     }
-    return sta ;
+    return sta;
 }
-
-
diff --git a/sysrap/sstate.h b/sysrap/sstate.h
index 0299e756f..e51735533 100644
--- a/sysrap/sstate.h
+++ b/sysrap/sstate.h
@@ -25,7 +25,7 @@ BUT seems no point doing that, can just directly use them from PRD.
 struct sstate
 {
     float4 material1 ;    // refractive_index/absorption_length/scattering_length/reemission_prob
-    float4 m1group2 ;     // group_velocity/spare1/spare2/spare3
+    float4 m1group2;      // group_velocity/wls_absorption_length/spare2/spare3
     float4 material2 ;   
     float4 surface ;      // detect/absorb/reflect_specular/reflect_diffuse
 
@@ -71,7 +71,7 @@ inline std::ostream& operator<<(std::ostream& os, const sstate& s )
        << " (refractive_index/absorption_length/scattering_length/reemission_prob) " 
        << std::endl 
        << " m1group2 " << s.m1group2
-       << " (group_velocity/spare1/spare2/spare3) "
+       << " (group_velocity/wls_absorption_length/spare2/spare3) "
        << std::endl 
        << " material2 " << s.material2 
        << " (refractive_index/absorption_length/scattering_length/reemission_prob) " 
diff --git a/tests/geom/DUNE_example_detector.gdml b/tests/geom/DUNE_example_detector.gdml
new file mode 100644
index 000000000..11ae55814
--- /dev/null
+++ b/tests/geom/DUNE_example_detector.gdml
@@ -0,0 +1,373 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<gdml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://cern.ch/service-spi/app/releases/GDML/schema/gdml.xsd">
+
+  <define>
+    <matrix coldim="2" name="RINDEX0x6000000ed5f0" values="2.33932e-06 1.65 2.91728e-06 1.65 3.0996e-06 1.65 3.64659e-06 1.65 4.06506e-06 1.65 7.74901e-06 1.65 9.68627e-06 1.65 1.16966e-05 1.65"/>
+    <matrix coldim="2" name="GROUPVEL0x6000000ed6c0" values="2.33932e-06 181.692 2.6283e-06 181.692 3.00844e-06 181.692 3.3731e-06 181.692 3.85582e-06 181.692 5.90703e-06 181.692 8.71764e-06 181.692 1.16966e-05 181.692"/>
+    <matrix coldim="2" name="WLSCOMPONENT0x6000000ed860" values="2.33932e-06 0 2.91728e-06 0.0005 3.0996e-06 0.002 3.64659e-06 0.022 4.06506e-06 0.0005 7.74901e-06 0 9.68627e-06 0 1.16966e-05 0"/>
+    <matrix coldim="2" name="WLSABSLENGTH0x6000000ed790" values="2.33932e-06 10000 2.91728e-06 10000 3.0996e-06 10000 3.64659e-06 10000 4.06506e-06 0.187 7.74901e-06 0.0005 9.68627e-06 0.0005 1.16966e-05 0.0005"/>
+    <matrix coldim="1" name="WLSTIMECONSTANT0x6000010e1b20" values="1.136"/>
+    <matrix coldim="2" name="RINDEX0x6000000ed930" values="2.33932e-06 1.5 2.91728e-06 1.5 3.0996e-06 1.5 3.64659e-06 1.5 4.06506e-06 1.5 7.74901e-06 1.5 9.68627e-06 1.5 1.16966e-05 1.5"/>
+    <matrix coldim="2" name="GROUPVEL0x6000000eda00" values="2.33932e-06 199.862 2.6283e-06 199.862 3.00844e-06 199.862 3.3731e-06 199.862 3.85582e-06 199.862 5.90703e-06 199.862 8.71764e-06 199.862 1.16966e-05 199.862"/>
+    <matrix coldim="2" name="RINDEX0x6000000edad0" values="2.33932e-06 1.58 2.91728e-06 1.58 3.0996e-06 1.58 3.64659e-06 1.58 4.06506e-06 1.58 7.74901e-06 1.58 9.68627e-06 1.58 1.16966e-05 1.58"/>
+    <matrix coldim="2" name="GROUPVEL0x6000000edba0" values="2.33932e-06 189.742 2.6283e-06 189.742 3.00844e-06 189.742 3.3731e-06 189.742 3.85582e-06 189.742 5.90703e-06 189.742 8.71764e-06 189.742 1.16966e-05 189.742"/>
+    <matrix coldim="2" name="WLSCOMPONENT0x6000000edd40" values="2.33932e-06 0.0005 2.91728e-06 0.02 3.0996e-06 0 3.64659e-06 0 4.06506e-06 0 7.74901e-06 0 9.68627e-06 0 1.16966e-05 0"/>
+    <matrix coldim="2" name="WLSABSLENGTH0x6000000edc70" values="2.33932e-06 2000 2.91728e-06 2000 3.0996e-06 0.8 3.64659e-06 0.8 4.06506e-06 3 7.74901e-06 0.0001 9.68627e-06 0.0001 1.16966e-05 0.0001"/>
+    <matrix coldim="1" name="WLSTIMECONSTANT0x6000010e1c00" values="1.26"/>
+    <matrix coldim="2" name="RINDEX0x6000000ede10" values="2.33932e-06 1.23 2.91728e-06 1.23 3.0996e-06 1.23 3.64659e-06 1.23 4.06506e-06 1.235 7.74901e-06 1.315 9.68627e-06 1.45 1.16966e-05 5.45"/>
+    <matrix coldim="2" name="GROUPVEL0x6000000edee0" values="2.33932e-06 243.734 2.6283e-06 243.734 3.00844e-06 243.734 3.3731e-06 243.734 3.85582e-06 234.483 5.90703e-06 214.29 8.71764e-06 150.84 1.16966e-05 11.2451"/>
+    <matrix coldim="2" name="RAYLEIGH0x6000000ee220" values="2.33932e-06 900 2.91728e-06 900 3.0996e-06 900 3.64659e-06 900 4.06506e-06 900 7.74901e-06 900 9.68627e-06 900 1.16966e-05 900"/>
+    <matrix coldim="2" name="ABSLENGTH0x6000000ee150" values="2.33932e-06 10000 2.91728e-06 10000 3.0996e-06 10000 3.64659e-06 10000 4.06506e-06 10000 7.74901e-06 10000 9.68627e-06 10000 1.16966e-05 10000"/>
+    <matrix coldim="2" name="SCINTILLATIONCOMPONENT10x6000000edfb0" values="2.33932e-06 0 2.91728e-06 0 3.0996e-06 0 3.64659e-06 0 4.06506e-06 0 7.74901e-06 0.000238409 9.68627e-06 0.0398859 1.16966e-05 0.00422473"/>
+    <matrix coldim="2" name="SCINTILLATIONCOMPONENT20x6000000ee080" values="2.33932e-06 0 2.91728e-06 0 3.0996e-06 0 3.64659e-06 0 4.06506e-06 0 7.74901e-06 0.000238409 9.68627e-06 0.0398859 1.16966e-05 0.00422473"/>
+    <matrix coldim="1" name="SCINTILLATIONYIELD0x6000010e1c70" values="24000"/>
+    <matrix coldim="1" name="RESOLUTIONSCALE0x6000010e1c70" values="1"/>
+    <matrix coldim="1" name="SCINTILLATIONTIMECONSTANT10x6000010e1c70" values="7"/>
+    <matrix coldim="1" name="SCINTILLATIONTIMECONSTANT20x6000010e1c70" values="1400"/>
+    <matrix coldim="1" name="SCINTILLATIONYIELD10x6000010e1c70" values="0.75"/>
+    <matrix coldim="1" name="SCINTILLATIONYIELD20x6000010e1c70" values="0.25"/>
+    <matrix coldim="2" name="REFLECTIVITY0x6000000ee2f0" values="2.33932e-06 0.98 2.91728e-06 0.98 3.0996e-06 0.98 3.64659e-06 0.98 4.06506e-06 0.98 7.74901e-06 0.98 9.68627e-06 0.98 1.16966e-05 0.98"/>
+    <matrix coldim="2" name="EFFICIENCY_SIPM" values="2.33932e-06 1.0 2.91728e-06 1.0 3.0996e-06 1.0 3.64659e-06 1.0 4.06506e-06 1.0 7.74901e-06 1.0 9.68627e-06 1.0 1.16966e-05 1.0"/>
+  </define>
+
+  <materials>
+    <isotope N="12" Z="6" name="C120x600002ee5980">
+      <atom unit="g/mole" value="12"/>
+    </isotope>
+    <isotope N="13" Z="6" name="C130x600002ee59c0">
+      <atom unit="g/mole" value="13.0034"/>
+    </isotope>
+    <element name="C0x600000ce1080">
+      <fraction n="0.9893" ref="C120x600002ee5980"/>
+      <fraction n="0.0107" ref="C130x600002ee59c0"/>
+    </element>
+    <isotope N="1" Z="1" name="H10x600002ee5a40">
+      <atom unit="g/mole" value="1.00782503081372"/>
+    </isotope>
+    <isotope N="2" Z="1" name="H20x600002ee5a80">
+      <atom unit="g/mole" value="2.01410199966617"/>
+    </isotope>
+    <element name="H0x600000ce1130">
+      <fraction n="0.999885" ref="H10x600002ee5a40"/>
+      <fraction n="0.000115" ref="H20x600002ee5a80"/>
+    </element>
+    <material name="pTP0x121e06e10" state="solid">
+      <property name="RINDEX" ref="RINDEX0x6000000ed5f0"/>
+      <property name="GROUPVEL" ref="GROUPVEL0x6000000ed6c0"/>
+      <property name="WLSCOMPONENT" ref="WLSCOMPONENT0x6000000ed860"/>
+      <property name="WLSABSLENGTH" ref="WLSABSLENGTH0x6000000ed790"/>
+      <property name="WLSTIMECONSTANT" ref="WLSTIMECONSTANT0x6000010e1b20"/>
+      <T unit="K" value="293.15"/>
+      <MEE unit="eV" value="68.6661371678143"/>
+      <D unit="g/cm3" value="1.23"/>
+      <fraction n="0.938728183298287" ref="C0x600000ce1080"/>
+      <fraction n="0.0612718167017128" ref="H0x600000ce1130"/>
+    </material>
+    <isotope N="16" Z="8" name="O160x600002ee5ac0">
+      <atom unit="g/mole" value="15.9949"/>
+    </isotope>
+    <isotope N="17" Z="8" name="O170x600002ee5b40">
+      <atom unit="g/mole" value="16.9991"/>
+    </isotope>
+    <isotope N="18" Z="8" name="O180x600002ee5b80">
+      <atom unit="g/mole" value="17.9992"/>
+    </isotope>
+    <element name="O0x600000ce11e0">
+      <fraction n="0.99757" ref="O160x600002ee5ac0"/>
+      <fraction n="0.00038" ref="O170x600002ee5b40"/>
+      <fraction n="0.00205" ref="O180x600002ee5b80"/>
+    </element>
+    <material name="acrylicMcMaster0x121e06f20" state="solid">
+      <property name="RINDEX" ref="RINDEX0x6000000ed930"/>
+      <property name="GROUPVEL" ref="GROUPVEL0x6000000eda00"/>
+      <T unit="K" value="293.15"/>
+      <MEE unit="eV" value="68.6088807971964"/>
+      <D unit="g/cm3" value="1.19"/>
+      <fraction n="0.599841070879962" ref="C0x600000ce1080"/>
+      <fraction n="0.0805418407442842" ref="H0x600000ce1130"/>
+      <fraction n="0.319617088375753" ref="O0x600000ce11e0"/>
+    </material>
+    <material name="bluewlsacrylic0x121e075c0" state="solid">
+      <property name="RINDEX" ref="RINDEX0x6000000edad0"/>
+      <property name="GROUPVEL" ref="GROUPVEL0x6000000edba0"/>
+      <property name="WLSCOMPONENT" ref="WLSCOMPONENT0x6000000edd40"/>
+      <property name="WLSABSLENGTH" ref="WLSABSLENGTH0x6000000edc70"/>
+      <property name="WLSTIMECONSTANT" ref="WLSTIMECONSTANT0x6000010e1c00"/>
+      <T unit="K" value="293.15"/>
+      <MEE unit="eV" value="64.6844741120544"/>
+      <D unit="g/cm3" value="1.023"/>
+      <fraction n="0.914708531800025" ref="C0x600000ce1080"/>
+      <fraction n="0.0852914681999746" ref="H0x600000ce1130"/>
+    </material>
+    <isotope N="36" Z="18" name="Ar360x600002ee5bc0">
+      <atom unit="g/mole" value="35.9675"/>
+    </isotope>
+    <isotope N="38" Z="18" name="Ar380x600002ee5c00">
+      <atom unit="g/mole" value="37.9627"/>
+    </isotope>
+    <isotope N="40" Z="18" name="Ar400x600002ee5b00">
+      <atom unit="g/mole" value="39.9624"/>
+    </isotope>
+    <element name="Ar0x600000ce1290">
+      <fraction n="0.003365" ref="Ar360x600002ee5bc0"/>
+      <fraction n="0.000632" ref="Ar380x600002ee5c00"/>
+      <fraction n="0.996003" ref="Ar400x600002ee5b00"/>
+    </element>
+    <material name="G4_lAr0x121e076d0" state="liquid">
+      <property name="RINDEX" ref="RINDEX0x6000000ede10"/>
+      <property name="GROUPVEL" ref="GROUPVEL0x6000000edee0"/>
+      <property name="RAYLEIGH" ref="RAYLEIGH0x6000000ee220"/>
+      <property name="ABSLENGTH" ref="ABSLENGTH0x6000000ee150"/>
+      <property name="SCINTILLATIONCOMPONENT1" ref="SCINTILLATIONCOMPONENT10x6000000edfb0"/>
+      <property name="SCINTILLATIONCOMPONENT2" ref="SCINTILLATIONCOMPONENT20x6000000ee080"/>
+      <property name="SCINTILLATIONYIELD" ref="SCINTILLATIONYIELD0x6000010e1c70"/>
+      <property name="RESOLUTIONSCALE" ref="RESOLUTIONSCALE0x6000010e1c70"/>
+      <property name="SCINTILLATIONTIMECONSTANT1" ref="SCINTILLATIONTIMECONSTANT10x6000010e1c70"/>
+      <property name="SCINTILLATIONTIMECONSTANT2" ref="SCINTILLATIONTIMECONSTANT20x6000010e1c70"/>
+      <property name="SCINTILLATIONYIELD1" ref="SCINTILLATIONYIELD10x6000010e1c70"/>
+      <property name="SCINTILLATIONYIELD2" ref="SCINTILLATIONYIELD20x6000010e1c70"/>
+      <T unit="K" value="293.15"/>
+      <MEE unit="eV" value="188"/>
+      <D unit="g/cm3" value="1.396"/>
+      <fraction n="1" ref="Ar0x600000ce1290"/>
+    </material>
+  </materials>
+
+  <solids>
+    <box lunit="mm" name="pTPlayer0x6000010e2920" x="500" y="500" z="0.002"/>
+    <box lunit="mm" name="pTPsubstrate0x6000010e2990" x="500" y="500" z="6"/>
+    <box lunit="mm" name="BlueWLSplate0x6000010e2a00" x="500" y="500" z="6"/>
+    <box lunit="mm" name="SiPMs0x6000010e2a70" x="6" y="1" z="6"/>
+    <box lunit="mm" name="ReflectiveFoilBackPlane0x6000010e2ae0" x="500" y="500" z="0.065"/>
+    <opticalsurface finish="3" model="1" name="Vikuiti0x6000010e1ce0" type="0" value="0">
+      <property name="REFLECTIVITY" ref="REFLECTIVITY0x6000000ee2f0"/>
+    </opticalsurface>
+    <opticalsurface finish="0" model="1" name="SiPMSurface" type="0" value="0">
+      <property name="EFFICIENCY" ref="EFFICIENCY_SIPM"/>
+    </opticalsurface>
+    <box lunit="mm" name="ReflectiveFoilEdgeTop0x6000010e2b50" x="500" y="0.065" z="6"/>
+    <box lunit="mm" name="ReflectiveFoilEdgeBot0x6000010e2bc0" x="500" y="0.065" z="6"/>
+    <box lunit="mm" name="ReflectiveFoilEdgeLeft0x6000010e2c30" x="0.065" y="500" z="6"/>
+    <box lunit="mm" name="ReflectiveFoilEdgeRight0x6000010e2ca0" x="0.065" y="500" z="6"/>
+    <box lunit="mm" name="solidWorld0x6000010e2300" x="1000" y="1000" z="1000"/>
+  </solids>
+
+  <structure>
+    <volume name="logicpTPlayer0x600000ae2300">
+      <materialref ref="pTP0x121e06e10"/>
+      <solidref ref="pTPlayer0x6000010e2920"/>
+    </volume>
+    <volume name="logicpTPsubstrate0x600000ae23a0">
+      <materialref ref="acrylicMcMaster0x121e06f20"/>
+      <solidref ref="pTPsubstrate0x6000010e2990"/>
+    </volume>
+    <volume name="logicBlueWLSplate0x600000ae2440">
+      <materialref ref="bluewlsacrylic0x121e075c0"/>
+      <solidref ref="BlueWLSplate0x6000010e2a00"/>
+    </volume>
+    <volume name="logicSiPMs0x600000ae24e0">
+      <materialref ref="G4_lAr0x121e076d0"/>
+      <solidref ref="SiPMs0x6000010e2a70"/>
+      <auxiliary auxtype="SensDet" auxvalue="PhotonDetector"/>
+    </volume>
+    <volume name="logicReflectiveFoilBackPlane0x600000ae2580">
+      <materialref ref="acrylicMcMaster0x121e06f20"/>
+      <solidref ref="ReflectiveFoilBackPlane0x6000010e2ae0"/>
+    </volume>
+    <volume name="logicReflectiveFoilEdgeTop0x600000ae2620">
+      <materialref ref="acrylicMcMaster0x121e06f20"/>
+      <solidref ref="ReflectiveFoilEdgeTop0x6000010e2b50"/>
+    </volume>
+    <volume name="logicReflectiveFoilEdgeBot0x600000ae26c0">
+      <materialref ref="acrylicMcMaster0x121e06f20"/>
+      <solidref ref="ReflectiveFoilEdgeBot0x6000010e2bc0"/>
+    </volume>
+    <volume name="logicReflectiveFoilEdgeLeft0x600000ae2760">
+      <materialref ref="acrylicMcMaster0x121e06f20"/>
+      <solidref ref="ReflectiveFoilEdgeLeft0x6000010e2c30"/>
+    </volume>
+    <volume name="logicReflectiveFoilEdgeRight0x600000ae2800">
+      <materialref ref="acrylicMcMaster0x121e06f20"/>
+      <solidref ref="ReflectiveFoilEdgeRight0x6000010e2ca0"/>
+    </volume>
+    <volume name="logicWorld0x600000ae1f40">
+      <materialref ref="G4_lAr0x121e076d0"/>
+      <solidref ref="solidWorld0x6000010e2300"/>
+      <physvol name="physpTPlayer0x6000018e2580">
+        <volumeref ref="logicpTPlayer0x600000ae2300"/>
+        <position name="physpTPlayer0x6000018e2580_pos" unit="mm" x="0" y="0" z="246.999"/>
+      </physvol>
+      <physvol name="physpTPsubstrate0x6000018e26c0">
+        <volumeref ref="logicpTPsubstrate0x600000ae23a0"/>
+        <position name="physpTPsubstrate0x6000018e26c0_pos" unit="mm" x="0" y="0" z="250"/>
+      </physvol>
+      <physvol name="physBlueWLSplate0x6000018e2800">
+        <volumeref ref="logicBlueWLSplate0x600000ae2440"/>
+        <position name="physBlueWLSplate0x6000018e2800_pos" unit="mm" x="0" y="0" z="258"/>
+      </physvol>
+      <physvol name="physSiPMs0x6000018e2940">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2940_pos" unit="mm" x="-241.935483870968" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="1" name="physSiPMs0x6000018e29e0">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e29e0_pos" unit="mm" x="-225.806451612903" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="2" name="physSiPMs0x6000018e2a30">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2a30_pos" unit="mm" x="-209.677419354839" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="3" name="physSiPMs0x6000018e2a80">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2a80_pos" unit="mm" x="-193.548387096774" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="4" name="physSiPMs0x6000018e2ad0">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2ad0_pos" unit="mm" x="-177.41935483871" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="5" name="physSiPMs0x6000018e2b20">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2b20_pos" unit="mm" x="-161.290322580645" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="6" name="physSiPMs0x6000018e2b70">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2b70_pos" unit="mm" x="-145.161290322581" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="7" name="physSiPMs0x6000018e2bc0">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2bc0_pos" unit="mm" x="-129.032258064516" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="8" name="physSiPMs0x6000018e2c10">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2c10_pos" unit="mm" x="-112.903225806452" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="9" name="physSiPMs0x6000018e2c60">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2c60_pos" unit="mm" x="-96.7741935483871" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="10" name="physSiPMs0x6000018e2cb0">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2cb0_pos" unit="mm" x="-80.6451612903226" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="11" name="physSiPMs0x6000018e2d00">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2d00_pos" unit="mm" x="-64.5161290322581" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="12" name="physSiPMs0x6000018e2d50">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2d50_pos" unit="mm" x="-48.3870967741935" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="13" name="physSiPMs0x6000018e2da0">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2da0_pos" unit="mm" x="-32.258064516129" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="14" name="physSiPMs0x6000018e2df0">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2df0_pos" unit="mm" x="-16.1290322580645" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="15" name="physSiPMs0x6000018e2e40">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2e40_pos" unit="mm" x="0" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="16" name="physSiPMs0x6000018e2e90">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2e90_pos" unit="mm" x="16.1290322580645" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="17" name="physSiPMs0x6000018e2ee0">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2ee0_pos" unit="mm" x="32.258064516129" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="18" name="physSiPMs0x6000018e2f30">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2f30_pos" unit="mm" x="48.3870967741935" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="19" name="physSiPMs0x6000018e2f80">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2f80_pos" unit="mm" x="64.516129032258" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="20" name="physSiPMs0x6000018e2fd0">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e2fd0_pos" unit="mm" x="80.6451612903226" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="21" name="physSiPMs0x6000018e3020">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e3020_pos" unit="mm" x="96.7741935483871" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="22" name="physSiPMs0x6000018e3070">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e3070_pos" unit="mm" x="112.903225806452" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="23" name="physSiPMs0x6000018e30c0">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e30c0_pos" unit="mm" x="129.032258064516" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="24" name="physSiPMs0x6000018e3110">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e3110_pos" unit="mm" x="145.161290322581" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="25" name="physSiPMs0x6000018e3160">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e3160_pos" unit="mm" x="161.290322580645" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="26" name="physSiPMs0x6000018e31b0">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e31b0_pos" unit="mm" x="177.41935483871" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="27" name="physSiPMs0x6000018e3200">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e3200_pos" unit="mm" x="193.548387096774" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="28" name="physSiPMs0x6000018e3250">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e3250_pos" unit="mm" x="209.677419354839" y="251" z="258"/>
+      </physvol>
+      <physvol copynumber="29" name="physSiPMs0x6000018e32a0">
+        <volumeref ref="logicSiPMs0x600000ae24e0"/>
+        <position name="physSiPMs0x6000018e32a0_pos" unit="mm" x="225.806451612903" y="251" z="258"/>
+      </physvol>
+      <physvol name="physReflectiveFoilBackPlane0x6000018e3390">
+        <volumeref ref="logicReflectiveFoilBackPlane0x600000ae2580"/>
+        <position name="physReflectiveFoilBackPlane0x6000018e3390_pos" unit="mm" x="0" y="0" z="261.033"/>
+      </physvol>
+      <physvol name="physReflectiveFoilEdgeTop0x6000018e36b0">
+        <volumeref ref="logicReflectiveFoilEdgeTop0x600000ae2620"/>
+        <position name="physReflectiveFoilEdgeTop0x6000018e36b0_pos" unit="mm" x="0" y="251.533" z="258"/>
+      </physvol>
+      <physvol name="physReflectiveFoilEdgeBot0x6000018e3750">
+        <volumeref ref="logicReflectiveFoilEdgeBot0x600000ae26c0"/>
+        <position name="physReflectiveFoilEdgeBot0x6000018e3750_pos" unit="mm" x="0" y="-250.033" z="258"/>
+      </physvol>
+      <physvol name="physReflectiveFoilEdgeLeft0x6000018e37f0">
+        <volumeref ref="logicReflectiveFoilEdgeLeft0x600000ae2760"/>
+        <position name="physReflectiveFoilEdgeLeft0x6000018e37f0_pos" unit="mm" x="250.033" y="0" z="258"/>
+      </physvol>
+      <physvol name="physReflectiveFoilEdgeRight0x6000018e3890">
+        <volumeref ref="logicReflectiveFoilEdgeRight0x600000ae2800"/>
+        <position name="physReflectiveFoilEdgeRight0x6000018e3890_pos" unit="mm" x="-250.033" y="0" z="258"/>
+      </physvol>
+    </volume>
+    <skinsurface name="skin0x600002e09bc0" surfaceproperty="Vikuiti0x6000010e1ce0">
+      <volumeref ref="logicReflectiveFoilBackPlane0x600000ae2580"/>
+    </skinsurface>
+    <skinsurface name="skinedgetop0x600002e09b80" surfaceproperty="Vikuiti0x6000010e1ce0">
+      <volumeref ref="logicReflectiveFoilEdgeTop0x600000ae2620"/>
+    </skinsurface>
+    <skinsurface name="skinedgebot0x600002e09c00" surfaceproperty="Vikuiti0x6000010e1ce0">
+      <volumeref ref="logicReflectiveFoilEdgeBot0x600000ae26c0"/>
+    </skinsurface>
+    <skinsurface name="skinedgeleft0x600002e09c40" surfaceproperty="Vikuiti0x6000010e1ce0">
+      <volumeref ref="logicReflectiveFoilEdgeLeft0x600000ae2760"/>
+    </skinsurface>
+    <skinsurface name="skinedgeright0x600002e09c80" surfaceproperty="Vikuiti0x6000010e1ce0">
+      <volumeref ref="logicReflectiveFoilEdgeRight0x600000ae2800"/>
+    </skinsurface>
+    <skinsurface name="skinSiPM" surfaceproperty="SiPMSurface">
+      <volumeref ref="logicSiPMs0x600000ae24e0"/>
+    </skinsurface>
+  </structure>
+
+  <setup name="Default" version="1.0">
+    <world ref="logicWorld0x600000ae1f40"/>
+  </setup>
+
+</gdml>
diff --git a/tests/geom/wls_scatter_viz.gdml b/tests/geom/wls_scatter_viz.gdml
new file mode 100644
index 000000000..db1dc872b
--- /dev/null
+++ b/tests/geom/wls_scatter_viz.gdml
@@ -0,0 +1,111 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<gdml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://service-spi.web.cern.ch/service-spi/app/releases/GDML/schema/gdml.xsd">
+
+  <define>
+    <!-- Energy points: 350nm=3.54eV, 400nm=3.1eV, 450nm=2.76eV, 500nm=2.48eV, 530nm=2.34eV -->
+    <matrix coldim="2" name="RINDEX_ALL" values="2.33932e-06 1.5 3.0996e-06 1.5 3.54241e-06 1.5 4.13281e-06 1.5 1.16966e-05 1.5"/>
+    <matrix coldim="2" name="GROUPVEL_ALL" values="2.33932e-06 200 3.0996e-06 200 3.54241e-06 200 4.13281e-06 200 1.16966e-05 200"/>
+    <matrix coldim="2" name="ABSLENGTH_LONG" values="2.33932e-06 100000 1.16966e-05 100000"/>
+
+    <!-- WLS: absorb at 350nm (14.4mm = 50% in 10mm), transparent at visible -->
+    <matrix coldim="2" name="WLSABSLENGTH_VIZ" values="2.33932e-06 100000 3.0996e-06 100000 3.54241e-06 14.4 4.13281e-06 5 1.16966e-05 2"/>
+    <matrix coldim="2" name="WLSCOMPONENT_VIZ" values="2.33932e-06 0.04 2.91728e-06 0.8 3.0996e-06 0.3 3.54241e-06 0.01 1.16966e-05 0.0"/>
+    <matrix coldim="1" name="WLSTIMECONSTANT_VIZ" values="0.5"/>
+
+    <!-- Rayleigh: 10mm mean free path = avg 2 scatters in 20mm gap -->
+    <matrix coldim="2" name="RAYLEIGH_SCATTER" values="2.33932e-06 10 3.0996e-06 10 3.54241e-06 10 4.13281e-06 10 1.16966e-05 10"/>
+
+    <!-- No Rayleigh for WLS sphere interior -->
+    <matrix coldim="2" name="RAYLEIGH_NONE" values="2.33932e-06 100000 1.16966e-05 100000"/>
+
+    <!-- Detector efficiency -->
+    <matrix coldim="2" name="EFFICIENCY_DET" values="2.33932e-06 1.0 1.16966e-05 1.0"/>
+  </define>
+
+  <materials>
+    <element Z="18" name="Ar">
+      <atom unit="g/mole" value="39.948"/>
+    </element>
+
+    <!-- Scattering medium: fills gap between spheres -->
+    <material name="ScatterMedium" state="liquid">
+      <property name="RINDEX" ref="RINDEX_ALL"/>
+      <property name="GROUPVEL" ref="GROUPVEL_ALL"/>
+      <property name="ABSLENGTH" ref="ABSLENGTH_LONG"/>
+      <property name="RAYLEIGH" ref="RAYLEIGH_SCATTER"/>
+      <T unit="K" value="293.15"/>
+      <MEE unit="eV" value="188"/>
+      <D unit="g/cm3" value="1.4"/>
+      <fraction n="1" ref="Ar"/>
+    </material>
+
+    <!-- WLS material: absorbs UV, re-emits visible -->
+    <material name="WLSMaterial" state="solid">
+      <property name="RINDEX" ref="RINDEX_ALL"/>
+      <property name="GROUPVEL" ref="GROUPVEL_ALL"/>
+      <property name="ABSLENGTH" ref="ABSLENGTH_LONG"/>
+      <property name="RAYLEIGH" ref="RAYLEIGH_NONE"/>
+      <property name="WLSABSLENGTH" ref="WLSABSLENGTH_VIZ"/>
+      <property name="WLSCOMPONENT" ref="WLSCOMPONENT_VIZ"/>
+      <property name="WLSTIMECONSTANT" ref="WLSTIMECONSTANT_VIZ"/>
+      <T unit="K" value="293.15"/>
+      <MEE unit="eV" value="188"/>
+      <D unit="g/cm3" value="1.2"/>
+      <fraction n="1" ref="Ar"/>
+    </material>
+
+    <!-- Detector material -->
+    <material name="DetectorMat" state="liquid">
+      <property name="RINDEX" ref="RINDEX_ALL"/>
+      <property name="GROUPVEL" ref="GROUPVEL_ALL"/>
+      <property name="ABSLENGTH" ref="ABSLENGTH_LONG"/>
+      <T unit="K" value="293.15"/>
+      <MEE unit="eV" value="188"/>
+      <D unit="g/cm3" value="1.4"/>
+      <fraction n="1" ref="Ar"/>
+    </material>
+  </materials>
+
+  <solids>
+    <sphere aunit="deg" lunit="mm" name="WorldSphere" rmax="50" rmin="0" startphi="0" deltaphi="360" starttheta="0" deltatheta="180"/>
+    <sphere aunit="deg" lunit="mm" name="WLSSphere" rmax="10" rmin="0" startphi="0" deltaphi="360" starttheta="0" deltatheta="180"/>
+    <sphere aunit="deg" lunit="mm" name="DetShell" rmax="30.5" rmin="30" startphi="0" deltaphi="360" starttheta="0" deltatheta="180"/>
+
+    <opticalsurface finish="0" model="1" name="DetSurface" type="0" value="0">
+      <property name="EFFICIENCY" ref="EFFICIENCY_DET"/>
+    </opticalsurface>
+  </solids>
+
+  <structure>
+    <volume name="logicWLS">
+      <materialref ref="WLSMaterial"/>
+      <solidref ref="WLSSphere"/>
+    </volume>
+
+    <volume name="logicDetector">
+      <materialref ref="DetectorMat"/>
+      <solidref ref="DetShell"/>
+      <auxiliary auxtype="SensDet" auxvalue="PhotonDetector"/>
+    </volume>
+
+    <volume name="logicWorld">
+      <materialref ref="ScatterMedium"/>
+      <solidref ref="WorldSphere"/>
+      <physvol name="physWLS">
+        <volumeref ref="logicWLS"/>
+      </physvol>
+      <physvol name="physDetector">
+        <volumeref ref="logicDetector"/>
+      </physvol>
+    </volume>
+
+    <skinsurface name="skinDetector" surfaceproperty="DetSurface">
+      <volumeref ref="logicDetector"/>
+    </skinsurface>
+  </structure>
+
+  <setup name="Default" version="1.0">
+    <world ref="logicWorld"/>
+  </setup>
+
+</gdml>
diff --git a/tests/geom/wls_slab.gdml b/tests/geom/wls_slab.gdml
new file mode 100644
index 000000000..50eb18af5
--- /dev/null
+++ b/tests/geom/wls_slab.gdml
@@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<gdml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+      xsi:noNamespaceSchemaLocation="">
+
+  <define>
+    <matrix coldim="2" name="RINDEX_AIR" values="1.55e-06 1.0 1.55e-05 1.0"/>
+    <matrix coldim="2" name="ABSLENGTH_AIR" values="1.55e-06 1.0e6 1.55e-05 1.0e6"/>
+
+    <!-- WLS slab: n=1.59, same material as wls_test -->
+    <matrix coldim="2" name="RINDEX_WLS" values="1.55e-06 1.59 1.55e-05 1.59"/>
+    <matrix coldim="2" name="ABSLENGTH_WLS" values="1.55e-06 1.0e6 1.55e-05 1.0e6"/>
+
+    <!-- Same WLS absorption spectrum as wls_test.gdml -->
+    <matrix coldim="2" name="WLSABSLENGTH_WLS" values="1.77e-06 10000.0 2.07e-06 10000.0 2.48e-06 10000.0 2.76e-06 100.0 3.10e-06 1.0 3.54e-06 0.1 4.13e-06 0.01"/>
+
+    <!-- Same WLS emission spectrum -->
+    <matrix coldim="2" name="WLSCOMPONENT_WLS" values="1.77e-06 0.00 2.07e-06 0.02 2.25e-06 0.10 2.48e-06 0.50 2.58e-06 1.00 2.70e-06 0.80 2.76e-06 0.50 2.88e-06 0.10 3.10e-06 0.00"/>
+    <matrix coldim="1" name="WLSTIMECONSTANT_WLS" values="0.5"/>
+
+    <!-- Detector glass -->
+    <matrix coldim="2" name="RINDEX_GLASS" values="1.55e-06 1.50 1.55e-05 1.50"/>
+    <matrix coldim="2" name="EFFICIENCYDET" values="1.55e-06 1.0 1.55e-05 1.0"/>
+  </define>
+
+  <materials>
+    <element name="N" formula="N" Z="7"><atom value="14.007" unit="g/mole"/></element>
+    <element name="O" formula="O" Z="8"><atom value="15.999" unit="g/mole"/></element>
+    <element name="C" formula="C" Z="6"><atom value="12.011" unit="g/mole"/></element>
+    <element name="H" formula="H" Z="1"><atom value="1.008" unit="g/mole"/></element>
+
+    <material name="Air" state="gas">
+      <D value="0.00120479" unit="g/cm3"/>
+      <fraction n="0.7" ref="N"/>
+      <fraction n="0.3" ref="O"/>
+      <property name="RINDEX" ref="RINDEX_AIR"/>
+      <property name="ABSLENGTH" ref="ABSLENGTH_AIR"/>
+    </material>
+
+    <material name="WLSMaterial" state="solid">
+      <D value="1.05" unit="g/cm3"/>
+      <fraction n="0.915" ref="C"/>
+      <fraction n="0.085" ref="H"/>
+      <property name="RINDEX" ref="RINDEX_WLS"/>
+      <property name="ABSLENGTH" ref="ABSLENGTH_WLS"/>
+      <property name="WLSABSLENGTH" ref="WLSABSLENGTH_WLS"/>
+      <property name="WLSCOMPONENT" ref="WLSCOMPONENT_WLS"/>
+      <property name="WLSTIMECONSTANT" ref="WLSTIMECONSTANT_WLS"/>
+    </material>
+
+    <material name="GlassMaterial" state="solid">
+      <D value="2.5" unit="g/cm3"/>
+      <fraction n="1.0" ref="O"/>
+      <property name="RINDEX" ref="RINDEX_GLASS"/>
+    </material>
+  </materials>
+
+  <solids>
+    <!--
+      Geometry: pencil beam along +Z
+        Source at z=-50mm in air, shooting +Z
+        WLS slab: 1mm thick box centered at z=0  (z from -0.5 to +0.5)
+        Detector: thin box at z=10mm
+
+      At 400nm input: WLSABSLENGTH=1.0mm, slab=1mm → P_abs = 1-exp(-1) = 63.2%
+      At 350nm input: WLSABSLENGTH=0.1mm, slab=1mm → P_abs = 1-exp(-10) ≈ 100%
+    -->
+    <box name="WorldBox" x="200" y="200" z="200" lunit="mm"/>
+    <box name="WLSSlab" x="100" y="100" z="1.0" lunit="mm"/>
+    <box name="DetectorBox" x="100" y="100" z="0.5" lunit="mm"/>
+
+    <opticalsurface name="DetSurface" type="0" model="1" finish="0" value="0">
+      <property name="EFFICIENCY" ref="EFFICIENCYDET"/>
+    </opticalsurface>
+  </solids>
+
+  <structure>
+    <volume name="WLS_logical">
+      <materialref ref="WLSMaterial"/>
+      <solidref ref="WLSSlab"/>
+    </volume>
+
+    <volume name="Detector_logical">
+      <materialref ref="GlassMaterial"/>
+      <solidref ref="DetectorBox"/>
+      <auxiliary auxtype="SensDet" auxvalue="PhotonDetector"/>
+    </volume>
+
+    <volume name="World_logical">
+      <materialref ref="Air"/>
+      <solidref ref="WorldBox"/>
+
+      <!-- WLS slab centered at z=0 -->
+      <physvol name="WLS_phys">
+        <volumeref ref="WLS_logical"/>
+        <position name="WLSpos" unit="mm" x="0" y="0" z="0"/>
+      </physvol>
+
+      <!-- Detector at z=10mm (behind the slab) -->
+      <physvol name="Detector_phys">
+        <volumeref ref="Detector_logical"/>
+        <position name="Detpos" unit="mm" x="0" y="0" z="10"/>
+      </physvol>
+    </volume>
+
+    <skinsurface name="DetSkinSurface" surfaceproperty="DetSurface">
+      <volumeref ref="Detector_logical"/>
+    </skinsurface>
+  </structure>
+
+  <setup name="Default" version="1.0">
+    <world ref="World_logical"/>
+  </setup>
+</gdml>
diff --git a/tests/geom/wls_test.gdml b/tests/geom/wls_test.gdml
new file mode 100644
index 000000000..984f99bf4
--- /dev/null
+++ b/tests/geom/wls_test.gdml
@@ -0,0 +1,144 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<gdml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+      xsi:noNamespaceSchemaLocation="">
+
+  <define>
+    <!-- Air refractive index (constant ~1.0) -->
+    <matrix coldim="2" name="RINDEX_AIR" values="1.55e-06 1.0 1.55e-05 1.0"/>
+    <matrix coldim="2" name="ABSLENGTH_AIR" values="1.55e-06 1.0e6 1.55e-05 1.0e6"/>
+
+    <!-- WLS material refractive index -->
+    <matrix coldim="2" name="RINDEX_WLS" values="1.55e-06 1.59 1.55e-05 1.59"/>
+
+    <!-- WLS regular absorption: transparent (WLS absorption does the work) -->
+    <matrix coldim="2" name="ABSLENGTH_WLS" values="1.55e-06 1.0e6 1.55e-05 1.0e6"/>
+
+    <!--
+      WLS absorption length (mm): absorbs UV strongly, transparent to visible.
+      Energy(MeV)   ~Wavelength    WLSABSLENGTH(mm)
+      1.77e-6       700nm          10000
+      2.07e-6       600nm          10000
+      2.48e-6       500nm          10000
+      2.76e-6       450nm          100
+      3.10e-6       400nm          1.0
+      3.54e-6       350nm          0.1
+      4.13e-6       300nm          0.01
+    -->
+    <matrix coldim="2" name="WLSABSLENGTH_WLS" values="1.77e-06 10000.0 2.07e-06 10000.0 2.48e-06 10000.0 2.76e-06 100.0 3.10e-06 1.0 3.54e-06 0.1 4.13e-06 0.01"/>
+
+    <!--
+      WLS emission spectrum: peaked in blue-green (around 450-500 nm).
+      Energy(MeV)   ~Wavelength   Intensity
+      1.77e-6       700nm         0.00
+      2.07e-6       600nm         0.02
+      2.25e-6       551nm         0.10
+      2.48e-6       500nm         0.50
+      2.58e-6       481nm         1.00 (peak)
+      2.70e-6       459nm         0.80
+      2.76e-6       449nm         0.50
+      2.88e-6       430nm         0.10
+      3.10e-6       400nm         0.00
+    -->
+    <matrix coldim="2" name="WLSCOMPONENT_WLS" values="1.77e-06 0.00 2.07e-06 0.02 2.25e-06 0.10 2.48e-06 0.50 2.58e-06 1.00 2.70e-06 0.80 2.76e-06 0.50 2.88e-06 0.10 3.10e-06 0.00"/>
+
+    <!-- WLS time constant: 0.5 ns -->
+    <matrix coldim="1" name="WLSTIMECONSTANT_WLS" values="0.5"/>
+
+    <!-- Detector shell refractive index (glass) -->
+    <matrix coldim="2" name="RINDEX_GLASS" values="1.55e-06 1.50 1.55e-05 1.50"/>
+
+    <!-- Detector efficiency: 100% -->
+    <matrix coldim="2" name="EFFICIENCYDET" values="1.55e-06 1.0 1.55e-05 1.0"/>
+  </define>
+
+  <materials>
+    <element name="N" formula="N" Z="7">
+      <atom value="14.007" unit="g/mole"/>
+    </element>
+    <element name="O" formula="O" Z="8">
+      <atom value="15.999" unit="g/mole"/>
+    </element>
+    <element name="C" formula="C" Z="6">
+      <atom value="12.011" unit="g/mole"/>
+    </element>
+    <element name="H" formula="H" Z="1">
+      <atom value="1.008" unit="g/mole"/>
+    </element>
+
+    <material name="Air" state="gas">
+      <D value="0.00120479" unit="g/cm3"/>
+      <fraction n="0.7" ref="N"/>
+      <fraction n="0.3" ref="O"/>
+      <property name="RINDEX" ref="RINDEX_AIR"/>
+      <property name="ABSLENGTH" ref="ABSLENGTH_AIR"/>
+    </material>
+
+    <material name="WLSMaterial" state="solid">
+      <D value="1.05" unit="g/cm3"/>
+      <fraction n="0.915" ref="C"/>
+      <fraction n="0.085" ref="H"/>
+      <property name="RINDEX" ref="RINDEX_WLS"/>
+      <property name="ABSLENGTH" ref="ABSLENGTH_WLS"/>
+      <property name="WLSABSLENGTH" ref="WLSABSLENGTH_WLS"/>
+      <property name="WLSCOMPONENT" ref="WLSCOMPONENT_WLS"/>
+      <property name="WLSTIMECONSTANT" ref="WLSTIMECONSTANT_WLS"/>
+    </material>
+
+    <material name="GlassMaterial" state="solid">
+      <D value="2.5" unit="g/cm3"/>
+      <fraction n="1.0" ref="O"/>
+      <property name="RINDEX" ref="RINDEX_GLASS"/>
+    </material>
+  </materials>
+
+  <solids>
+    <sphere name="WorldSphere" rmin="0" rmax="200" deltaphi="6.28318530718" deltatheta="3.14159265359" aunit="rad" lunit="mm"/>
+    <sphere name="WLSSphere" rmin="0" rmax="20" deltaphi="6.28318530718" deltatheta="3.14159265359" aunit="rad" lunit="mm"/>
+    <sphere name="DetectorOuter" rmin="0" rmax="30" deltaphi="6.28318530718" deltatheta="3.14159265359" aunit="rad" lunit="mm"/>
+    <sphere name="DetectorInner" rmin="0" rmax="28" deltaphi="6.28318530718" deltatheta="3.14159265359" aunit="rad" lunit="mm"/>
+    <subtraction name="DetectorShell">
+      <first ref="DetectorOuter"/>
+      <second ref="DetectorInner"/>
+    </subtraction>
+
+    <opticalsurface name="DetSurface" type="0" model="1" finish="0" value="0">
+      <property name="EFFICIENCY" ref="EFFICIENCYDET"/>
+    </opticalsurface>
+  </solids>
+
+  <structure>
+    <volume name="WLS_logical">
+      <materialref ref="WLSMaterial"/>
+      <solidref ref="WLSSphere"/>
+    </volume>
+
+    <volume name="Detector_logical">
+      <materialref ref="GlassMaterial"/>
+      <solidref ref="DetectorShell"/>
+      <auxiliary auxtype="SensDet" auxvalue="PhotonDetector"/>
+    </volume>
+
+    <volume name="World_logical">
+      <materialref ref="Air"/>
+      <solidref ref="WorldSphere"/>
+
+      <physvol name="WLS_phys">
+        <volumeref ref="WLS_logical"/>
+        <position name="WLSpos" unit="mm" x="0" y="0" z="0"/>
+      </physvol>
+
+      <physvol name="Detector_phys">
+        <volumeref ref="Detector_logical"/>
+        <position name="Detpos" unit="mm" x="0" y="0" z="0"/>
+      </physvol>
+    </volume>
+
+    <skinsurface name="DetSkinSurface" surfaceproperty="DetSurface">
+      <volumeref ref="Detector_logical"/>
+    </skinsurface>
+  </structure>
+
+  <setup name="Default" version="1.0">
+    <world ref="World_logical"/>
+  </setup>
+</gdml>
diff --git a/tests/test_wavelength_shifting.sh b/tests/test_wavelength_shifting.sh
new file mode 100755
index 000000000..74278fec8
--- /dev/null
+++ b/tests/test_wavelength_shifting.sh
@@ -0,0 +1,257 @@
+#!/bin/bash
+#
+# test_wavelength_shifting.sh
+# ============================
+# End-to-end test: GPU vs G4 wavelength shifting physics
+#
+# Fires 10000 UV photons (350nm) from outside a WLS sphere into a scattering
+# medium. Compares GPU (opticks) and G4 hit wavelength distributions, WLS
+# conversion rate, and arrival time distributions using chi-squared test.
+#
+# Geometry: tests/geom/wls_scatter_viz.gdml
+#   - WLS sphere r=10mm (absorbs UV, re-emits visible)
+#   - Scattering medium (Rayleigh, 10mm mean free path)
+#   - Detector shell r=30mm (100% efficiency)
+#
+# Usage:
+#   ./tests/test_wavelength_shifting.sh [seed]
+#
+# Exit code 0 = PASS, 1 = FAIL
+#
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+SEED=${1:-42}
+NUMPHOTON=10000
+GEOM="$REPO_DIR/tests/geom/wls_scatter_viz.gdml"
+CONFIG="wls_scatter_viz"
+
+source /opt/eic-opticks/eic-opticks-env.sh 2>/dev/null || true
+export OPTICKS_MAX_BOUNCE=100
+export OPTICKS_EVENT_MODE=HitPhoton
+export OPTICKS_MAX_SLOT=100000
+
+echo "=============================================="
+echo " WLS Test: GPU vs G4 Wavelength Shifting"
+echo "=============================================="
+echo "  Geometry: $GEOM"
+echo "  Photons:  $NUMPHOTON (350nm UV)"
+echo "  Seed:     $SEED"
+echo ""
+
+# --- GPU run ---
+echo "[GPU] Running GPUPhotonSourceMinimal..."
+GPU_OUT=$(/opt/eic-opticks/bin/GPUPhotonSourceMinimal \
+    -g "$GEOM" -c "$CONFIG" -m "$REPO_DIR/tests/run.mac" -s "$SEED" 2>&1)
+GPU_HITS=$(echo "$GPU_OUT" | grep "Opticks: NumHits" | head -1 | awk '{print $NF}')
+echo "[GPU] Hits: $GPU_HITS"
+
+GPU_HIT_FILE="/tmp/MISSING_USER/opticks/GEOM/GEOM/GPUPhotonSourceMinimal/ALL0_no_opticks_event_name/A000/hit.npy"
+
+# --- G4 run ---
+echo "[G4]  Running StandAloneGeant4Validation..."
+G4_OUT=$(/opt/eic-opticks/bin/StandAloneGeant4Validation \
+    -g "$GEOM" -c "$CONFIG" -s "$SEED" 2>&1)
+G4_HITS=$(echo "$G4_OUT" | grep "Total accumulated hits" | awk '{print $NF}')
+echo "[G4]  Hits: $G4_HITS"
+
+G4_HIT_FILE="g4_hits.npy"
+
+# --- Compare ---
+echo ""
+echo "[COMPARE] Analyzing wavelength and time distributions..."
+echo ""
+
+python3 - "$GPU_HIT_FILE" "$G4_HIT_FILE" "$GPU_HITS" "$G4_HITS" << 'PYEOF'
+import sys
+import numpy as np
+
+gpu_hit_file = sys.argv[1]
+g4_hit_file = sys.argv[2]
+gpu_nhits = int(sys.argv[3])
+g4_nhits = int(sys.argv[4])
+
+gpu = np.load(gpu_hit_file).reshape(-1, 4, 4)
+g4 = np.load(g4_hit_file).reshape(-1, 4, 4)
+
+gpu_wl = gpu[:, 2, 3]
+g4_wl = g4[:, 2, 3]
+gpu_time = gpu[:, 0, 3]
+g4_time = g4[:, 0, 3]
+
+PASS = True
+ALPHA = 0.001  # significance level (tolerates minor ICDF interpolation difference)
+
+
+def chi2_test(h_obs, h_exp, label):
+    """Chi-squared test for two histograms. Returns (chi2, ndf, p_value, pass)."""
+    # Scale expected to match observed total
+    scale = h_obs.sum() / h_exp.sum() if h_exp.sum() > 0 else 1.0
+    h_exp_scaled = h_exp * scale
+
+    # Only use bins with sufficient statistics (>5 expected)
+    mask = h_exp_scaled > 5
+    if mask.sum() < 2:
+        print(f"  {label}: Too few bins with sufficient stats")
+        return 0, 0, 1.0, True
+
+    obs = h_obs[mask].astype(float)
+    exp = h_exp_scaled[mask].astype(float)
+    chi2 = np.sum((obs - exp) ** 2 / exp)
+    ndf = mask.sum() - 1
+
+    # p-value from chi2 distribution using Wilson-Hilferty approximation
+    if ndf > 0:
+        z = (chi2 / ndf) ** (1.0 / 3) - (1 - 2.0 / (9 * ndf))
+        z /= np.sqrt(2.0 / (9 * ndf))
+        # Approximate p-value from standard normal
+        p = 0.5 * (1.0 + math.erf(-z / np.sqrt(2)))
+    else:
+        p = 1.0
+
+    passed = p >= ALPHA
+    return chi2, ndf, p, passed
+
+
+def ks_test(a, b):
+    """Two-sample Kolmogorov-Smirnov test."""
+    a, b = np.sort(a), np.sort(b)
+    na, nb = len(a), len(b)
+    combined = np.concatenate([a, b])
+    combined.sort()
+    cdf_a = np.searchsorted(a, combined, side='right') / na
+    cdf_b = np.searchsorted(b, combined, side='right') / nb
+    d = np.max(np.abs(cdf_a - cdf_b))
+    en = np.sqrt(na * nb / (na + nb))
+    p = min(np.exp(-2.0 * (en * d) ** 2) * 2.0, 1.0)
+    return d, p
+
+
+# -------------------------------------------------------
+# Test 1: Hit count comparison
+# -------------------------------------------------------
+print("=" * 55)
+print("  TEST 1: Hit Count")
+print("=" * 55)
+print(f"  GPU: {len(gpu)}")
+print(f"  G4:  {len(g4)}")
+import math
+sigma = math.sqrt(len(gpu) + len(g4))
+z = abs(len(gpu) - len(g4)) / sigma if sigma > 0 else 0
+print(f"  |Z| = {z:.1f}σ")
+t1_pass = z < 5
+status = "PASS" if t1_pass else "FAIL"
+print(f"  Result: {status} (threshold: 5σ)")
+PASS = PASS and t1_pass
+
+
+# -------------------------------------------------------
+# Test 2: WLS conversion fraction
+# -------------------------------------------------------
+print()
+print("=" * 55)
+print("  TEST 2: WLS Conversion Fraction")
+print("=" * 55)
+WLS_THRESHOLD = 380  # nm
+
+gpu_frac = np.mean(gpu_wl > WLS_THRESHOLD)
+g4_frac = np.mean(g4_wl > WLS_THRESHOLD)
+frac_diff = abs(gpu_frac - g4_frac)
+
+print(f"  GPU shifted: {100*gpu_frac:.1f}%")
+print(f"  G4  shifted: {100*g4_frac:.1f}%")
+print(f"  |Difference|: {100*frac_diff:.2f}%")
+t2_pass = frac_diff < 0.03  # 3% tolerance
+status = "PASS" if t2_pass else "FAIL"
+print(f"  Result: {status} (threshold: 3%)")
+PASS = PASS and t2_pass
+
+
+# Pre-compute shifted/unshifted arrays
+gpu_shifted = gpu_wl[gpu_wl > WLS_THRESHOLD]
+g4_shifted = g4_wl[g4_wl > WLS_THRESHOLD]
+
+# -------------------------------------------------------
+# Test 3: Shifted wavelength spectrum (KS test)
+# -------------------------------------------------------
+print()
+print("=" * 55)
+print("  TEST 3: Shifted Wavelength Spectrum (KS Test)")
+print("=" * 55)
+
+if len(gpu_shifted) > 10 and len(g4_shifted) > 10:
+    d, p3 = ks_test(gpu_shifted, g4_shifted)
+    print(f"  GPU shifted: N={len(gpu_shifted)}, mean={gpu_shifted.mean():.1f}nm")
+    print(f"  G4  shifted: N={len(g4_shifted)}, mean={g4_shifted.mean():.1f}nm")
+    print(f"  KS D={d:.6f}  p={p3:.4f}")
+    t3_pass = p3 >= ALPHA
+else:
+    print("  Too few shifted photons for KS test")
+    t3_pass = True
+
+status = "PASS" if t3_pass else "FAIL"
+print(f"  Result: {status} (threshold: p > {ALPHA})")
+PASS = PASS and t3_pass
+
+
+# -------------------------------------------------------
+# Test 4: Arrival time for shifted photons (KS test)
+# -------------------------------------------------------
+print()
+print("=" * 55)
+print("  TEST 4: Shifted Photon Arrival Time (KS Test)")
+print("=" * 55)
+
+# Compare shifted photon times — these include WLS exponential delay + transport
+# With the G4 WLS time profile set to "exponential", distributions should match
+gpu_shifted_t = gpu_time[gpu_wl > WLS_THRESHOLD]
+g4_shifted_t = g4_time[g4_wl > WLS_THRESHOLD]
+
+print(f"  GPU shifted: N={len(gpu_shifted_t)}, mean={gpu_shifted_t.mean():.3f}ns, std={gpu_shifted_t.std():.3f}ns")
+print(f"  G4  shifted: N={len(g4_shifted_t)}, mean={g4_shifted_t.mean():.3f}ns, std={g4_shifted_t.std():.3f}ns")
+print(f"  Std ratio: {gpu_shifted_t.std()/g4_shifted_t.std():.3f} (expect ~1.0)")
+
+if len(gpu_shifted_t) > 10 and len(g4_shifted_t) > 10:
+    d_t, p_t = ks_test(gpu_shifted_t, g4_shifted_t)
+    print(f"  KS D={d_t:.6f}  p={p_t:.4f}")
+    t4_pass = p_t >= ALPHA
+else:
+    print("  Too few shifted photons for KS test")
+    t4_pass = True
+
+# Also check unshifted time (pure transport, no WLS delay)
+gpu_unshifted_t = gpu_time[gpu_wl <= WLS_THRESHOLD]
+g4_unshifted_t = g4_time[g4_wl <= WLS_THRESHOLD]
+print(f"  Unshifted time: GPU mean={gpu_unshifted_t.mean():.3f}ns  G4 mean={g4_unshifted_t.mean():.3f}ns")
+
+status = "PASS" if t4_pass else "FAIL"
+print(f"  Result: {status} (KS p > {ALPHA})")
+PASS = PASS and t4_pass
+
+
+# -------------------------------------------------------
+# Summary
+# -------------------------------------------------------
+print()
+print("=" * 55)
+print("  SUMMARY")
+print("=" * 55)
+tests = [
+    ("Hit count",             t1_pass),
+    ("WLS fraction",          t2_pass),
+    ("Shifted wavelength KS", t3_pass),
+    ("Shifted time KS",       t4_pass),
+]
+for name, passed in tests:
+    print(f"  {name:>25s}: {'PASS' if passed else 'FAIL'}")
+
+print()
+if PASS:
+    print("  *** ALL TESTS PASSED ***")
+    sys.exit(0)
+else:
+    print("  *** SOME TESTS FAILED ***")
+    sys.exit(1)
+PYEOF
diff --git a/tools/generate_precooked_rng.cu b/tools/generate_precooked_rng.cu
new file mode 100644
index 000000000..79cb2910d
--- /dev/null
+++ b/tools/generate_precooked_rng.cu
@@ -0,0 +1,113 @@
+/**
+generate_precooked_rng.cu
+==========================
+
+Generates precooked curand Philox sequences for U4Random aligned mode.
+Each photon gets its own random stream matching the GPU simulation.
+
+Build:
+    nvcc -o generate_precooked_rng tools/generate_precooked_rng.cu \
+         -I. -I/opt/eic-opticks/include/eic-opticks -lcurand -std=c++17
+
+Usage:
+    ./generate_precooked_rng [num_photons] [num_randoms_per_photon]
+    Defaults: 100000 photons, 256 randoms each (nj=16, nk=16)
+
+Output:
+    ~/.opticks/precooked/QSimTest/rng_sequence/
+        rng_sequence_f_ni<NI>_nj<NJ>_nk<NK>_tranche<NI>/
+            rng_sequence_f_ni<NI>_nj<NJ>_nk<NK>_ioffset000000.npy
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <sys/stat.h>
+
+#include <curand_kernel.h>
+#include "sysrap/NP.hh"
+
+__global__ void generate_sequences(float* out, unsigned ni, unsigned nv, unsigned id_offset)
+{
+    unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= ni) return;
+
+    unsigned photon_idx = id_offset + idx;
+
+    // Match GPU simulation: curand_init(seed=0, subsequence=photon_idx, offset=0)
+    curandStatePhilox4_32_10_t rng;
+    curand_init(0ULL, (unsigned long long)photon_idx, 0ULL, &rng);
+
+    float* row = out + idx * nv;
+    for (unsigned j = 0; j < nv; j++)
+        row[j] = curand_uniform(&rng);
+}
+
+static void mkdirp(const char* path)
+{
+    char tmp[1024];
+    snprintf(tmp, sizeof(tmp), "%s", path);
+    for (char* p = tmp + 1; *p; p++)
+    {
+        if (*p == '/') { *p = 0; mkdir(tmp, 0755); *p = '/'; }
+    }
+    mkdir(tmp, 0755);
+}
+
+int main(int argc, char** argv)
+{
+    unsigned ni = 100000;
+    unsigned nj = 16;
+    unsigned nk = 16;
+
+    if (argc > 1) ni = atoi(argv[1]);
+    if (argc > 2)
+    {
+        unsigned total = atoi(argv[2]);
+        nj = 1; nk = total;
+        for (unsigned f = 2; f * f <= total; f++)
+        {
+            if (total % f == 0 && f <= 64) { nj = f; nk = total / f; }
+        }
+    }
+
+    unsigned nv = nj * nk;
+    printf("Generating precooked curand Philox sequences:\n");
+    printf("  photons: %u, randoms/photon: %u (nj=%u, nk=%u), memory: %.1f MB\n",
+           ni, nv, nj, nk, (double)ni * nv * sizeof(float) / (1024 * 1024));
+
+    const char* home = getenv("HOME");
+    char dirpath[512], filename[256], fullpath[768];
+
+    snprintf(dirpath, sizeof(dirpath),
+        "%s/.opticks/precooked/QSimTest/rng_sequence/rng_sequence_f_ni%u_nj%u_nk%u_tranche%u",
+        home, ni, nj, nk, ni);
+    mkdirp(dirpath);
+
+    snprintf(filename, sizeof(filename),
+        "rng_sequence_f_ni%u_nj%u_nk%u_ioffset%06u.npy", ni, nj, nk, 0);
+    snprintf(fullpath, sizeof(fullpath), "%s/%s", dirpath, filename);
+
+    float* d_out = nullptr;
+    cudaMalloc(&d_out, (size_t)ni * nv * sizeof(float));
+
+    unsigned threads = 256;
+    unsigned blocks = (ni + threads - 1) / threads;
+    generate_sequences<<<blocks, threads>>>(d_out, ni, nv, 0);
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) { fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err)); return 1; }
+
+    NP* seq = NP::Make<float>(ni, nj, nk);
+    cudaMemcpy(seq->values<float>(), d_out, (size_t)ni * nv * sizeof(float), cudaMemcpyDeviceToHost);
+    cudaFree(d_out);
+
+    seq->save(fullpath);
+    printf("Saved: %s\n", fullpath);
+    printf("Set OPTICKS_RANDOM_SEQPATH=%s\n", fullpath);
+
+    delete seq;
+    return 0;
+}
diff --git a/u4/CMakeLists.txt b/u4/CMakeLists.txt
index 3a18458d9..868af18b9 100644
--- a/u4/CMakeLists.txt
+++ b/u4/CMakeLists.txt
@@ -69,6 +69,7 @@ set(HEADERS
     U4Material.hh
     U4Mat.h
     U4Scint.h
+    U4WLS.h
 
     U4Volume.h
     U4Surface.h
diff --git a/u4/U4Tree.h b/u4/U4Tree.h
index c7495afad..b92865f67 100644
--- a/u4/U4Tree.h
+++ b/u4/U4Tree.h
@@ -82,6 +82,7 @@ controlled via envvar::
 
 #include "U4Mesh.h"
 #include "U4Scint.h"
+#include "U4WLS.h"
 
 #include "U4Solid.h"
 #include "U4PhysicsTable.h"
@@ -112,6 +113,7 @@ struct U4Tree
     std::vector<const G4VSolid*>                solids ;
     U4PhysicsTable<G4OpRayleigh>*               rayleigh_table ;
     U4Scint*                                    scint ;
+    U4WLS *wls;
 
     // disable the below with settings with by defining the below envvar
     static constexpr const char* __DISABLE_OSUR_IMPLICIT = "U4Tree__DISABLE_OSUR_IMPLICIT" ;
@@ -152,6 +154,7 @@ struct U4Tree
     void initMaterial(const G4Material* const mt);
 
     void initScint();
+    void initWLS();
     void initSurfaces();
 
     void initSolids();
@@ -262,6 +265,7 @@ inline U4Tree::U4Tree(
     num_surface_standard(-1),
     rayleigh_table(CreateRayleighTable()),
     scint(nullptr),
+    wls(nullptr),
     enable_osur(!ssys::getenvbool(__DISABLE_OSUR_IMPLICIT)),
     enable_isur(!ssys::getenvbool(__DISABLE_ISUR_IMPLICIT)),
     material_debug(ssys::getenvint(__MATERIAL_DEBUG,0)),
@@ -292,6 +296,9 @@ inline void U4Tree::init()
     LOG(LEVEL) << "-initScint" ;
     initScint();
 
+    LOG(LEVEL) << "-initWLS";
+    initWLS();
+
     LOG(LEVEL) << "-initSurfaces" ;
     initSurfaces();
 
@@ -384,6 +391,28 @@ inline void U4Tree::initScint()
     }
 }
 
+/**
+U4Tree::initWLS
+------------------
+
+Scans all G4 materials for WLS properties (WLSCOMPONENT, WLSTIMECONSTANT).
+Creates inverse CDF texture data and material mapping for GPU-side WLS
+wavelength sampling. Stored in st->standard for serialization and upload.
+
+**/
+
+inline void U4Tree::initWLS()
+{
+    wls = U4WLS::Create(st->material, materials);
+    if (wls)
+    {
+        st->standard->wls_icdf = wls->icdf;
+        st->standard->wls_mat_map = wls->mat_map;
+        st->standard->wls_time_constants = wls->time_constants;
+        LOG(LEVEL) << wls->desc();
+    }
+}
+
 /**
 U4Tree::CreateRayleighTable
 ----------------------------
diff --git a/u4/U4WLS.h b/u4/U4WLS.h
new file mode 100644
index 000000000..c915ae92d
--- /dev/null
+++ b/u4/U4WLS.h
@@ -0,0 +1,209 @@
+#pragma once
+/**
+U4WLS.h : Wavelength Shifting ICDF Creation
+===============================================
+
+Creates inverse CDF textures for wavelength shifting (WLS) materials,
+analogous to U4Scint.h for scintillation. Supports multiple WLS materials
+by stacking ICDF rows into a single texture.
+
+For each material with a WLSCOMPONENT property:
+1. Integrates the emission spectrum to get a CDF
+2. Inverts it at 4096 uniformly-spaced CDF values (3 resolutions for HD)
+3. Extracts WLSTIMECONSTANT from the material properties table
+
+The output arrays:
+- icdf: shape (num_wls_mat*3, 4096, 1) — stacked HD ICDF rows
+- mat_map: shape (num_total_mat,) int — maps material index to WLS row (-1 = no WLS)
+- time_constants: shape (num_wls_mat,) float — per-WLS-material time constant
+
+The G4 WLS process (G4OpWLS) uses these material properties:
+- WLSABSLENGTH: absorption length as f(energy) — handled via boundary texture
+- WLSCOMPONENT: emission spectrum as f(energy) — converted to ICDF here
+- WLSTIMECONSTANT: re-emission time delay (scalar) — extracted here
+
+**/
+
+#include <cassert>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "G4Material.hh"
+#include "G4MaterialPropertiesTable.hh"
+#include "G4MaterialPropertyVector.hh"
+#include "G4PhysicalConstants.hh"
+#include "G4SystemOfUnits.hh"
+
+#include "NP.hh"
+#include "NPFold.h"
+#include "SLOG.hh"
+#include "U4MaterialPropertyVector.h"
+#include "U4Scint.h" // reuse Integral and CreateGeant4InterpolatedInverseCDF
+
+struct U4WLS
+{
+    static constexpr const char *WLSCOMPONENT_KEY = "WLSCOMPONENT";
+    static constexpr const char *WLSTIMECONSTANT_KEY = "WLSTIMECONSTANT";
+
+    static U4WLS *Create(const NPFold *materials, const std::vector<const G4Material *> &mats);
+
+    const NP *icdf;           // (num_wls*3, 4096, 1) stacked HD ICDF for all WLS materials
+    const NP *mat_map;        // (num_total_mat,) int: material idx -> base ICDF row, or -1
+    const NP *time_constants; // (num_wls,) float: time constant per WLS material
+
+    unsigned num_wls;
+    unsigned num_mat;
+
+    U4WLS(const std::vector<const G4Material *> &mats, const std::vector<int> &wls_indices,
+          const std::vector<const G4MaterialPropertyVector *> &wls_components,
+          const std::vector<double> &wls_time_consts);
+
+    std::string desc() const;
+};
+
+/**
+U4WLS::Create
+---------------
+
+Scans all materials for WLSCOMPONENT property. For each material
+that has it, extracts the emission spectrum and time constant.
+
+Returns nullptr if no WLS materials are found.
+
+**/
+
+inline U4WLS *U4WLS::Create(const NPFold *materials, const std::vector<const G4Material *> &mats)
+{
+    std::vector<int> wls_indices;
+    std::vector<const G4MaterialPropertyVector *> wls_components;
+    std::vector<double> wls_time_consts;
+
+    for (unsigned i = 0; i < mats.size(); i++)
+    {
+        const G4Material *mat = mats[i];
+        G4MaterialPropertiesTable *mpt = mat->GetMaterialPropertiesTable();
+        if (mpt == nullptr)
+            continue;
+
+        G4MaterialPropertyVector *wlscomp = mpt->GetProperty(WLSCOMPONENT_KEY);
+        if (wlscomp == nullptr)
+            continue;
+
+        // Found a WLS material
+        wls_indices.push_back(i);
+        wls_components.push_back(wlscomp);
+
+        // Extract time constant (scalar property, default 0 = instant re-emission)
+        double tc = 0.0;
+        if (mpt->ConstPropertyExists(WLSTIMECONSTANT_KEY))
+        {
+            tc = mpt->GetConstProperty(WLSTIMECONSTANT_KEY) / ns; // convert to ns
+        }
+        wls_time_consts.push_back(tc);
+    }
+
+    if (wls_indices.empty())
+        return nullptr;
+
+    return new U4WLS(mats, wls_indices, wls_components, wls_time_consts);
+}
+
+/**
+U4WLS::U4WLS
+--------------
+
+Builds the ICDF texture data and material mapping arrays.
+
+For each WLS material:
+1. Integrate WLSCOMPONENT to get CDF (reuses U4Scint::Integral)
+2. Build 3-layer HD ICDF (reuses U4Scint::CreateGeant4InterpolatedInverseCDF)
+3. Stack into combined ICDF array
+
+**/
+
+inline U4WLS::U4WLS(const std::vector<const G4Material *> &mats, const std::vector<int> &wls_indices,
+                    const std::vector<const G4MaterialPropertyVector *> &wls_components,
+                    const std::vector<double> &wls_time_consts)
+    : icdf(nullptr), mat_map(nullptr), time_constants(nullptr), num_wls(wls_indices.size()), num_mat(mats.size())
+{
+    assert(num_wls > 0);
+    assert(wls_components.size() == num_wls);
+    assert(wls_time_consts.size() == num_wls);
+
+    int num_bins = 4096;
+    int hd_factor = 20;
+
+    // Build per-material ICDFs and stack them
+    std::vector<const NP *> icdfs;
+    for (unsigned w = 0; w < num_wls; w++)
+    {
+        const G4MaterialPropertyVector *comp = wls_components[w];
+        const G4Material *mat = mats[wls_indices[w]];
+        const char *matname = mat->GetName().c_str();
+
+        // Integrate emission spectrum to get CDF
+        G4MaterialPropertyVector *integral = U4Scint::Integral(comp);
+
+        // Build 3-layer HD ICDF (wavelength values in nm)
+        NP *one_icdf = U4Scint::CreateGeant4InterpolatedInverseCDF(integral, num_bins, hd_factor, matname,
+                                                                   false /*energy_not_wavelength*/
+        );
+
+        assert(one_icdf);
+        assert(one_icdf->has_shape(3, num_bins, 1));
+        icdfs.push_back(one_icdf);
+    }
+
+    // Stack all ICDFs into a single array: (num_wls*3, 4096, 1)
+    {
+        NP *stacked = NP::Make<double>(num_wls * 3, num_bins, 1);
+        double *dst = stacked->values<double>();
+        for (unsigned w = 0; w < num_wls; w++)
+        {
+            const double *src = icdfs[w]->cvalues<double>();
+            unsigned row_size = 3 * num_bins * 1;
+            memcpy(dst + w * row_size, src, row_size * sizeof(double));
+        }
+        stacked->set_meta<int>("hd_factor", hd_factor);
+        stacked->set_meta<int>("num_bins", num_bins);
+        stacked->set_meta<int>("num_wls", num_wls);
+        icdf = stacked;
+    }
+
+    // Build material index -> ICDF row mapping
+    // For material i, mat_map[i] = base row in ICDF texture (0, 3, 6, ...)
+    // or -1 if material has no WLS
+    {
+        NP *mm = NP::Make<int>(num_mat);
+        int *mm_v = mm->values<int>();
+        for (unsigned i = 0; i < num_mat; i++)
+            mm_v[i] = -1;
+        for (unsigned w = 0; w < num_wls; w++)
+        {
+            mm_v[wls_indices[w]] = w * 3; // base row for this WLS material's 3 HD layers
+        }
+        mat_map = mm;
+    }
+
+    // Build time constants array (in ns)
+    {
+        NP *tc = NP::Make<float>(num_wls);
+        float *tc_v = tc->values<float>();
+        for (unsigned w = 0; w < num_wls; w++)
+        {
+            tc_v[w] = float(wls_time_consts[w]);
+        }
+        time_constants = tc;
+    }
+}
+
+inline std::string U4WLS::desc() const
+{
+    std::stringstream ss;
+    ss << "U4WLS::desc" << " num_wls " << num_wls << " num_mat " << num_mat << " icdf " << (icdf ? icdf->sstr() : "-")
+       << " mat_map " << (mat_map ? mat_map->sstr() : "-") << " time_constants "
+       << (time_constants ? time_constants->sstr() : "-");
+    return ss.str();
+}