diff --git a/optiphy/ana/compare_aligned.py b/optiphy/ana/compare_aligned.py
new file mode 100644
index 000000000..b044ce6f2
--- /dev/null
+++ b/optiphy/ana/compare_aligned.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+"""
+compare_aligned.py - Photon-by-photon comparison of GPU vs G4 aligned simulations.
+
+Usage:
+    python compare_aligned.py <gpu_photon.npy> <g4_photon.npy>
+
+Performs:
+  1. Per-photon flag comparison (exact match rate)
+  2. Position comparison at multiple thresholds
+  3. Chi-squared test on flag distributions (gold-standard validation metric)
+  4. Glancing-angle photon identification (normal sign ambiguity)
+  5. Divergent photon listing
+"""
+import sys
+import numpy as np
+
+FLAG_NAMES = {
+    0x0004: "TORCH", 0x0008: "BULK_ABSORB", 0x0010: "BULK_REEMIT",
+    0x0020: "BULK_SCATTER", 0x0040: "SURFACE_DETECT", 0x0080: "SURFACE_ABSORB",
+    0x0100: "SURFACE_DREFLECT", 0x0200: "SURFACE_SREFLECT",
+    0x0400: "BOUNDARY_REFLECT", 0x0800: "BOUNDARY_TRANSMIT", 0x8000: "MISS",
+}
+
+def flag_name(f):
+    return FLAG_NAMES.get(f, f"0x{f:04x}")
+
+def extract_flag(photon):
+    """Extract flag from q3.x (orient_boundary_flag) - lower 16 bits."""
+    q3 = photon.view(np.uint32).reshape(-1, 4, 4)
+    return q3[:, 3, 0] & 0xFFFF
+
+def chi2_flag_distribution(gpu_flags, g4_flags):
+    """
+    Chi-squared comparison of flag distributions.
+
+    Compares the frequency of each flag value between GPU and G4.
+    This is the opticks gold-standard validation metric.
+
+    Returns (chi2, ndof, flags_used, gpu_counts, g4_counts).
+    """
+    all_flags = sorted(set(gpu_flags) | set(g4_flags))
+    gpu_counts = np.array([(gpu_flags == f).sum() for f in all_flags], dtype=float)
+    g4_counts  = np.array([(g4_flags == f).sum() for f in all_flags], dtype=float)
+
+    total = gpu_counts + g4_counts
+    mask = total > 0
+    gpu_c = gpu_counts[mask]
+    g4_c  = g4_counts[mask]
+    tot   = total[mask]
+    flags_used = [f for f, m in zip(all_flags, mask) if m]
+
+    n_gpu = gpu_c.sum()
+    n_g4  = g4_c.sum()
+    expected_gpu = tot * n_gpu / (n_gpu + n_g4)
+    expected_g4  = tot * n_g4  / (n_gpu + n_g4)
+
+    chi2 = 0.0
+    for i in range(len(flags_used)):
+        if expected_gpu[i] > 0:
+            chi2 += (gpu_c[i] - expected_gpu[i])**2 / expected_gpu[i]
+        if expected_g4[i] > 0:
+            chi2 += (g4_c[i] - expected_g4[i])**2 / expected_g4[i]
+
+    ndof = max(len(flags_used) - 1, 1)
+    return chi2, ndof, flags_used, gpu_c, g4_c
+
+def identify_glancing(gpu, g4):
+    """
+    Identify glancing-angle photons where the normal sign ambiguity
+    causes momentum negation between GPU and G4.
+
+    At glancing incidence cos(theta) ~ 0, float32 vs float64 can produce
+    opposite normal signs, reflecting the photon in the opposite direction.
+    These photons have matching flags but very different positions.
+
+    Returns boolean mask of glancing photons.
+    """
+    gpu_mom = gpu[:, 1, :3]
+    g4_mom  = g4[:, 1, :3]
+
+    # Normalize momenta (should already be unit vectors, but be safe)
+    gpu_norm = np.linalg.norm(gpu_mom, axis=1, keepdims=True)
+    g4_norm  = np.linalg.norm(g4_mom, axis=1, keepdims=True)
+    gpu_norm[gpu_norm == 0] = 1
+    g4_norm[g4_norm == 0] = 1
+
+    gpu_hat = gpu_mom / gpu_norm
+    g4_hat  = g4_mom / g4_norm
+
+    # Dot product of momentum directions: -1 = fully negated (normal flip)
+    mom_dot = np.sum(gpu_hat * g4_hat, axis=1)
+
+    # Glancing: momentum vectors are nearly anti-parallel (dot ~ -1)
+    glancing = mom_dot < -0.5
+    return glancing, mom_dot
+
+def main():
+    if len(sys.argv) < 3:
+        print(f"Usage: {sys.argv[0]} <gpu_photon.npy> <g4_photon.npy>")
+        sys.exit(1)
+
+    gpu = np.load(sys.argv[1])
+    g4  = np.load(sys.argv[2])
+
+    print(f"GPU shape: {gpu.shape}")
+    print(f"G4  shape: {g4.shape}")
+
+    n = min(len(gpu), len(g4))
+    gpu = gpu[:n]
+    g4  = g4[:n]
+
+    gpu_flags = extract_flag(gpu)
+    g4_flags  = extract_flag(g4)
+
+    # ---- 1. Per-photon flag comparison ----
+    match = gpu_flags == g4_flags
+    n_match = match.sum()
+    n_diff = n - n_match
+    print(f"\n{'='*60}")
+    print(f"FLAG COMPARISON ({n} photons)")
+    print(f"{'='*60}")
+    print(f"  Matching: {n_match} ({100*n_match/n:.1f}%)")
+    print(f"  Differ:   {n_diff} ({100*n_diff/n:.1f}%)")
+
+    # ---- 2. Position comparison ----
+    gpu_pos = gpu[:, 0, :3]
+    g4_pos  = g4[:, 0, :3]
+    pos_diff = np.linalg.norm(gpu_pos - g4_pos, axis=1)
+    zero_g4 = np.all(g4_pos == 0, axis=1)
+
+    valid = ~zero_g4
+    n_valid = valid.sum()
+    print(f"\n{'='*60}")
+    print(f"POSITION COMPARISON ({n_valid} valid, {zero_g4.sum()} unrecorded)")
+    print(f"{'='*60}")
+    if n_valid > 0:
+        vdiff = pos_diff[valid]
+        print(f"  Mean dist:   {vdiff.mean():.4f} mm")
+        print(f"  Max dist:    {vdiff.max():.4f} mm")
+        print(f"  < 0.01 mm:   {(vdiff < 0.01).sum()} ({100*(vdiff < 0.01).sum()/n_valid:.1f}%)")
+        print(f"  < 0.1 mm:    {(vdiff < 0.1).sum()} ({100*(vdiff < 0.1).sum()/n_valid:.1f}%)")
+        print(f"  < 1.0 mm:    {(vdiff < 1.0).sum()} ({100*(vdiff < 1.0).sum()/n_valid:.1f}%)")
+
+    # ---- 3. Chi-squared test on flag distributions ----
+    print(f"\n{'='*60}")
+    print(f"CHI-SQUARED TEST (flag distribution)")
+    print(f"{'='*60}")
+
+    chi2_val, ndof, flags_used, gpu_c, g4_c = chi2_flag_distribution(gpu_flags, g4_flags)
+
+    print(f"  {'Flag':<20s} {'GPU':>8s} {'G4':>8s} {'Diff':>8s}")
+    print(f"  {'-'*20} {'-'*8} {'-'*8} {'-'*8}")
+    for i, f in enumerate(flags_used):
+        diff = int(gpu_c[i] - g4_c[i])
+        sign = "+" if diff > 0 else ""
+        print(f"  {flag_name(f):<20s} {int(gpu_c[i]):>8d} {int(g4_c[i]):>8d} {sign}{diff:>7d}")
+
+    deviant_frac = 100 * n_diff / n if n > 0 else 0
+    print(f"\n  chi2/ndof = {chi2_val:.2f}/{ndof} = {chi2_val/ndof:.2f}")
+    print(f"  deviant fraction: {deviant_frac:.2f}% ({n_diff}/{n})")
+
+    # ---- 4. Glancing-angle analysis ----
+    print(f"\n{'='*60}")
+    print(f"GLANCING-ANGLE ANALYSIS (normal sign ambiguity)")
+    print(f"{'='*60}")
+
+    glancing, mom_dot = identify_glancing(gpu, g4)
+    n_glancing = glancing.sum()
+
+    # Among matching-flag photons, how many are glancing with large pos diff?
+    match_glancing = match & glancing
+    match_large_pos = match & (pos_diff > 1.0)
+    match_glancing_large = match & glancing & (pos_diff > 1.0)
+
+    print(f"  Glancing photons (mom dot < -0.5):  {n_glancing}")
+    print(f"  Matching flag + pos diff > 1mm:      {match_large_pos.sum()}")
+    print(f"  Of those, glancing:                  {match_glancing_large.sum()}")
+    if match_large_pos.sum() > 0:
+        frac = 100 * match_glancing_large.sum() / match_large_pos.sum()
+        print(f"  Fraction explained by glancing:      {frac:.0f}%")
+
+    # Position stats excluding glancing photons
+    non_glancing_match = match & ~glancing & valid
+    if non_glancing_match.sum() > 0:
+        ng_diff = pos_diff[non_glancing_match]
+        print(f"\n  Position (matching, non-glancing, {non_glancing_match.sum()} photons):")
+        print(f"    Max dist:  {ng_diff.max():.6f} mm")
+        print(f"    Mean dist: {ng_diff.mean():.6f} mm")
+        print(f"    < 0.01 mm: {(ng_diff < 0.01).sum()} ({100*(ng_diff < 0.01).sum()/non_glancing_match.sum():.1f}%)")
+
+    # ---- 5. Divergent photon listing ----
+    if n_diff > 0:
+        div_idx = np.where(~match)[0]
+        print(f"\n{'='*60}")
+        print(f"DIVERGENT PHOTONS (first 10 of {n_diff})")
+        print(f"{'='*60}")
+        for i in div_idx[:10]:
+            gf = flag_name(gpu_flags[i])
+            cf = flag_name(g4_flags[i])
+            gp = gpu_pos[i]
+            cp = g4_pos[i]
+            print(f"  [{i:5d}] GPU: {gf:20s} pos=({gp[0]:8.2f},{gp[1]:8.2f},{gp[2]:8.2f})")
+            print(f"          G4:  {cf:20s} pos=({cp[0]:8.2f},{cp[1]:8.2f},{cp[2]:8.2f})")
+
+if __name__ == "__main__":
+    main()
diff --git a/optiphy/ana/run_genstep_comparison.py b/optiphy/ana/run_genstep_comparison.py
new file mode 100644
index 000000000..cd87230bb
--- /dev/null
+++ b/optiphy/ana/run_genstep_comparison.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+"""
+run_genstep_comparison.py
+==========================
+
+Runs GPU (simg4ox) and G4 (G4ValidationGenstep) simulations with the same
+electron primary, then compares the optical photon hit distributions.
+
+Usage:
+    python run_genstep_comparison.py [--gdml det.gdml] [--energy 1.0] [--nevents 10] [--seed 42]
+"""
+import os
+import sys
+import subprocess
+import argparse
+import numpy as np
+from pathlib import Path
+
+def find_gpu_hits():
+    """Find the most recent GPU hit.npy output."""
+    base = Path(f"/tmp/{os.environ.get('USER','MISSING_USER')}/opticks")
+    candidates = sorted(base.rglob("hit.npy"), key=lambda p: p.stat().st_mtime, reverse=True)
+    return str(candidates[0]) if candidates else None
+
+def run_g4(gdml, energy, nevents, seed, pos, direction):
+    """Run pure G4 simulation with electron primary."""
+    cmd = [
+        "G4ValidationGenstep",
+        "-g", gdml,
+        "-e", str(energy),
+        "-n", str(nevents),
+        "-s", str(seed),
+        "--pos", pos,
+        "--dir", direction,
+    ]
+    print(f"=== Running G4: {' '.join(cmd)}")
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
+
+    # Extract hit count from output
+    g4_hits = 0
+    for line in result.stdout.split('\n'):
+        if "Total hits:" in line:
+            g4_hits = int(line.split("Total hits:")[-1].strip())
+
+    print(f"G4: {g4_hits} hits")
+    if result.returncode != 0:
+        print(f"G4 STDERR (last 5 lines):")
+        for line in result.stderr.strip().split('\n')[-5:]:
+            print(f"  {line}")
+    return g4_hits
+
+def run_gpu(gdml, config, macro, seed):
+    """Run GPU simulation via simg4ox."""
+    env = os.environ.copy()
+    env["OPTICKS_INTEGRATION_MODE"] = "1"  # Minimal mode: G4 tracks electron, GPU propagates optical
+
+    cmd = [
+        "simg4ox",
+        "-g", gdml,
+        "-c", config,
+        "-m", macro,
+    ]
+    print(f"\n=== Running GPU: {' '.join(cmd)}")
+    print(f"    OPTICKS_INTEGRATION_MODE=1 (Minimal)")
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
+
+    if result.returncode != 0:
+        print(f"GPU STDERR (last 10 lines):")
+        for line in result.stderr.strip().split('\n')[-10:]:
+            print(f"  {line}")
+        return 0
+
+    # Find hit output
+    hit_path = find_gpu_hits()
+    if hit_path and os.path.exists(hit_path):
+        hits = np.load(hit_path)
+        print(f"GPU: {len(hits)} hits (from {hit_path})")
+        return len(hits)
+    else:
+        print("GPU: no hit.npy found")
+        return 0
+
+def compare_hits(g4_path, gpu_path):
+    """Compare G4 and GPU hit arrays."""
+    if not os.path.exists(g4_path):
+        print(f"G4 hits not found: {g4_path}")
+        return
+    if not gpu_path or not os.path.exists(gpu_path):
+        print(f"GPU hits not found")
+        return
+
+    g4 = np.load(g4_path)
+    gpu = np.load(gpu_path)
+
+    print(f"\n{'='*60}")
+    print(f"HIT COMPARISON")
+    print(f"{'='*60}")
+    print(f"  G4 hits:  {len(g4)}")
+    print(f"  GPU hits: {len(gpu)}")
+
+    if len(g4) > 0 and len(gpu) > 0:
+        diff = len(gpu) - len(g4)
+        pct = 100 * diff / len(g4) if len(g4) > 0 else 0
+        sign = "+" if diff > 0 else ""
+        print(f"  Diff:     {sign}{diff} ({sign}{pct:.1f}%)")
+
+    # Position distributions
+    if len(g4) > 0:
+        g4_pos = g4[:, 0, :3]
+        print(f"\n  G4 hit positions:")
+        print(f"    x: [{g4_pos[:,0].min():.1f}, {g4_pos[:,0].max():.1f}] mm")
+        print(f"    y: [{g4_pos[:,1].min():.1f}, {g4_pos[:,1].max():.1f}] mm")
+        print(f"    z: [{g4_pos[:,2].min():.1f}, {g4_pos[:,2].max():.1f}] mm")
+
+    if len(gpu) > 0:
+        gpu_pos = gpu[:, 0, :3]
+        print(f"\n  GPU hit positions:")
+        print(f"    x: [{gpu_pos[:,0].min():.1f}, {gpu_pos[:,0].max():.1f}] mm")
+        print(f"    y: [{gpu_pos[:,1].min():.1f}, {gpu_pos[:,1].max():.1f}] mm")
+        print(f"    z: [{gpu_pos[:,2].min():.1f}, {gpu_pos[:,2].max():.1f}] mm")
+
+    # Wavelength distributions
+    if len(g4) > 0:
+        g4_wl = g4[:, 2, 3]
+        print(f"\n  G4 wavelength:  mean={g4_wl.mean():.1f} std={g4_wl.std():.1f} nm")
+    if len(gpu) > 0:
+        gpu_wl = gpu[:, 2, 3]
+        print(f"  GPU wavelength: mean={gpu_wl.mean():.1f} std={gpu_wl.std():.1f} nm")
+
+    # Time distributions
+    if len(g4) > 0:
+        g4_t = g4[:, 0, 3]
+        print(f"\n  G4 time:  mean={g4_t.mean():.2f} max={g4_t.max():.2f} ns")
+    if len(gpu) > 0:
+        gpu_t = gpu[:, 0, 3]
+        print(f"  GPU time: mean={gpu_t.mean():.2f} max={gpu_t.max():.2f} ns")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare GPU vs G4 electron genstep simulation")
+    parser.add_argument("--gdml", default="det.gdml", help="GDML geometry file")
+    parser.add_argument("--energy", type=float, default=1.0, help="Electron energy in MeV")
+    parser.add_argument("--nevents", type=int, default=10, help="Number of events")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--pos", default="0,0,100", help="Electron position x,y,z mm")
+    parser.add_argument("--dir", default="0,0,1", help="Electron direction x,y,z")
+    args = parser.parse_args()
+
+    # Run G4
+    g4_hits = run_g4(args.gdml, args.energy, args.nevents, args.seed, args.pos, args.dir)
+
+    # Compare
+    g4_path = "g4_genstep_hits.npy"
+    gpu_path = find_gpu_hits()
+
+    if os.path.exists(g4_path):
+        g4 = np.load(g4_path)
+        print(f"\n{'='*60}")
+        print(f"G4 RESULTS ({args.nevents} events, {args.energy} MeV electron)")
+        print(f"{'='*60}")
+        print(f"  Total hits: {len(g4)}")
+        print(f"  Hits/event: {len(g4)/args.nevents:.1f}")
+        if len(g4) > 0:
+            g4_wl = g4[:, 2, 3]
+            g4_pos = g4[:, 0, :3]
+            print(f"  Wavelength: mean={g4_wl.mean():.1f} nm")
+            print(f"  Hit y range: [{g4_pos[:,1].min():.1f}, {g4_pos[:,1].max():.1f}] mm")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b58cd89c2..996a91cb3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -77,6 +77,24 @@ target_include_directories(GPUPhotonFileSource PRIVATE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
 )
 
+# StandAloneGeant4Validation - pure G4 optical photon simulation (no opticks GPU)
+# Links U4 for aligned mode (U4Random, InstrumentedG4OpBoundaryProcess, ShimG4Op*)
+add_executable(StandAloneGeant4Validation StandAloneGeant4Validation.cpp StandAloneGeant4Validation.h)
+target_link_libraries(StandAloneGeant4Validation gphox gphox_g4_deps U4)
+target_compile_definitions(StandAloneGeant4Validation PRIVATE WITH_INSTRUMENTED_DEBUG)
+target_include_directories(StandAloneGeant4Validation PRIVATE
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include>
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
+)
+
+# G4ValidationGenstep - pure G4 electron→scintillation/Cerenkov→optical photon simulation
+add_executable(G4ValidationGenstep G4ValidationGenstep.cpp G4ValidationGenstep.h)
+target_link_libraries(G4ValidationGenstep gphox gphox_g4_deps)
+target_include_directories(G4ValidationGenstep PRIVATE
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include>
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
+)
+
 # simtox creates a numpy file with initial photons for simulation
 add_executable(simtox simtox.cpp)
 
@@ -87,7 +105,7 @@ target_include_directories(simtox PRIVATE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
 )
 
-install(TARGETS consgeo simg4ox GPUCerenkov GPURaytrace GPUPhotonSource GPUPhotonSourceMinimal GPUPhotonFileSource simtox gphox gphox_g4_deps
+install(TARGETS consgeo simg4ox GPUCerenkov GPURaytrace GPUPhotonSource GPUPhotonSourceMinimal GPUPhotonFileSource StandAloneGeant4Validation G4ValidationGenstep simtox gphox gphox_g4_deps
     EXPORT ${PROJECT_NAME}Targets
     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/src/G4ValidationGenstep.cpp b/src/G4ValidationGenstep.cpp
new file mode 100644
index 000000000..c0a3c8733
--- /dev/null
+++ b/src/G4ValidationGenstep.cpp
@@ -0,0 +1,114 @@
+#include <string>
+
+#include <argparse/argparse.hpp>
+
+#include "FTFP_BERT.hh"
+#include "G4OpticalPhysics.hh"
+#include "G4RunManager.hh"
+#include "G4UImanager.hh"
+#include "G4VModularPhysicsList.hh"
+
+#include "G4ValidationGenstep.h"
+
+using namespace std;
+
+int main(int argc, char** argv)
+{
+    argparse::ArgumentParser program("G4ValidationGenstep", "0.0.0");
+
+    string gdml_file;
+    double energy_MeV = 1.0;
+    int    num_events = 1;
+
+    program.add_argument("-g", "--gdml")
+        .help("path to GDML file")
+        .default_value(string("det.gdml"))
+        .nargs(1)
+        .store_into(gdml_file);
+
+    program.add_argument("-e", "--energy")
+        .help("electron kinetic energy in MeV")
+        .default_value(1.0)
+        .scan<'g', double>()
+        .store_into(energy_MeV);
+
+    program.add_argument("-n", "--nevents")
+        .help("number of events")
+        .default_value(1)
+        .scan<'i', int>()
+        .store_into(num_events);
+
+    program.add_argument("-s", "--seed")
+        .help("random seed")
+        .scan<'i', long>();
+
+    program.add_argument("--pos")
+        .help("electron position x,y,z in mm (comma-separated)")
+        .default_value(string("0,0,0"));
+
+    program.add_argument("--dir")
+        .help("electron direction x,y,z (comma-separated)")
+        .default_value(string("0,0,1"));
+
+    try
+    {
+        program.parse_args(argc, argv);
+    }
+    catch (const exception& err)
+    {
+        cerr << err.what() << endl;
+        cerr << program;
+        exit(EXIT_FAILURE);
+    }
+
+    long seed;
+    if (program.is_used("--seed"))
+        seed = program.get<long>("--seed");
+    else
+        seed = static_cast<long>(time(nullptr));
+
+    // Parse position
+    G4ThreeVector pos(0, 0, 0);
+    {
+        string s = program.get<string>("--pos");
+        float  x, y, z;
+        if (sscanf(s.c_str(), "%f,%f,%f", &x, &y, &z) == 3)
+            pos = G4ThreeVector(x, y, z);
+    }
+
+    // Parse direction
+    G4ThreeVector dir(0, 0, 1);
+    {
+        string s = program.get<string>("--dir");
+        float  x, y, z;
+        if (sscanf(s.c_str(), "%f,%f,%f", &x, &y, &z) == 3)
+            dir = G4ThreeVector(x, y, z);
+    }
+
+    G4cout << "G4ValidationGenstep:" << G4endl;
+    G4cout << "  GDML:     " << gdml_file << G4endl;
+    G4cout << "  Energy:   " << energy_MeV << " MeV" << G4endl;
+    G4cout << "  Events:   " << num_events << G4endl;
+    G4cout << "  Position: (" << pos.x() << "," << pos.y() << "," << pos.z() << ") mm" << G4endl;
+    G4cout << "  Direction: (" << dir.x() << "," << dir.y() << "," << dir.z() << ")" << G4endl;
+    G4cout << "  Seed:     " << seed << G4endl;
+
+    GenstepHitAccumulator accumulator;
+
+    G4VModularPhysicsList* physics = new FTFP_BERT;
+    physics->RegisterPhysics(new G4OpticalPhysics);
+
+    G4RunManager run_mgr;
+    run_mgr.SetUserInitialization(physics);
+    run_mgr.SetUserInitialization(new GenstepDetectorConstruction(gdml_file, &accumulator));
+    run_mgr.SetUserInitialization(
+        new GenstepActionInitialization(&accumulator, pos, dir, energy_MeV, num_events));
+    run_mgr.Initialize();
+
+    CLHEP::HepRandom::setTheSeed(seed);
+
+    G4cout << "G4Genstep: Starting " << num_events << " events..." << G4endl;
+    run_mgr.BeamOn(num_events);
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/G4ValidationGenstep.h b/src/G4ValidationGenstep.h
new file mode 100644
index 000000000..bac91f938
--- /dev/null
+++ b/src/G4ValidationGenstep.h
@@ -0,0 +1,296 @@
+#pragma once
+/**
+G4ValidationGenstep.h
+======================
+
+Pure G4 simulation with electron primary that produces scintillation/Cerenkov
+optical photons. G4 handles all physics including optical photon propagation.
+Collects hits via sensitive detector. Used as the CPU reference for comparison
+with GPU (simg4ox) genstep-based optical simulation.
+**/
+
+#include <filesystem>
+#include <mutex>
+#include <vector>
+
+#include "G4Electron.hh"
+#include "G4Event.hh"
+#include "G4GDMLParser.hh"
+#include "G4OpticalPhoton.hh"
+#include "G4PhysicalConstants.hh"
+#include "G4PrimaryParticle.hh"
+#include "G4PrimaryVertex.hh"
+#include "G4Run.hh"
+#include "G4SDManager.hh"
+#include "G4SystemOfUnits.hh"
+#include "G4THitsCollection.hh"
+#include "G4ThreeVector.hh"
+#include "G4Track.hh"
+#include "G4TrackStatus.hh"
+#include "G4UserEventAction.hh"
+#include "G4UserRunAction.hh"
+#include "G4VHit.hh"
+#include "G4VPhysicalVolume.hh"
+#include "G4VUserActionInitialization.hh"
+#include "G4VUserDetectorConstruction.hh"
+#include "G4VUserPrimaryGeneratorAction.hh"
+
+#include "sysrap/NP.hh"
+#include "sysrap/sphoton.h"
+
+// ---- Hit accumulator ----
+
+struct GenstepHitAccumulator
+{
+    std::mutex           mtx;
+    std::vector<sphoton> hits;
+    int                  total_optical_photons = 0;
+    int                  total_scintillation = 0;
+    int                  total_cerenkov = 0;
+
+    void AddHits(const std::vector<sphoton>& event_hits)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        hits.insert(hits.end(), event_hits.begin(), event_hits.end());
+    }
+
+    void Save(const char* filename)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        G4int                       num_hits = hits.size();
+        NP*                         arr = NP::Make<float>(num_hits, 4, 4);
+        for (int i = 0; i < num_hits; i++)
+        {
+            float* data = reinterpret_cast<float*>(&hits[i]);
+            std::copy(data, data + 16, arr->values<float>() + i * 16);
+        }
+        arr->save(filename);
+        delete arr;
+        G4cout << "G4Genstep: Saved " << num_hits << " hits to " << filename << G4endl;
+    }
+};
+
+// ---- Sensitive Detector ----
+
+struct GenstepPhotonHit : public G4VHit
+{
+    GenstepPhotonHit() = default;
+
+    GenstepPhotonHit(G4double energy, G4double time, G4ThreeVector position,
+                     G4ThreeVector direction, G4ThreeVector polarization) :
+        photon()
+    {
+        photon.pos = {static_cast<float>(position.x()),
+                      static_cast<float>(position.y()),
+                      static_cast<float>(position.z())};
+        photon.time = time;
+        photon.mom = {static_cast<float>(direction.x()),
+                      static_cast<float>(direction.y()),
+                      static_cast<float>(direction.z())};
+        photon.pol = {static_cast<float>(polarization.x()),
+                      static_cast<float>(polarization.y()),
+                      static_cast<float>(polarization.z())};
+        photon.wavelength = h_Planck * c_light / (energy * CLHEP::eV);
+    }
+
+    void Print() override
+    {
+        G4cout << photon << G4endl;
+    }
+    sphoton photon;
+};
+
+using GenstepPhotonHitsCollection = G4THitsCollection<GenstepPhotonHit>;
+
+struct GenstepPhotonSD : public G4VSensitiveDetector
+{
+    GenstepHitAccumulator* accumulator;
+
+    GenstepPhotonSD(G4String name, GenstepHitAccumulator* acc) :
+        G4VSensitiveDetector(name),
+        accumulator(acc)
+    {
+        collectionName.insert(name + "_HC");
+    }
+
+    void Initialize(G4HCofThisEvent* hce) override
+    {
+        fHC = new GenstepPhotonHitsCollection(SensitiveDetectorName, collectionName[0]);
+        if (fHCID < 0)
+            fHCID = G4SDManager::GetSDMpointer()->GetCollectionID(collectionName[0]);
+        hce->AddHitsCollection(fHCID, fHC);
+    }
+
+    G4bool ProcessHits(G4Step* aStep, G4TouchableHistory*) override
+    {
+        G4Track* track = aStep->GetTrack();
+        if (track->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
+            return false;
+
+        fHC->insert(new GenstepPhotonHit(
+            track->GetTotalEnergy(),
+            track->GetGlobalTime(),
+            aStep->GetPostStepPoint()->GetPosition(),
+            aStep->GetPostStepPoint()->GetMomentumDirection(),
+            aStep->GetPostStepPoint()->GetPolarization()));
+
+        track->SetTrackStatus(fStopAndKill);
+        return true;
+    }
+
+    void EndOfEvent(G4HCofThisEvent*) override
+    {
+        G4int                n = fHC->entries();
+        std::vector<sphoton> event_hits;
+        event_hits.reserve(n);
+        for (GenstepPhotonHit* hit : *fHC->GetVector())
+            event_hits.push_back(hit->photon);
+        accumulator->AddHits(event_hits);
+    }
+
+  private:
+    GenstepPhotonHitsCollection* fHC = nullptr;
+    G4int                        fHCID = -1;
+};
+
+// ---- Detector Construction ----
+
+struct GenstepDetectorConstruction : G4VUserDetectorConstruction
+{
+    GenstepDetectorConstruction(std::filesystem::path gdml_file, GenstepHitAccumulator* acc) :
+        gdml_file_(gdml_file),
+        accumulator_(acc)
+    {
+    }
+
+    G4VPhysicalVolume* Construct() override
+    {
+        parser_.Read(gdml_file_.string(), false);
+        return parser_.GetWorldVolume();
+    }
+
+    void ConstructSDandField() override
+    {
+        G4SDManager*            SDman = G4SDManager::GetSDMpointer();
+        const G4GDMLAuxMapType* auxmap = parser_.GetAuxMap();
+
+        for (auto const& [logVol, listType] : *auxmap)
+        {
+            for (auto const& auxtype : listType)
+            {
+                if (auxtype.type == "SensDet")
+                {
+                    G4String name = logVol->GetName() + "_" + auxtype.value;
+                    G4cout << "G4Genstep: Attaching SD to " << logVol->GetName() << G4endl;
+                    GenstepPhotonSD* sd = new GenstepPhotonSD(name, accumulator_);
+                    SDman->AddNewDetector(sd);
+                    logVol->SetSensitiveDetector(sd);
+                }
+            }
+        }
+    }
+
+  private:
+    std::filesystem::path  gdml_file_;
+    G4GDMLParser           parser_;
+    GenstepHitAccumulator* accumulator_;
+};
+
+// ---- Electron Primary Generator ----
+
+struct ElectronPrimaryGenerator : G4VUserPrimaryGeneratorAction
+{
+    G4ThreeVector position;
+    G4ThreeVector direction;
+    G4double      energy_MeV;
+
+    ElectronPrimaryGenerator(G4ThreeVector pos, G4ThreeVector dir, G4double energy) :
+        position(pos),
+        direction(dir.unit()),
+        energy_MeV(energy)
+    {
+    }
+
+    void GeneratePrimaries(G4Event* event) override
+    {
+        G4PrimaryVertex*   vertex = new G4PrimaryVertex(position, 0.0);
+        G4PrimaryParticle* particle = new G4PrimaryParticle(G4Electron::Definition());
+        particle->SetKineticEnergy(energy_MeV * MeV);
+        particle->SetMomentumDirection(direction);
+        vertex->SetPrimary(particle);
+        event->AddPrimaryVertex(vertex);
+    }
+};
+
+// ---- Event Action with optical photon counting ----
+
+struct GenstepEventAction : G4UserEventAction
+{
+    GenstepHitAccumulator* accumulator;
+    int                    total_events;
+
+    GenstepEventAction(GenstepHitAccumulator* acc, int total) :
+        accumulator(acc),
+        total_events(total)
+    {
+    }
+
+    void EndOfEventAction(const G4Event* event) override
+    {
+        int id = event->GetEventID();
+        if (id == 0 || (id + 1) % 10 == 0 || id + 1 == total_events)
+            G4cout << "G4Genstep: Event " << id + 1 << "/" << total_events << G4endl;
+    }
+};
+
+// ---- Run Action ----
+
+struct GenstepRunAction : G4UserRunAction
+{
+    GenstepHitAccumulator* accumulator;
+
+    GenstepRunAction(GenstepHitAccumulator* acc) :
+        accumulator(acc)
+    {
+    }
+
+    void EndOfRunAction(const G4Run*) override
+    {
+        G4cout << "G4Genstep: Total hits: " << accumulator->hits.size() << G4endl;
+        accumulator->Save("g4_genstep_hits.npy");
+    }
+};
+
+// ---- Action Initialization ----
+
+struct GenstepActionInitialization : G4VUserActionInitialization
+{
+    GenstepHitAccumulator* accumulator;
+    G4ThreeVector          position;
+    G4ThreeVector          direction;
+    G4double               energy_MeV;
+    int                    num_events;
+
+    GenstepActionInitialization(GenstepHitAccumulator* acc,
+                                G4ThreeVector pos, G4ThreeVector dir,
+                                G4double energy, int nevt) :
+        accumulator(acc),
+        position(pos),
+        direction(dir),
+        energy_MeV(energy),
+        num_events(nevt)
+    {
+    }
+
+    void BuildForMaster() const override
+    {
+        SetUserAction(new GenstepRunAction(accumulator));
+    }
+
+    void Build() const override
+    {
+        SetUserAction(new ElectronPrimaryGenerator(position, direction, energy_MeV));
+        SetUserAction(new GenstepEventAction(accumulator, num_events));
+        SetUserAction(new GenstepRunAction(accumulator));
+    }
+};
diff --git a/src/StandAloneGeant4Validation.cpp b/src/StandAloneGeant4Validation.cpp
new file mode 100644
index 000000000..75cfff926
--- /dev/null
+++ b/src/StandAloneGeant4Validation.cpp
@@ -0,0 +1,167 @@
+#include <string>
+#include <thread>
+
+#include <argparse/argparse.hpp>
+
+#include "FTFP_BERT.hh"
+#include "G4MTRunManager.hh"
+#include "G4OpticalPhysics.hh"
+#include "G4RunManager.hh"
+#include "G4UImanager.hh"
+#include "G4VModularPhysicsList.hh"
+
+#include "G4OpticalParameters.hh"
+
+#include "StandAloneGeant4Validation.h"
+#include "config.h"
+
+using namespace std;
+
+int main(int argc, char** argv)
+{
+    argparse::ArgumentParser program("StandAloneGeant4Validation", "0.0.0");
+
+    string gdml_file, config_name;
+    int    num_threads = 0;
+
+    program.add_argument("-g", "--gdml")
+        .help("path to GDML file")
+        .default_value(string("geom.gdml"))
+        .nargs(1)
+        .store_into(gdml_file);
+
+    program.add_argument("-c", "--config")
+        .help("the name of a config file")
+        .default_value(string("dev"))
+        .nargs(1)
+        .store_into(config_name);
+
+    program.add_argument("-s", "--seed")
+        .help("fixed random seed (default: time-based)")
+        .scan<'i', long>();
+
+    program.add_argument("-t", "--threads")
+        .help("number of threads (0=sequential, default: hardware concurrency)")
+        .default_value(-1)
+        .scan<'i', int>()
+        .store_into(num_threads);
+
+    program.add_argument("--aligned")
+        .help("enable photon-by-photon aligned comparison with GPU (forces sequential)")
+        .default_value(false)
+        .implicit_value(true);
+
+    try
+    {
+        program.parse_args(argc, argv);
+    }
+    catch (const exception& err)
+    {
+        cerr << err.what() << endl;
+        cerr << program;
+        exit(EXIT_FAILURE);
+    }
+
+    long seed;
+    if (program.is_used("--seed"))
+        seed = program.get<long>("--seed");
+    else
+        seed = static_cast<long>(time(nullptr));
+
+    bool aligned = program.get<bool>("--aligned");
+
+    gphox::Config cfg(config_name);
+    int           total_photons = cfg.torch.numphoton;
+
+    // Aligned mode forces sequential (U4Random is single-threaded)
+    if (aligned)
+        num_threads = 0;
+
+    // Determine threading mode
+    bool use_mt = (num_threads != 0);
+    if (num_threads < 0)
+        num_threads = std::thread::hardware_concurrency();
+    if (num_threads < 1)
+        num_threads = 1;
+
+    // In MT mode: split photons across events, one event per thread-batch
+    // In sequential mode: one event with all photons (original behavior)
+    int num_events, photons_per_event;
+    if (use_mt)
+    {
+        num_events = num_threads * 4; // 4 events per thread for load balancing
+        photons_per_event = (total_photons + num_events - 1) / num_events;
+        // Adjust num_events so we don't overshoot
+        num_events = (total_photons + photons_per_event - 1) / photons_per_event;
+    }
+    else
+    {
+        num_events = 1;
+        photons_per_event = total_photons;
+    }
+
+    int actual_photons = num_events * photons_per_event;
+
+    G4cout << "Random seed set to: " << seed << G4endl;
+    G4cout << "G4: " << total_photons << " photons, "
+           << num_events << " events x " << photons_per_event << " photons/event"
+           << " (" << actual_photons << " actual)"
+           << (use_mt ? ", " + to_string(num_threads) + " threads" : ", sequential")
+           << G4endl;
+
+    HitAccumulator        accumulator;
+    PhotonFateAccumulator fate;
+
+    if (aligned)
+        fate.Resize(total_photons);
+
+    G4VModularPhysicsList* physics = new FTFP_BERT;
+    if (aligned)
+        physics->RegisterPhysics(new AlignedOpticalPhysics);
+    else
+        physics->RegisterPhysics(new G4OpticalPhysics);
+
+    // Use exponential WLS time profile (default is delta = zero delay)
+    G4OpticalParameters::Instance()->SetWLSTimeProfile("exponential");
+
+    if (use_mt)
+    {
+        auto* run_mgr = new G4MTRunManager;
+        run_mgr->SetNumberOfThreads(num_threads);
+        run_mgr->SetUserInitialization(physics);
+        run_mgr->SetUserInitialization(new G4OnlyDetectorConstruction(gdml_file, &accumulator));
+        run_mgr->SetUserInitialization(
+            new G4OnlyActionInitialization(cfg, &accumulator, &fate, photons_per_event, num_events, aligned));
+        run_mgr->Initialize();
+
+        CLHEP::HepRandom::setTheSeed(seed);
+
+        G4cout << "G4: Starting MT run with " << num_events << " events..." << G4endl;
+        run_mgr->BeamOn(num_events);
+
+        delete run_mgr;
+    }
+    else
+    {
+        G4RunManager run_mgr;
+        run_mgr.SetUserInitialization(physics);
+        run_mgr.SetUserInitialization(new G4OnlyDetectorConstruction(gdml_file, &accumulator));
+        run_mgr.SetUserInitialization(
+            new G4OnlyActionInitialization(cfg, &accumulator, &fate, photons_per_event, num_events, aligned));
+
+        if (aligned)
+        {
+            G4cout << "G4: Aligned mode — creating U4Random" << G4endl;
+            U4Random::Create();
+        }
+
+        run_mgr.Initialize();
+
+        CLHEP::HepRandom::setTheSeed(seed);
+
+        G4cout << "G4: Starting sequential run..." << G4endl;
+        run_mgr.BeamOn(num_events);
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/StandAloneGeant4Validation.h b/src/StandAloneGeant4Validation.h
new file mode 100644
index 000000000..b3cd0d59d
--- /dev/null
+++ b/src/StandAloneGeant4Validation.h
@@ -0,0 +1,623 @@
+#pragma once
+
+#include <cmath>
+#include <filesystem>
+#include <mutex>
+#include <vector>
+
+#include "G4Event.hh"
+#include "G4GDMLParser.hh"
+#include "G4OpBoundaryProcess.hh"
+#include "G4OpWLS.hh"
+#include "G4OpticalPhoton.hh"
+#include "G4PhysicalConstants.hh"
+#include "G4PrimaryParticle.hh"
+#include "G4PrimaryVertex.hh"
+#include "G4ProcessManager.hh"
+#include "G4Run.hh"
+#include "G4SDManager.hh"
+#include "G4SystemOfUnits.hh"
+#include "G4THitsCollection.hh"
+#include "G4ThreeVector.hh"
+#include "G4Track.hh"
+#include "G4TrackStatus.hh"
+#include "G4UserEventAction.hh"
+#include "G4UserRunAction.hh"
+#include "G4UserSteppingAction.hh"
+#include "G4UserTrackingAction.hh"
+#include "G4VHit.hh"
+#include "G4VPhysicalVolume.hh"
+#include "G4VPhysicsConstructor.hh"
+#include "G4VUserActionInitialization.hh"
+#include "G4VUserDetectorConstruction.hh"
+#include "G4VUserPrimaryGeneratorAction.hh"
+
+#include "ShimG4OpAbsorption.hh"
+#include "ShimG4OpRayleigh.hh"
+#include "U4Random.hh"
+
+#include "sysrap/NP.hh"
+#include "sysrap/sphoton.h"
+
+#include "config.h"
+#include "torch.h"
+
+// ---- Global hit accumulator (thread-safe) ----
+
+struct HitAccumulator
+{
+    std::mutex           mtx;
+    std::vector<sphoton> hits;
+
+    void AddHits(const std::vector<sphoton>& event_hits)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        hits.insert(hits.end(), event_hits.begin(), event_hits.end());
+    }
+
+    void Save(const char* filename)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        G4int                       num_hits = hits.size();
+        NP*                         arr = NP::Make<float>(num_hits, 4, 4);
+        for (int i = 0; i < num_hits; i++)
+        {
+            float* data = reinterpret_cast<float*>(&hits[i]);
+            std::copy(data, data + 16, arr->values<float>() + i * 16);
+        }
+        arr->save(filename);
+        delete arr;
+        G4cout << "G4: Saved " << num_hits << " total hits to " << filename << G4endl;
+    }
+};
+
+// ---- Sensitive Detector: collects optical photon hits per event ----
+
+struct G4PhotonHit : public G4VHit
+{
+    G4PhotonHit() = default;
+
+    G4PhotonHit(G4double energy, G4double time, G4ThreeVector position,
+                G4ThreeVector direction, G4ThreeVector polarization) :
+        photon()
+    {
+        photon.pos = {static_cast<float>(position.x()),
+                      static_cast<float>(position.y()),
+                      static_cast<float>(position.z())};
+        photon.time = time;
+        photon.mom = {static_cast<float>(direction.x()),
+                      static_cast<float>(direction.y()),
+                      static_cast<float>(direction.z())};
+        photon.pol = {static_cast<float>(polarization.x()),
+                      static_cast<float>(polarization.y()),
+                      static_cast<float>(polarization.z())};
+        photon.wavelength = h_Planck * c_light / (energy * CLHEP::eV);
+    }
+
+    void Print() override
+    {
+        G4cout << photon << G4endl;
+    }
+
+    sphoton photon;
+};
+
+using G4PhotonHitsCollection = G4THitsCollection<G4PhotonHit>;
+
+struct G4PhotonSD : public G4VSensitiveDetector
+{
+    HitAccumulator* accumulator;
+
+    G4PhotonSD(G4String name, HitAccumulator* acc) :
+        G4VSensitiveDetector(name),
+        accumulator(acc)
+    {
+        G4String HCname = name + "_HC";
+        collectionName.insert(HCname);
+    }
+
+    void Initialize(G4HCofThisEvent* hce) override
+    {
+        fHitsCollection = new G4PhotonHitsCollection(SensitiveDetectorName, collectionName[0]);
+        if (fHCID < 0)
+            fHCID = G4SDManager::GetSDMpointer()->GetCollectionID(collectionName[0]);
+        hce->AddHitsCollection(fHCID, fHitsCollection);
+    }
+
+    G4bool ProcessHits(G4Step* aStep, G4TouchableHistory*) override
+    {
+        G4Track* track = aStep->GetTrack();
+        if (track->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
+            return false;
+
+        G4PhotonHit* hit = new G4PhotonHit(
+            track->GetTotalEnergy(),
+            track->GetGlobalTime(),
+            aStep->GetPostStepPoint()->GetPosition(),
+            aStep->GetPostStepPoint()->GetMomentumDirection(),
+            aStep->GetPostStepPoint()->GetPolarization());
+
+        fHitsCollection->insert(hit);
+        track->SetTrackStatus(fStopAndKill);
+        return true;
+    }
+
+    void EndOfEvent(G4HCofThisEvent*) override
+    {
+        G4int num_hits = fHitsCollection->entries();
+
+        std::vector<sphoton> event_hits;
+        event_hits.reserve(num_hits);
+        for (G4PhotonHit* hit : *fHitsCollection->GetVector())
+            event_hits.push_back(hit->photon);
+
+        accumulator->AddHits(event_hits);
+    }
+
+  private:
+    G4PhotonHitsCollection* fHitsCollection = nullptr;
+    G4int                   fHCID = -1;
+};
+
+// ---- Detector Construction: loads GDML, attaches SD ----
+
+struct G4OnlyDetectorConstruction : G4VUserDetectorConstruction
+{
+    G4OnlyDetectorConstruction(std::filesystem::path gdml_file, HitAccumulator* acc) :
+        gdml_file_(gdml_file),
+        accumulator_(acc)
+    {
+    }
+
+    G4VPhysicalVolume* Construct() override
+    {
+        parser_.Read(gdml_file_.string(), false);
+        return parser_.GetWorldVolume();
+    }
+
+    void ConstructSDandField() override
+    {
+        G4SDManager*            SDman = G4SDManager::GetSDMpointer();
+        const G4GDMLAuxMapType* auxmap = parser_.GetAuxMap();
+
+        for (auto const& [logVol, listType] : *auxmap)
+        {
+            for (auto const& auxtype : listType)
+            {
+                if (auxtype.type == "SensDet")
+                {
+                    G4String name = logVol->GetName() + "_" + auxtype.value;
+                    G4cout << "G4: Attaching SD to " << logVol->GetName() << G4endl;
+                    G4PhotonSD* sd = new G4PhotonSD(name, accumulator_);
+                    SDman->AddNewDetector(sd);
+                    logVol->SetSensitiveDetector(sd);
+                }
+            }
+        }
+    }
+
+  private:
+    std::filesystem::path gdml_file_;
+    G4GDMLParser          parser_;
+    HitAccumulator*       accumulator_;
+};
+
+// ---- Primary Generator: distributes photons across events ----
+
+struct G4OnlyPrimaryGenerator : G4VUserPrimaryGeneratorAction
+{
+    gphox::Config cfg;
+    int           photons_per_event;
+
+    G4OnlyPrimaryGenerator(const gphox::Config& cfg, int photons_per_event) :
+        cfg(cfg),
+        photons_per_event(photons_per_event)
+    {
+    }
+
+    void GeneratePrimaries(G4Event* event) override
+    {
+        int eventID = event->GetEventID();
+
+        // Generate photons for this event's batch using event-specific seed offset
+        storch t = cfg.torch;
+        t.numphoton = photons_per_event;
+        std::vector<sphoton> sphotons = generate_photons(t, photons_per_event, eventID);
+
+        for (const sphoton& p : sphotons)
+        {
+            G4ThreeVector position(p.pos.x, p.pos.y, p.pos.z);
+            G4ThreeVector direction(p.mom.x, p.mom.y, p.mom.z);
+            G4ThreeVector polarization(p.pol.x, p.pol.y, p.pol.z);
+            G4double      wavelength_nm = p.wavelength;
+
+            G4PrimaryVertex* vertex = new G4PrimaryVertex(position, p.time);
+            G4double         energy = h_Planck * c_light / (wavelength_nm * nm);
+
+            G4PrimaryParticle* particle = new G4PrimaryParticle(G4OpticalPhoton::Definition());
+            particle->SetKineticEnergy(energy);
+            particle->SetMomentumDirection(direction);
+            particle->SetPolarization(polarization);
+
+            vertex->SetPrimary(particle);
+            event->AddPrimaryVertex(vertex);
+        }
+    }
+};
+
+// ---- Photon fate accumulator: tracks ALL photon final states ----
+
+struct PhotonFateAccumulator
+{
+    std::mutex           mtx;
+    std::vector<sphoton> photons;
+    bool                 indexed = false; // true for aligned mode: store by photon index
+
+    // Opticks flag enum values
+    static constexpr unsigned TORCH = 0x0004;
+    static constexpr unsigned BULK_ABSORB = 0x0008;
+    static constexpr unsigned BULK_REEMIT = 0x0010;
+    static constexpr unsigned BULK_SCATTER = 0x0020;
+    static constexpr unsigned SURFACE_DETECT = 0x0040;
+    static constexpr unsigned SURFACE_ABSORB = 0x0080;
+    static constexpr unsigned SURFACE_DREFLECT = 0x0100;
+    static constexpr unsigned SURFACE_SREFLECT = 0x0200;
+    static constexpr unsigned BOUNDARY_REFLECT = 0x0400;
+    static constexpr unsigned BOUNDARY_TRANSMIT = 0x0800;
+    static constexpr unsigned MISS = 0x8000;
+
+    void Resize(int n)
+    {
+        photons.resize(n);
+        indexed = true;
+    }
+
+    void Set(int idx, const sphoton& p)
+    {
+        if (idx >= 0 && idx < (int)photons.size())
+            photons[idx] = p;
+    }
+
+    void Add(const sphoton& p)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        photons.push_back(p);
+    }
+
+    void Save(const char* filename)
+    {
+        std::lock_guard<std::mutex> lock(mtx);
+        int                         n = photons.size();
+        NP*                         arr = NP::Make<float>(n, 4, 4);
+        for (int i = 0; i < n; i++)
+        {
+            float* data = reinterpret_cast<float*>(&photons[i]);
+            std::copy(data, data + 16, arr->values<float>() + i * 16);
+        }
+        arr->save(filename);
+        delete arr;
+        G4cout << "G4: Saved " << n << " photon fates to " << filename << G4endl;
+    }
+};
+
+// ---- Stepping Action: tracks photon fates with opticks-compatible flags ----
+
+struct G4OnlySteppingAction : G4UserSteppingAction
+{
+    PhotonFateAccumulator*     fate;
+    bool                       aligned;
+    std::map<std::string, int> proc_death_counts;
+    std::map<int, int>         boundary_status_counts;
+    std::mutex                 count_mtx;
+
+    G4OnlySteppingAction(PhotonFateAccumulator* f, bool aligned_ = false) :
+        fate(f),
+        aligned(aligned_)
+    {
+    }
+
+    ~G4OnlySteppingAction()
+    {
+        std::lock_guard<std::mutex> lock(count_mtx);
+        if (!proc_death_counts.empty())
+        {
+            G4cout << "\nG4: Photon death process summary:" << G4endl;
+            for (auto& [name, count] : proc_death_counts)
+                G4cout << "  " << name << ": " << count << G4endl;
+        }
+        if (!boundary_status_counts.empty())
+        {
+            G4cout << "\nG4: OpBoundary status counts (all steps):" << G4endl;
+            const char* bnames[] = {
+                "Undefined", "Transmission", "FresnelRefraction", "FresnelReflection",
+                "TotalInternalReflection", "LambertianReflection", "LobeReflection",
+                "SpikeReflection", "BackScattering", "Absorption", "Detection",
+                "NotAtBoundary", "SameMaterial", "StepTooSmall", "NoRINDEX",
+                "PolishedLumirrorAirReflection", "PolishedLumirrorGlueReflection",
+                "PolishedAirReflection", "PolishedTeflonAirReflection",
+                "PolishedTiOAirReflection", "PolishedTyvekAirReflection",
+                "PolishedVM2000AirReflection", "PolishedVM2000GlueReflection",
+                "EtchedLumirrorAirReflection", "EtchedLumirrorGlueReflection",
+                "EtchedAirReflection", "EtchedTeflonAirReflection",
+                "EtchedTiOAirReflection", "EtchedTyvekAirReflection",
+                "EtchedVM2000AirReflection", "EtchedVM2000GlueReflection",
+                "GroundLumirrorAirReflection", "GroundLumirrorGlueReflection",
+                "GroundAirReflection", "GroundTeflonAirReflection",
+                "GroundTiOAirReflection", "GroundTyvekAirReflection",
+                "GroundVM2000AirReflection", "GroundVM2000GlueReflection",
+                "Dichroic", "CoatedDielectricReflection", "CoatedDielectricRefraction",
+                "CoatedDielectricFrustratedTransmission"};
+            for (auto& [st, count] : boundary_status_counts)
+            {
+                const char* nm = (st >= 0 && st < 43) ? bnames[st] : "?";
+                G4cout << "  " << nm << "(" << st << "): " << count << G4endl;
+            }
+        }
+    }
+
+    void UserSteppingAction(const G4Step* aStep) override
+    {
+        G4Track* track = aStep->GetTrack();
+        if (track->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
+            return;
+
+        G4StepPoint*  post = aStep->GetPostStepPoint();
+        G4TrackStatus status = track->GetTrackStatus();
+
+        // Find the OpBoundary process to get its status (for ALL steps)
+        G4OpBoundaryProcess* boundary = nullptr;
+        G4ProcessManager*    pm = track->GetDefinition()->GetProcessManager();
+        for (int i = 0; i < pm->GetPostStepProcessVector()->entries(); i++)
+        {
+            G4VProcess* p = (*pm->GetPostStepProcessVector())[i];
+            boundary = dynamic_cast<G4OpBoundaryProcess*>(p);
+            if (boundary)
+                break;
+        }
+
+        G4OpBoundaryProcessStatus bStatus = boundary ? boundary->GetStatus() : Undefined;
+
+        // Count boundary status for ALL steps
+        if (boundary && bStatus != NotAtBoundary && bStatus != Undefined && bStatus != StepTooSmall)
+        {
+            std::lock_guard<std::mutex> lock(count_mtx);
+            boundary_status_counts[int(bStatus)]++;
+        }
+
+        // Only record photon state when the photon is about to die
+        if (status != fStopAndKill && status != fStopButAlive)
+            return;
+
+        // Identify the process
+        const G4VProcess* proc = post->GetProcessDefinedStep();
+        G4String          procName = proc ? proc->GetProcessName() : "Unknown";
+
+        // Build detailed key for counting
+        std::string key = procName;
+        if (procName == "OpBoundary" && boundary)
+            key += "(" + std::to_string(int(bStatus)) + ")";
+        key += (status == fStopAndKill ? "/Kill" : "/Alive");
+
+        {
+            std::lock_guard<std::mutex> lock(count_mtx);
+            proc_death_counts[key]++;
+        }
+
+        // Map to opticks flag
+        unsigned flag = 0;
+
+        if (procName == "OpAbsorption")
+        {
+            flag = PhotonFateAccumulator::BULK_ABSORB;
+        }
+        else if (procName == "OpWLS")
+        {
+            flag = PhotonFateAccumulator::BULK_REEMIT;
+        }
+        else if (procName == "OpBoundary" && boundary)
+        {
+            switch (bStatus)
+            {
+            case Detection:
+                flag = PhotonFateAccumulator::SURFACE_DETECT;
+                break;
+            case Absorption:
+                flag = PhotonFateAccumulator::SURFACE_ABSORB;
+                break;
+            case FresnelReflection:
+            case TotalInternalReflection:
+                flag = PhotonFateAccumulator::BOUNDARY_REFLECT;
+                break;
+            case FresnelRefraction:
+                flag = PhotonFateAccumulator::BOUNDARY_TRANSMIT;
+                break;
+            case LambertianReflection:
+            case LobeReflection:
+                flag = PhotonFateAccumulator::SURFACE_DREFLECT;
+                break;
+            case SpikeReflection:
+                flag = PhotonFateAccumulator::SURFACE_SREFLECT;
+                break;
+            case BackScattering:
+                flag = PhotonFateAccumulator::SURFACE_DREFLECT;
+                break;
+            default:
+                flag = PhotonFateAccumulator::SURFACE_ABSORB;
+                break;
+            }
+        }
+        else if (procName == "Transportation")
+        {
+            // Check if an SD killed this photon (SURFACE_DETECT)
+            G4StepPoint*       pre = aStep->GetPreStepPoint();
+            G4VPhysicalVolume* preVol = pre->GetPhysicalVolume();
+            G4VPhysicalVolume* postVol = post->GetPhysicalVolume();
+            G4LogicalVolume*   preLog = preVol ? preVol->GetLogicalVolume() : nullptr;
+            G4LogicalVolume*   postLog = postVol ? postVol->GetLogicalVolume() : nullptr;
+            bool               sd_pre = preLog && preLog->GetSensitiveDetector();
+            bool               sd_post = postLog && postLog->GetSensitiveDetector();
+            if (sd_pre || sd_post)
+                flag = PhotonFateAccumulator::SURFACE_DETECT;
+            else
+                flag = PhotonFateAccumulator::BOUNDARY_TRANSMIT;
+        }
+
+        if (flag == 0)
+            flag = PhotonFateAccumulator::MISS; // catch-all
+
+        // Build sphoton with the final state
+        G4ThreeVector pos = post->GetPosition();
+        G4ThreeVector mom = post->GetMomentumDirection();
+        G4ThreeVector pol = post->GetPolarization();
+        G4double      time = post->GetGlobalTime();
+        G4double      energy = post->GetTotalEnergy();
+
+        sphoton p = {};
+        p.pos = {float(pos.x()), float(pos.y()), float(pos.z())};
+        p.time = float(time);
+        p.mom = {float(mom.x()), float(mom.y()), float(mom.z())};
+        p.pol = {float(pol.x()), float(pol.y()), float(pol.z())};
+        p.wavelength = (energy > 0) ? float(h_Planck * c_light / (energy * CLHEP::eV)) : 0.f;
+
+        p.orient_boundary_flag = flag & 0xFFFF;
+        p.flagmask = flag;
+
+        if (aligned && fate->indexed)
+        {
+            int photon_idx = track->GetTrackID() - 1; // G4 trackIDs are 1-based
+            fate->Set(photon_idx, p);
+        }
+        else
+        {
+            fate->Add(p);
+        }
+    }
+};
+
+// ---- Tracking Action: per-photon RNG sync for aligned mode ----
+
+struct G4OnlyTrackingAction : G4UserTrackingAction
+{
+    void PreUserTrackingAction(const G4Track* track) override
+    {
+        if (track->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
+            return;
+        int photon_idx = track->GetTrackID() - 1; // G4 trackIDs are 1-based
+        U4Random::SetSequenceIndex(photon_idx);
+    }
+
+    void PostUserTrackingAction(const G4Track* track) override
+    {
+        if (track->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
+            return;
+        U4Random::SetSequenceIndex(-1);
+    }
+};
+
+// ---- AlignedOpticalPhysics: uses Shim processes for precise RNILL matching ----
+
+struct AlignedOpticalPhysics : G4VPhysicsConstructor
+{
+    AlignedOpticalPhysics() :
+        G4VPhysicsConstructor("AlignedOptical")
+    {
+    }
+    void ConstructParticle() override
+    {
+    }
+    void ConstructProcess() override
+    {
+        auto* pm = G4OpticalPhoton::OpticalPhoton()->GetProcessManager();
+        pm->AddDiscreteProcess(new ShimG4OpAbsorption());
+        pm->AddDiscreteProcess(new ShimG4OpRayleigh());
+        pm->AddDiscreteProcess(new G4OpBoundaryProcess());
+        pm->AddDiscreteProcess(new G4OpWLS());
+    }
+};
+
+// ---- Event Action: reports per-event progress ----
+
+struct G4OnlyEventAction : G4UserEventAction
+{
+    int total_events;
+
+    G4OnlyEventAction(int total_events) :
+        total_events(total_events)
+    {
+    }
+
+    void EndOfEventAction(const G4Event* event) override
+    {
+        int id = event->GetEventID();
+        if (id == 0 || (id + 1) % 10 == 0 || id + 1 == total_events)
+            G4cout << "G4: Event " << id + 1 << "/" << total_events << G4endl;
+    }
+};
+
+// ---- Run Action: saves merged hits at end ----
+
+struct G4OnlyRunAction : G4UserRunAction
+{
+    HitAccumulator*        accumulator;
+    PhotonFateAccumulator* fate;
+
+    G4OnlyRunAction(HitAccumulator* acc, PhotonFateAccumulator* f = nullptr) :
+        accumulator(acc),
+        fate(f)
+    {
+    }
+
+    void EndOfRunAction(const G4Run*) override
+    {
+        if (G4Threading::IsMasterThread() || !G4Threading::IsMultithreadedApplication())
+        {
+            G4cout << "G4: Total accumulated hits: " << accumulator->hits.size() << G4endl;
+            accumulator->Save("g4_hits.npy");
+            if (fate)
+            {
+                G4cout << "G4: Total photon fates: " << fate->photons.size() << G4endl;
+                fate->Save("g4_photon.npy");
+            }
+        }
+    }
+};
+
+// ---- Action Initialization (required for MT) ----
+
+struct G4OnlyActionInitialization : G4VUserActionInitialization
+{
+    gphox::Config          cfg;
+    HitAccumulator*        accumulator;
+    PhotonFateAccumulator* fate;
+    int                    photons_per_event;
+    int                    num_events;
+    bool                   aligned;
+
+    G4OnlyActionInitialization(const gphox::Config& cfg, HitAccumulator* acc,
+                               PhotonFateAccumulator* f,
+                               int photons_per_event, int num_events,
+                               bool aligned_ = false) :
+        cfg(cfg),
+        accumulator(acc),
+        fate(f),
+        photons_per_event(photons_per_event),
+        num_events(num_events),
+        aligned(aligned_)
+    {
+    }
+
+    void BuildForMaster() const override
+    {
+        SetUserAction(new G4OnlyRunAction(accumulator, fate));
+    }
+
+    void Build() const override
+    {
+        SetUserAction(new G4OnlyPrimaryGenerator(cfg, photons_per_event));
+        SetUserAction(new G4OnlyEventAction(num_events));
+        SetUserAction(new G4OnlyRunAction(accumulator, fate));
+        SetUserAction(new G4OnlySteppingAction(fate, aligned));
+        if (aligned)
+            SetUserAction(new G4OnlyTrackingAction());
+    }
+};
diff --git a/sysrap/SEventConfig.cc b/sysrap/SEventConfig.cc
index 50eb28428..74d4c9654 100644
--- a/sysrap/SEventConfig.cc
+++ b/sysrap/SEventConfig.cc
@@ -774,7 +774,7 @@ void SEventConfig::LIMIT_Check()
    //assert( _MaxBounce >= 0 && _MaxBounce <  LIMIT ) ;
    // MaxBounce should not in principal be limited
 
-   assert( _MaxRecord >= 0 && _MaxRecord <= RecordLimit() ) ;
+   assert(_MaxRecord >= 0); // RecordLimit relaxed to allow large record arrays for step analysis
    assert( _MaxRec    >= 0 && _MaxRec    <= RecordLimit() ) ;
    assert( _MaxPrd    >= 0 && _MaxPrd    <= RecordLimit() ) ;
 
@@ -1562,7 +1562,8 @@ void SEventConfig::Initialize_Comp_Simulate_(unsigned& gather_mask, unsigned& sa
         else if(IsDebugLite())
         {
             SEventConfig::SetMaxRec(0);
-            SEventConfig::SetMaxRecord(record_limit);
+            int env_max_record = ssys::getenvint(kMaxRecord, 0);
+            SEventConfig::SetMaxRecord(env_max_record > 0 ? env_max_record : record_limit);
             SEventConfig::SetMaxSeq(1);  // formerly incorrectly set to max_bounce+1
         }
 
diff --git a/tools/generate_precooked_rng.cu b/tools/generate_precooked_rng.cu
new file mode 100644
index 000000000..43b750e0d
--- /dev/null
+++ b/tools/generate_precooked_rng.cu
@@ -0,0 +1,139 @@
+/**
+generate_precooked_rng.cu
+==========================
+
+Generates precooked curand Philox sample arrays for U4Random aligned mode.
+Each photon gets its own pre-evaluated random-number stream matching what
+the GPU simulation would draw at the same step.
+
+Relation to QCurandStateMonolithicTest
+---------------------------------------
+This tool is NOT a replacement for QCurandStateMonolithicTest — they
+produce different artifacts for different consumers:
+
+    QCurandStateMonolithicTest (qudarap/tests/)
+        - Output: curand internal STATE structs (Philox4_32_10_t)
+                  ~/.opticks/rngcache/RNG/QCurandState_<N>_0_0.bin
+        - Consumer: GPU simulation initialisation. The kernel picks up a
+                    state and starts drawing from it.
+        - Purpose: pre-seeded RNG state for reproducible GPU runs.
+
+    generate_precooked_rng (this tool)
+        - Output: pre-evaluated random SAMPLES (float32 NumPy array)
+                  ~/.opticks/precooked/QSimTest/rng_sequence/.../
+                      rng_sequence_f_ni<NI>_nj<NJ>_nk<NK>_ioffset000000.npy
+        - Consumer: U4Random in --aligned mode (CPU-side G4). G4 reads
+                    sample i from the array on its i-th draw, byte-for-byte
+                    matching whatever the GPU kernel would have drawn at
+                    that step.
+        - Purpose: photon-by-photon GPU<->G4 alignment.
+
+States vs samples; GPU init vs G4 alignment. Both are needed for the
+full validation workflow.
+
+Build:
+    nvcc -o generate_precooked_rng tools/generate_precooked_rng.cu \
+         -I. -I/opt/eic-opticks/include/eic-opticks -lcurand -std=c++17
+
+Usage:
+    ./generate_precooked_rng [num_photons] [num_randoms_per_photon]
+    Defaults: 100000 photons, 256 randoms each (nj=16, nk=16)
+
+Output:
+    ~/.opticks/precooked/QSimTest/rng_sequence/
+        rng_sequence_f_ni<NI>_nj<NJ>_nk<NK>_tranche<NI>/
+            rng_sequence_f_ni<NI>_nj<NJ>_nk<NK>_ioffset000000.npy
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <sys/stat.h>
+
+#include <curand_kernel.h>
+#include "sysrap/NP.hh"
+
+__global__ void generate_sequences(float* out, unsigned ni, unsigned nv, unsigned id_offset)
+{
+    unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= ni) return;
+
+    unsigned photon_idx = id_offset + idx;
+
+    // Match GPU simulation: curand_init(seed=0, subsequence=photon_idx, offset=0)
+    curandStatePhilox4_32_10_t rng;
+    curand_init(0ULL, (unsigned long long)photon_idx, 0ULL, &rng);
+
+    float* row = out + idx * nv;
+    for (unsigned j = 0; j < nv; j++)
+        row[j] = curand_uniform(&rng);
+}
+
+static void mkdirp(const char* path)
+{
+    char tmp[1024];
+    snprintf(tmp, sizeof(tmp), "%s", path);
+    for (char* p = tmp + 1; *p; p++)
+    {
+        if (*p == '/') { *p = 0; mkdir(tmp, 0755); *p = '/'; }
+    }
+    mkdir(tmp, 0755);
+}
+
+int main(int argc, char** argv)
+{
+    unsigned ni = 100000;
+    unsigned nj = 16;
+    unsigned nk = 16;
+
+    if (argc > 1) ni = atoi(argv[1]);
+    if (argc > 2)
+    {
+        unsigned total = atoi(argv[2]);
+        nj = 1; nk = total;
+        for (unsigned f = 2; f * f <= total; f++)
+        {
+            if (total % f == 0 && f <= 64) { nj = f; nk = total / f; }
+        }
+    }
+
+    unsigned nv = nj * nk;
+    printf("Generating precooked curand Philox sequences:\n");
+    printf("  photons: %u, randoms/photon: %u (nj=%u, nk=%u), memory: %.1f MB\n",
+           ni, nv, nj, nk, (double)ni * nv * sizeof(float) / (1024 * 1024));
+
+    const char* home = getenv("HOME");
+    char dirpath[512], filename[256], fullpath[768];
+
+    snprintf(dirpath, sizeof(dirpath),
+        "%s/.opticks/precooked/QSimTest/rng_sequence/rng_sequence_f_ni%u_nj%u_nk%u_tranche%u",
+        home, ni, nj, nk, ni);
+    mkdirp(dirpath);
+
+    snprintf(filename, sizeof(filename),
+        "rng_sequence_f_ni%u_nj%u_nk%u_ioffset%06u.npy", ni, nj, nk, 0);
+    snprintf(fullpath, sizeof(fullpath), "%s/%s", dirpath, filename);
+
+    float* d_out = nullptr;
+    cudaMalloc(&d_out, (size_t)ni * nv * sizeof(float));
+
+    unsigned threads = 256;
+    unsigned blocks = (ni + threads - 1) / threads;
+    generate_sequences<<<blocks, threads>>>(d_out, ni, nv, 0);
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) { fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err)); return 1; }
+
+    NP* seq = NP::Make<float>(ni, nj, nk);
+    cudaMemcpy(seq->values<float>(), d_out, (size_t)ni * nv * sizeof(float), cudaMemcpyDeviceToHost);
+    cudaFree(d_out);
+
+    seq->save(fullpath);
+    printf("Saved: %s\n", fullpath);
+    printf("Set OPTICKS_RANDOM_SEQPATH=%s\n", fullpath);
+
+    delete seq;
+    return 0;
+}