From c526baa3d37f8e69e6e52ae5f44268958e47732b Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Thu, 26 Feb 2026 19:26:11 -0800
Subject: [PATCH 1/2] cache LiDAR point clouds in zarr via sphere geometry
 serialization

Sphere primitives (point clouds) now serialize to/from zarr alongside
triangle meshes and curves. place_pointcloud(s) stores baked data in
_baked_meshes, save_meshes writes type:"sphere" groups with centers/
radii/colors arrays, and load_meshes reconstructs sphere GAS entries.
nyc_lidar.py skips LAZ decompression on subsequent runs via zarr cache.
---
 examples/nyc_lidar.py | 104 +++++++++++++++++
 rtxpy/accessor.py     | 260 +++++++++++++++++++++++++++++++++++++-----
 rtxpy/engine.py       |  31 ++++-
 rtxpy/mesh_store.py   |  84 +++++++++++++-
 4 files changed, 442 insertions(+), 37 deletions(-)
 create mode 100644 examples/nyc_lidar.py

diff --git a/examples/nyc_lidar.py b/examples/nyc_lidar.py
new file mode 100644
index 0000000..60cde30
--- /dev/null
+++ b/examples/nyc_lidar.py
@@ -0,0 +1,104 @@
+"""
+NYC Real LiDAR Point Cloud Demo
+================================
+
+Downloads real USGS 3DEP LiDAR LAZ tiles from the TNM API for
+Manhattan and visualizes them over the NYC DEM using OptiX sphere
+primitives with ASPRS classification coloring.
+
+First run downloads ~2.7 GB of LAZ data; subsequent runs use cache.
+LAZ decompression is parallelized across CPU cores.
+
+LiDAR point clouds and buildings are cached in the zarr store after
+the first run. Subsequent runs skip all LAZ decompression, filtering,
+thinning, and coordinate transforms — loading directly from zarr.
+
+Usage:
+    python examples/nyc_lidar.py
+"""
+
+from pathlib import Path
+
+import numpy as np
+import xarray as xr
+import zarr
+
+import rtxpy
+
+
+def _has_lidar_cache(zarr_path):
+    """Check if the zarr store has cached LiDAR sphere geometries."""
+    try:
+        store = zarr.open(str(zarr_path), mode='r', use_consolidated=False)
+        if 'meshes' not in store:
+            return False
+        mg = store['meshes']
+        for gid in mg:
+            if mg[gid].attrs.get('type', '') == 'sphere':
+                return True
+    except Exception:
+        pass
+    return False
+
+
+def main():
+    # Full NYC DEM extent in WGS84
+    dem_bounds = (-74.26, 40.49, -73.70, 40.92)
+    # Manhattan LiDAR subset
+    lidar_bounds = (-74.02, 40.70, -73.97, 40.88)
+    cache_dir = Path('examples/cache/lidar')
+    zarr_path = 'examples/nyc_dem.zarr'
+
+    # Load NYC DEM (write CRS back — rioxarray doesn't auto-detect from zarr)
+    ds = xr.open_zarr(zarr_path)
+    terrain = ds['elevation'].load().astype(np.float32)
+    terrain = terrain.rio.write_crs('EPSG:32618')
+    terrain = terrain.rtx.to_cupy()
+
+    # Build terrain mesh
+    terrain.rtx.triangulate()
+
+    # Satellite tiles
+    terrain.rtx.place_tiles('satellite')
+
+    # Check for cached lidar + buildings in zarr
+    if _has_lidar_cache(zarr_path):
+        print("Loading cached meshes from zarr...")
+        terrain.rtx.load_meshes(zarr_path)
+    else:
+        # Use cached LAZ files if available, otherwise download
+        cached = sorted(cache_dir.glob('*.laz')) if cache_dir.exists() else []
+        if cached:
+            laz_paths = cached
+            print(f"Using {len(laz_paths)} cached LAZ file(s)")
+        else:
+            laz_paths = rtxpy.fetch_lidar(lidar_bounds, cache_dir=str(cache_dir))
+        print(f"Got {len(laz_paths)} LAZ file(s)")
+
+        # Buildings from Overture Maps (full DEM extent)
+        buildings = rtxpy.fetch_buildings(
+            bounds=dem_bounds, source='overture',
+            cache_path='examples/cache/nyc_lidar_buildings.geojson')
+        terrain.rtx.place_buildings(buildings)
+
+        # Place all LAZ tiles in parallel (threaded LAZ decompression)
+        # ASPRS classes: 1=unclassified, 2=ground, 10=rail, 17=bridge
+        # Exclude: 7=low noise, 9=water, 18=high noise
+        # thin=0.5 removes flight-line overlap striations (1 point per 0.5m cell)
+        terrain.rtx.place_pointclouds(
+            laz_paths,
+            point_size=0.25,
+            color='classification',
+            classification=[1, 2, 10, 17],
+            thin=0.5,
+        )
+
+        # Save all meshes (buildings + lidar) to zarr for next run
+        terrain.rtx.save_meshes(zarr_path)
+
+    # Launch interactive viewer
+    terrain.rtx.explore(width=1600, height=1200)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/rtxpy/accessor.py b/rtxpy/accessor.py
index 688cb8a..120c400 100644
--- a/rtxpy/accessor.py
+++ b/rtxpy/accessor.py
@@ -413,11 +413,16 @@ def save_meshes(self, zarr_path):
             elev_chunks = elev.metadata.chunk_grid.chunk_shape
         store = None
 
-        # Separate triangle meshes (3-tuple) from curves (4-tuple)
+        # Separate triangle meshes, curves, and spheres
         meshes = {}
         curves = {}
+        spheres = {}
         for gid, baked in self._baked_meshes.items():
-            if len(baked) == 4:
+            entry = self._rtx._geom_state.gas_entries.get(gid)
+            if entry and entry.is_sphere:
+                # sphere geometry: (centers, radii, colors, base_z)
+                spheres[gid] = (baked[0], baked[1], baked[2])
+            elif len(baked) == 4:
                 # curve geometry: (verts, widths, indices, base_z)
                 curves[gid] = (baked[0], baked[1], baked[2])
             else:
@@ -431,6 +436,7 @@ def save_meshes(self, zarr_path):
             elevation_shape=elev_shape,
             elevation_chunks=elev_chunks,
             curves=curves,
+            spheres=spheres,
         )
 
     def load_meshes(self, zarr_path, chunks=None):
@@ -450,7 +456,7 @@ def load_meshes(self, zarr_path, chunks=None):
         """
         from .mesh_store import load_meshes_from_zarr
 
-        meshes, colors, meta, curves = load_meshes_from_zarr(
+        meshes, colors, meta, curves, spheres = load_meshes_from_zarr(
             zarr_path, chunks=chunks)
 
         psx, psy = meta['pixel_spacing']
@@ -498,6 +504,29 @@ def load_meshes(self, zarr_path, chunks=None):
                                        indices.copy(), base_z)
             loaded_curves.append(gid)
 
+        # Load sphere geometries (point clouds)
+        loaded_spheres = []
+        total_sphere_pts = 0
+        for gid, (centers, radii, sph_colors) in spheres.items():
+            if len(centers) == 0:
+                continue
+            # Determine uniform vs per-point radii
+            unique_r = np.unique(radii)
+            r_arg = float(unique_r[0]) if len(unique_r) == 1 else radii
+            self._rtx.add_sphere_geometry(
+                gid, centers, r_arg, colors=sph_colors if len(sph_colors) > 0 else None)
+            self._geometry_colors[gid] = colors.get(gid, (0.5, 0.5, 0.5, 5.0))
+
+            # Compute base_z for VE rescaling
+            vx = centers[0::3]
+            vy = centers[1::3]
+            base_z = self._bilinear_terrain_z(
+                terrain_data_np, vx, vy, psx, psy)
+            self._baked_meshes[gid] = (
+                centers.copy(), radii.copy(), sph_colors.copy(), base_z)
+            loaded_spheres.append(gid)
+            total_sphere_pts += len(radii)
+
         self._geometry_colors_dirty = True
         n_tris = sum(len(meshes[g][1]) // 3 for g in loaded)
         n_segs = sum(len(curves[g][2]) for g in loaded_curves)
@@ -506,7 +535,9 @@ def load_meshes(self, zarr_path, chunks=None):
             parts.append(f"{n_tris:,} triangles")
         if loaded_curves:
             parts.append(f"{n_segs:,} curve segments")
-        total = len(loaded) + len(loaded_curves)
+        if loaded_spheres:
+            parts.append(f"{total_sphere_pts:,} sphere points")
+        total = len(loaded) + len(loaded_curves) + len(loaded_spheres)
         print(f"Loaded {total} mesh geometries ({', '.join(parts)}) "
               f"from {zarr_path}")
 
@@ -816,6 +847,7 @@ def place_pointcloud(self, source, geometry_id='pointcloud',
                          point_size=1.0, color='elevation',
                          classification=None, returns=None,
                          bounds=None, subsample=1, max_points=None,
+                         thin=None,
                          snap_to_terrain=False, colormap=None):
         """
         Place a lidar point cloud in the scene as sphere primitives.
@@ -863,12 +895,17 @@ def place_pointcloud(self, source, geometry_id='pointcloud',
             bounds=bounds,
             subsample=subsample,
             max_points=max_points,
+            thin=thin,
         )
 
         if len(centers) == 0:
             return {'num_points': 0, 'geometry_id': geometry_id, 'bounds': None}
 
-        terrain = self._obj.values
+        terrain = self._obj.data
+        try:
+            terrain = terrain.get()  # cupy → numpy
+        except AttributeError:
+            terrain = np.asarray(terrain)
         H, W = terrain.shape
 
         # Convert coordinates to terrain pixel space if needed
@@ -886,24 +923,28 @@ def place_pointcloud(self, source, geometry_id='pointcloud',
             y_coords = self._obj.coords['y'].values
             psy = abs(float(y_coords[1] - y_coords[0]))
 
-        # If coords look like they're in a projected CRS (large values),
-        # convert to pixel space
+        # Convert projected-CRS coordinates to the terrain mesh coordinate
+        # system.  triangulate() builds vertices at (col*psx, row*psy, elev)
+        # so we need world-offset coords, not pixel coords.
         if psx != 1.0 or psy != 1.0:
             x_origin = float(self._obj.coords['x'].values[0])
             y_origin = float(self._obj.coords['y'].values[0])
-            centers[:, 0] = (centers[:, 0] - x_origin) / psx
-            centers[:, 1] = (centers[:, 1] - y_origin) / psy
+            centers[:, 0] = centers[:, 0] - x_origin
+            # Y axis may be inverted (descending = north-up raster convention)
+            y_ascending = (self._obj.coords['y'].values[-1]
+                           > self._obj.coords['y'].values[0])
+            if y_ascending:
+                centers[:, 1] = centers[:, 1] - y_origin
+            else:
+                centers[:, 1] = y_origin - centers[:, 1]
+            # Z stays in raw meters — matches terrain mesh Z
 
         # Snap Z to terrain if requested
         if snap_to_terrain:
             for i in range(len(centers)):
                 vx, vy = centers[i, 0], centers[i, 1]
-                z = self._bilinear_terrain_z(terrain, vx, vy, 1.0, 1.0)
+                z = self._bilinear_terrain_z(terrain, vx, vy, psx, psy)
                 centers[i, 2] = z
-        else:
-            # Scale Z by pixel spacing to match terrain units
-            if psx != 1.0:
-                centers[:, 2] = centers[:, 2] / psx
 
         # Build per-point colors
         point_colors = build_colors(
@@ -926,9 +967,19 @@ def place_pointcloud(self, source, geometry_id='pointcloud',
         if result != 0:
             raise RuntimeError(f"Failed to add sphere geometry '{geometry_id}'")
 
-        # Set geometry color to a marker value so the render kernel
-        # knows to use per-point colors (alpha > 0 triggers solid color)
-        self._geometry_colors[geometry_id] = (0.5, 0.5, 0.5)
+        # Alpha=5.0 signals per-point color lookup in the shade kernel
+        self._geometry_colors[geometry_id] = (0.5, 0.5, 0.5, 5.0)
+
+        # Store baked data for zarr caching and VE rescaling
+        n_pts = len(centers)
+        radii_arr = (np.full(n_pts, radii, dtype=np.float32)
+                     if np.isscalar(radii)
+                     else np.asarray(radii, dtype=np.float32))
+        vx, vy = centers[:, 0], centers[:, 1]
+        base_z = self._bilinear_terrain_z(terrain, vx, vy, psx, psy)
+        self._baked_meshes[geometry_id] = (
+            centers.ravel().copy(), radii_arr.copy(),
+            point_colors.ravel().copy(), base_z)
 
         point_bounds = (
             float(centers[:, 0].min()), float(centers[:, 1].min()),
@@ -943,6 +994,163 @@ def place_pointcloud(self, source, geometry_id='pointcloud',
             'bounds': point_bounds,
         }
 
+    def place_pointclouds(self, sources, geometry_id_prefix='lidar',
+                          point_size=1.0, color='elevation',
+                          classification=None, returns=None,
+                          bounds=None, subsample=1, max_points=None,
+                          thin=None,
+                          snap_to_terrain=False, colormap=None,
+                          workers=None):
+        """
+        Place multiple point cloud files in parallel.
+
+        Decompresses LAZ files concurrently using a process pool, then
+        adds each to the scene sequentially (GPU work is serial).
+
+        Args:
+            sources: List of file paths (str or Path) to LAS/LAZ files.
+            geometry_id_prefix: Prefix for geometry IDs. Each tile gets
+                ``'{prefix}_{i}'``.  Default ``'lidar'``.
+            point_size: Sphere radius in world units.
+            color: Color mode (see ``place_pointcloud``).
+            classification: ASPRS class filter (see ``place_pointcloud``).
+            returns: Return number filter.
+            bounds: Spatial crop (xmin, ymin, xmax, ymax).
+            subsample: Keep every Nth point.
+            max_points: Max points per tile.
+            snap_to_terrain: Replace Z with DEM elevation.
+            colormap: Custom color LUT.
+            workers: Number of parallel loader processes.
+                Default ``None`` uses ``min(len(sources), os.cpu_count())``.
+
+        Returns:
+            List of result dicts (same format as ``place_pointcloud``).
+        """
+        import os
+        from concurrent.futures import ProcessPoolExecutor, as_completed
+        from .pointcloud import load_pointcloud, build_colors
+
+        sources = [str(s) for s in sources]
+        if not sources:
+            return []
+
+        if workers is None:
+            workers = min(len(sources), os.cpu_count() or 4)
+
+        # --- Phase 1: Parallel LAZ decompression (CPU-bound) ---
+        load_kwargs = dict(
+            classification=classification,
+            returns=returns,
+            bounds=bounds,
+            subsample=subsample,
+            max_points=max_points,
+            thin=thin,
+        )
+
+        loaded = [None] * len(sources)
+        print(f"Loading {len(sources)} tiles with {workers} workers...")
+        with ProcessPoolExecutor(max_workers=workers) as pool:
+            futures = {pool.submit(load_pointcloud, s, **load_kwargs): i
+                       for i, s in enumerate(sources)}
+            done = 0
+            for future in as_completed(futures):
+                i = futures[future]
+                loaded[i] = future.result()
+                done += 1
+                if done % 10 == 0 or done == len(sources):
+                    print(f"  Loaded {done}/{len(sources)} tiles")
+
+        # --- Pre-compute coordinate transform params (once) ---
+        terrain = self._obj.data
+        try:
+            terrain = terrain.get()
+        except AttributeError:
+            terrain = np.asarray(terrain)
+
+        psx, psy = 1.0, 1.0
+        if 'x' in self._obj.coords and len(self._obj.coords['x']) > 1:
+            x_coords = self._obj.coords['x'].values
+            psx = abs(float(x_coords[1] - x_coords[0]))
+        if 'y' in self._obj.coords and len(self._obj.coords['y']) > 1:
+            y_coords = self._obj.coords['y'].values
+            psy = abs(float(y_coords[1] - y_coords[0]))
+
+        has_proj = psx != 1.0 or psy != 1.0
+        if has_proj:
+            x_origin = float(self._obj.coords['x'].values[0])
+            y_origin = float(self._obj.coords['y'].values[0])
+            y_ascending = (self._obj.coords['y'].values[-1]
+                           > self._obj.coords['y'].values[0])
+
+        # --- Phase 2: Sequential coord transform + GPU upload ---
+        results = []
+        total_points = 0
+        for i, (centers, attributes) in enumerate(loaded):
+            gid = f"{geometry_id_prefix}_{i}"
+
+            if len(centers) == 0:
+                results.append(
+                    {'num_points': 0, 'geometry_id': gid, 'bounds': None})
+                continue
+
+            # Coordinate conversion
+            if has_proj:
+                centers[:, 0] = centers[:, 0] - x_origin
+                if y_ascending:
+                    centers[:, 1] = centers[:, 1] - y_origin
+                else:
+                    centers[:, 1] = y_origin - centers[:, 1]
+
+            if snap_to_terrain:
+                for j in range(len(centers)):
+                    vx, vy = centers[j, 0], centers[j, 1]
+                    centers[j, 2] = self._bilinear_terrain_z(
+                        terrain, vx, vy, psx, psy)
+
+            point_colors = build_colors(
+                centers, attributes, color_mode=color, colormap=colormap)
+
+            radii = float(point_size) if np.isscalar(point_size) else \
+                np.asarray(point_size, dtype=np.float32)
+
+            res = self._rtx.add_sphere_geometry(
+                gid, centers.ravel(), radii,
+                colors=point_colors.ravel())
+            if res != 0:
+                raise RuntimeError(
+                    f"Failed to add sphere geometry '{gid}'")
+
+            self._geometry_colors[gid] = (0.5, 0.5, 0.5, 5.0)
+            self._geometry_colors_dirty = True
+
+            # Store baked data for zarr caching and VE rescaling
+            n = len(centers)
+            radii_arr = (np.full(n, radii, dtype=np.float32)
+                         if np.isscalar(radii)
+                         else np.asarray(radii, dtype=np.float32))
+            vx, vy = centers[:, 0], centers[:, 1]
+            base_z = self._bilinear_terrain_z(terrain, vx, vy, psx, psy)
+            self._baked_meshes[gid] = (
+                centers.ravel().copy(), radii_arr.copy(),
+                point_colors.ravel().copy(), base_z)
+
+            total_points += n
+            results.append({
+                'num_points': n,
+                'geometry_id': gid,
+                'bounds': (
+                    float(centers[:, 0].min()),
+                    float(centers[:, 1].min()),
+                    float(centers[:, 2].min()),
+                    float(centers[:, 0].max()),
+                    float(centers[:, 1].max()),
+                    float(centers[:, 2].max()),
+                ),
+            })
+
+        print(f"Placed {total_points:,} points from {len(sources)} tiles")
+        return results
+
     _GEOJSON_PALETTE = [
         (0.90, 0.22, 0.20),  # red
         (0.20, 0.56, 0.90),  # blue
@@ -1096,9 +1304,9 @@ def place_geojson(self, geojson, height=10.0,
         # Flat ribbon for LineStrings and Polygon outlines
         use_ribbon = not extrude
         if width is None:
-            width = (self._pixel_spacing_x + self._pixel_spacing_y) * 0.08
+            width = (self._pixel_spacing_x + self._pixel_spacing_y) * 0.04
         # Small hover above terrain; bilinear Z sampling keeps ribbons close
-        ribbon_hover = (self._pixel_spacing_x + self._pixel_spacing_y) * 0.01
+        ribbon_hover = (self._pixel_spacing_x + self._pixel_spacing_y) * 0.005
 
         # Fast path: load pre-computed merged mesh from cache
         if mesh_cache is not None and merge:
@@ -1550,11 +1758,8 @@ def place_buildings(self, geojson, elev_scale=None, default_height_m=8.0,
             GeoJSON FeatureCollection of building footprint polygons
             (e.g. from :func:`rtxpy.fetch_buildings`).
         elev_scale : float, optional
-            Factor applied to real-world heights so they match the scaled
-            terrain.  When *None* (default), auto-computed so that
-            ``default_height_m`` buildings are roughly 2× the pixel
-            spacing — ensuring they are always visible at the terrain's
-            native resolution.
+            Factor applied to real-world heights (metres).  Default is
+            1.0 (no scaling — heights match terrain z units directly).
         default_height_m : float, optional
             Height in metres used when a feature has no ``height`` property.
             Default is 8.0.
@@ -1573,10 +1778,7 @@ def place_buildings(self, geojson, elev_scale=None, default_height_m=8.0,
         from pathlib import Path as _CachePath
 
         if elev_scale is None:
-            avg_spacing = (self._pixel_spacing_x + self._pixel_spacing_y) / 2
-            # Buildings should be ~2× pixel spacing for visibility
-            target_height = avg_spacing * 1
-            elev_scale = max(1.0, target_height / default_height_m)
+            elev_scale = 1.0
 
         # Invalidate mesh cache if height parameters changed
         if mesh_cache is not None:
@@ -1661,7 +1863,7 @@ def place_roads(self, geojson, geometry_id='road', color=None,
                 mesh_cache=mesh_cache,
             )
 
-    def place_water(self, geojson, body_height=0.5, mesh_cache_prefix=None):
+    def place_water(self, geojson, body_height=0.2, mesh_cache_prefix=None):
         """Classify and place water features as coloured geometry on terrain.
 
         Splits the GeoJSON into three categories based on the ``waterway``
diff --git a/rtxpy/engine.py b/rtxpy/engine.py
index 0f51410..145c8ff 100644
--- a/rtxpy/engine.py
+++ b/rtxpy/engine.py
@@ -236,7 +236,7 @@ def _load_chunk(self, cr, cc):
         if (cr, cc) in self._cache:
             return
         from .mesh_store import load_meshes_from_zarr
-        meshes, _, _, curves = load_meshes_from_zarr(
+        meshes, _, _, curves, _spheres = load_meshes_from_zarr(
             self._zarr_path, chunks=[(cr, cc)])
         # Merge curves into the same dict with a marker
         combined = {}
@@ -1211,7 +1211,7 @@ def __init__(self, raster, width: int = 800, height: int = 600,
         self.color_stretch = 'linear'
 
         # Ambient occlusion state
-        self.ao_enabled = False
+        self.ao_enabled = True
         self.ao_radius = None  # auto-computed from scene extent
         self.gi_intensity = 2.0  # GI bounce intensity multiplier
         self.gi_bounces = 1  # Number of GI bounces (1=single, 2-3=multi)
@@ -1221,8 +1221,11 @@ def __init__(self, raster, width: int = 800, height: int = 600,
         self._d_ao_accum = None  # GPU accumulation buffer (H, W, 3) float32
         self._prev_cam_state = None  # (position_tuple, yaw, pitch, fov) for dirty detection
 
+        # Eye Dome Lighting state
+        self.edl_enabled = True
+
         # Denoiser state
-        self.denoise_enabled = False
+        self.denoise_enabled = True
         self._prev_cam_for_flow = None  # (pos, forward, right, up, aspect, fov_scale) from prev frame
         self._d_flow = None  # (H, W, 2) float32 motion vectors
 
@@ -1234,7 +1237,7 @@ def __init__(self, raster, width: int = 800, height: int = 600,
         # Tile overlay settings
         self._tile_service = None
         self._tiles_enabled = False
-        self._geometry_layer_idx = 0  # Start at 'none'
+        self._geometry_layer_idx = 1  # Start at 'all'
 
         # Viewshed settings
         self.viewshed_enabled = False
@@ -3155,6 +3158,16 @@ def _handle_key_press(self, raw_key, key):
             self._toggle_wind()
             return
 
+        # Toggle terrain visibility: Shift+E
+        if raw_key == 'E':
+            entry = self.rtx._geom_state.gas_entries.get('terrain')
+            if entry is not None:
+                vis = not entry.visible
+                self.rtx.set_geometry_visible('terrain', vis)
+                print(f"Terrain {'shown' if vis else 'hidden'}")
+                self._needs_render = True
+            return
+
         # GTFS-RT realtime vehicle toggle: Shift+B
         if raw_key == 'B':
             self._toggle_gtfs_rt()
@@ -3181,6 +3194,13 @@ def _handle_key_press(self, raw_key, key):
             self._update_frame()
             return
 
+        # Eye Dome Lighting toggle: Shift+H
+        if raw_key == 'H':
+            self.edl_enabled = not self.edl_enabled
+            print(f"Eye Dome Lighting: {'ON' if self.edl_enabled else 'OFF'}")
+            self._update_frame()
+            return
+
         # Drone glow toggle: Shift+L
         if raw_key == 'L':
             self._drone_glow = not self._drone_glow
@@ -4814,6 +4834,7 @@ def _render_frame(self):
             focal_distance=dof_focal,
             edge_strength=0.2,
             edge_color=(0.15, 0.13, 0.10),
+            edl=self.edl_enabled,
             bloom=not defer_post,
             tone_map=not defer_post,
             _return_gpu=True,
@@ -5358,6 +5379,7 @@ def _render_help_text(self):
                 ("B", "Toggle TIN / Voxel"),
                 ("T", "Toggle shadows"),
                 ("Shift+T", "Cycle time of day"),
+                ("Shift+E", "Toggle terrain"),
             ]),
             ("DATA LAYERS", [
                 ("Shift+F", "FIRMS fire (7d)"),
@@ -5367,6 +5389,7 @@ def _render_help_text(self):
         col_right = [
             ("RENDERING", [
                 ("0", "Toggle ambient occlusion"),
+                ("Shift+H", "Toggle EDL lighting"),
                 ("Shift+G", "Cycle GI bounces (1-3)"),
                 ("Shift+D", "Toggle AI denoiser"),
                 ("9", "Toggle depth of field"),
diff --git a/rtxpy/mesh_store.py b/rtxpy/mesh_store.py
index d2fd772..62b08ca 100644
--- a/rtxpy/mesh_store.py
+++ b/rtxpy/mesh_store.py
@@ -55,7 +55,7 @@ def chunks_for_pixel_window(yi0, yi1, xi0, xi1, chunk_h, chunk_w):
 
 def save_meshes_to_zarr(zarr_path, meshes, colors, pixel_spacing,
                         elevation_shape, elevation_chunks,
-                        curves=None):
+                        curves=None, spheres=None):
     """Save mesh geometries into a zarr store, spatially partitioned by chunk.
 
     Parameters
@@ -76,6 +76,9 @@ def save_meshes_to_zarr(zarr_path, meshes, colors, pixel_spacing,
     curves : dict, optional
         ``{geometry_id: (vertices_flat, widths_flat, indices_flat)}`` for
         B-spline curve tube geometries (roads, water features).
+    spheres : dict, optional
+        ``{geometry_id: (centers_flat, radii, colors_flat)}`` for sphere
+        primitive geometries (LiDAR point clouds).
     """
     store = zarr.open(str(zarr_path), mode='r+')
 
@@ -216,7 +219,38 @@ def save_meshes_to_zarr(zarr_path, meshes, colors, pixel_spacing,
                     chunks=(len(local_indices),),
                 )
 
-    total = len(meshes) + (len(curves) if curves else 0)
+    # --- Sphere geometries (point clouds) ---
+    if spheres:
+        for gid, (centers, radii, sph_colors) in spheres.items():
+            centers = np.asarray(centers, dtype=np.float32)
+            radii = np.asarray(radii, dtype=np.float32)
+            sph_colors = np.asarray(sph_colors, dtype=np.float32)
+
+            gg = mg.create_group(gid)
+            c = colors.get(gid, (0.5, 0.5, 0.5, 5.0))
+            gg.attrs['color'] = list(c)
+            gg.attrs['type'] = 'sphere'
+
+            if len(centers) == 0:
+                continue
+
+            # Single chunk per geometry (already spatially partitioned by tile)
+            chunk_grp = gg.create_group('0_0')
+            chunk_grp.create_array(
+                'centers', data=centers,
+                chunks=(len(centers),),
+            )
+            chunk_grp.create_array(
+                'radii', data=radii,
+                chunks=(len(radii),),
+            )
+            chunk_grp.create_array(
+                'colors', data=sph_colors,
+                chunks=(len(sph_colors),),
+            )
+
+    total = len(meshes) + (len(curves) if curves else 0) + \
+        (len(spheres) if spheres else 0)
     print(f"Saved {total} mesh geometries to {zarr_path}/meshes/")
 
 
@@ -242,6 +276,9 @@ def load_meshes_from_zarr(zarr_path, chunks=None):
     curves : dict
         ``{geometry_id: (vertices_flat, widths_flat, indices_flat)}`` for
         curve geometries.  Empty dict if none stored.
+    spheres : dict
+        ``{geometry_id: (centers_flat, radii, colors_flat)}`` for sphere
+        primitive geometries.  Empty dict if none stored.
     """
     store = zarr.open(str(zarr_path), mode='r', use_consolidated=False)
     mg = store['meshes']
@@ -259,6 +296,7 @@ def load_meshes_from_zarr(zarr_path, chunks=None):
 
     meshes = {}
     curves = {}
+    spheres = {}
     colors = {}
 
     for gid in mg:
@@ -266,7 +304,45 @@ def load_meshes_from_zarr(zarr_path, chunks=None):
         if not hasattr(gg, 'attrs'):
             continue
         colors[gid] = tuple(gg.attrs.get('color', (0.6, 0.6, 0.6)))
-        is_curve = gg.attrs.get('type', '') == 'curve'
+        geom_type = gg.attrs.get('type', '')
+        is_curve = geom_type == 'curve'
+        is_sphere = geom_type == 'sphere'
+
+        if is_sphere:
+            # Sphere geometry: concatenate centers/radii/colors across chunks
+            all_centers = []
+            all_radii = []
+            all_colors = []
+            for key in sorted(gg):
+                if key in ('centers', 'radii', 'colors'):
+                    continue
+                if chunk_set is not None and key not in chunk_set:
+                    continue
+                cg = gg[key]
+                if 'centers' in cg:
+                    all_centers.append(
+                        np.array(cg['centers'], dtype=np.float32))
+                if 'radii' in cg:
+                    all_radii.append(
+                        np.array(cg['radii'], dtype=np.float32))
+                if 'colors' in cg:
+                    all_colors.append(
+                        np.array(cg['colors'], dtype=np.float32))
+
+            if all_centers:
+                spheres[gid] = (
+                    np.concatenate(all_centers),
+                    np.concatenate(all_radii),
+                    np.concatenate(all_colors) if all_colors else
+                    np.empty(0, dtype=np.float32),
+                )
+            else:
+                spheres[gid] = (
+                    np.empty(0, dtype=np.float32),
+                    np.empty(0, dtype=np.float32),
+                    np.empty(0, dtype=np.float32),
+                )
+            continue
 
         all_verts = []
         all_widths = []
@@ -311,4 +387,4 @@ def load_meshes_from_zarr(zarr_path, chunks=None):
                 meshes[gid] = (np.empty(0, dtype=np.float32),
                                np.empty(0, dtype=np.int32))
 
-    return meshes, colors, meta, curves
+    return meshes, colors, meta, curves, spheres

From ed75ce89c104c0229f571c0dc529b947b020fa28 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Thu, 26 Feb 2026 20:40:56 -0800
Subject: [PATCH 2/2] GPU wind splatting, DOF+AO fix, interactive point cloud
 color cycling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Move wind particle splatting from CPU (np.add.at ~5ms) to GPU via
  Numba CUDA kernel with cuda.atomic.add (~0.1ms). Handle NaN terrain
  for ocean pixels so wind displays over water.
- Fix DOF (key 9) requiring AO to be enabled — decouple progressive
  accumulation from AO so DOF works independently or alongside AO.
- Add interactive point cloud color mode cycling (Shift+C) through
  elevation/intensity/classification/rgb. Store point cloud attributes
  on accessor for runtime recoloring via build_colors().
- Fix nyc_lidar.py paths to resolve relative to __file__ so it runs
  from any working directory.
---
 examples/nyc_lidar.py    |   9 +-
 rtxpy/__init__.py        |   2 +-
 rtxpy/accessor.py        |   9 ++
 rtxpy/analysis/render.py | 114 +++++++++++++-
 rtxpy/engine.py          | 329 ++++++++++++++++++++++++++++++++++++---
 rtxpy/pointcloud.py      |  16 +-
 rtxpy/remote_data.py     | 147 +++++++++++++++++
 rtxpy/rtx.py             |  57 ++++++-
 8 files changed, 646 insertions(+), 37 deletions(-)

diff --git a/examples/nyc_lidar.py b/examples/nyc_lidar.py
index 60cde30..fdfd0c8 100644
--- a/examples/nyc_lidar.py
+++ b/examples/nyc_lidar.py
@@ -42,12 +42,15 @@ def _has_lidar_cache(zarr_path):
 
 
 def main():
+    # Resolve paths relative to this script's directory
+    _here = Path(__file__).resolve().parent
+
     # Full NYC DEM extent in WGS84
     dem_bounds = (-74.26, 40.49, -73.70, 40.92)
     # Manhattan LiDAR subset
     lidar_bounds = (-74.02, 40.70, -73.97, 40.88)
-    cache_dir = Path('examples/cache/lidar')
-    zarr_path = 'examples/nyc_dem.zarr'
+    cache_dir = _here / 'cache' / 'lidar'
+    zarr_path = str(_here / 'nyc_dem.zarr')
 
     # Load NYC DEM (write CRS back — rioxarray doesn't auto-detect from zarr)
     ds = xr.open_zarr(zarr_path)
@@ -78,7 +81,7 @@ def main():
         # Buildings from Overture Maps (full DEM extent)
         buildings = rtxpy.fetch_buildings(
             bounds=dem_bounds, source='overture',
-            cache_path='examples/cache/nyc_lidar_buildings.geojson')
+            cache_path=str(_here / 'cache' / 'nyc_lidar_buildings.geojson'))
         terrain.rtx.place_buildings(buildings)
 
         # Place all LAZ tiles in parallel (threaded LAZ decompression)
diff --git a/rtxpy/__init__.py b/rtxpy/__init__.py
index 9983d99..74e1cc7 100644
--- a/rtxpy/__init__.py
+++ b/rtxpy/__init__.py
@@ -28,7 +28,7 @@
 
 # Optional convenience — network helpers with lazy dependency checks
 try:
-    from .remote_data import fetch_dem, fetch_osm, fetch_buildings, fetch_roads, fetch_water, fetch_wind, fetch_firms, fetch_places, fetch_infrastructure, fetch_land_use, fetch_restaurant_grades, fetch_gtfs
+    from .remote_data import fetch_dem, fetch_lidar, fetch_osm, fetch_buildings, fetch_roads, fetch_water, fetch_wind, fetch_firms, fetch_places, fetch_infrastructure, fetch_land_use, fetch_restaurant_grades, fetch_gtfs
 except ImportError:
     pass
 
diff --git a/rtxpy/accessor.py b/rtxpy/accessor.py
index 120c400..c667bd4 100644
--- a/rtxpy/accessor.py
+++ b/rtxpy/accessor.py
@@ -36,6 +36,9 @@ def __init__(self, xarray_obj):
         self._geometry_colors_gpu = None
         # Baked merged meshes for VE rescaling: {geometry_id: (vertices, indices)}
         self._baked_meshes = {}
+        # Point cloud attributes for interactive color mode cycling
+        # {geometry_id: (centers_N3, attributes_dict)}
+        self._pc_attributes = {}
 
     @property
     def _rtx(self):
@@ -970,6 +973,9 @@ def place_pointcloud(self, source, geometry_id='pointcloud',
         # Alpha=5.0 signals per-point color lookup in the shade kernel
         self._geometry_colors[geometry_id] = (0.5, 0.5, 0.5, 5.0)
 
+        # Store attributes for interactive color mode cycling
+        self._pc_attributes[geometry_id] = (centers.copy(), attributes)
+
         # Store baked data for zarr caching and VE rescaling
         n_pts = len(centers)
         radii_arr = (np.full(n_pts, radii, dtype=np.float32)
@@ -1123,6 +1129,9 @@ def place_pointclouds(self, sources, geometry_id_prefix='lidar',
             self._geometry_colors[gid] = (0.5, 0.5, 0.5, 5.0)
             self._geometry_colors_dirty = True
 
+            # Store attributes for interactive color mode cycling
+            self._pc_attributes[gid] = (centers.copy(), attributes)
+
             # Store baked data for zarr caching and VE rescaling
             n = len(centers)
             radii_arr = (np.full(n, radii, dtype=np.float32)
diff --git a/rtxpy/analysis/render.py b/rtxpy/analysis/render.py
index 10f5400..c1c23d3 100644
--- a/rtxpy/analysis/render.py
+++ b/rtxpy/analysis/render.py
@@ -710,7 +710,7 @@ def _generate_reflection_rays_kernel(reflection_rays, primary_rays, primary_hits
             inst_id = instance_ids[idx]
             if inst_id >= 0 and inst_id < geometry_colors.shape[0]:
                 gc_alpha = geometry_colors[inst_id, 3]
-                if gc_alpha >= 2.0:
+                if gc_alpha >= 2.0 and gc_alpha < 5.0:
                     is_water = True
 
             # Check if terrain hit is NaN ocean
@@ -824,6 +824,7 @@ def _shade_terrain_kernel(
     rgb_texture,
     overlay_data, overlay_alpha, overlay_min, overlay_range,
     instance_ids, geometry_colors,
+    primitive_ids, point_colors, point_color_offsets,
     ao_factor, gi_color, gi_intensity,
     reflection_hits, reflection_rays
 ):
@@ -869,6 +870,7 @@ def _shade_terrain_kernel(
             # Alpha encoding: 0 = no override, (0,1] = normal shaded,
             #                  (1,2) = emissive glow (alpha-1 = min lighting floor),
             #                  >=2   = water shader  (alpha-2 = specular strength)
+            #                  >=5   = per-point color (sphere/point cloud)
             inst_id = instance_ids[idx]
             has_color_override = False
             emissive = 0.0
@@ -876,7 +878,25 @@ def _shade_terrain_kernel(
             water_specular = 0.0
             if inst_id >= 0 and inst_id < geometry_colors.shape[0]:
                 gc_alpha = geometry_colors[inst_id, 3]
-                if gc_alpha > 0.0:
+                if gc_alpha >= 5.0:
+                    # Per-point color lookup for sphere geometries
+                    prim_id = primitive_ids[idx]
+                    if (inst_id < point_color_offsets.shape[0]
+                            and point_color_offsets[inst_id] >= 0
+                            and prim_id >= 0):
+                        pc_idx = (point_color_offsets[inst_id] + prim_id) * 4
+                        if pc_idx + 3 < point_colors.shape[0]:
+                            base_r = point_colors[pc_idx]
+                            base_g = point_colors[pc_idx + 1]
+                            base_b = point_colors[pc_idx + 2]
+                            has_color_override = True
+                    if not has_color_override:
+                        # Fallback to geometry color RGB
+                        base_r = geometry_colors[inst_id, 0]
+                        base_g = geometry_colors[inst_id, 1]
+                        base_b = geometry_colors[inst_id, 2]
+                        has_color_override = True
+                elif gc_alpha > 0.0:
                     base_r = geometry_colors[inst_id, 0]
                     base_g = geometry_colors[inst_id, 1]
                     base_b = geometry_colors[inst_id, 2]
@@ -1281,6 +1301,52 @@ def _edge_outline(output, instance_ids, edge_strength=0.6,
         edge_strength, *edge_color)
 
 
+@cuda.jit
+def _edl_kernel(output, depth, height, width, radius, strength):
+    """Eye Dome Lighting: darken pixels at depth discontinuities."""
+    idx = cuda.grid(1)
+    if idx >= height * width:
+        return
+    py = idx // width
+    px = idx % width
+    center_d = depth[idx]
+    if center_d <= 0.0:
+        return
+    log_center = math.log2(center_d)
+    response = 0.0
+    # Sample 8 directions (pi/4 apart)
+    for i in range(8):
+        angle = i * (math.pi / 4.0)
+        # math.floor(x + 0.5) for correct rounding of negative offsets
+        nx = px + int(math.floor(radius * math.cos(angle) + 0.5))
+        ny = py + int(math.floor(radius * math.sin(angle) + 0.5))
+        if 0 <= nx < width and 0 <= ny < height:
+            nd = depth[ny * width + nx]
+            if nd > 0.0:
+                dd = log_center - math.log2(nd)
+                if dd > 0.0:
+                    response += dd
+            else:
+                # Empty/sky neighbor — silhouette edge, add fixed contribution
+                response += 0.5
+        else:
+            # Out-of-bounds — treat as silhouette edge
+            response += 0.5
+    shade = math.exp(-response * strength)
+    output[py, px, 0] *= shade
+    output[py, px, 1] *= shade
+    output[py, px, 2] *= shade
+
+
+def _edl(output, depth, width, height, radius=2.0, strength=0.7):
+    """Apply Eye Dome Lighting post-process for depth-edge enhancement."""
+    num_pixels = height * width
+    threadsperblock = 256
+    blockspergrid = (num_pixels + threadsperblock - 1) // threadsperblock
+    _edl_kernel[blockspergrid, threadsperblock](
+        output, depth, height, width, radius, strength)
+
+
 @cuda.jit
 def _compute_flow_kernel(flow_out, primary_rays, primary_hits,
                          width, height,
@@ -1474,6 +1540,7 @@ def _shade_terrain(
     overlay_data=None, overlay_alpha=0.5,
     overlay_min=0.0, overlay_range=1.0,
     instance_ids=None, geometry_colors=None,
+    primitive_ids=None, point_colors=None, point_color_offsets=None,
     ao_factor=None, gi_color=None, gi_intensity=2.0,
     reflection_hits=None, reflection_rays=None,
     albedo_out=None,
@@ -1511,6 +1578,24 @@ def _shade_terrain(
             _DUMMY_1x4 = cupy.zeros((1, 4), dtype=np.float32)
         geometry_colors = _DUMMY_1x4
 
+    # Handle per-point colors for sphere geometries
+    global _DUMMY_PC_PRIMS, _DUMMY_PC_PRIMS_SIZE
+    global _DUMMY_PC_COLORS, _DUMMY_PC_OFFSETS
+    if primitive_ids is None:
+        if not hasattr(_shade_terrain, '_dpc_prims') or \
+                _shade_terrain._dpc_prims_size != num_rays:
+            _shade_terrain._dpc_prims = cupy.full(num_rays, -1, dtype=cupy.int32)
+            _shade_terrain._dpc_prims_size = num_rays
+        primitive_ids = _shade_terrain._dpc_prims
+    if point_colors is None:
+        if not hasattr(_shade_terrain, '_dpc_colors'):
+            _shade_terrain._dpc_colors = cupy.zeros(4, dtype=cupy.float32)
+        point_colors = _shade_terrain._dpc_colors
+    if point_color_offsets is None:
+        if not hasattr(_shade_terrain, '_dpc_offsets'):
+            _shade_terrain._dpc_offsets = cupy.full(1, -1, dtype=cupy.int32)
+        point_color_offsets = _shade_terrain._dpc_offsets
+
     # Handle AO factor - cached all-ones when disabled
     global _DUMMY_AO_ONES, _DUMMY_AO_SIZE
     if ao_factor is None:
@@ -1558,6 +1643,7 @@ def _shade_terrain(
         rgb_texture,
         overlay_data, overlay_alpha, overlay_min, overlay_range,
         instance_ids, geometry_colors,
+        primitive_ids, point_colors, point_color_offsets,
         ao_factor, gi_color, np.float32(gi_intensity),
         reflection_hits, reflection_rays
     )
@@ -1598,6 +1684,7 @@ def __init__(self):
         self.output = None
         self.albedo = None
         self.instance_ids = None
+        self.primitive_ids = None
         self.ao_rays = None
         self.ao_hits = None
         self.gi_color = None
@@ -1619,6 +1706,7 @@ def get(self, width, height, shadows, alpha, need_instance_ids, ao=False):
             self.output = cupy.zeros((height, width, num_channels), dtype=np.float32)
             self.albedo = cupy.zeros((height, width, 3), dtype=np.float32)
             self.instance_ids = cupy.full(num_rays, -1, dtype=cupy.int32)
+            self.primitive_ids = cupy.full(num_rays, -1, dtype=cupy.int32)
             if ao:
                 self.ao_rays = cupy.empty((num_rays, 8), dtype=np.float32)
                 self.ao_hits = cupy.empty((num_rays, 4), dtype=np.float32)
@@ -1701,6 +1789,9 @@ def render(
     edge_lines: bool = True,
     edge_strength: float = 0.6,
     edge_color: Tuple[float, float, float] = (0.05, 0.05, 0.05),
+    edl: bool = True,
+    edl_strength: float = 0.7,
+    edl_radius: float = 2.0,
     _return_gpu: bool = False,
 ) -> np.ndarray:
     """Render terrain with a perspective camera for movie-quality visualization.
@@ -1923,8 +2014,10 @@ def render(
 
     # Step 2: Trace primary rays (with instance_ids if geometry_colors provided)
     d_instance_ids = bufs.instance_ids
+    d_primitive_ids = bufs.primitive_ids
     if geometry_colors is not None:
-        optix.trace(d_primary_rays, d_primary_hits, num_rays, instance_ids=d_instance_ids)
+        optix.trace(d_primary_rays, d_primary_hits, num_rays,
+                    instance_ids=d_instance_ids, primitive_ids=d_primitive_ids)
     else:
         optix.trace(d_primary_rays, d_primary_hits, num_rays)
 
@@ -2036,6 +2129,12 @@ def render(
         ov_max = float(cupy.nanmax(d_overlay))
         ov_range = ov_max - ov_min
 
+    # Build per-point color buffers for sphere geometries
+    d_point_colors = None
+    d_point_color_offsets = None
+    if optix is not None:
+        d_point_colors, d_point_color_offsets = optix.build_point_colors_gpu()
+
     # Step 4: Shade terrain
     _shade_terrain(
         d_output, d_primary_rays, d_primary_hits, d_shadow_hits,
@@ -2052,6 +2151,9 @@ def render(
         overlay_data=d_overlay, overlay_alpha=overlay_alpha,
         overlay_min=ov_min, overlay_range=ov_range,
         instance_ids=d_instance_ids, geometry_colors=geometry_colors,
+        primitive_ids=d_primitive_ids,
+        point_colors=d_point_colors,
+        point_color_offsets=d_point_color_offsets,
         ao_factor=d_ao_factor,
         gi_color=bufs.gi_color if ao_samples > 0 else None,
         gi_intensity=gi_intensity,
@@ -2070,6 +2172,12 @@ def render(
     if edge_lines and geometry_colors is not None:
         _edge_outline(d_output, d_instance_ids, edge_strength, edge_color)
 
+    # Eye Dome Lighting (depth-edge enhancement, especially for point clouds)
+    if edl:
+        d_depth_1d = d_primary_hits.reshape(num_rays, 4)[:, 0].copy()
+        _edl(d_output, d_depth_1d, width, height,
+             radius=edl_radius, strength=edl_strength)
+
     # Bloom post-process (before tone mapping so ACES compresses bloom gracefully)
     if bloom:
         _bloom(d_output, bufs.bloom_temp, bufs.bloom_scratch)
diff --git a/rtxpy/engine.py b/rtxpy/engine.py
index 145c8ff..1baa27d 100644
--- a/rtxpy/engine.py
+++ b/rtxpy/engine.py
@@ -5,6 +5,7 @@
 Uses GLFW for windowing/input and ModernGL for GPU texture display.
 """
 
+import math
 import os
 import queue
 import threading
@@ -21,6 +22,7 @@
 
 if has_cupy:
     import cupy as cp
+    from numba import cuda
 
 
 # ---------------------------------------------------------------------------
@@ -48,6 +50,106 @@
 """
 
 
+# ---------------------------------------------------------------------------
+# Numba CUDA kernel for wind particle splatting
+# ---------------------------------------------------------------------------
+if has_cupy:
+    @cuda.jit
+    def _wind_splat_kernel(
+        trails,       # (N*T, 2) float32 — (row, col) per trail point
+        alphas,       # (N*T,) float32 — pre-computed alpha per trail point
+        terrain,      # (tH, tW) float32 — terrain elevation
+        output,       # (sh, sw, 3) float32 — frame buffer (atomic add)
+        # Camera basis — scalar args to avoid tiny GPU allocations
+        cam_x, cam_y, cam_z,
+        fwd_x, fwd_y, fwd_z,
+        rgt_x, rgt_y, rgt_z,
+        up_x, up_y, up_z,
+        # Projection params
+        fov_scale, aspect_ratio,
+        # Terrain/world params
+        psx, psy, ve, subsample_f, min_depth,
+        # Splat params
+        dot_radius,
+        # Wind color
+        color_r, color_g, color_b,
+    ):
+        idx = cuda.grid(1)
+        if idx >= trails.shape[0]:
+            return
+
+        a = alphas[idx]
+        if a < 1e-6:
+            return
+
+        row = trails[idx, 0]
+        col = trails[idx, 1]
+
+        # Terrain Z lookup (nearest-neighbor, clamped)
+        tH = terrain.shape[0]
+        tW = terrain.shape[1]
+        sr = int(row / subsample_f)
+        sc = int(col / subsample_f)
+        if sr < 0:
+            sr = 0
+        elif sr >= tH:
+            sr = tH - 1
+        if sc < 0:
+            sc = 0
+        elif sc >= tW:
+            sc = tW - 1
+        z_raw = terrain[sr, sc]
+        # Ocean/water pixels are NaN — map to 0 (matches CPU path)
+        if z_raw != z_raw:  # NaN check (works under fast math)
+            z_raw = 0.0
+        z_val = z_raw * ve + 3.0
+
+        # World position
+        wx = col * psx
+        wy = row * psy
+
+        # Camera-relative
+        dx = wx - cam_x
+        dy = wy - cam_y
+        dz = z_val - cam_z
+
+        # Depth along forward axis
+        depth = dx * fwd_x + dy * fwd_y + dz * fwd_z
+        if depth <= min_depth:
+            return
+
+        inv_depth = 1.0 / (depth + 1e-10)
+        u_cam = dx * rgt_x + dy * rgt_y + dz * rgt_z
+        v_cam = dx * up_x + dy * up_y + dz * up_z
+        u_ndc = u_cam * inv_depth / (fov_scale * aspect_ratio)
+        v_ndc = v_cam * inv_depth / fov_scale
+
+        sh = output.shape[0]
+        sw = output.shape[1]
+        sx = int((u_ndc + 1.0) * 0.5 * sw)
+        sy = int((1.0 - v_ndc) * 0.5 * sh)
+
+        if sx < 0 or sx >= sw or sy < 0 or sy >= sh:
+            return
+
+        # Circular stamp splat
+        r = dot_radius
+        for offy in range(-r, r + 1):
+            for offx in range(-r, r + 1):
+                dist_sq = offx * offx + offy * offy
+                if dist_sq > r * r:
+                    continue
+                falloff = 1.0 - math.sqrt(dist_sq) / r
+                px = sx + offx
+                py = sy + offy
+                if px < 0 or px >= sw or py < 0 or py >= sh:
+                    continue
+                contrib = a * falloff
+                cuda.atomic.add(output, (py, px, 0), contrib * color_r)
+                cuda.atomic.add(output, (py, px, 1), contrib * color_g)
+                cuda.atomic.add(output, (py, px, 2), contrib * color_b)
+
+
 def _glfw_to_key(glfw_key, mods):
     """Translate a GLFW key code + modifiers to the string format used by
     _handle_key_press / _handle_key_release.
@@ -990,6 +1092,7 @@ class InteractiveViewer:
     - U/Shift+U: Cycle basemap forward/backward (none → satellite → osm)
     - N: Cycle geometry layer (none → all → groups)
     - P: Jump to previous geometry in current group
+    - Shift+C: Cycle point cloud color mode (elevation/intensity/classification/rgb)
     - ,/.: Decrease/increase overlay alpha (transparency)
     - O: Place observer (for viewshed) at look-at point
     - Shift+O: Cycle drone mode (off → 3rd person → FPV → off)
@@ -1234,6 +1337,10 @@ def __init__(self, raster, width: int = 800, height: int = 600,
         self._dof_aperture = 20.0  # lens radius in scene units
         self._dof_focal_distance = 1000.0  # focal plane distance (= look_at distance)
 
+        # Point cloud color mode cycling
+        self._pc_color_modes = ['elevation', 'intensity', 'classification', 'rgb']
+        self._pc_color_mode_idx = 0
+
         # Tile overlay settings
         self._tile_service = None
         self._tiles_enabled = False
@@ -1301,6 +1408,13 @@ def __init__(self, raster, width: int = 800, height: int = 600,
         self._wind_min_visible_age = 6  # Ticks before particle becomes visible (builds trail first)
         self._wind_terrain_np = None  # Cached CPU terrain for wind Z lookup
 
+        # GPU wind splatting buffers
+        self._d_wind_trails = None    # (N*T, 2) float32 GPU buffer
+        self._d_wind_alpha = None     # (N*T,) float32 GPU buffer
+        self._d_base_frame = None     # (H, W, 3) float32 — post-processed frame before wind
+        self._d_wind_scratch = None   # (H, W, 3) float32 — scratch for idle replay
+        self._wind_done_event = None  # CUDA event for stream sync
+
         # GTFS-RT realtime vehicle overlay state
         self._gtfs_rt_url = None
         self._gtfs_rt_enabled = False
@@ -1726,6 +1840,8 @@ def _rebuild_at_resolution(self, factor):
 
         self.raster = sub
         self._wind_terrain_np = None  # invalidate cached terrain
+        self._d_base_frame = None     # invalidate GPU wind buffers
+        self._d_wind_scratch = None
         H, W = sub.shape
         self.terrain_shape = (H, W)
 
@@ -2796,7 +2912,6 @@ def _draw_wind_on_frame(self, img):
             return
 
         from .analysis.render import _compute_camera_basis
-        import math
 
         sh, sw = img.shape[:2]
         N = self._wind_particles.shape[0]
@@ -2915,6 +3030,101 @@ def _draw_wind_on_frame(self, img):
         np.clip(img, 0, 1, out=img)
         return img
 
+    def _splat_wind_gpu(self, d_frame):
+        """Project and splat wind particles on GPU via Numba CUDA kernel.
+
+        Parameters
+        ----------
+        d_frame : cupy.ndarray, shape (H, W, 3)
+            GPU frame buffer (float32 0-1). Modified in-place via atomic add.
+        """
+        if self._wind_particles is None or self._wind_trails is None:
+            return
+
+        from .analysis.render import _compute_camera_basis
+
+        sh, sw = d_frame.shape[:2]
+        N = self._wind_particles.shape[0]
+        trail_len = self._wind_trail_len
+
+        # Camera basis
+        cam_pos = self.position
+        look_at = self._get_look_at()
+        forward, right, cam_up = _compute_camera_basis(
+            tuple(cam_pos), tuple(look_at), (0, 0, 1),
+        )
+        fov_scale = math.tan(math.radians(self.fov) / 2.0)
+        aspect_ratio = sw / sh
+
+        # Pre-compute alpha on CPU (vectorized — same logic as _draw_wind_on_frame)
+        ages = self._wind_ages
+        lifetimes = self._wind_lifetimes
+        trail_idx = np.tile(np.arange(trail_len, dtype=np.float32), N)
+        ages_rep = np.repeat(ages, trail_len)
+        lifetimes_rep = np.repeat(lifetimes, trail_len)
+        age_ok = ages_rep > trail_idx
+
+        mva = self._wind_min_visible_age
+        fade_in = np.clip((ages_rep - mva) / 10.0, 0, 1)
+        fade_out = np.clip((lifetimes_rep - ages_rep) / 20.0, 0, 1)
+        trail_fade = 1.0 - (trail_idx / trail_len)
+        alpha = self._wind_alpha * fade_in * fade_out * trail_fade
+        alpha[~age_ok] = 0.0
+
+        # Flatten trails: (N, T, 2) → (N*T, 2)
+        all_pts = self._wind_trails.reshape(-1, 2).astype(np.float32)
+        alpha = alpha.astype(np.float32)
+        total = N * trail_len
+
+        # Upload to reusable GPU buffers
+        if self._d_wind_trails is None or self._d_wind_trails.shape[0] != total:
+            self._d_wind_trails = cp.empty((total, 2), dtype=cp.float32)
+            self._d_wind_alpha = cp.empty(total, dtype=cp.float32)
+        self._d_wind_trails.set(all_pts)
+        self._d_wind_alpha.set(alpha)
+
+        # GPU terrain — use raster.data directly
+        terrain_data = self.raster.data
+        if not isinstance(terrain_data, cp.ndarray):
+            terrain_data = cp.asarray(terrain_data)
+
+        # Kernel launch
+        threadsperblock = 256
+        blockspergrid = (total + threadsperblock - 1) // threadsperblock
+
+        f = float(self.subsample_factor)
+        psx = float(self._base_pixel_spacing_x)
+        psy = float(self._base_pixel_spacing_y)
+        ve = float(self.vertical_exaggeration)
+        min_depth = float(self._wind_min_depth)
+        r = int(self._wind_dot_radius)
+
+        _wind_splat_kernel[blockspergrid, threadsperblock](
+            self._d_wind_trails,
+            self._d_wind_alpha,
+            terrain_data,
+            d_frame,
+            # Camera position
+            float(cam_pos[0]), float(cam_pos[1]), float(cam_pos[2]),
+            # Forward
+            float(forward[0]), float(forward[1]), float(forward[2]),
+            # Right
+            float(right[0]), float(right[1]), float(right[2]),
+            # Up
+            float(cam_up[0]), float(cam_up[1]), float(cam_up[2]),
+            # Projection
+            float(fov_scale), float(aspect_ratio),
+            # Terrain/world
+            psx, psy, ve, f, min_depth,
+            # Splat
+            r,
+            # Color (teal)
+            0.3, 0.9, 0.8,
+        )
+
+        # Clamp output
+        cp.clip(d_frame, 0, 1, out=d_frame)
+
     # ------------------------------------------------------------------
     # GTFS-RT realtime vehicle overlay
     # ------------------------------------------------------------------
@@ -3173,6 +3383,11 @@ def _handle_key_press(self, raw_key, key):
             self._toggle_gtfs_rt()
             return
 
+        # Point cloud color mode cycle: Shift+C
+        if raw_key == 'C':
+            self._cycle_pointcloud_colors()
+            return
+
         # Denoiser toggle: Shift+D (before movement keys capture 'd')
         if raw_key == 'D':
             self.denoise_enabled = not self.denoise_enabled
@@ -3511,6 +3726,8 @@ def _check_terrain_reload(self):
         self._base_raster = new_raster
         self.raster = new_raster
         self._wind_terrain_np = None  # invalidate cached terrain
+        self._d_base_frame = None     # invalidate GPU wind buffers
+        self._d_wind_scratch = None
 
         # Update coordinate tracking
         self._coord_origin_x = new_origin_x
@@ -3730,8 +3947,8 @@ def _tick(self):
             if self._chunk_manager.update(self.position[0], self.position[1], self):
                 self._geometry_colors_builder = self._accessor._build_geometry_colors_gpu
                 self._render_needed = True
-        # AO: keep accumulating samples when camera is stationary
-        if (self.ao_enabled and not self._held_keys
+        # AO/DOF: keep accumulating samples when camera is stationary
+        if ((self.ao_enabled or self.dof_enabled) and not self._held_keys
                 and not self._mouse_dragging
                 and self._ao_frame_count < self._ao_max_frames):
             self._render_needed = True
@@ -3739,10 +3956,13 @@ def _tick(self):
         if self._render_needed:
             self._update_frame()
             self._render_needed = False
-        elif self._wind_enabled and self._wind_particles is not None and self._pinned_frame is not None:
+        elif self._wind_enabled and self._wind_particles is not None and self._d_base_frame is not None:
             # Wind is on but camera didn't move — skip the expensive ray
-            # trace and just re-advect particles + re-composite overlays.
+            # trace and just re-advect particles + GPU splat on fresh copy.
             self._update_wind_particles()
+            cp.copyto(self._d_wind_scratch, self._d_base_frame)
+            self._splat_wind_gpu(self._d_wind_scratch)
+            self._d_wind_scratch.get(out=self._pinned_frame)
             self._composite_overlays()
 
     def _cycle_terrain_layer(self):
@@ -3844,6 +4064,51 @@ def _cycle_geometry_layer(self):
         self._current_geom_idx = 0
         self._update_frame()
 
+    def _cycle_pointcloud_colors(self):
+        """Cycle point cloud color mode: elevation → intensity → classification → rgb."""
+        acc = self._accessor
+        if acc is None or not hasattr(acc, '_pc_attributes') or not acc._pc_attributes:
+            print("No point cloud geometries in scene")
+            return
+
+        from .pointcloud import build_colors
+
+        self._pc_color_mode_idx = (self._pc_color_mode_idx + 1) % len(self._pc_color_modes)
+        mode = self._pc_color_modes[self._pc_color_mode_idx]
+
+        updated = 0
+        for gid, (centers, attributes) in acc._pc_attributes.items():
+            # Check the geometry still exists
+            if self.rtx is None or not self.rtx.has_geometry(gid):
+                continue
+
+            # Build new colors
+            new_colors = build_colors(centers, attributes, color_mode=mode)
+            colors_flat = new_colors.ravel().astype(np.float32)
+
+            # Update per-point colors on the RTX geometry state
+            gs = self.rtx._geom_state
+            gs.point_colors_per_gas[gid] = colors_flat
+            # Invalidate concatenated GPU buffer so it gets rebuilt
+            gs.point_colors = None
+            gs.point_color_offsets = None
+
+            # Update baked mesh colors (3rd element of 4-tuple)
+            if gid in acc._baked_meshes:
+                baked = acc._baked_meshes[gid]
+                acc._baked_meshes[gid] = (baked[0], baked[1], colors_flat.copy(), baked[3])
+
+            updated += 1
+
+        if updated > 0:
+            print(f"Point cloud color: {mode}")
+            self._d_ao_accum = None
+            self._ao_frame_count = 0
+            self._prev_cam_state = None
+            self._update_frame()
+        else:
+            print(f"No active point cloud geometries to recolor")
+
     def _jump_to_geometry(self, direction):
         """Jump camera to next/previous geometry in current layer.
 
@@ -4681,8 +4946,8 @@ def _save_screenshot(self):
             geometry_colors=geometry_colors,
         )
 
-        # Accumulated multi-frame screenshot when AO is enabled
-        num_frames = 64 if self.ao_enabled else 1
+        # Accumulated multi-frame screenshot when AO or DOF is enabled
+        num_frames = 64 if (self.ao_enabled or self.dof_enabled) else 1
 
         if num_frames > 1:
             import cupy
@@ -4776,16 +5041,20 @@ def _render_frame(self):
         if builder is not None:
             geometry_colors = builder()
 
+        # Progressive accumulation is needed for AO convergence and/or
+        # DOF (thin-lens jitter needs multi-frame averaging to converge).
+        needs_accum = self.ao_enabled or self.dof_enabled
+
         # AO parameters: multiple samples per frame for smooth early results,
         # with progressive accumulation across frames for further refinement
         ao_samples = self._ao_samples_per_frame if self.ao_enabled else 0
         ao_seed = self._ao_frame_count if self.ao_enabled else 0
 
         # When progressive accumulation is active, pass frame seed for AA + soft shadows + DOF
-        frame_seed = self._ao_frame_count + 1 if self.ao_enabled else 0
+        frame_seed = self._ao_frame_count + 1 if needs_accum else 0
 
-        # Depth of field (requires progressive accumulation via AO)
-        if self.dof_enabled and self.ao_enabled:
+        # Depth of field
+        if self.dof_enabled:
             dof_aperture = self._dof_aperture
             dof_focal = self._dof_focal_distance
         else:
@@ -4851,8 +5120,9 @@ def _update_frame(self):
         d_output = self._render_frame()
         self.frame_count += 1
 
-        # Progressive AO accumulation
-        if self.ao_enabled:
+        # Progressive accumulation (needed for AO convergence and/or DOF)
+        needs_accum = self.ao_enabled or self.dof_enabled
+        if needs_accum:
             from .analysis.render import _bloom, _tone_map_aces, _render_buffers
 
             # Check if camera moved — compare current state to previous
@@ -4876,11 +5146,11 @@ def _update_frame(self):
             d_display = d_output
 
         # Deferred post-processing: denoise → bloom → tone map.
-        # These are deferred when AO or denoiser is active so they
+        # These are deferred when AO, DOF, or denoiser is active so they
         # operate on the clean / averaged signal.
-        defer_post = self.ao_enabled or self.denoise_enabled
+        defer_post = needs_accum or self.denoise_enabled
         if defer_post:
-            if not self.ao_enabled:
+            if not needs_accum:
                 from .analysis.render import _bloom, _tone_map_aces, _render_buffers
 
             if self.denoise_enabled:
@@ -4940,12 +5210,26 @@ def _update_frame(self):
                 self._pinned_mem, dtype=np.float32, count=d_display.size
             ).reshape(d_display.shape)
 
-        # Start async D2H copy on non-blocking stream
-        d_display.get(out=self._pinned_frame, stream=self._readback_stream)
+        # Save clean post-processed frame for idle wind replay
+        if self._wind_enabled:
+            if self._d_base_frame is None or self._d_base_frame.shape != d_display.shape:
+                self._d_base_frame = cp.empty_like(d_display)
+                self._d_wind_scratch = cp.empty_like(d_display)
+            cp.copyto(self._d_base_frame, d_display)
 
-        # CPU work while DMA runs
+        # GPU wind: advect on CPU, splat on GPU, then readback
         if self._wind_enabled and self._wind_particles is not None:
             self._update_wind_particles()
+            self._splat_wind_gpu(d_display)
+
+            # Sync: wind kernel runs on stream 0, readback on non-blocking stream
+            if self._wind_done_event is None:
+                self._wind_done_event = cp.cuda.Event()
+            self._wind_done_event.record()
+            self._readback_stream.wait_event(self._wind_done_event)
+
+        # Async D2H copy on non-blocking stream
+        d_display.get(out=self._pinned_frame, stream=self._readback_stream)
 
         # Wait for DMA to complete
         self._readback_stream.synchronize()
@@ -5000,8 +5284,7 @@ def _composite_overlays(self):
 
         # Build display frame (copy if we need overlays, else use pinned directly)
         needs_overlay = (
-            (self._wind_enabled and self._wind_particles is not None)
-            or (self._gtfs_rt_enabled and self._gtfs_rt_vehicles is not None)
+            (self._gtfs_rt_enabled and self._gtfs_rt_vehicles is not None)
             or self.show_minimap
             or self._title_overlay_rgba is not None
             or self._legend_rgba is not None
@@ -5012,10 +5295,6 @@ def _composite_overlays(self):
         else:
             img = self._pinned_frame
 
-        # Wind overlay
-        if self._wind_enabled and self._wind_particles is not None:
-            self._draw_wind_on_frame(img)
-
         # GTFS-RT vehicle overlay
         if self._gtfs_rt_enabled and self._gtfs_rt_vehicles is not None:
             self._draw_gtfs_rt_on_frame(img)
@@ -5399,6 +5678,7 @@ def _render_help_text(self):
             ("GEOMETRY", [
                 ("N", "Cycle geometry layer"),
                 ("P", "Prev geometry in group"),
+                ("Shift+C", "Cycle point cloud colors"),
                 ("Right-Click", "Pick geometry"),
             ]),
             ("OBSERVERS", [
@@ -5962,6 +6242,7 @@ def explore(raster, width: int = 800, height: int = 600,
     - U/Shift+U: Cycle basemap forward/backward (none → satellite → osm)
     - N: Cycle geometry layer (none → all → groups)
     - P: Jump to previous geometry in current group
+    - Shift+C: Cycle point cloud color mode (elevation/intensity/classification/rgb)
     - ,/.: Decrease/increase overlay alpha (transparency)
     - O: Place observer (for viewshed) at look-at point
     - Shift+O: Cycle drone mode (off → 3rd person → FPV → off)
diff --git a/rtxpy/pointcloud.py b/rtxpy/pointcloud.py
index d23398d..ce0204d 100644
--- a/rtxpy/pointcloud.py
+++ b/rtxpy/pointcloud.py
@@ -33,7 +33,8 @@
 
 
 def load_pointcloud(source, classification=None, returns=None,
-                    bounds=None, subsample=1, max_points=None):
+                    bounds=None, subsample=1, max_points=None,
+                    thin=None):
     """
     Load a point cloud from a file path or numpy array.
 
@@ -46,6 +47,9 @@ def load_pointcloud(source, classification=None, returns=None,
         returns: Optional 'first', 'last', or 'all' to filter by return number.
         bounds: Optional (xmin, ymin, xmax, ymax) spatial crop.
         subsample: Keep every Nth point (default 1 = all).
+        thin: Grid cell size for spatial thinning (in source CRS units,
+            typically metres). Keeps one point per XY grid cell, removing
+            density striations from overlapping flight lines. None disables.
         max_points: Maximum number of points to keep (random sample if exceeded).
 
     Returns:
@@ -104,6 +108,16 @@ def load_pointcloud(source, classification=None, returns=None,
         centers = centers[::subsample]
         attributes = {k: v[::subsample] for k, v in attributes.items()}
 
+    # Spatial grid thinning — one point per XY cell
+    if thin is not None and thin > 0 and len(centers) > 0:
+        cell_x = (centers[:, 0] / thin).astype(np.int64)
+        cell_y = (centers[:, 1] / thin).astype(np.int64)
+        cell_keys = cell_x + cell_y * 2_000_000_003  # large prime avoids collisions
+        _, keep = np.unique(cell_keys, return_index=True)
+        keep.sort()
+        centers = centers[keep]
+        attributes = {k: v[keep] for k, v in attributes.items()}
+
     # Cap total points
     if max_points is not None and len(centers) > max_points:
         rng = np.random.default_rng(42)
diff --git a/rtxpy/remote_data.py b/rtxpy/remote_data.py
index 1581ec5..0a23cc4 100644
--- a/rtxpy/remote_data.py
+++ b/rtxpy/remote_data.py
@@ -474,6 +474,153 @@ def fetch_dem(bounds, output_path, source="copernicus", crs=None, cache_dir=None
     return merged
 
 
+def _query_usgs_lidar_tiles(bounds):
+    """Query USGS TNM API for LiDAR Point Cloud (LPC) LAZ tiles.
+
+    Parameters
+    ----------
+    bounds : tuple of float
+        (west, south, east, north) in WGS84 degrees.
+
+    Returns
+    -------
+    list of (tile_name, url, size_bytes)
+        Sorted by newest ``publicationDate`` first, deduplicated by tile name.
+    """
+    try:
+        import requests
+    except ImportError:
+        raise ImportError(
+            "requests is required for fetch_lidar(). "
+            "Install it with: pip install requests"
+        )
+
+    west, south, east, north = bounds
+    all_items = []
+    offset = 0
+
+    while True:
+        params = {
+            "datasets": "Lidar Point Cloud (LPC)",
+            "bbox": f"{west},{south},{east},{north}",
+            "prodFormats": "LAZ",
+            "max": 100,
+            "offset": offset,
+        }
+        resp = requests.get(_TNM_API_URL, params=params, timeout=60)
+        resp.raise_for_status()
+        data = resp.json()
+
+        items = data.get("items", [])
+        all_items.extend(items)
+        if len(items) < 100:
+            break
+        offset += 100
+
+    # Prefer newer publications when tiles overlap
+    all_items.sort(
+        key=lambda item: item.get("publicationDate", ""),
+        reverse=True,
+    )
+
+    seen_names = set()
+    tiles = []
+    for item in all_items:
+        url = item.get("downloadURL", "")
+        if not url:
+            continue
+
+        tile_name = item.get("title", url.split("/")[-1])
+        # Sanitize tile name for use as filename
+        tile_name = re.sub(r'[^\w\-.]', '_', tile_name)
+
+        if tile_name in seen_names:
+            continue
+        seen_names.add(tile_name)
+
+        size_bytes = item.get("sizeInBytes", 0)
+        tiles.append((tile_name, url, size_bytes))
+
+    return tiles
+
+
+def fetch_lidar(bounds, cache_dir=None, max_tiles=None):
+    """Download USGS 3DEP LiDAR LAZ tiles for a bounding box.
+
+    Queries the USGS TNM API for Lidar Point Cloud (LPC) tiles in LAZ
+    format, downloads them with caching, and returns paths to the local
+    files.  The LAZ files can be passed directly to
+    ``place_pointcloud()`` which handles coordinate conversion.
+
+    Parameters
+    ----------
+    bounds : tuple of float
+        (west, south, east, north) in WGS84 degrees.
+    cache_dir : str or Path, optional
+        Directory for cached LAZ files.  Defaults to
+        ``~/.cache/rtxpy/lidar/``.
+    max_tiles : int, optional
+        Limit the number of tiles downloaded.  Tiles are sorted by
+        newest publication date first, so the most recent data is
+        preferred.  ``None`` downloads all available tiles.
+
+    Returns
+    -------
+    list of Path
+        Paths to downloaded LAZ files.
+    """
+    try:
+        import requests  # noqa: F401 — validate dependency early
+    except ImportError:
+        raise ImportError(
+            "requests is required for fetch_lidar(). "
+            "Install it with: pip install requests"
+        )
+
+    if cache_dir is None:
+        cache_dir = Path.home() / ".cache" / "rtxpy" / "lidar"
+    else:
+        cache_dir = Path(cache_dir)
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    tiles = _query_usgs_lidar_tiles(bounds)
+    if not tiles:
+        print("No LiDAR tiles found for the given bounds.")
+        return []
+
+    total_mb = sum(s for _, _, s in tiles) / (1 << 20)
+    print(f"Found {len(tiles)} LiDAR tile(s) ({total_mb:.0f} MB total)")
+
+    if max_tiles is not None and max_tiles < len(tiles):
+        print(f"  Limiting to {max_tiles} of {len(tiles)} tile(s)")
+        tiles = tiles[:max_tiles]
+
+    laz_paths = []
+    for tile_name, url, size_bytes in tiles:
+        # Ensure .laz extension
+        fname = tile_name if tile_name.endswith(".laz") else f"{tile_name}.laz"
+        tile_path = cache_dir / fname
+
+        if tile_path.exists() and tile_path.stat().st_size > 0:
+            print(f"  Using cached {tile_name}")
+        else:
+            size_mb = size_bytes / (1 << 20)
+            print(f"  Downloading {tile_name} ({size_mb:.1f} MB)...")
+            try:
+                _download_tile(url, tile_path)
+            except Exception as e:
+                print(f"  Warning: Failed to download {tile_name}: {e}")
+                continue
+
+        laz_paths.append(tile_path)
+
+    if not laz_paths:
+        raise RuntimeError("Failed to download any LiDAR tiles")
+
+    print(f"  {len(laz_paths)} LAZ file(s) ready")
+    return laz_paths
+
+
 def fetch_osm(bounds, tags=None, crs=None, cache_path=None):
     """Download OpenStreetMap features for a bounding box.
 
diff --git a/rtxpy/rtx.py b/rtxpy/rtx.py
index 04b7d6b..10bc5de 100644
--- a/rtxpy/rtx.py
+++ b/rtxpy/rtx.py
@@ -168,8 +168,10 @@ def __init__(self):
         self.hf_tile_size = 32
         self.hf_num_tiles_x = 0
 
-        # Point cloud state
-        self.point_colors = None  # GPU buffer (cupy) for per-point RGBA colors
+        # Point cloud state — per-GAS colors keyed by geometry_id
+        self.point_colors_per_gas = {}  # {geometry_id: np.ndarray (N*4,)}
+        self.point_colors = None  # concatenated GPU buffer (built on demand)
+        self.point_color_offsets = None  # GPU int32 per-instance offsets
 
         # Device buffers for CPU->GPU transfers (per-instance)
         self.d_rays = None
@@ -202,7 +204,9 @@ def clear(self):
         self.hf_num_tiles_x = 0
 
         # Clear point cloud state
+        self.point_colors_per_gas = {}
         self.point_colors = None
+        self.point_color_offsets = None
 
         # Reset to single-GAS mode
         self.single_gas_mode = True
@@ -2408,10 +2412,13 @@ def add_sphere_geometry(self, geometry_id: str, centers, radii,
             is_sphere=True,
         )
 
-        # Store per-point colors if provided
+        # Store per-point colors for this GAS
         if colors is not None:
-            self._geom_state.point_colors = cupy.asarray(
-                colors, dtype=cupy.float32)
+            self._geom_state.point_colors_per_gas[geometry_id] = np.asarray(
+                colors, dtype=np.float32)
+            # Invalidate concatenated buffer so it gets rebuilt
+            self._geom_state.point_colors = None
+            self._geom_state.point_color_offsets = None
 
         self._geom_state.ias_dirty = True
         return 0
@@ -2478,6 +2485,46 @@ def get_geometry_count(self) -> int:
         """
         return len(self._geom_state.gas_entries)
 
+    def build_point_colors_gpu(self):
+        """Build concatenated per-point color buffer and per-instance offsets.
+
+        Returns (point_colors, point_color_offsets) as cupy arrays, or
+        (None, None) if no sphere geometries have per-point colors.
+
+        point_colors: (total_points * 4,) float32 — RGBA per point
+        point_color_offsets: (num_instances,) int32 — offset into
+            point_colors for each IAS instance (-1 = no per-point colors)
+        """
+        gs = self._geom_state
+        if not gs.point_colors_per_gas:
+            return None, None
+
+        # Return cached if still valid
+        if gs.point_colors is not None and gs.point_color_offsets is not None:
+            return gs.point_colors, gs.point_color_offsets
+
+        geom_ids = list(gs.gas_entries.keys())
+        n_instances = len(geom_ids)
+        offsets = np.full(n_instances, -1, dtype=np.int32)
+        parts = []
+        cumulative = 0
+
+        for i, gid in enumerate(geom_ids):
+            if gid in gs.point_colors_per_gas:
+                colors_flat = gs.point_colors_per_gas[gid]
+                n_points = len(colors_flat) // 4
+                offsets[i] = cumulative
+                parts.append(colors_flat)
+                cumulative += n_points
+
+        if not parts:
+            return None, None
+
+        all_colors = np.concatenate(parts)
+        gs.point_colors = cupy.asarray(all_colors, dtype=cupy.float32)
+        gs.point_color_offsets = cupy.asarray(offsets, dtype=cupy.int32)
+        return gs.point_colors, gs.point_color_offsets
+
     def has_geometry(self, geometry_id: str) -> bool:
         """
         Check if a geometry with the given ID exists.