diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..7635a93 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,47 @@ +# pytorch/pytorch images ship with Python, pip, and PyTorch pre-installed, +# so no NVIDIA registry auth or manual CUDA installation is needed. +FROM pytorch/pytorch:2.9.1-cuda12.8-cudnn9-devel + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 +ENV PYTORCH_ALLOC_CONF=expandable_segments:True + +# LOW_VRAM_MODE=1: cast aggregator to bf16 on CPU before GPU transfer (~2-3 GB VRAM savings). +# Per original authors: "no measurable quality change" for the aggregator trunk, +# but the scale-phase RoPE computations are affected on very small GPUs. +# Build with: docker build --build-arg LOW_VRAM_MODE=1 -t lingbot-map-demo-light . +ARG LOW_VRAM_MODE=0 +ENV LOW_VRAM_MODE=${LOW_VRAM_MODE} + +# System dependencies (Python/pip already present in base image) +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + wget \ + curl \ + libglib2.0-0 \ + libgl1 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +# Copy source and install lingbot-map with visualization extras +WORKDIR /app +COPY . . +RUN pip install --no-cache-dir -e ".[vis]" && \ + pip install --no-cache-dir onnxruntime + +# FlashInfer for efficient KV-cache attention (falls back to SDPA if unavailable) +RUN pip install --no-cache-dir flashinfer-python \ + -i https://flashinfer.ai/whl/cu128/torch2.9/ || \ + echo "WARNING: FlashInfer not installed — demo will use --use_sdpa fallback" + +RUN mkdir -p /model /data/images + +COPY docker/entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +EXPOSE 8080 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/README.md b/README.md index d73b1c6..0fc7f90 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,151 @@ python demo.py --model_path /path/to/checkpoint.pt \ `--camera_num_iterations` defaults to `4`; setting it to `1` skips three refinement passes in the camera head (and shrinks its KV cache by 4×). +# 🐳 Docker + +Run the full demo — including model download, inference, and 3D viewer — without any local Python or CUDA setup. + +## Image Design + +| Layer | Detail | +|:---|:---| +| Base image | `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-devel` (public, no auth required) | +| Attention backend | [FlashInfer](https://github.com/flashinfer-ai/flashinfer) for paged KV-cache; auto-falls back to PyTorch SDPA if unavailable | +| Visualisation | [viser](https://github.com/nerfstudio-project/viser) web viewer exposed on port **8080** | +| Model resolution | `docker/entrypoint.sh` checks `/model/` at startup and auto-downloads from HuggingFace when no `.pt` file is found | +| Data access | Images and model weights are provided via **volume mounts** — nothing user-specific is baked into the image | + +``` +lingbot-map-demo +├── /app/ ← source code + built-in example scenes +│ └── example/{church,oxford,university,loop}/ +├── /model/ ← mount a host directory here to cache the model +└── /data/ ← mount your images or video here +``` + +## Prerequisites + +- [Docker](https://docs.docker.com/get-docker/) with the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) +- An NVIDIA GPU (CUDA 12.8 driver) + +## Build + +```bash +git clone https://github.com/YoshiRi/lingbot-map-docker.git +cd lingbot-map-docker +docker build -t lingbot-map-demo . +``` + +## Try the Built-in Example Scenes + +The four example scenes from `example/` are already baked into the image at `/app/example/`. +No extra data mount is needed — just provide a writable directory for the model cache. + +```bash +# Church (outdoor, sky masking recommended) +docker run --gpus all \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /app/example/church --mask_sky + +# Oxford +docker run --gpus all \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /app/example/oxford --mask_sky + +# University +docker run --gpus all \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /app/example/university --mask_sky + +# Loop (loop-closure trajectory, no sky masking needed) +docker run --gpus all \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /app/example/loop +``` + +On **first run** the model is downloaded from HuggingFace and cached in `./model/`; subsequent runs start immediately. +Open **http://localhost:8080** in your browser once inference completes. + +## Run with Your Own Images + +Place your images (`.jpg` / `.png`) in a local folder, then mount it: + +```bash +docker run --gpus all \ + -v /path/to/your/images:/data/images \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /data/images +``` + +## Run with a Video File + +```bash +docker run --gpus all \ + -v /path/to/video.mp4:/data/video.mp4 \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --video_path /data/video.mp4 --fps 10 +``` + +## docker-compose + +Edit `docker-compose.yml` to set your image folder and model variant, then: + +```bash +# Put your images in ./images/ +docker compose up +``` + +## Environment Variables + +| Variable | Default | Description | +|:---|:---|:---| +| `HF_MODEL_NAME` | `lingbot-map` | Checkpoint to download: `lingbot-map`, `lingbot-map-long`, or `lingbot-map-stage1` | +| `MODEL_PATH` | *(auto)* | Explicit path to a `.pt` file inside the container (skips auto-download) | +| `MODEL_CACHE_DIR` | `/model` | Directory where the downloaded model is stored | +| `HUGGING_FACE_HUB_TOKEN` | *(none)* | HuggingFace token for gated repos | + +## Tips + +**Use a pre-downloaded model** (avoids HuggingFace download at runtime): +```bash +docker run --gpus all \ + -v /path/to/checkpoint.pt:/model/lingbot-map.pt \ + -v $(pwd)/images:/data/images \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /data/images +``` + +**Limited GPU memory** — add one or both flags: +```bash + --num_scale_frames 2 # reduces activation peak of the initial scale phase + --keyframe_interval 6 # keeps only every 6th frame in KV cache +``` + +**Long sequences (> 3000 frames)** — use windowed mode: +```bash + --mode windowed --window_size 128 +``` + +**Faster inference** — reduce camera head iterations (small accuracy trade-off): +```bash + --camera_num_iterations 1 +``` + +--- + # 📜 License This project is released under the Apache License 2.0. See [LICENSE](LICENSE.txt) file for details. diff --git a/demo.py b/demo.py index a8c8636..faa1dc1 100644 --- a/demo.py +++ b/demo.py @@ -20,7 +20,9 @@ import argparse import glob +import json import os +import struct import time # Must be set before `import torch` / any CUDA init. Reduces the reserved-vs-allocated @@ -127,7 +129,7 @@ def load_model(args, device): if args.model_path: print(f"Loading checkpoint: {args.model_path}") - ckpt = torch.load(args.model_path, map_location=device, weights_only=False) + ckpt = torch.load(args.model_path, map_location="cpu", weights_only=False) state_dict = ckpt.get("model", ckpt) missing, unexpected = model.load_state_dict(state_dict, strict=False) if missing: @@ -136,6 +138,13 @@ def load_model(args, device): print(f" Unexpected keys: {len(unexpected)}") print(" Checkpoint loaded.") + if os.environ.get("LOW_VRAM_MODE", "0") == "1" and \ + device.type == "cuda" and getattr(model, "aggregator", None) is not None: + cap = torch.cuda.get_device_capability(device) + dtype = torch.bfloat16 if cap[0] >= 8 else torch.float16 + print(f"[LOW_VRAM_MODE] Casting aggregator to {dtype} on CPU before GPU transfer") + model.aggregator = model.aggregator.to(dtype=dtype) + return model.to(device).eval() @@ -227,6 +236,107 @@ def prepare_for_visualization(predictions, images=None): return vis_predictions +# ============================================================================= +# Export +# ============================================================================= + +def export_results(predictions, images_cpu, output_dir, conf_threshold=0.0, ply_stride=1): + """Save inference results to output_dir. + + Writes three files: + predictions.npz – raw numpy arrays (depth, world_points, extrinsic, intrinsic, images) + pointcloud.ply – merged, confidence-filtered point cloud (binary PLY) + cameras.json – per-frame camera-to-world poses and intrinsics + """ + os.makedirs(output_dir, exist_ok=True) + + # ── NPZ ────────────────────────────────────────────────────────────────── + npz_path = os.path.join(output_dir, "predictions.npz") + save_dict = {} + for k, v in predictions.items(): + if isinstance(v, torch.Tensor): + save_dict[k] = v.cpu().numpy() + elif isinstance(v, np.ndarray): + save_dict[k] = v + if isinstance(images_cpu, torch.Tensor): + images_arr = images_cpu.numpy() + elif isinstance(images_cpu, np.ndarray): + images_arr = images_cpu + else: + images_arr = None + if images_arr is not None: + if images_arr.ndim == 5 and images_arr.shape[0] == 1: + images_arr = images_arr[0] # (1,S,C,H,W) → (S,C,H,W) + save_dict["images"] = images_arr + np.savez_compressed(npz_path, **save_dict) + print(f" Saved predictions → {npz_path}") + + # ── PLY ────────────────────────────────────────────────────────────────── + world_points = save_dict.get("world_points") # (S, H, W, 3) + depth = save_dict.get("depth") # (S, H, W, 1) fallback + depth_conf = save_dict.get("depth_conf") # (S, H, W) + images_np = save_dict.get("images") # (S, 3, H, W) + + if world_points is None and depth is not None: + from lingbot_map.utils.geometry import unproject_depth_map_to_point_map + world_points = unproject_depth_map_to_point_map( + depth, save_dict["extrinsic"], save_dict["intrinsic"] + ) + + if world_points is not None and images_np is not None: + S, H, W = world_points.shape[:3] + colors = images_np.transpose(0, 2, 3, 1) # (S, H, W, 3) + pts_all, col_all = [], [] + st = max(1, int(ply_stride)) + for i in range(S): + pts = world_points[i, ::st, ::st].reshape(-1, 3) + col = (colors[i, ::st, ::st].reshape(-1, 3) * 255).clip(0, 255).astype(np.uint8) + valid = np.isfinite(pts).all(axis=1) + if depth_conf is not None: + valid &= depth_conf[i, ::st, ::st].reshape(-1) > conf_threshold + pts_all.append(pts[valid]) + col_all.append(col[valid]) + pts_merged = np.concatenate(pts_all, axis=0).astype(np.float32) + col_merged = np.concatenate(col_all, axis=0) + n = len(pts_merged) + + ply_path = os.path.join(output_dir, "pointcloud.ply") + header = ( + "ply\nformat binary_little_endian 1.0\n" + f"element vertex {n}\n" + "property float x\nproperty float y\nproperty float z\n" + "property uchar red\nproperty uchar green\nproperty uchar blue\n" + "end_header\n" + ).encode() + with open(ply_path, "wb") as f: + f.write(header) + # interleave xyz + rgb tightly + data = np.empty(n, dtype=[("x","f4"),("y","f4"),("z","f4"), + ("r","u1"),("g","u1"),("b","u1")]) + data["x"], data["y"], data["z"] = pts_merged[:,0], pts_merged[:,1], pts_merged[:,2] + data["r"], data["g"], data["b"] = col_merged[:,0], col_merged[:,1], col_merged[:,2] + f.write(data.tobytes()) + print(f" Saved point cloud ({n:,} pts) → {ply_path}") + else: + print(" Skipping PLY export (world_points or images not available)") + + # ── cameras.json ───────────────────────────────────────────────────────── + extrinsic = save_dict.get("extrinsic") # (S, 3, 4) c2w + intrinsic = save_dict.get("intrinsic") # (S, 3, 3) + if extrinsic is not None and intrinsic is not None: + cameras = [] + for i in range(len(extrinsic)): + cameras.append({ + "frame": i, + "c2w": extrinsic[i].tolist(), + "K": intrinsic[i].tolist(), + }) + cam_path = os.path.join(output_dir, "cameras.json") + with open(cam_path, "w") as f: + json.dump(cameras, f, indent=2) + print(f" Saved cameras ({len(cameras)} frames) → {cam_path}") + + # ============================================================================= # Main # ============================================================================= @@ -291,6 +401,12 @@ def main(): parser.add_argument("--export_preprocessed", type=str, default=None, help="Export stride-sampled, resized/cropped images to this folder") + # Output + parser.add_argument("--output_dir", type=str, default="/data/output", + help="Directory for exported results (predictions.npz, pointcloud.ply, cameras.json)") + parser.add_argument("--no_viewer", action="store_true", + help="Skip the interactive viewer (export only)") + args = parser.parse_args() assert args.image_folder or args.video_path, \ "Provide --image_folder or --video_path" @@ -398,6 +514,15 @@ def main(): predictions, images_cpu = postprocess(predictions, images_for_post) + # ── Export ─────────────────────────────────────────────────────────────── + print(f"Exporting results to {args.output_dir} ...") + export_results(predictions, images_cpu, args.output_dir, + conf_threshold=args.conf_threshold, ply_stride=args.downsample_factor) + + if args.no_viewer: + print("Viewer skipped (--no_viewer). Done.") + return + # ── Visualize ──────────────────────────────────────────────────────────── try: from lingbot_map.vis import PointCloudViewer diff --git a/docker-compose.lowvram.yml b/docker-compose.lowvram.yml new file mode 100644 index 0000000..634091b --- /dev/null +++ b/docker-compose.lowvram.yml @@ -0,0 +1,29 @@ +services: + lingbot-map: + build: + context: . + args: + LOW_VRAM_MODE: "1" # cast aggregator to bf16 before GPU transfer + image: lingbot-map-demo-light + runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - HF_MODEL_NAME=lingbot-map + - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} + volumes: + - ${IMAGE_HOST_PATH:-./example/oxford}:/data/images # override: IMAGE_HOST_PATH=~/photos docker compose up + - ./model:/model # model cache (downloaded on first run) + - ./output:/data/output # exported results (PLY / NPZ / JSON) + ports: + - "8080:8080" + command: > + --image_folder /data/images + --output_dir /data/output + --mode windowed + --window_size 16 + --overlap_size 8 + --num_scale_frames 4 + --conf_threshold 2.0 + --downsample_factor 4 + --mask_sky + --point_size 0.005 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..0cac8ba --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,21 @@ +services: + lingbot-map: + build: + context: . + args: + LOW_VRAM_MODE: "0" + image: lingbot-map-demo + runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - HF_MODEL_NAME=lingbot-map # lingbot-map | lingbot-map-long | lingbot-map-stage1 + - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} + volumes: + - ${IMAGE_HOST_PATH:-./example/oxford}:/data/images # override: IMAGE_HOST_PATH=~/photos docker compose up + - ./model:/model # model cache (downloaded on first run) + - ./output:/data/output # exported results (PLY / NPZ / JSON) + ports: + - "8080:8080" + command: > + --image_folder /data/images + --output_dir /data/output diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 0000000..da8d123 --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,54 @@ +#!/bin/bash +set -e + +MODEL_CACHE_DIR="${MODEL_CACHE_DIR:-/model}" +HF_MODEL_NAME="${HF_MODEL_NAME:-lingbot-map}" +MODEL_PATH="${MODEL_PATH:-}" + +# ── Resolve model path ──────────────────────────────────────────────────────── +if [ -z "$MODEL_PATH" ]; then + # Look for a matching .pt file already in the cache volume + MODEL_FILE=$(find "$MODEL_CACHE_DIR" -name "${HF_MODEL_NAME}.pt" -print -quit 2>/dev/null || true) + if [ -z "$MODEL_FILE" ]; then + MODEL_FILE=$(find "$MODEL_CACHE_DIR" -name "*.pt" -print -quit 2>/dev/null || true) + fi + + if [ -z "$MODEL_FILE" ]; then + echo "Model not found in ${MODEL_CACHE_DIR}. Downloading '${HF_MODEL_NAME}' from HuggingFace..." + python - </dev/null || \ + find "$MODEL_CACHE_DIR" -name "*.pt" -print -quit 2>/dev/null || true) + fi + + MODEL_PATH="$MODEL_FILE" +fi + +if [ -z "$MODEL_PATH" ] || [ ! -f "$MODEL_PATH" ]; then + echo "ERROR: No model .pt file found. Either:" + echo " - Mount a pre-downloaded model: -v /path/to/model.pt:/model/lingbot-map.pt" + echo " - Or let auto-download run (requires internet access)" + exit 1 +fi + +echo "Using model: ${MODEL_PATH}" + +# ── Check FlashInfer availability ───────────────────────────────────────────── +EXTRA_ARGS="" +python -c "import flashinfer" 2>/dev/null || EXTRA_ARGS="--use_sdpa" +if [ -n "$EXTRA_ARGS" ]; then + echo "FlashInfer not available, using SDPA backend." +fi + +# ── Launch demo ─────────────────────────────────────────────────────────────── +exec python /app/demo.py \ + --model_path "$MODEL_PATH" \ + $EXTRA_ARGS \ + "$@" diff --git a/lingbot_map/vis/point_cloud_viewer.py b/lingbot_map/vis/point_cloud_viewer.py index a1d698c..87e9ab4 100644 --- a/lingbot_map/vis/point_cloud_viewer.py +++ b/lingbot_map/vis/point_cloud_viewer.py @@ -97,7 +97,7 @@ def __init__( self.size = size self.state_args = state_args self.server = viser.ViserServer(host="0.0.0.0", port=port) - self.server.gui.configure_theme(titlebar_content=None, control_layout="collapsible") + self.server.gui.configure_theme(titlebar_content=None, control_layout="fixed") self.device = device self.conf_list = conf_list self.vis_threshold = vis_threshold @@ -415,8 +415,8 @@ def _(event: viser.GuiEvent) -> None: "Show Camera", initial_value=self.show_camera ) self.vis_threshold_slider = self.server.gui.add_slider( - "Visibility Threshold", min=1.0, max=5.0, step=0.01, - initial_value=self.vis_threshold, + "Visibility Threshold", min=0.0, max=5.0, step=0.01, + initial_value=max(self.vis_threshold, 0.0), ) self.camera_downsample_slider = self.server.gui.add_slider( "Camera Downsample Factor", min=1, max=50, step=1, initial_value=1 diff --git a/tools/analyze_predictions.py b/tools/analyze_predictions.py new file mode 100644 index 0000000..cf3f9d4 --- /dev/null +++ b/tools/analyze_predictions.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +"""Diagnose predictions.npz from demo.py. + +Usage: + python tools/analyze_predictions.py output/predictions.npz + python tools/analyze_predictions.py output/predictions.npz --cameras output/cameras.json + python tools/analyze_predictions.py output/predictions.npz --save report.png +""" +import argparse +import json +import sys +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec + + +def stat(arr, name): + if arr is None: + print(f" {name}: missing") + return + finite = np.isfinite(arr) + n_inf = np.sum(~finite) + valid = arr[finite] + print(f" {name}: shape={arr.shape} dtype={arr.dtype}") + if valid.size: + print(f" range=[{valid.min():.4g}, {valid.max():.4g}] mean={valid.mean():.4g} " + f"nan/inf={n_inf} ({100*n_inf/arr.size:.1f}%)") + else: + print(f" ALL VALUES INVALID (nan/inf)") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("npz", help="Path to predictions.npz") + parser.add_argument("--cameras", default=None, help="Path to cameras.json (optional)") + parser.add_argument("--save", default=None, help="Save figure to file") + parser.add_argument("--conf_threshold", type=float, default=None, + help="Confidence threshold to simulate (default: show distribution)") + args = parser.parse_args() + + print(f"Loading {args.npz} ...") + d = np.load(args.npz, allow_pickle=False) + keys = list(d.keys()) + print(f"Keys: {keys}\n") + + world_points = d.get("world_points") # (S, H, W, 3) + depth_conf = d.get("depth_conf") if "depth_conf" in keys else ( + d.get("world_points_conf") if "world_points_conf" in keys else None) + images = d.get("images") # (S, 3, H, W) or (S, H, W, 3) + extrinsic = d.get("extrinsic") # (S, 3, 4) + + print("=== Array stats ===") + stat(world_points, "world_points") + stat(depth_conf, "depth_conf / world_points_conf") + stat(images, "images") + stat(extrinsic, "extrinsic") + print() + + if world_points is None: + print("No world_points found — cannot analyse point cloud.") + sys.exit(1) + + S, H, W = world_points.shape[:3] + pts_flat = world_points.reshape(-1, 3) + finite_mask = np.isfinite(pts_flat).all(axis=1) + + print("=== Point cloud sanity ===") + print(f" Total pixels : {len(pts_flat):,}") + print(f" Finite points: {finite_mask.sum():,} ({100*finite_mask.mean():.1f}%)") + + if depth_conf is not None: + conf_flat = depth_conf.reshape(-1) + print(f"\n Confidence stats (all):") + pcts = [0, 1, 5, 25, 50, 75, 90, 95, 99, 100] + vals = np.nanpercentile(conf_flat, pcts) + for p, v in zip(pcts, vals): + print(f" p{p:3d}: {v:.4f}") + + for thr in [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0]: + n_pass = int((conf_flat[finite_mask] > thr).sum()) + print(f" conf > {thr:.1f}: {n_pass:>8,} pts ({100*n_pass/max(finite_mask.sum(),1):.1f}% of finite)") + + pts_valid = pts_flat[finite_mask] + if len(pts_valid): + print(f"\n X range: [{pts_valid[:,0].min():.4f}, {pts_valid[:,0].max():.4f}]") + print(f" Y range: [{pts_valid[:,1].min():.4f}, {pts_valid[:,1].max():.4f}]") + print(f" Z range: [{pts_valid[:,2].min():.4f}, {pts_valid[:,2].max():.4f}]") + dist = np.linalg.norm(pts_valid, axis=1) + print(f" Distance from origin: min={dist.min():.4f} median={np.median(dist):.4f} " + f"p99={np.percentile(dist,99):.4f} max={dist.max():.4f}") + + if images is not None: + print(f"\n Image pixel range: [{images.min():.4f}, {images.max():.4f}]") + if images.max() <= 1.01: + print(" → pixels appear to be in [0,1] — color mapping should be fine") + elif images.max() <= 255.5: + print(" → pixels appear to be in [0,255] — need /255 before color mapping (possible color bug!)") + else: + print(" WARNING: pixel values outside expected range — color mapping will be wrong") + + # ── Figures ────────────────────────────────────────────────────────────── + cam_positions = None + if args.cameras: + with open(args.cameras) as f: + cams = json.load(f) + cam_positions = np.array([[c["c2w"][0][3], c["c2w"][1][3], c["c2w"][2][3]] for c in cams]) + + if extrinsic is not None and cam_positions is None: + cam_positions = extrinsic[:, :3, 3] # translation from c2w + + fig = plt.figure(figsize=(14, 10)) + gs = gridspec.GridSpec(2, 3, figure=fig) + + # Sample points for plotting (avoid OOM on huge arrays) + MAX_PLOT = 50_000 + if depth_conf is not None: + conf_thr = args.conf_threshold if args.conf_threshold is not None else 2.0 + sel = finite_mask & (depth_conf.reshape(-1) > conf_thr) + else: + sel = finite_mask + pts_sel = pts_flat[sel] + if len(pts_sel) > MAX_PLOT: + idx = np.random.choice(len(pts_sel), MAX_PLOT, replace=False) + pts_sel = pts_sel[idx] + print(f"\n Plotting {len(pts_sel):,} points (conf>{args.conf_threshold if args.conf_threshold else 2.0:.1f})") + + # Top-down view (X-Z) + ax1 = fig.add_subplot(gs[0, 0]) + if len(pts_sel): + ax1.scatter(pts_sel[:, 0], pts_sel[:, 2], s=0.5, alpha=0.3, c="steelblue") + if cam_positions is not None: + ax1.plot(cam_positions[:, 0], cam_positions[:, 2], "r-", lw=1, label="cameras") + ax1.scatter(cam_positions[0, 0], cam_positions[0, 2], c="lime", s=60, zorder=5) + ax1.scatter(cam_positions[-1, 0], cam_positions[-1, 2], c="red", s=60, zorder=5) + ax1.legend(fontsize=7) + ax1.set_xlabel("X"); ax1.set_ylabel("Z") + ax1.set_title("Top-down (X-Z)") + ax1.set_aspect("equal") + + # Side view (X-Y) + ax2 = fig.add_subplot(gs[0, 1]) + if len(pts_sel): + ax2.scatter(pts_sel[:, 0], pts_sel[:, 1], s=0.5, alpha=0.3, c="steelblue") + if cam_positions is not None: + ax2.plot(cam_positions[:, 0], cam_positions[:, 1], "r-", lw=1) + ax2.scatter(cam_positions[0, 0], cam_positions[0, 1], c="lime", s=60, zorder=5) + ax2.scatter(cam_positions[-1, 0], cam_positions[-1, 1], c="red", s=60, zorder=5) + ax2.set_xlabel("X"); ax2.set_ylabel("Y") + ax2.set_title("Side (X-Y)") + ax2.set_aspect("equal") + + # Front view (Y-Z) + ax3 = fig.add_subplot(gs[0, 2]) + if len(pts_sel): + ax3.scatter(pts_sel[:, 2], pts_sel[:, 1], s=0.5, alpha=0.3, c="steelblue") + if cam_positions is not None: + ax3.plot(cam_positions[:, 2], cam_positions[:, 1], "r-", lw=1) + ax3.set_xlabel("Z"); ax3.set_ylabel("Y") + ax3.set_title("Front (Z-Y)") + ax3.set_aspect("equal") + + # Confidence histogram + ax4 = fig.add_subplot(gs[1, 0]) + if depth_conf is not None: + cf = depth_conf.reshape(-1) + cf_finite = cf[np.isfinite(cf)] + ax4.hist(cf_finite, bins=100, color="steelblue", alpha=0.7) + for thr in [1.0, 2.0, 3.0]: + ax4.axvline(thr, color="red", lw=1, linestyle="--", label=f"thr={thr}") + ax4.set_xlabel("confidence"); ax4.set_ylabel("count") + ax4.set_title("Confidence distribution") + ax4.legend(fontsize=7) + else: + ax4.text(0.5, 0.5, "No confidence data", ha="center", va="center", transform=ax4.transAxes) + + # Distance histogram + ax5 = fig.add_subplot(gs[1, 1]) + if len(pts_valid): + dist = np.linalg.norm(pts_valid, axis=1) + p99 = np.percentile(dist, 99) + ax5.hist(dist[dist < p99 * 2], bins=100, color="darkorange", alpha=0.7) + ax5.axvline(p99, color="red", lw=1, linestyle="--", label=f"p99={p99:.2f}") + ax5.set_xlabel("distance from origin"); ax5.set_ylabel("count") + ax5.set_title("Point distance distribution") + ax5.legend(fontsize=7) + + # Point count per frame + ax6 = fig.add_subplot(gs[1, 2]) + if depth_conf is not None: + conf_thr = args.conf_threshold if args.conf_threshold is not None else 2.0 + counts = [] + for i in range(S): + pts_i = world_points[i].reshape(-1, 3) + fin = np.isfinite(pts_i).all(axis=1) + cnf = depth_conf[i].reshape(-1) > conf_thr + counts.append(int((fin & cnf).sum())) + ax6.bar(range(S), counts, color="mediumseagreen", alpha=0.8) + ax6.set_xlabel("frame"); ax6.set_ylabel("valid points") + ax6.set_title(f"Valid points per frame (conf>{conf_thr:.1f})") + else: + ax6.text(0.5, 0.5, "No confidence data", ha="center", va="center", transform=ax6.transAxes) + + plt.suptitle(f"LingBot-Map predictions analysis (S={S}, H={H}, W={W})", fontsize=12) + plt.tight_layout() + + if args.save: + plt.savefig(args.save, dpi=150) + print(f"\nSaved → {args.save}") + else: + plt.show() + + +if __name__ == "__main__": + main() diff --git a/tools/debug_reconstruction.py b/tools/debug_reconstruction.py new file mode 100644 index 0000000..24388b1 --- /dev/null +++ b/tools/debug_reconstruction.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python3 +"""Debug reconstruction quality from predictions.npz. + +Checks: + 1. Are world_points in front of or behind each camera? + 2. Do reprojected points align with the original image? + 3. Depth map plausibility per frame. + +Usage: + python tools/debug_reconstruction.py output/predictions.npz + python tools/debug_reconstruction.py output/predictions.npz --frames 0 10 50 100 + python tools/debug_reconstruction.py output/predictions.npz --save debug/ +""" +import argparse +import os +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec + + +def c2w_to_w2c(c2w_3x4): + """Invert c2w (3x4) → w2c (3x4).""" + R = c2w_3x4[:3, :3] + t = c2w_3x4[:3, 3] + R_inv = R.T + t_inv = -R_inv @ t + w2c = np.eye(4) + w2c[:3, :3] = R_inv + w2c[:3, 3] = t_inv + return w2c[:3, :] # (3, 4) + + +def project_to_camera(world_pts, w2c_3x4, K_3x3): + """world_pts (N,3) → pixel coords (N,2) and depth (N,).""" + R, t = w2c_3x4[:3, :3], w2c_3x4[:3, 3] + cam_pts = (R @ world_pts.T).T + t # (N, 3) in camera space + depth_cam = cam_pts[:, 2] + fx, fy = K_3x3[0, 0], K_3x3[1, 1] + cx, cy = K_3x3[0, 2], K_3x3[1, 2] + u = fx * cam_pts[:, 0] / (cam_pts[:, 2] + 1e-8) + cx + v = fy * cam_pts[:, 1] / (cam_pts[:, 2] + 1e-8) + cy + return np.stack([u, v], axis=1), depth_cam + + +def analyze_frame(frame_idx, world_points, extrinsic, intrinsic, images, + depth_conf, conf_threshold=2.0): + """Return dict of diagnostics for one frame.""" + pts = world_points[frame_idx].reshape(-1, 3) # (H*W, 3) + c2w = np.eye(4); c2w[:3, :] = extrinsic[frame_idx] # (4, 4) + w2c = c2w_to_w2c(extrinsic[frame_idx]) + K = intrinsic[frame_idx] + H, W = world_points.shape[1:3] + + # Camera position and forward direction + cam_pos = extrinsic[frame_idx][:3, 3] + cam_forward = extrinsic[frame_idx][:3, 2] # 3rd column of R = Z axis + + # Transform points to camera space + _, depth_cam = project_to_camera(pts, w2c, K) + + finite = np.isfinite(pts).all(axis=1) + if depth_conf is not None: + conf_mask = depth_conf[frame_idx].reshape(-1) > conf_threshold + else: + conf_mask = np.ones(len(pts), dtype=bool) + valid = finite & conf_mask + + n_valid = valid.sum() + n_front = (depth_cam[valid] > 0).sum() + n_behind = (depth_cam[valid] <= 0).sum() + pct_front = 100 * n_front / max(n_valid, 1) + pct_behind = 100 * n_behind / max(n_valid, 1) + depth_median = float(np.median(depth_cam[valid])) if n_valid else float("nan") + depth_p5 = float(np.percentile(depth_cam[valid], 5)) if n_valid else float("nan") + depth_p95 = float(np.percentile(depth_cam[valid], 95)) if n_valid else float("nan") + + # Image for display (C,H,W) → (H,W,C), clip to [0,1] + img_display = None + if images is not None: + img = images[frame_idx] + if img.shape[0] == 3: # (3,H,W) → (H,W,3) + img = img.transpose(1, 2, 0) + img_display = np.clip(img, 0, 1) + + # Reprojection: project valid pts back and compare to pixel grid + uv, _ = project_to_camera(pts[valid], w2c, K) + + return dict( + frame=frame_idx, + cam_pos=cam_pos, + cam_forward=cam_forward, + n_valid=n_valid, + pct_front=pct_front, + pct_behind=pct_behind, + depth_median=depth_median, + depth_p5=depth_p5, + depth_p95=depth_p95, + depth_cam_valid=depth_cam[valid], + world_pts_valid=pts[valid], + uv_reprojected=uv, + img_display=img_display, + H=H, W=W, + ) + + +def plot_frame(ax_row, diag): + """Fill one row of subplots for a single frame.""" + ax_img, ax_depth, ax_reproj, ax_text = ax_row + f = diag["frame"] + + # ── image ────────────────────────────────────────────────────── + if diag["img_display"] is not None: + ax_img.imshow(diag["img_display"]) + ax_img.set_title(f"frame {f}: input image", fontsize=8) + ax_img.axis("off") + + # ── depth histogram ──────────────────────────────────────────── + dc = diag["depth_cam_valid"] + if len(dc): + p1, p99 = np.percentile(dc, 1), np.percentile(dc, 99) + ax_depth.hist(np.clip(dc, p1 * 1.5, p99 * 1.5), bins=80, + color="steelblue" if diag["pct_front"] > 90 else "tomato", + alpha=0.8) + ax_depth.axvline(0, color="red", lw=1.5, label="camera plane") + ax_depth.set_xlabel("depth in camera space", fontsize=7) + ax_depth.set_title( + f"front {diag['pct_front']:.0f}% behind {diag['pct_behind']:.0f}%\n" + f"median={diag['depth_median']:.2f} p5={diag['depth_p5']:.2f} p95={diag['depth_p95']:.2f}", + fontsize=7) + ax_depth.legend(fontsize=6) + ax_depth.tick_params(labelsize=6) + + # ── reprojection scatter ─────────────────────────────────────── + uv = diag["uv_reprojected"] + H, W = diag["H"], diag["W"] + if diag["img_display"] is not None: + ax_reproj.imshow(diag["img_display"], alpha=0.5) + in_frame = ((uv[:, 0] >= 0) & (uv[:, 0] < W) & + (uv[:, 1] >= 0) & (uv[:, 1] < H)) + MAX_PTS = 2000 + if in_frame.sum(): + idx = np.random.choice(in_frame.sum(), + min(MAX_PTS, in_frame.sum()), replace=False) + ax_reproj.scatter(uv[in_frame][idx, 0], uv[in_frame][idx, 1], + s=0.3, alpha=0.4, c="lime") + pct_in = 100 * in_frame.mean() + ax_reproj.set_xlim(0, W); ax_reproj.set_ylim(H, 0) + ax_reproj.set_title(f"reprojection {pct_in:.0f}% in frame", fontsize=8) + ax_reproj.axis("off") + + # ── text summary ─────────────────────────────────────────────── + pos = diag["cam_pos"] + fwd = diag["cam_forward"] + txt = (f"frame {f}\n" + f"pos [{pos[0]:.2f}, {pos[1]:.2f}, {pos[2]:.2f}]\n" + f"fwd [{fwd[0]:.2f}, {fwd[1]:.2f}, {fwd[2]:.2f}]\n" + f"valid pts: {diag['n_valid']:,}\n" + f"front: {diag['pct_front']:.1f}%\n" + f"behind: {diag['pct_behind']:.1f}%") + color = "limegreen" if diag["pct_front"] > 90 else \ + "orange" if diag["pct_front"] > 50 else "red" + ax_text.text(0.05, 0.95, txt, transform=ax_text.transAxes, + fontsize=8, va="top", fontfamily="monospace", + bbox=dict(boxstyle="round", facecolor=color, alpha=0.3)) + ax_text.axis("off") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("npz", help="Path to predictions.npz") + parser.add_argument("--frames", nargs="+", type=int, default=None, + help="Frame indices to inspect (default: evenly spaced 5 frames)") + parser.add_argument("--conf_threshold", type=float, default=2.0) + parser.add_argument("--save", default=None, + help="Directory to save debug figures") + args = parser.parse_args() + + print(f"Loading {args.npz} ...") + d = np.load(args.npz, allow_pickle=False) + + world_points = d["world_points"] # (S, H, W, 3) + extrinsic = d["extrinsic"] # (S, 3, 4) c2w + intrinsic = d["intrinsic"] # (S, 3, 3) + images = d.get("images") # (S, 3, H, W) or None + depth_conf = (d.get("depth_conf") if "depth_conf" in d + else d.get("world_points_conf") if "world_points_conf" in d + else None) + chunk_scales = d["chunk_scales"] if "chunk_scales" in d else None # (num_windows,) + + S = world_points.shape[0] + + # ── Window alignment scales ──────────────────────────────────── + if chunk_scales is not None: + cs = chunk_scales.reshape(-1) + print(f"\n=== Window alignment scales (chunk_scales) ===") + print(f" num windows : {len(cs)}") + print(f" values : {np.array2string(cs, precision=4, separator=', ')}") + bad = (cs < 0.01) | (cs > 100) + if bad.any(): + print(f" WARNING: windows {np.where(bad)[0].tolist()} have extreme scales " + f"(clamped to 1e-3 or 1e3) — alignment failed for these windows") + else: + print(f" All scales look reasonable (range [{cs.min():.4f}, {cs.max():.4f}])") + + # ── Full-sequence front% scan ───────────────────────────────── + print(f"\n=== Front% scan (every 10th frame) ===") + stride = max(1, S // 50) + scan_frames = list(range(0, S, stride)) + front_pcts = [] + for fi in scan_frames: + pts = world_points[fi].reshape(-1, 3) + c2w = extrinsic[fi] + w2c = c2w_to_w2c(c2w) + _, dc = project_to_camera(pts, w2c, intrinsic[fi]) + finite = np.isfinite(pts).all(axis=1) + conf_ok = depth_conf[fi].reshape(-1) > args.conf_threshold if depth_conf is not None else finite + valid = finite & conf_ok + pct = 100 * (dc[valid] > 0).sum() / max(valid.sum(), 1) + front_pcts.append(float(pct)) + + front_arr = np.array(front_pcts) + bad_frames = [scan_frames[i] for i, p in enumerate(front_pcts) if p < 50] + good_frames = [scan_frames[i] for i, p in enumerate(front_pcts) if p >= 90] + print(f" Frames with <50% front (flipped): {bad_frames}") + print(f" Frames with >=90% front (correct): count={len(good_frames)}/{len(scan_frames)}") + + frames = args.frames or [0, S // 4, S // 2, 3 * S // 4, S - 1] + frames = [min(f, S - 1) for f in frames] + print(f"\nDetail frames: {frames}") + + # ── Per-frame text summary ───────────────────────────────────── + print(f"\n{'frame':>6} {'front%':>7} {'behind%':>8} {'depth_median':>12} " + f"{'cam_pos':>30} {'cam_fwd':>30}") + diags = [] + for fi in frames: + diag = analyze_frame(fi, world_points, extrinsic, intrinsic, + images, depth_conf, args.conf_threshold) + diags.append(diag) + fwd = diag["cam_forward"] + print(f"{fi:6d} {diag['pct_front']:7.1f} {diag['pct_behind']:8.1f} " + f"{diag['depth_median']:12.3f} " + f"[{diag['cam_pos'][0]:5.2f},{diag['cam_pos'][1]:5.2f},{diag['cam_pos'][2]:5.2f}] " + f"[{fwd[0]:5.2f},{fwd[1]:5.2f},{fwd[2]:5.2f}]") + + # ── Diagnosis ───────────────────────────────────────────────── + print() + avg_front = np.mean([d["pct_front"] for d in diags]) + pct_bad_windows = 100 * len(bad_frames) / max(len(scan_frames), 1) + if avg_front > 90: + print("✓ Points are mostly in front of cameras — geometry looks correct.") + print(" Blank viewer / bad PLY is likely a density/scale issue, not a logic bug.") + elif pct_bad_windows > 10 and chunk_scales is not None and ((chunk_scales.reshape(-1) < 0.01).any()): + print("✗ Window scale clamped to minimum — depth-ratio alignment failed.") + print(" Likely cause: near-zero or negative depth in overlap frames.") + print(" Fix: increase --overlap_size or --num_scale_frames.") + elif pct_bad_windows > 10: + print("✗ Many frames have points behind cameras.") + print(" Pattern: check if bad frames cluster at window boundaries.") + print(" If clustered → windowed stitching issue (overlap too small).") + print(" If scattered → model output inconsistency (try --mode streaming).") + else: + print("△ Partial front/behind mix — possible coordinate convention mismatch.") + + # ── Figures ─────────────────────────────────────────────────── + n = len(diags) + fig, axes = plt.subplots(n, 4, figsize=(16, 4 * n), + gridspec_kw={"width_ratios": [2, 2, 2, 1]}) + if n == 1: + axes = [axes] + for row, diag in zip(axes, diags): + plot_frame(row, diag) + + plt.suptitle("Reconstruction debug: per-frame geometry check", fontsize=11) + plt.tight_layout() + + if args.save: + os.makedirs(args.save, exist_ok=True) + path = os.path.join(args.save, "debug_frames.png") + plt.savefig(path, dpi=120) + print(f"\nSaved → {path}") + else: + plt.show() + + +if __name__ == "__main__": + main() diff --git a/tools/visualize_cameras.py b/tools/visualize_cameras.py new file mode 100644 index 0000000..a790fa1 --- /dev/null +++ b/tools/visualize_cameras.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +"""Visualize camera trajectory from cameras.json produced by demo.py. + +Usage: + python tools/visualize_cameras.py output/cameras.json + python tools/visualize_cameras.py output/cameras.json --save trajectory.png +""" +import argparse +import json +import numpy as np +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D # noqa: F401 + + +def draw_camera(ax, c2w, scale=0.05, color="steelblue"): + """Draw a small camera frustum: 3 axes + pyramid outline.""" + origin = c2w[:3, 3] + # Unit axes in camera space (right=x, up=-y, forward=z) + axes = c2w[:3, :3] @ np.array([[1, 0, 0], [0, -1, 0], [0, 0, 1]], dtype=float).T + colors = ["red", "green", color] + labels = ["X", "Y", "Z"] + for i, (col, lbl) in enumerate(zip(colors, labels)): + tip = origin + axes[:, i] * scale + ax.plot(*zip(origin, tip), color=col, linewidth=1) + + # Frustum corners (simplified: just 4 corner rays) + corners_cam = np.array([[1, 1, 2], [-1, 1, 2], [-1, -1, 2], [1, -1, 2]], dtype=float) * scale * 0.5 + corners_world = (c2w[:3, :3] @ corners_cam.T).T + origin + for corner in corners_world: + ax.plot(*zip(origin, corner), color=color, linewidth=0.5, alpha=0.5) + # Close the frustum rectangle + rect = np.vstack([corners_world, corners_world[0]]) + ax.plot(rect[:, 0], rect[:, 1], rect[:, 2], color=color, linewidth=0.5, alpha=0.5) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("cameras_json", help="Path to cameras.json") + parser.add_argument("--save", default=None, help="Save figure to file instead of showing") + parser.add_argument("--skip", type=int, default=1, help="Draw every N-th camera (default 1 = all)") + parser.add_argument("--frustum_scale", type=float, default=None, help="Frustum size (auto if unset)") + args = parser.parse_args() + + with open(args.cameras_json) as f: + cameras = json.load(f) + + positions = np.array([c["c2w"][i][3] for c in cameras for i in range(3)]).reshape(-1, 3) + positions = np.array([[c["c2w"][0][3], c["c2w"][1][3], c["c2w"][2][3]] for c in cameras]) + + span = np.max(positions, axis=0) - np.min(positions, axis=0) + scale = float(np.max(span)) * 0.04 if args.frustum_scale is None else args.frustum_scale + + fig = plt.figure(figsize=(10, 8)) + ax = fig.add_subplot(111, projection="3d") + + # Trajectory line + ax.plot(positions[:, 0], positions[:, 1], positions[:, 2], + color="gray", linewidth=1, alpha=0.6, label="trajectory") + + # Start / end markers + ax.scatter(*positions[0], color="lime", s=80, zorder=5, label="start") + ax.scatter(*positions[-1], color="red", s=80, zorder=5, label="end") + + # Camera frustums + for i, cam in enumerate(cameras): + if i % args.skip != 0: + continue + c2w = np.array(cam["c2w"]) # (3, 4) + c2w_4x4 = np.eye(4) + c2w_4x4[:3, :] = c2w + t = i / max(len(cameras) - 1, 1) + color = plt.cm.cool(t) + draw_camera(ax, c2w_4x4, scale=scale, color=color) + + ax.set_xlabel("X") + ax.set_ylabel("Y") + ax.set_zlabel("Z") + ax.set_title(f"Camera trajectory ({len(cameras)} frames)") + ax.legend() + + # Equal aspect ratio + center = positions.mean(axis=0) + half = float(np.max(span)) * 0.55 + ax.set_xlim(center[0] - half, center[0] + half) + ax.set_ylim(center[1] - half, center[1] + half) + ax.set_zlim(center[2] - half, center[2] + half) + + plt.tight_layout() + if args.save: + plt.savefig(args.save, dpi=150) + print(f"Saved → {args.save}") + else: + plt.show() + + +if __name__ == "__main__": + main()