From 4ea6db3d36e109354e6d8a9984291d1aaae2cb36 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 18 Apr 2026 22:51:36 +0000 Subject: [PATCH 01/24] Add Docker image for streaming 3D reconstruction demo Enables running the lingbot-map demo with user-provided images on Docker without any local Python/CUDA setup: - Dockerfile: CUDA 12.8 + PyTorch 2.9.1 + lingbot-map[vis] + FlashInfer - docker/entrypoint.sh: auto-downloads model from HuggingFace on first run, falls back to --use_sdpa if FlashInfer is unavailable - docker-compose.yml: mounts ./images and ./model, exposes port 8080 - README.md: Docker Quick Start section https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- Dockerfile | 48 +++++++++++++++++++++++++++++ README.md | 73 ++++++++++++++++++++++++++++++++++++++++++++ docker-compose.yml | 24 +++++++++++++++ docker/entrypoint.sh | 54 ++++++++++++++++++++++++++++++++ 4 files changed, 199 insertions(+) create mode 100644 Dockerfile create mode 100644 docker-compose.yml create mode 100644 docker/entrypoint.sh diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2ec7b1c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,48 @@ +FROM nvidia/cuda:12.8.0-cudnn9-devel-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 +ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + +# System dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.10 \ + python3.10-dev \ + python3-pip \ + git \ + wget \ + curl \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \ + ln -sf /usr/bin/pip3 /usr/bin/pip + +# PyTorch 2.9.1 + CUDA 12.8 +RUN pip install --no-cache-dir \ + torch==2.9.1 torchvision==0.24.1 \ + --index-url https://download.pytorch.org/whl/cu128 + +# Copy source and install lingbot-map with visualization extras +WORKDIR /app +COPY . . +RUN pip install --no-cache-dir -e ".[vis]" && \ + pip install --no-cache-dir onnxruntime + +# FlashInfer for efficient KV-cache attention (falls back to SDPA if unavailable) +RUN pip install --no-cache-dir flashinfer-python \ + -i https://flashinfer.ai/whl/cu128/torch2.9/ || \ + echo "WARNING: FlashInfer not installed — demo will use --use_sdpa fallback" + +RUN mkdir -p /model /data/images + +COPY docker/entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +EXPOSE 8080 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/README.md b/README.md index d73b1c6..83996c5 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,79 @@ python demo.py --model_path /path/to/checkpoint.pt \ `--camera_num_iterations` defaults to `4`; setting it to `1` skips three refinement passes in the camera head (and shrinks its KV cache by 4×). +# 🐳 Docker Quick Start + +Run the demo on your own images without a local Python environment. + +### Prerequisites +- [Docker](https://docs.docker.com/get-docker/) with [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) +- An NVIDIA GPU with CUDA 12.8 support + +### 1. Build the image + +```bash +docker build -t lingbot-map-demo . +``` + +### 2. Run with your images + +Place your images in a local `images/` folder, then: + +```bash +docker run --gpus all \ + -v $(pwd)/images:/data/images \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /data/images +``` + +Open **http://localhost:8080** in your browser to see the interactive 3D viewer. + +The model is **downloaded automatically** on the first run and cached in `./model/`. + +### Run with a video file + +```bash +docker run --gpus all \ + -v /path/to/video.mp4:/data/video.mp4 \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --video_path /data/video.mp4 --fps 10 +``` + +### Using docker-compose + +```bash +# Put your images in ./images/, then: +docker compose up +``` + +Edit `docker-compose.yml` to change the model variant (`HF_MODEL_NAME`) or other options. + +### Environment variables + +| Variable | Default | Description | +|:---|:---|:---| +| `HF_MODEL_NAME` | `lingbot-map` | Model variant to download (`lingbot-map`, `lingbot-map-long`, `lingbot-map-stage1`) | +| `MODEL_PATH` | *(auto)* | Full path to a pre-downloaded `.pt` file inside the container | +| `MODEL_CACHE_DIR` | `/model` | Where the model is stored/cached | +| `HUGGING_FACE_HUB_TOKEN` | *(none)* | HuggingFace token if needed | + +### Use a pre-downloaded model + +```bash +docker run --gpus all \ + -v /path/to/checkpoint.pt:/model/lingbot-map.pt \ + -v $(pwd)/images:/data/images \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /data/images +``` + +--- + # 📜 License This project is released under the Apache License 2.0. See [LICENSE](LICENSE.txt) file for details. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..14eca69 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,24 @@ +services: + lingbot-map: + build: . + image: lingbot-map-demo + runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + # Set to lingbot-map-long for long sequences, lingbot-map-stage1 for bidirectional + - HF_MODEL_NAME=lingbot-map + # Optional: set to a .pt path inside the container to skip auto-download + # - MODEL_PATH=/model/lingbot-map.pt + # Optional: HuggingFace token for private/gated repos + - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} + volumes: + # Mount your images here + - ./images:/data/images + # Model cache (downloaded on first run, reused afterwards) + - ./model:/model + ports: + - "8080:8080" + # Default: run on the mounted images folder + command: --image_folder /data/images + # Uncomment to run on a video file (also mount the file above): + # command: --video_path /data/video.mp4 --fps 10 diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 0000000..da8d123 --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,54 @@ +#!/bin/bash +set -e + +MODEL_CACHE_DIR="${MODEL_CACHE_DIR:-/model}" +HF_MODEL_NAME="${HF_MODEL_NAME:-lingbot-map}" +MODEL_PATH="${MODEL_PATH:-}" + +# ── Resolve model path ──────────────────────────────────────────────────────── +if [ -z "$MODEL_PATH" ]; then + # Look for a matching .pt file already in the cache volume + MODEL_FILE=$(find "$MODEL_CACHE_DIR" -name "${HF_MODEL_NAME}.pt" -print -quit 2>/dev/null || true) + if [ -z "$MODEL_FILE" ]; then + MODEL_FILE=$(find "$MODEL_CACHE_DIR" -name "*.pt" -print -quit 2>/dev/null || true) + fi + + if [ -z "$MODEL_FILE" ]; then + echo "Model not found in ${MODEL_CACHE_DIR}. Downloading '${HF_MODEL_NAME}' from HuggingFace..." + python - </dev/null || \ + find "$MODEL_CACHE_DIR" -name "*.pt" -print -quit 2>/dev/null || true) + fi + + MODEL_PATH="$MODEL_FILE" +fi + +if [ -z "$MODEL_PATH" ] || [ ! -f "$MODEL_PATH" ]; then + echo "ERROR: No model .pt file found. Either:" + echo " - Mount a pre-downloaded model: -v /path/to/model.pt:/model/lingbot-map.pt" + echo " - Or let auto-download run (requires internet access)" + exit 1 +fi + +echo "Using model: ${MODEL_PATH}" + +# ── Check FlashInfer availability ───────────────────────────────────────────── +EXTRA_ARGS="" +python -c "import flashinfer" 2>/dev/null || EXTRA_ARGS="--use_sdpa" +if [ -n "$EXTRA_ARGS" ]; then + echo "FlashInfer not available, using SDPA backend." +fi + +# ── Launch demo ─────────────────────────────────────────────────────────────── +exec python /app/demo.py \ + --model_path "$MODEL_PATH" \ + $EXTRA_ARGS \ + "$@" From e775d870b8db3b4054e282464731643b5ad6c98c Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 18 Apr 2026 23:23:34 +0000 Subject: [PATCH 02/24] Fix base image: switch to pytorch/pytorch to avoid NVIDIA registry issues nvidia/cuda tags can be unavailable without Docker Hub auth or may not exist for CUDA 12.8 with the cudnn9 suffix. pytorch/pytorch official images ship PyTorch + CUDA pre-installed and are publicly accessible without auth. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- Dockerfile | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2ec7b1c..ac966b6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,13 @@ -FROM nvidia/cuda:12.8.0-cudnn9-devel-ubuntu22.04 +# pytorch/pytorch images ship with Python, pip, and PyTorch pre-installed, +# so no NVIDIA registry auth or manual CUDA installation is needed. +FROM pytorch/pytorch:2.9.1-cuda12.8-cudnn9-devel ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True -# System dependencies +# System dependencies (Python/pip already present in base image) RUN apt-get update && apt-get install -y --no-install-recommends \ - python3.10 \ - python3.10-dev \ - python3-pip \ git \ wget \ curl \ @@ -19,14 +18,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libgomp1 \ && rm -rf /var/lib/apt/lists/* -RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \ - ln -sf /usr/bin/pip3 /usr/bin/pip - -# PyTorch 2.9.1 + CUDA 12.8 -RUN pip install --no-cache-dir \ - torch==2.9.1 torchvision==0.24.1 \ - --index-url https://download.pytorch.org/whl/cu128 - # Copy source and install lingbot-map with visualization extras WORKDIR /app COPY . . From 63f0f5b2d802e523993d0ab8b4dc66771257ed9e Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 18 Apr 2026 23:44:03 +0000 Subject: [PATCH 03/24] Rewrite Docker README section with design overview and demo examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add image design table (base image, backends, ports, volume layout) - Add directory structure diagram showing /app, /model, /data - Add dedicated section for built-in example scenes (church/oxford/university/loop) - Clarify that examples are baked into the image — no extra data mount needed - Expand tips section: pre-downloaded model, GPU memory, long sequences, fast inference https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- README.md | 118 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 95 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 83996c5..0fc7f90 100644 --- a/README.md +++ b/README.md @@ -208,38 +208,93 @@ python demo.py --model_path /path/to/checkpoint.pt \ `--camera_num_iterations` defaults to `4`; setting it to `1` skips three refinement passes in the camera head (and shrinks its KV cache by 4×). -# 🐳 Docker Quick Start +# 🐳 Docker -Run the demo on your own images without a local Python environment. +Run the full demo — including model download, inference, and 3D viewer — without any local Python or CUDA setup. -### Prerequisites -- [Docker](https://docs.docker.com/get-docker/) with [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) -- An NVIDIA GPU with CUDA 12.8 support +## Image Design -### 1. Build the image +| Layer | Detail | +|:---|:---| +| Base image | `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-devel` (public, no auth required) | +| Attention backend | [FlashInfer](https://github.com/flashinfer-ai/flashinfer) for paged KV-cache; auto-falls back to PyTorch SDPA if unavailable | +| Visualisation | [viser](https://github.com/nerfstudio-project/viser) web viewer exposed on port **8080** | +| Model resolution | `docker/entrypoint.sh` checks `/model/` at startup and auto-downloads from HuggingFace when no `.pt` file is found | +| Data access | Images and model weights are provided via **volume mounts** — nothing user-specific is baked into the image | + +``` +lingbot-map-demo +├── /app/ ← source code + built-in example scenes +│ └── example/{church,oxford,university,loop}/ +├── /model/ ← mount a host directory here to cache the model +└── /data/ ← mount your images or video here +``` + +## Prerequisites + +- [Docker](https://docs.docker.com/get-docker/) with the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) +- An NVIDIA GPU (CUDA 12.8 driver) + +## Build ```bash +git clone https://github.com/YoshiRi/lingbot-map-docker.git +cd lingbot-map-docker docker build -t lingbot-map-demo . ``` -### 2. Run with your images +## Try the Built-in Example Scenes -Place your images in a local `images/` folder, then: +The four example scenes from `example/` are already baked into the image at `/app/example/`. +No extra data mount is needed — just provide a writable directory for the model cache. ```bash +# Church (outdoor, sky masking recommended) docker run --gpus all \ - -v $(pwd)/images:/data/images \ -v $(pwd)/model:/model \ -p 8080:8080 \ lingbot-map-demo \ - --image_folder /data/images + --image_folder /app/example/church --mask_sky + +# Oxford +docker run --gpus all \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /app/example/oxford --mask_sky + +# University +docker run --gpus all \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /app/example/university --mask_sky + +# Loop (loop-closure trajectory, no sky masking needed) +docker run --gpus all \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /app/example/loop ``` -Open **http://localhost:8080** in your browser to see the interactive 3D viewer. +On **first run** the model is downloaded from HuggingFace and cached in `./model/`; subsequent runs start immediately. +Open **http://localhost:8080** in your browser once inference completes. -The model is **downloaded automatically** on the first run and cached in `./model/`. +## Run with Your Own Images -### Run with a video file +Place your images (`.jpg` / `.png`) in a local folder, then mount it: + +```bash +docker run --gpus all \ + -v /path/to/your/images:/data/images \ + -v $(pwd)/model:/model \ + -p 8080:8080 \ + lingbot-map-demo \ + --image_folder /data/images +``` + +## Run with a Video File ```bash docker run --gpus all \ @@ -250,26 +305,27 @@ docker run --gpus all \ --video_path /data/video.mp4 --fps 10 ``` -### Using docker-compose +## docker-compose + +Edit `docker-compose.yml` to set your image folder and model variant, then: ```bash -# Put your images in ./images/, then: +# Put your images in ./images/ docker compose up ``` -Edit `docker-compose.yml` to change the model variant (`HF_MODEL_NAME`) or other options. - -### Environment variables +## Environment Variables | Variable | Default | Description | |:---|:---|:---| -| `HF_MODEL_NAME` | `lingbot-map` | Model variant to download (`lingbot-map`, `lingbot-map-long`, `lingbot-map-stage1`) | -| `MODEL_PATH` | *(auto)* | Full path to a pre-downloaded `.pt` file inside the container | -| `MODEL_CACHE_DIR` | `/model` | Where the model is stored/cached | -| `HUGGING_FACE_HUB_TOKEN` | *(none)* | HuggingFace token if needed | +| `HF_MODEL_NAME` | `lingbot-map` | Checkpoint to download: `lingbot-map`, `lingbot-map-long`, or `lingbot-map-stage1` | +| `MODEL_PATH` | *(auto)* | Explicit path to a `.pt` file inside the container (skips auto-download) | +| `MODEL_CACHE_DIR` | `/model` | Directory where the downloaded model is stored | +| `HUGGING_FACE_HUB_TOKEN` | *(none)* | HuggingFace token for gated repos | -### Use a pre-downloaded model +## Tips +**Use a pre-downloaded model** (avoids HuggingFace download at runtime): ```bash docker run --gpus all \ -v /path/to/checkpoint.pt:/model/lingbot-map.pt \ @@ -279,6 +335,22 @@ docker run --gpus all \ --image_folder /data/images ``` +**Limited GPU memory** — add one or both flags: +```bash + --num_scale_frames 2 # reduces activation peak of the initial scale phase + --keyframe_interval 6 # keeps only every 6th frame in KV cache +``` + +**Long sequences (> 3000 frames)** — use windowed mode: +```bash + --mode windowed --window_size 128 +``` + +**Faster inference** — reduce camera head iterations (small accuracy trade-off): +```bash + --camera_num_iterations 1 +``` + --- # 📜 License From 56f6b2232bc60b1edf13e52f65ad61bb796cbc12 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 19 Apr 2026 00:14:27 +0000 Subject: [PATCH 04/24] Fix cv2 ImportError: add libgl1 to apt packages opencv-python requires libGL.so.1 which is not present in the pytorch/pytorch base image. Adding libgl1 resolves the error. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index ac966b6..ec8211b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,6 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ wget \ curl \ libglib2.0-0 \ + libgl1 \ libsm6 \ libxext6 \ libxrender-dev \ From 288830f650d60105e0c234fa93ed2c4e4196f185 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 19 Apr 2026 00:36:19 +0000 Subject: [PATCH 05/24] Fix OOM on small GPUs and deprecated env var demo.py: load checkpoint to CPU before GPU transfer to avoid holding both the state dict and the model weights in GPU memory simultaneously. Cast the aggregator (DINOv2 trunk) to bfloat16 on CPU before model.to(device) to halve its VRAM footprint (~2-3 GB saved). Heads remain in fp32. Dockerfile: rename PYTORCH_CUDA_ALLOC_CONF to PYTORCH_ALLOC_CONF (deprecated in newer PyTorch versions). https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- Dockerfile | 2 +- demo.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index ec8211b..4adbd4c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ FROM pytorch/pytorch:2.9.1-cuda12.8-cudnn9-devel ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 -ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +ENV PYTORCH_ALLOC_CONF=expandable_segments:True # System dependencies (Python/pip already present in base image) RUN apt-get update && apt-get install -y --no-install-recommends \ diff --git a/demo.py b/demo.py index a8c8636..104a7fb 100644 --- a/demo.py +++ b/demo.py @@ -127,7 +127,9 @@ def load_model(args, device): if args.model_path: print(f"Loading checkpoint: {args.model_path}") - ckpt = torch.load(args.model_path, map_location=device, weights_only=False) + # Always load to CPU first; moving a fp32 state dict directly to the + # target GPU risks OOM on cards with limited VRAM (e.g. 6 GB). + ckpt = torch.load(args.model_path, map_location="cpu", weights_only=False) state_dict = ckpt.get("model", ckpt) missing, unexpected = model.load_state_dict(state_dict, strict=False) if missing: @@ -136,6 +138,16 @@ def load_model(args, device): print(f" Unexpected keys: {len(unexpected)}") print(" Checkpoint loaded.") + # Cast the aggregator (DINOv2 trunk) to bfloat16 on CPU before the GPU + # transfer. This halves the trunk's VRAM footprint (~2-3 GB saved) and + # avoids OOM on GPUs with ≤8 GB. Heads are kept in fp32 as the original + # code documents. The matching cast in main() becomes a no-op. + if device.type == "cuda" and getattr(model, "aggregator", None) is not None: + cap = torch.cuda.get_device_capability(device) + dtype = torch.bfloat16 if cap[0] >= 8 else torch.float16 + print(f"Pre-casting aggregator to {dtype} on CPU before GPU transfer") + model.aggregator = model.aggregator.to(dtype=dtype) + return model.to(device).eval() From 368ad15a4f155a3ec0d933a5021d5d7d22f62ebf Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 19 Apr 2026 00:41:35 +0000 Subject: [PATCH 06/24] Revert demo.py OOM workaround (reverts bf16 pre-cast) Restores original load_model behaviour: checkpoint loaded directly to the target device in fp32. The bfloat16 pre-cast degraded model accuracy and is not appropriate outside of memory-constrained environments. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- demo.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/demo.py b/demo.py index 104a7fb..a8c8636 100644 --- a/demo.py +++ b/demo.py @@ -127,9 +127,7 @@ def load_model(args, device): if args.model_path: print(f"Loading checkpoint: {args.model_path}") - # Always load to CPU first; moving a fp32 state dict directly to the - # target GPU risks OOM on cards with limited VRAM (e.g. 6 GB). - ckpt = torch.load(args.model_path, map_location="cpu", weights_only=False) + ckpt = torch.load(args.model_path, map_location=device, weights_only=False) state_dict = ckpt.get("model", ckpt) missing, unexpected = model.load_state_dict(state_dict, strict=False) if missing: @@ -138,16 +136,6 @@ def load_model(args, device): print(f" Unexpected keys: {len(unexpected)}") print(" Checkpoint loaded.") - # Cast the aggregator (DINOv2 trunk) to bfloat16 on CPU before the GPU - # transfer. This halves the trunk's VRAM footprint (~2-3 GB saved) and - # avoids OOM on GPUs with ≤8 GB. Heads are kept in fp32 as the original - # code documents. The matching cast in main() becomes a no-op. - if device.type == "cuda" and getattr(model, "aggregator", None) is not None: - cap = torch.cuda.get_device_capability(device) - dtype = torch.bfloat16 if cap[0] >= 8 else torch.float16 - print(f"Pre-casting aggregator to {dtype} on CPU before GPU transfer") - model.aggregator = model.aggregator.to(dtype=dtype) - return model.to(device).eval() From 4600f3e756f22dd403078b217060cc7acece6d95 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 19 Apr 2026 03:21:32 +0000 Subject: [PATCH 07/24] Fix double GPU allocation in load_model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit map_location=device caused the state dict to be loaded onto GPU while the model itself was still on CPU. load_state_dict then performed a D2H copy, leaving the GPU state dict alive until the function returned — at which point model.to(device) also needed GPU space, doubling peak VRAM with no benefit. Changing to map_location="cpu" ensures a single H2D transfer via model.to(device). https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- demo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo.py b/demo.py index a8c8636..a1e7c09 100644 --- a/demo.py +++ b/demo.py @@ -127,7 +127,7 @@ def load_model(args, device): if args.model_path: print(f"Loading checkpoint: {args.model_path}") - ckpt = torch.load(args.model_path, map_location=device, weights_only=False) + ckpt = torch.load(args.model_path, map_location="cpu", weights_only=False) state_dict = ckpt.get("model", ckpt) missing, unexpected = model.load_state_dict(state_dict, strict=False) if missing: From 6ee9da589a8f2db436f84d0c45b1842e28f4ec3b Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 19 Apr 2026 23:35:12 +0000 Subject: [PATCH 08/24] Cast aggregator to bf16 on CPU before GPU transfer (low-VRAM mode) The DINOv2 aggregator trunk accounts for ~2-3 GB of the fp32 model. Casting it on CPU before model.to(device) avoids the temporary fp32+bf16 coexistence on GPU that would OOM on cards with <=6 GB VRAM. Per the original authors (demo.py:329-336): "no measurable quality change". Heads remain in fp32; the matching cast in main() becomes a no-op. To revert: git revert HEAD or switch back to claude/docker-image-stream-demo-Lgnfl. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- demo.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/demo.py b/demo.py index a1e7c09..9439728 100644 --- a/demo.py +++ b/demo.py @@ -136,6 +136,18 @@ def load_model(args, device): print(f" Unexpected keys: {len(unexpected)}") print(" Checkpoint loaded.") + # Cast aggregator to bfloat16 on CPU before GPU transfer. + # The aggregator (DINOv2 trunk) is the bulk of the model (~2-3 GB in fp32). + # Casting on CPU avoids needing a temporary fp32+bf16 copy on GPU, + # which would OOM on cards with <=6 GB VRAM. + # Per the original authors: "no measurable quality change". + # To revert: remove this block and checkout the parent branch. + if device.type == "cuda" and getattr(model, "aggregator", None) is not None: + cap = torch.cuda.get_device_capability(device) + dtype = torch.bfloat16 if cap[0] >= 8 else torch.float16 + print(f"Casting aggregator to {dtype} on CPU before GPU transfer (low-VRAM mode)") + model.aggregator = model.aggregator.to(dtype=dtype) + return model.to(device).eval() From ca1452a8881b2439525647764a807bba267e72e6 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 01:07:05 +0000 Subject: [PATCH 09/24] Switch low-VRAM bf16 cast to build-arg / env var Instead of hardcoding the aggregator bf16 cast, gate it behind LOW_VRAM_MODE. Both image variants now build from a single Dockerfile on a single branch: # Standard (full precision) docker build -t lingbot-map-demo . # Low VRAM (~2-3 GB savings, aggregator in bf16) docker build --build-arg LOW_VRAM_MODE=1 -t lingbot-map-demo-light . https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- Dockerfile | 7 +++++++ demo.py | 11 +++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4adbd4c..7635a93 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,13 @@ ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 ENV PYTORCH_ALLOC_CONF=expandable_segments:True +# LOW_VRAM_MODE=1: cast aggregator to bf16 on CPU before GPU transfer (~2-3 GB VRAM savings). +# Per original authors: "no measurable quality change" for the aggregator trunk, +# but the scale-phase RoPE computations are affected on very small GPUs. +# Build with: docker build --build-arg LOW_VRAM_MODE=1 -t lingbot-map-demo-light . +ARG LOW_VRAM_MODE=0 +ENV LOW_VRAM_MODE=${LOW_VRAM_MODE} + # System dependencies (Python/pip already present in base image) RUN apt-get update && apt-get install -y --no-install-recommends \ git \ diff --git a/demo.py b/demo.py index 9439728..d3b033f 100644 --- a/demo.py +++ b/demo.py @@ -136,16 +136,11 @@ def load_model(args, device): print(f" Unexpected keys: {len(unexpected)}") print(" Checkpoint loaded.") - # Cast aggregator to bfloat16 on CPU before GPU transfer. - # The aggregator (DINOv2 trunk) is the bulk of the model (~2-3 GB in fp32). - # Casting on CPU avoids needing a temporary fp32+bf16 copy on GPU, - # which would OOM on cards with <=6 GB VRAM. - # Per the original authors: "no measurable quality change". - # To revert: remove this block and checkout the parent branch. - if device.type == "cuda" and getattr(model, "aggregator", None) is not None: + if os.environ.get("LOW_VRAM_MODE", "0") == "1" and \ + device.type == "cuda" and getattr(model, "aggregator", None) is not None: cap = torch.cuda.get_device_capability(device) dtype = torch.bfloat16 if cap[0] >= 8 else torch.float16 - print(f"Casting aggregator to {dtype} on CPU before GPU transfer (low-VRAM mode)") + print(f"[LOW_VRAM_MODE] Casting aggregator to {dtype} on CPU before GPU transfer") model.aggregator = model.aggregator.to(dtype=dtype) return model.to(device).eval() From 83f1382e6fb725327e378241627e4a838fd31c21 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 21 Apr 2026 06:23:44 +0000 Subject: [PATCH 10/24] Fix vis_threshold slider: lower min from 1.0 to 0.0 Passing --conf_threshold <1.0 caused a viser error because the slider initial_value fell below the min constraint. Also, the 1.0 floor made it impossible to view low-confidence reconstructions (e.g. with --num_scale_frames 2) since all points were filtered out in the UI. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- lingbot_map/vis/point_cloud_viewer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lingbot_map/vis/point_cloud_viewer.py b/lingbot_map/vis/point_cloud_viewer.py index a1d698c..7b1523a 100644 --- a/lingbot_map/vis/point_cloud_viewer.py +++ b/lingbot_map/vis/point_cloud_viewer.py @@ -415,8 +415,8 @@ def _(event: viser.GuiEvent) -> None: "Show Camera", initial_value=self.show_camera ) self.vis_threshold_slider = self.server.gui.add_slider( - "Visibility Threshold", min=1.0, max=5.0, step=0.01, - initial_value=self.vis_threshold, + "Visibility Threshold", min=0.0, max=5.0, step=0.01, + initial_value=max(self.vis_threshold, 0.0), ) self.camera_downsample_slider = self.server.gui.add_slider( "Camera Downsample Factor", min=1, max=50, step=1, initial_value=1 From 840aa6a73b5e3fcac4d99a39e6b5039adc13535c Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 21 Apr 2026 07:09:39 +0000 Subject: [PATCH 11/24] Add debug output to _process_pred_dict for visibility diagnosis Prints world_points finite ratio and confidence score statistics to help diagnose why nothing appears in the viser viewer. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- lingbot_map/vis/point_cloud_viewer.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lingbot_map/vis/point_cloud_viewer.py b/lingbot_map/vis/point_cloud_viewer.py index 7b1523a..ece1b07 100644 --- a/lingbot_map/vis/point_cloud_viewer.py +++ b/lingbot_map/vis/point_cloud_viewer.py @@ -213,6 +213,17 @@ def _process_pred_dict( if depth_stride > 1: print(f' depth_stride={depth_stride}: projecting {S - skipped}/{S} frames, skipping {skipped}') + # Debug: report data validity + sample_pts = world_points.reshape(-1, 3) + finite_mask = np.isfinite(sample_pts).all(axis=1) + finite_ratio = finite_mask.mean() + if conf is not None: + conf_vals = conf[finite_mask.reshape(conf.shape)] if conf.shape == finite_mask.reshape(conf.shape).shape else conf.reshape(-1) + print(f"[vis debug] world_points finite={finite_ratio:.1%} " + f"conf min={conf.min():.3f} mean={conf.mean():.3f} max={conf.max():.3f}") + else: + print(f"[vis debug] world_points finite={finite_ratio:.1%} conf=None (all set to 1.0)") + # Create camera dictionary (all frames keep cameras) cam_to_world_mat = closed_form_inverse_se3(extrinsics_cam) cam_dict = { From 70db59404153e044e7b15a27506f738e735f814d Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 21 Apr 2026 10:07:30 +0000 Subject: [PATCH 12/24] Fix viser control panel not visible: change layout to fixed collapsible layout hides the control panel by default, requiring users to find and click a toggle button. Switch to fixed so sliders/buttons are always visible on the side. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- lingbot_map/vis/point_cloud_viewer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingbot_map/vis/point_cloud_viewer.py b/lingbot_map/vis/point_cloud_viewer.py index ece1b07..7a2ea92 100644 --- a/lingbot_map/vis/point_cloud_viewer.py +++ b/lingbot_map/vis/point_cloud_viewer.py @@ -97,7 +97,7 @@ def __init__( self.size = size self.state_args = state_args self.server = viser.ViserServer(host="0.0.0.0", port=port) - self.server.gui.configure_theme(titlebar_content=None, control_layout="collapsible") + self.server.gui.configure_theme(titlebar_content=None, control_layout="fixed") self.device = device self.conf_list = conf_list self.vis_threshold = vis_threshold From 19f94b2132ab0bacb45576b999039ec043590344 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 21 Apr 2026 11:42:48 +0000 Subject: [PATCH 13/24] Add file export and docker-compose.lowvram.yml demo.py: - Add --output_dir (default /data/output) and --no_viewer flag - export_results() writes three files after every inference run: predictions.npz raw numpy arrays (world_points, depth, extrinsic, intrinsic, images) pointcloud.ply confidence-filtered merged point cloud (binary PLY) cameras.json per-frame c2w poses and intrinsic matrices - Export runs unconditionally; viewer is launched unless --no_viewer is set docker-compose.yml: - Add ./output:/data/output volume mount - Pass --output_dir /data/output in default command docker-compose.lowvram.yml: - New file for 8 GB VRAM machines (LOW_VRAM_MODE=1 build) - Pre-configured windowed inference defaults https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- demo.py | 110 +++++++++++++++++++++++++++++++++++++ docker-compose.lowvram.yml | 27 +++++++++ docker-compose.yml | 25 ++++----- 3 files changed, 148 insertions(+), 14 deletions(-) create mode 100644 docker-compose.lowvram.yml diff --git a/demo.py b/demo.py index d3b033f..6a2aa35 100644 --- a/demo.py +++ b/demo.py @@ -20,7 +20,9 @@ import argparse import glob +import json import os +import struct import time # Must be set before `import torch` / any CUDA init. Reduces the reserved-vs-allocated @@ -234,6 +236,100 @@ def prepare_for_visualization(predictions, images=None): return vis_predictions +# ============================================================================= +# Export +# ============================================================================= + +def export_results(predictions, images_cpu, output_dir, conf_threshold=0.0): + """Save inference results to output_dir. + + Writes three files: + predictions.npz – raw numpy arrays (depth, world_points, extrinsic, intrinsic, images) + pointcloud.ply – merged, confidence-filtered point cloud (binary PLY) + cameras.json – per-frame camera-to-world poses and intrinsics + """ + os.makedirs(output_dir, exist_ok=True) + + # ── NPZ ────────────────────────────────────────────────────────────────── + npz_path = os.path.join(output_dir, "predictions.npz") + save_dict = {} + for k, v in predictions.items(): + if isinstance(v, torch.Tensor): + save_dict[k] = v.cpu().numpy() + elif isinstance(v, np.ndarray): + save_dict[k] = v + if isinstance(images_cpu, torch.Tensor): + save_dict["images"] = images_cpu.numpy() + elif isinstance(images_cpu, np.ndarray): + save_dict["images"] = images_cpu + np.savez_compressed(npz_path, **save_dict) + print(f" Saved predictions → {npz_path}") + + # ── PLY ────────────────────────────────────────────────────────────────── + world_points = save_dict.get("world_points") # (S, H, W, 3) + depth = save_dict.get("depth") # (S, H, W, 1) fallback + depth_conf = save_dict.get("depth_conf") # (S, H, W) + images_np = save_dict.get("images") # (S, 3, H, W) + + if world_points is None and depth is not None: + from lingbot_map.utils.geometry import unproject_depth_map_to_point_map + world_points = unproject_depth_map_to_point_map( + depth, save_dict["extrinsic"], save_dict["intrinsic"] + ) + + if world_points is not None and images_np is not None: + S, H, W = world_points.shape[:3] + colors = images_np.transpose(0, 2, 3, 1) # (S, H, W, 3) + pts_all, col_all = [], [] + for i in range(S): + pts = world_points[i].reshape(-1, 3) + col = (colors[i].reshape(-1, 3) * 255).clip(0, 255).astype(np.uint8) + valid = np.isfinite(pts).all(axis=1) + if depth_conf is not None: + valid &= depth_conf[i].reshape(-1) > conf_threshold + pts_all.append(pts[valid]) + col_all.append(col[valid]) + pts_merged = np.concatenate(pts_all, axis=0).astype(np.float32) + col_merged = np.concatenate(col_all, axis=0) + n = len(pts_merged) + + ply_path = os.path.join(output_dir, "pointcloud.ply") + header = ( + "ply\nformat binary_little_endian 1.0\n" + f"element vertex {n}\n" + "property float x\nproperty float y\nproperty float z\n" + "property uchar red\nproperty uchar green\nproperty uchar blue\n" + "end_header\n" + ).encode() + with open(ply_path, "wb") as f: + f.write(header) + # interleave xyz + rgb tightly + data = np.empty(n, dtype=[("x","f4"),("y","f4"),("z","f4"), + ("r","u1"),("g","u1"),("b","u1")]) + data["x"], data["y"], data["z"] = pts_merged[:,0], pts_merged[:,1], pts_merged[:,2] + data["r"], data["g"], data["b"] = col_merged[:,0], col_merged[:,1], col_merged[:,2] + f.write(data.tobytes()) + print(f" Saved point cloud ({n:,} pts) → {ply_path}") + else: + print(" Skipping PLY export (world_points or images not available)") + + # ── cameras.json ───────────────────────────────────────────────────────── + extrinsic = save_dict.get("extrinsic") # (S, 3, 4) c2w + intrinsic = save_dict.get("intrinsic") # (S, 3, 3) + if extrinsic is not None and intrinsic is not None: + cameras = [] + for i in range(len(extrinsic)): + cameras.append({ + "frame": i, + "c2w": extrinsic[i].tolist(), + "K": intrinsic[i].tolist(), + }) + cam_path = os.path.join(output_dir, "cameras.json") + with open(cam_path, "w") as f: + json.dump(cameras, f, indent=2) + print(f" Saved cameras ({len(cameras)} frames) → {cam_path}") + + # ============================================================================= # Main # ============================================================================= @@ -298,6 +394,12 @@ def main(): parser.add_argument("--export_preprocessed", type=str, default=None, help="Export stride-sampled, resized/cropped images to this folder") + # Output + parser.add_argument("--output_dir", type=str, default="/data/output", + help="Directory for exported results (predictions.npz, pointcloud.ply, cameras.json)") + parser.add_argument("--no_viewer", action="store_true", + help="Skip the interactive viewer (export only)") + args = parser.parse_args() assert args.image_folder or args.video_path, \ "Provide --image_folder or --video_path" @@ -405,6 +507,14 @@ def main(): predictions, images_cpu = postprocess(predictions, images_for_post) + # ── Export ─────────────────────────────────────────────────────────────── + print(f"Exporting results to {args.output_dir} ...") + export_results(predictions, images_cpu, args.output_dir, conf_threshold=args.conf_threshold) + + if args.no_viewer: + print("Viewer skipped (--no_viewer). Done.") + return + # ── Visualize ──────────────────────────────────────────────────────────── try: from lingbot_map.vis import PointCloudViewer diff --git a/docker-compose.lowvram.yml b/docker-compose.lowvram.yml new file mode 100644 index 0000000..f522320 --- /dev/null +++ b/docker-compose.lowvram.yml @@ -0,0 +1,27 @@ +services: + lingbot-map: + build: + context: . + args: + LOW_VRAM_MODE: "1" # cast aggregator to bf16 before GPU transfer + image: lingbot-map-demo-light + runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - HF_MODEL_NAME=lingbot-map + - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} + volumes: + - ./images:/data/images # input images + - ./model:/model # model cache (downloaded on first run) + - ./output:/data/output # exported results (PLY / NPZ / JSON) + ports: + - "8080:8080" + command: > + --image_folder /data/images + --output_dir /data/output + --mode windowed + --window_size 16 + --overlap_size 4 + --num_scale_frames 2 + --conf_threshold 0.0 + --point_size 0.001 diff --git a/docker-compose.yml b/docker-compose.yml index 14eca69..9ab7cb7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,24 +1,21 @@ services: lingbot-map: - build: . + build: + context: . + args: + LOW_VRAM_MODE: "0" image: lingbot-map-demo runtime: nvidia environment: - NVIDIA_VISIBLE_DEVICES=all - # Set to lingbot-map-long for long sequences, lingbot-map-stage1 for bidirectional - - HF_MODEL_NAME=lingbot-map - # Optional: set to a .pt path inside the container to skip auto-download - # - MODEL_PATH=/model/lingbot-map.pt - # Optional: HuggingFace token for private/gated repos + - HF_MODEL_NAME=lingbot-map # lingbot-map | lingbot-map-long | lingbot-map-stage1 - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} volumes: - # Mount your images here - - ./images:/data/images - # Model cache (downloaded on first run, reused afterwards) - - ./model:/model + - ./images:/data/images # input images + - ./model:/model # model cache (downloaded on first run) + - ./output:/data/output # exported results (PLY / NPZ / JSON) ports: - "8080:8080" - # Default: run on the mounted images folder - command: --image_folder /data/images - # Uncomment to run on a video file (also mount the file above): - # command: --video_path /data/video.mp4 --fps 10 + command: > + --image_folder /data/images + --output_dir /data/output From 81e0ef77679a1357cfe744b7c58fbacd64af10fa Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 21 Apr 2026 13:35:48 +0000 Subject: [PATCH 14/24] Remove temporary vis debug print from point_cloud_viewer https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- lingbot_map/vis/point_cloud_viewer.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/lingbot_map/vis/point_cloud_viewer.py b/lingbot_map/vis/point_cloud_viewer.py index 7a2ea92..87e9ab4 100644 --- a/lingbot_map/vis/point_cloud_viewer.py +++ b/lingbot_map/vis/point_cloud_viewer.py @@ -213,17 +213,6 @@ def _process_pred_dict( if depth_stride > 1: print(f' depth_stride={depth_stride}: projecting {S - skipped}/{S} frames, skipping {skipped}') - # Debug: report data validity - sample_pts = world_points.reshape(-1, 3) - finite_mask = np.isfinite(sample_pts).all(axis=1) - finite_ratio = finite_mask.mean() - if conf is not None: - conf_vals = conf[finite_mask.reshape(conf.shape)] if conf.shape == finite_mask.reshape(conf.shape).shape else conf.reshape(-1) - print(f"[vis debug] world_points finite={finite_ratio:.1%} " - f"conf min={conf.min():.3f} mean={conf.mean():.3f} max={conf.max():.3f}") - else: - print(f"[vis debug] world_points finite={finite_ratio:.1%} conf=None (all set to 1.0)") - # Create camera dictionary (all frames keep cameras) cam_to_world_mat = closed_form_inverse_se3(extrinsics_cam) cam_dict = { From 8fc554c83db8c46ce02ef165969a86a9546b3093 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 21 Apr 2026 14:02:32 +0000 Subject: [PATCH 15/24] Add IMAGE_HOST_PATH and IMAGE_FOLDER env vars to docker-compose IMAGE_HOST_PATH: changes the host-side volume mount source (default ./images) IMAGE_FOLDER: changes the container-side path passed to --image_folder (default /data/images) Examples: IMAGE_HOST_PATH=~/photos docker compose up # mount custom host dir IMAGE_FOLDER=/app/example/oxford docker compose up # use built-in sample https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- docker-compose.lowvram.yml | 8 ++++---- docker-compose.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docker-compose.lowvram.yml b/docker-compose.lowvram.yml index f522320..9cfdbb5 100644 --- a/docker-compose.lowvram.yml +++ b/docker-compose.lowvram.yml @@ -11,13 +11,13 @@ services: - HF_MODEL_NAME=lingbot-map - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} volumes: - - ./images:/data/images # input images - - ./model:/model # model cache (downloaded on first run) - - ./output:/data/output # exported results (PLY / NPZ / JSON) + - ${IMAGE_HOST_PATH:-./images}:/data/images # override: IMAGE_HOST_PATH=~/photos docker compose up + - ./model:/model # model cache (downloaded on first run) + - ./output:/data/output # exported results (PLY / NPZ / JSON) ports: - "8080:8080" command: > - --image_folder /data/images + --image_folder ${IMAGE_FOLDER:-/data/images} --output_dir /data/output --mode windowed --window_size 16 diff --git a/docker-compose.yml b/docker-compose.yml index 9ab7cb7..e2fd482 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,11 +11,11 @@ services: - HF_MODEL_NAME=lingbot-map # lingbot-map | lingbot-map-long | lingbot-map-stage1 - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} volumes: - - ./images:/data/images # input images - - ./model:/model # model cache (downloaded on first run) - - ./output:/data/output # exported results (PLY / NPZ / JSON) + - ${IMAGE_HOST_PATH:-./images}:/data/images # override: IMAGE_HOST_PATH=~/photos docker compose up + - ./model:/model # model cache (downloaded on first run) + - ./output:/data/output # exported results (PLY / NPZ / JSON) ports: - "8080:8080" command: > - --image_folder /data/images + --image_folder ${IMAGE_FOLDER:-/data/images} --output_dir /data/output From 3ab8374b0247bc270cdb74c22d61cbb462329c13 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 21 Apr 2026 14:25:22 +0000 Subject: [PATCH 16/24] Simplify image path: remove IMAGE_FOLDER, default IMAGE_HOST_PATH to ./example/oxford Single variable IMAGE_HOST_PATH controls the host-side mount. Default points to the bundled sample so bare `docker compose up` works out of the box. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- docker-compose.lowvram.yml | 8 ++++---- docker-compose.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docker-compose.lowvram.yml b/docker-compose.lowvram.yml index 9cfdbb5..ea9f85b 100644 --- a/docker-compose.lowvram.yml +++ b/docker-compose.lowvram.yml @@ -11,13 +11,13 @@ services: - HF_MODEL_NAME=lingbot-map - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} volumes: - - ${IMAGE_HOST_PATH:-./images}:/data/images # override: IMAGE_HOST_PATH=~/photos docker compose up - - ./model:/model # model cache (downloaded on first run) - - ./output:/data/output # exported results (PLY / NPZ / JSON) + - ${IMAGE_HOST_PATH:-./example/oxford}:/data/images # override: IMAGE_HOST_PATH=~/photos docker compose up + - ./model:/model # model cache (downloaded on first run) + - ./output:/data/output # exported results (PLY / NPZ / JSON) ports: - "8080:8080" command: > - --image_folder ${IMAGE_FOLDER:-/data/images} + --image_folder /data/images --output_dir /data/output --mode windowed --window_size 16 diff --git a/docker-compose.yml b/docker-compose.yml index e2fd482..0cac8ba 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,11 +11,11 @@ services: - HF_MODEL_NAME=lingbot-map # lingbot-map | lingbot-map-long | lingbot-map-stage1 - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} volumes: - - ${IMAGE_HOST_PATH:-./images}:/data/images # override: IMAGE_HOST_PATH=~/photos docker compose up - - ./model:/model # model cache (downloaded on first run) - - ./output:/data/output # exported results (PLY / NPZ / JSON) + - ${IMAGE_HOST_PATH:-./example/oxford}:/data/images # override: IMAGE_HOST_PATH=~/photos docker compose up + - ./model:/model # model cache (downloaded on first run) + - ./output:/data/output # exported results (PLY / NPZ / JSON) ports: - "8080:8080" command: > - --image_folder ${IMAGE_FOLDER:-/data/images} + --image_folder /data/images --output_dir /data/output From 5802ff15ec9de017fff57d37120550cca74e08d1 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 21 Apr 2026 14:33:01 +0000 Subject: [PATCH 17/24] Fix export_results crash: squeeze batch dim from images tensor images_cpu has shape (1,S,C,H,W) but PLY export expected (S,C,H,W). Drop leading batch dim when present before saving. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- demo.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/demo.py b/demo.py index 6a2aa35..4a5d818 100644 --- a/demo.py +++ b/demo.py @@ -259,9 +259,15 @@ def export_results(predictions, images_cpu, output_dir, conf_threshold=0.0): elif isinstance(v, np.ndarray): save_dict[k] = v if isinstance(images_cpu, torch.Tensor): - save_dict["images"] = images_cpu.numpy() + images_arr = images_cpu.numpy() elif isinstance(images_cpu, np.ndarray): - save_dict["images"] = images_cpu + images_arr = images_cpu + else: + images_arr = None + if images_arr is not None: + if images_arr.ndim == 5 and images_arr.shape[0] == 1: + images_arr = images_arr[0] # (1,S,C,H,W) → (S,C,H,W) + save_dict["images"] = images_arr np.savez_compressed(npz_path, **save_dict) print(f" Saved predictions → {npz_path}") From 50636089dae5cf37de8898259c922e2a60298447 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 00:32:24 +0000 Subject: [PATCH 18/24] Raise default conf_threshold to 2.0 in lowvram compose conf mean was 2.9 on oxford; threshold 0.0 produced ~1GB PLY. 2.0 filters low-confidence points while retaining the majority. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- docker-compose.lowvram.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.lowvram.yml b/docker-compose.lowvram.yml index ea9f85b..082c25c 100644 --- a/docker-compose.lowvram.yml +++ b/docker-compose.lowvram.yml @@ -23,5 +23,5 @@ services: --window_size 16 --overlap_size 4 --num_scale_frames 2 - --conf_threshold 0.0 + --conf_threshold 2.0 --point_size 0.001 From c8ec8c8448d65c24c86ef3c17064a705dff7c404 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 07:05:00 +0000 Subject: [PATCH 19/24] Add tools/visualize_cameras.py: 3D camera trajectory viewer Loads cameras.json, draws frustums colored by frame order (cool colormap), trajectory line, start/end markers. Requires only numpy + matplotlib. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- tools/visualize_cameras.py | 97 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 tools/visualize_cameras.py diff --git a/tools/visualize_cameras.py b/tools/visualize_cameras.py new file mode 100644 index 0000000..a790fa1 --- /dev/null +++ b/tools/visualize_cameras.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +"""Visualize camera trajectory from cameras.json produced by demo.py. + +Usage: + python tools/visualize_cameras.py output/cameras.json + python tools/visualize_cameras.py output/cameras.json --save trajectory.png +""" +import argparse +import json +import numpy as np +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D # noqa: F401 + + +def draw_camera(ax, c2w, scale=0.05, color="steelblue"): + """Draw a small camera frustum: 3 axes + pyramid outline.""" + origin = c2w[:3, 3] + # Unit axes in camera space (right=x, up=-y, forward=z) + axes = c2w[:3, :3] @ np.array([[1, 0, 0], [0, -1, 0], [0, 0, 1]], dtype=float).T + colors = ["red", "green", color] + labels = ["X", "Y", "Z"] + for i, (col, lbl) in enumerate(zip(colors, labels)): + tip = origin + axes[:, i] * scale + ax.plot(*zip(origin, tip), color=col, linewidth=1) + + # Frustum corners (simplified: just 4 corner rays) + corners_cam = np.array([[1, 1, 2], [-1, 1, 2], [-1, -1, 2], [1, -1, 2]], dtype=float) * scale * 0.5 + corners_world = (c2w[:3, :3] @ corners_cam.T).T + origin + for corner in corners_world: + ax.plot(*zip(origin, corner), color=color, linewidth=0.5, alpha=0.5) + # Close the frustum rectangle + rect = np.vstack([corners_world, corners_world[0]]) + ax.plot(rect[:, 0], rect[:, 1], rect[:, 2], color=color, linewidth=0.5, alpha=0.5) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("cameras_json", help="Path to cameras.json") + parser.add_argument("--save", default=None, help="Save figure to file instead of showing") + parser.add_argument("--skip", type=int, default=1, help="Draw every N-th camera (default 1 = all)") + parser.add_argument("--frustum_scale", type=float, default=None, help="Frustum size (auto if unset)") + args = parser.parse_args() + + with open(args.cameras_json) as f: + cameras = json.load(f) + + positions = np.array([c["c2w"][i][3] for c in cameras for i in range(3)]).reshape(-1, 3) + positions = np.array([[c["c2w"][0][3], c["c2w"][1][3], c["c2w"][2][3]] for c in cameras]) + + span = np.max(positions, axis=0) - np.min(positions, axis=0) + scale = float(np.max(span)) * 0.04 if args.frustum_scale is None else args.frustum_scale + + fig = plt.figure(figsize=(10, 8)) + ax = fig.add_subplot(111, projection="3d") + + # Trajectory line + ax.plot(positions[:, 0], positions[:, 1], positions[:, 2], + color="gray", linewidth=1, alpha=0.6, label="trajectory") + + # Start / end markers + ax.scatter(*positions[0], color="lime", s=80, zorder=5, label="start") + ax.scatter(*positions[-1], color="red", s=80, zorder=5, label="end") + + # Camera frustums + for i, cam in enumerate(cameras): + if i % args.skip != 0: + continue + c2w = np.array(cam["c2w"]) # (3, 4) + c2w_4x4 = np.eye(4) + c2w_4x4[:3, :] = c2w + t = i / max(len(cameras) - 1, 1) + color = plt.cm.cool(t) + draw_camera(ax, c2w_4x4, scale=scale, color=color) + + ax.set_xlabel("X") + ax.set_ylabel("Y") + ax.set_zlabel("Z") + ax.set_title(f"Camera trajectory ({len(cameras)} frames)") + ax.legend() + + # Equal aspect ratio + center = positions.mean(axis=0) + half = float(np.max(span)) * 0.55 + ax.set_xlim(center[0] - half, center[0] + half) + ax.set_ylim(center[1] - half, center[1] + half) + ax.set_zlim(center[2] - half, center[2] + half) + + plt.tight_layout() + if args.save: + plt.savefig(args.save, dpi=150) + print(f"Saved → {args.save}") + else: + plt.show() + + +if __name__ == "__main__": + main() From cf71338aad5770364300407c7c3c536b53ccbeea Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 11:39:42 +0000 Subject: [PATCH 20/24] Add tools/analyze_predictions.py: point cloud diagnostic tool Reports: shape/dtype, NaN/Inf ratio, coordinate bounds, distance distribution, confidence percentiles, point counts per threshold, image pixel range. Plots: top-down/side/front 2D projections, confidence histogram, distance histogram, valid-points-per-frame bar chart. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- tools/analyze_predictions.py | 214 +++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 tools/analyze_predictions.py diff --git a/tools/analyze_predictions.py b/tools/analyze_predictions.py new file mode 100644 index 0000000..cf3f9d4 --- /dev/null +++ b/tools/analyze_predictions.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +"""Diagnose predictions.npz from demo.py. + +Usage: + python tools/analyze_predictions.py output/predictions.npz + python tools/analyze_predictions.py output/predictions.npz --cameras output/cameras.json + python tools/analyze_predictions.py output/predictions.npz --save report.png +""" +import argparse +import json +import sys +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec + + +def stat(arr, name): + if arr is None: + print(f" {name}: missing") + return + finite = np.isfinite(arr) + n_inf = np.sum(~finite) + valid = arr[finite] + print(f" {name}: shape={arr.shape} dtype={arr.dtype}") + if valid.size: + print(f" range=[{valid.min():.4g}, {valid.max():.4g}] mean={valid.mean():.4g} " + f"nan/inf={n_inf} ({100*n_inf/arr.size:.1f}%)") + else: + print(f" ALL VALUES INVALID (nan/inf)") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("npz", help="Path to predictions.npz") + parser.add_argument("--cameras", default=None, help="Path to cameras.json (optional)") + parser.add_argument("--save", default=None, help="Save figure to file") + parser.add_argument("--conf_threshold", type=float, default=None, + help="Confidence threshold to simulate (default: show distribution)") + args = parser.parse_args() + + print(f"Loading {args.npz} ...") + d = np.load(args.npz, allow_pickle=False) + keys = list(d.keys()) + print(f"Keys: {keys}\n") + + world_points = d.get("world_points") # (S, H, W, 3) + depth_conf = d.get("depth_conf") if "depth_conf" in keys else ( + d.get("world_points_conf") if "world_points_conf" in keys else None) + images = d.get("images") # (S, 3, H, W) or (S, H, W, 3) + extrinsic = d.get("extrinsic") # (S, 3, 4) + + print("=== Array stats ===") + stat(world_points, "world_points") + stat(depth_conf, "depth_conf / world_points_conf") + stat(images, "images") + stat(extrinsic, "extrinsic") + print() + + if world_points is None: + print("No world_points found — cannot analyse point cloud.") + sys.exit(1) + + S, H, W = world_points.shape[:3] + pts_flat = world_points.reshape(-1, 3) + finite_mask = np.isfinite(pts_flat).all(axis=1) + + print("=== Point cloud sanity ===") + print(f" Total pixels : {len(pts_flat):,}") + print(f" Finite points: {finite_mask.sum():,} ({100*finite_mask.mean():.1f}%)") + + if depth_conf is not None: + conf_flat = depth_conf.reshape(-1) + print(f"\n Confidence stats (all):") + pcts = [0, 1, 5, 25, 50, 75, 90, 95, 99, 100] + vals = np.nanpercentile(conf_flat, pcts) + for p, v in zip(pcts, vals): + print(f" p{p:3d}: {v:.4f}") + + for thr in [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0]: + n_pass = int((conf_flat[finite_mask] > thr).sum()) + print(f" conf > {thr:.1f}: {n_pass:>8,} pts ({100*n_pass/max(finite_mask.sum(),1):.1f}% of finite)") + + pts_valid = pts_flat[finite_mask] + if len(pts_valid): + print(f"\n X range: [{pts_valid[:,0].min():.4f}, {pts_valid[:,0].max():.4f}]") + print(f" Y range: [{pts_valid[:,1].min():.4f}, {pts_valid[:,1].max():.4f}]") + print(f" Z range: [{pts_valid[:,2].min():.4f}, {pts_valid[:,2].max():.4f}]") + dist = np.linalg.norm(pts_valid, axis=1) + print(f" Distance from origin: min={dist.min():.4f} median={np.median(dist):.4f} " + f"p99={np.percentile(dist,99):.4f} max={dist.max():.4f}") + + if images is not None: + print(f"\n Image pixel range: [{images.min():.4f}, {images.max():.4f}]") + if images.max() <= 1.01: + print(" → pixels appear to be in [0,1] — color mapping should be fine") + elif images.max() <= 255.5: + print(" → pixels appear to be in [0,255] — need /255 before color mapping (possible color bug!)") + else: + print(" WARNING: pixel values outside expected range — color mapping will be wrong") + + # ── Figures ────────────────────────────────────────────────────────────── + cam_positions = None + if args.cameras: + with open(args.cameras) as f: + cams = json.load(f) + cam_positions = np.array([[c["c2w"][0][3], c["c2w"][1][3], c["c2w"][2][3]] for c in cams]) + + if extrinsic is not None and cam_positions is None: + cam_positions = extrinsic[:, :3, 3] # translation from c2w + + fig = plt.figure(figsize=(14, 10)) + gs = gridspec.GridSpec(2, 3, figure=fig) + + # Sample points for plotting (avoid OOM on huge arrays) + MAX_PLOT = 50_000 + if depth_conf is not None: + conf_thr = args.conf_threshold if args.conf_threshold is not None else 2.0 + sel = finite_mask & (depth_conf.reshape(-1) > conf_thr) + else: + sel = finite_mask + pts_sel = pts_flat[sel] + if len(pts_sel) > MAX_PLOT: + idx = np.random.choice(len(pts_sel), MAX_PLOT, replace=False) + pts_sel = pts_sel[idx] + print(f"\n Plotting {len(pts_sel):,} points (conf>{args.conf_threshold if args.conf_threshold else 2.0:.1f})") + + # Top-down view (X-Z) + ax1 = fig.add_subplot(gs[0, 0]) + if len(pts_sel): + ax1.scatter(pts_sel[:, 0], pts_sel[:, 2], s=0.5, alpha=0.3, c="steelblue") + if cam_positions is not None: + ax1.plot(cam_positions[:, 0], cam_positions[:, 2], "r-", lw=1, label="cameras") + ax1.scatter(cam_positions[0, 0], cam_positions[0, 2], c="lime", s=60, zorder=5) + ax1.scatter(cam_positions[-1, 0], cam_positions[-1, 2], c="red", s=60, zorder=5) + ax1.legend(fontsize=7) + ax1.set_xlabel("X"); ax1.set_ylabel("Z") + ax1.set_title("Top-down (X-Z)") + ax1.set_aspect("equal") + + # Side view (X-Y) + ax2 = fig.add_subplot(gs[0, 1]) + if len(pts_sel): + ax2.scatter(pts_sel[:, 0], pts_sel[:, 1], s=0.5, alpha=0.3, c="steelblue") + if cam_positions is not None: + ax2.plot(cam_positions[:, 0], cam_positions[:, 1], "r-", lw=1) + ax2.scatter(cam_positions[0, 0], cam_positions[0, 1], c="lime", s=60, zorder=5) + ax2.scatter(cam_positions[-1, 0], cam_positions[-1, 1], c="red", s=60, zorder=5) + ax2.set_xlabel("X"); ax2.set_ylabel("Y") + ax2.set_title("Side (X-Y)") + ax2.set_aspect("equal") + + # Front view (Y-Z) + ax3 = fig.add_subplot(gs[0, 2]) + if len(pts_sel): + ax3.scatter(pts_sel[:, 2], pts_sel[:, 1], s=0.5, alpha=0.3, c="steelblue") + if cam_positions is not None: + ax3.plot(cam_positions[:, 2], cam_positions[:, 1], "r-", lw=1) + ax3.set_xlabel("Z"); ax3.set_ylabel("Y") + ax3.set_title("Front (Z-Y)") + ax3.set_aspect("equal") + + # Confidence histogram + ax4 = fig.add_subplot(gs[1, 0]) + if depth_conf is not None: + cf = depth_conf.reshape(-1) + cf_finite = cf[np.isfinite(cf)] + ax4.hist(cf_finite, bins=100, color="steelblue", alpha=0.7) + for thr in [1.0, 2.0, 3.0]: + ax4.axvline(thr, color="red", lw=1, linestyle="--", label=f"thr={thr}") + ax4.set_xlabel("confidence"); ax4.set_ylabel("count") + ax4.set_title("Confidence distribution") + ax4.legend(fontsize=7) + else: + ax4.text(0.5, 0.5, "No confidence data", ha="center", va="center", transform=ax4.transAxes) + + # Distance histogram + ax5 = fig.add_subplot(gs[1, 1]) + if len(pts_valid): + dist = np.linalg.norm(pts_valid, axis=1) + p99 = np.percentile(dist, 99) + ax5.hist(dist[dist < p99 * 2], bins=100, color="darkorange", alpha=0.7) + ax5.axvline(p99, color="red", lw=1, linestyle="--", label=f"p99={p99:.2f}") + ax5.set_xlabel("distance from origin"); ax5.set_ylabel("count") + ax5.set_title("Point distance distribution") + ax5.legend(fontsize=7) + + # Point count per frame + ax6 = fig.add_subplot(gs[1, 2]) + if depth_conf is not None: + conf_thr = args.conf_threshold if args.conf_threshold is not None else 2.0 + counts = [] + for i in range(S): + pts_i = world_points[i].reshape(-1, 3) + fin = np.isfinite(pts_i).all(axis=1) + cnf = depth_conf[i].reshape(-1) > conf_thr + counts.append(int((fin & cnf).sum())) + ax6.bar(range(S), counts, color="mediumseagreen", alpha=0.8) + ax6.set_xlabel("frame"); ax6.set_ylabel("valid points") + ax6.set_title(f"Valid points per frame (conf>{conf_thr:.1f})") + else: + ax6.text(0.5, 0.5, "No confidence data", ha="center", va="center", transform=ax6.transAxes) + + plt.suptitle(f"LingBot-Map predictions analysis (S={S}, H={H}, W={W})", fontsize=12) + plt.tight_layout() + + if args.save: + plt.savefig(args.save, dpi=150) + print(f"\nSaved → {args.save}") + else: + plt.show() + + +if __name__ == "__main__": + main() From 7363f3f7d26162a3fb283ca1ebc8a3b22dc2fb00 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Apr 2026 02:21:21 +0000 Subject: [PATCH 21/24] Add downsample_factor 4 to lowvram compose: reduce PLY from 32M to ~2M points Without downsampling, conf>2.0 still produces ~490MB/32M points which overwhelms MeshLab and WebGL. Factor 4 brings it to ~2M points (~30MB). Increase point_size to 0.005 to compensate for sparser sampling. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- docker-compose.lowvram.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker-compose.lowvram.yml b/docker-compose.lowvram.yml index 082c25c..e6f843c 100644 --- a/docker-compose.lowvram.yml +++ b/docker-compose.lowvram.yml @@ -24,4 +24,5 @@ services: --overlap_size 4 --num_scale_frames 2 --conf_threshold 2.0 - --point_size 0.001 + --downsample_factor 4 + --point_size 0.005 From 08adf8234a12b26cb569b15c5da7fea427f39bcb Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Apr 2026 02:27:19 +0000 Subject: [PATCH 22/24] Improve quality and fix PLY size: ply_stride, num_scale_frames 4, overlap 8, mask_sky MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - export_results: apply downsample_factor as spatial stride on PLY export (was only affecting viser viewer; now 32M pts → ~2M pts with factor=4) - num_scale_frames 2→4: better global scale estimation - overlap_size 4→8: smoother window-to-window stitching - mask_sky: remove sky points (outdoor/driving sequences) https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- demo.py | 12 +++++++----- docker-compose.lowvram.yml | 5 +++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/demo.py b/demo.py index 4a5d818..faa1dc1 100644 --- a/demo.py +++ b/demo.py @@ -240,7 +240,7 @@ def prepare_for_visualization(predictions, images=None): # Export # ============================================================================= -def export_results(predictions, images_cpu, output_dir, conf_threshold=0.0): +def export_results(predictions, images_cpu, output_dir, conf_threshold=0.0, ply_stride=1): """Save inference results to output_dir. Writes three files: @@ -287,12 +287,13 @@ def export_results(predictions, images_cpu, output_dir, conf_threshold=0.0): S, H, W = world_points.shape[:3] colors = images_np.transpose(0, 2, 3, 1) # (S, H, W, 3) pts_all, col_all = [], [] + st = max(1, int(ply_stride)) for i in range(S): - pts = world_points[i].reshape(-1, 3) - col = (colors[i].reshape(-1, 3) * 255).clip(0, 255).astype(np.uint8) + pts = world_points[i, ::st, ::st].reshape(-1, 3) + col = (colors[i, ::st, ::st].reshape(-1, 3) * 255).clip(0, 255).astype(np.uint8) valid = np.isfinite(pts).all(axis=1) if depth_conf is not None: - valid &= depth_conf[i].reshape(-1) > conf_threshold + valid &= depth_conf[i, ::st, ::st].reshape(-1) > conf_threshold pts_all.append(pts[valid]) col_all.append(col[valid]) pts_merged = np.concatenate(pts_all, axis=0).astype(np.float32) @@ -515,7 +516,8 @@ def main(): # ── Export ─────────────────────────────────────────────────────────────── print(f"Exporting results to {args.output_dir} ...") - export_results(predictions, images_cpu, args.output_dir, conf_threshold=args.conf_threshold) + export_results(predictions, images_cpu, args.output_dir, + conf_threshold=args.conf_threshold, ply_stride=args.downsample_factor) if args.no_viewer: print("Viewer skipped (--no_viewer). Done.") diff --git a/docker-compose.lowvram.yml b/docker-compose.lowvram.yml index e6f843c..634091b 100644 --- a/docker-compose.lowvram.yml +++ b/docker-compose.lowvram.yml @@ -21,8 +21,9 @@ services: --output_dir /data/output --mode windowed --window_size 16 - --overlap_size 4 - --num_scale_frames 2 + --overlap_size 8 + --num_scale_frames 4 --conf_threshold 2.0 --downsample_factor 4 + --mask_sky --point_size 0.005 From a776640680cfa6c1fee596e43e3e6664e9e6c5d8 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Apr 2026 08:54:03 +0000 Subject: [PATCH 23/24] Add tools/debug_reconstruction.py: per-frame geometry debugger Checks if world_points are in front of / behind each camera by transforming to camera space. Reports front% per frame and flags coordinate convention bugs. Also shows depth histogram, reprojection overlay, and camera forward vector. https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- tools/debug_reconstruction.py | 240 ++++++++++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 tools/debug_reconstruction.py diff --git a/tools/debug_reconstruction.py b/tools/debug_reconstruction.py new file mode 100644 index 0000000..d3cdbc6 --- /dev/null +++ b/tools/debug_reconstruction.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +"""Debug reconstruction quality from predictions.npz. + +Checks: + 1. Are world_points in front of or behind each camera? + 2. Do reprojected points align with the original image? + 3. Depth map plausibility per frame. + +Usage: + python tools/debug_reconstruction.py output/predictions.npz + python tools/debug_reconstruction.py output/predictions.npz --frames 0 10 50 100 + python tools/debug_reconstruction.py output/predictions.npz --save debug/ +""" +import argparse +import os +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec + + +def c2w_to_w2c(c2w_3x4): + """Invert c2w (3x4) → w2c (3x4).""" + R = c2w_3x4[:3, :3] + t = c2w_3x4[:3, 3] + R_inv = R.T + t_inv = -R_inv @ t + w2c = np.eye(4) + w2c[:3, :3] = R_inv + w2c[:3, 3] = t_inv + return w2c[:3, :] # (3, 4) + + +def project_to_camera(world_pts, w2c_3x4, K_3x3): + """world_pts (N,3) → pixel coords (N,2) and depth (N,).""" + R, t = w2c_3x4[:3, :3], w2c_3x4[:3, 3] + cam_pts = (R @ world_pts.T).T + t # (N, 3) in camera space + depth_cam = cam_pts[:, 2] + fx, fy = K_3x3[0, 0], K_3x3[1, 1] + cx, cy = K_3x3[0, 2], K_3x3[1, 2] + u = fx * cam_pts[:, 0] / (cam_pts[:, 2] + 1e-8) + cx + v = fy * cam_pts[:, 1] / (cam_pts[:, 2] + 1e-8) + cy + return np.stack([u, v], axis=1), depth_cam + + +def analyze_frame(frame_idx, world_points, extrinsic, intrinsic, images, + depth_conf, conf_threshold=2.0): + """Return dict of diagnostics for one frame.""" + pts = world_points[frame_idx].reshape(-1, 3) # (H*W, 3) + c2w = np.eye(4); c2w[:3, :] = extrinsic[frame_idx] # (4, 4) + w2c = c2w_to_w2c(extrinsic[frame_idx]) + K = intrinsic[frame_idx] + H, W = world_points.shape[1:3] + + # Camera position and forward direction + cam_pos = extrinsic[frame_idx][:3, 3] + cam_forward = extrinsic[frame_idx][:3, 2] # 3rd column of R = Z axis + + # Transform points to camera space + _, depth_cam = project_to_camera(pts, w2c, K) + + finite = np.isfinite(pts).all(axis=1) + if depth_conf is not None: + conf_mask = depth_conf[frame_idx].reshape(-1) > conf_threshold + else: + conf_mask = np.ones(len(pts), dtype=bool) + valid = finite & conf_mask + + n_valid = valid.sum() + n_front = (depth_cam[valid] > 0).sum() + n_behind = (depth_cam[valid] <= 0).sum() + pct_front = 100 * n_front / max(n_valid, 1) + pct_behind = 100 * n_behind / max(n_valid, 1) + depth_median = float(np.median(depth_cam[valid])) if n_valid else float("nan") + depth_p5 = float(np.percentile(depth_cam[valid], 5)) if n_valid else float("nan") + depth_p95 = float(np.percentile(depth_cam[valid], 95)) if n_valid else float("nan") + + # Image for display (C,H,W) → (H,W,C), clip to [0,1] + img_display = None + if images is not None: + img = images[frame_idx] + if img.shape[0] == 3: # (3,H,W) → (H,W,3) + img = img.transpose(1, 2, 0) + img_display = np.clip(img, 0, 1) + + # Reprojection: project valid pts back and compare to pixel grid + uv, _ = project_to_camera(pts[valid], w2c, K) + + return dict( + frame=frame_idx, + cam_pos=cam_pos, + cam_forward=cam_forward, + n_valid=n_valid, + pct_front=pct_front, + pct_behind=pct_behind, + depth_median=depth_median, + depth_p5=depth_p5, + depth_p95=depth_p95, + depth_cam_valid=depth_cam[valid], + world_pts_valid=pts[valid], + uv_reprojected=uv, + img_display=img_display, + H=H, W=W, + ) + + +def plot_frame(ax_row, diag): + """Fill one row of subplots for a single frame.""" + ax_img, ax_depth, ax_reproj, ax_text = ax_row + f = diag["frame"] + + # ── image ────────────────────────────────────────────────────── + if diag["img_display"] is not None: + ax_img.imshow(diag["img_display"]) + ax_img.set_title(f"frame {f}: input image", fontsize=8) + ax_img.axis("off") + + # ── depth histogram ──────────────────────────────────────────── + dc = diag["depth_cam_valid"] + if len(dc): + p1, p99 = np.percentile(dc, 1), np.percentile(dc, 99) + ax_depth.hist(np.clip(dc, p1 * 1.5, p99 * 1.5), bins=80, + color="steelblue" if diag["pct_front"] > 90 else "tomato", + alpha=0.8) + ax_depth.axvline(0, color="red", lw=1.5, label="camera plane") + ax_depth.set_xlabel("depth in camera space", fontsize=7) + ax_depth.set_title( + f"front {diag['pct_front']:.0f}% behind {diag['pct_behind']:.0f}%\n" + f"median={diag['depth_median']:.2f} p5={diag['depth_p5']:.2f} p95={diag['depth_p95']:.2f}", + fontsize=7) + ax_depth.legend(fontsize=6) + ax_depth.tick_params(labelsize=6) + + # ── reprojection scatter ─────────────────────────────────────── + uv = diag["uv_reprojected"] + H, W = diag["H"], diag["W"] + if diag["img_display"] is not None: + ax_reproj.imshow(diag["img_display"], alpha=0.5) + in_frame = ((uv[:, 0] >= 0) & (uv[:, 0] < W) & + (uv[:, 1] >= 0) & (uv[:, 1] < H)) + MAX_PTS = 2000 + if in_frame.sum(): + idx = np.random.choice(in_frame.sum(), + min(MAX_PTS, in_frame.sum()), replace=False) + ax_reproj.scatter(uv[in_frame][idx, 0], uv[in_frame][idx, 1], + s=0.3, alpha=0.4, c="lime") + pct_in = 100 * in_frame.mean() + ax_reproj.set_xlim(0, W); ax_reproj.set_ylim(H, 0) + ax_reproj.set_title(f"reprojection {pct_in:.0f}% in frame", fontsize=8) + ax_reproj.axis("off") + + # ── text summary ─────────────────────────────────────────────── + pos = diag["cam_pos"] + fwd = diag["cam_forward"] + txt = (f"frame {f}\n" + f"pos [{pos[0]:.2f}, {pos[1]:.2f}, {pos[2]:.2f}]\n" + f"fwd [{fwd[0]:.2f}, {fwd[1]:.2f}, {fwd[2]:.2f}]\n" + f"valid pts: {diag['n_valid']:,}\n" + f"front: {diag['pct_front']:.1f}%\n" + f"behind: {diag['pct_behind']:.1f}%") + color = "limegreen" if diag["pct_front"] > 90 else \ + "orange" if diag["pct_front"] > 50 else "red" + ax_text.text(0.05, 0.95, txt, transform=ax_text.transAxes, + fontsize=8, va="top", fontfamily="monospace", + bbox=dict(boxstyle="round", facecolor=color, alpha=0.3)) + ax_text.axis("off") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("npz", help="Path to predictions.npz") + parser.add_argument("--frames", nargs="+", type=int, default=None, + help="Frame indices to inspect (default: 0, S//4, S//2, 3*S//4)") + parser.add_argument("--conf_threshold", type=float, default=2.0) + parser.add_argument("--save", default=None, + help="Directory to save per-frame debug figures") + args = parser.parse_args() + + print(f"Loading {args.npz} ...") + d = np.load(args.npz, allow_pickle=False) + + world_points = d["world_points"] # (S, H, W, 3) + extrinsic = d["extrinsic"] # (S, 3, 4) c2w + intrinsic = d["intrinsic"] # (S, 3, 3) + images = d.get("images") # (S, 3, H, W) or None + depth_conf = (d.get("depth_conf") if "depth_conf" in d + else d.get("world_points_conf") if "world_points_conf" in d + else None) + + S = world_points.shape[0] + frames = args.frames or [0, S // 4, S // 2, 3 * S // 4, S - 1] + frames = [min(f, S - 1) for f in frames] + print(f"Analysing frames: {frames}\n") + + # ── Per-frame text summary ───────────────────────────────────── + print(f"{'frame':>6} {'front%':>7} {'behind%':>8} {'depth_median':>12} {'cam_pos':>30}") + diags = [] + for fi in frames: + diag = analyze_frame(fi, world_points, extrinsic, intrinsic, + images, depth_conf, args.conf_threshold) + diags.append(diag) + print(f"{fi:6d} {diag['pct_front']:7.1f} {diag['pct_behind']:8.1f} " + f"{diag['depth_median']:12.3f} " + f"[{diag['cam_pos'][0]:.2f}, {diag['cam_pos'][1]:.2f}, {diag['cam_pos'][2]:.2f}]") + + # ── Diagnosis ───────────────────────────────────────────────── + print() + avg_front = np.mean([d["pct_front"] for d in diags]) + if avg_front > 90: + print("✓ Points are mostly in front of cameras — geometry looks correct.") + print(" Blank viewer / bad PLY is likely a density/scale issue, not a logic bug.") + elif avg_front > 50: + print("△ Partial front/behind mix — possible coordinate convention mismatch.") + print(" Check if extrinsic is truly c2w (camera-to-world).") + else: + print("✗ Most points are BEHIND cameras — likely a coordinate convention bug.") + print(" The c2w→w2c inversion or world_point projection may be flipped.") + + # ── Figures ─────────────────────────────────────────────────── + n = len(diags) + fig, axes = plt.subplots(n, 4, figsize=(16, 4 * n), + gridspec_kw={"width_ratios": [2, 2, 2, 1]}) + if n == 1: + axes = [axes] + for row, diag in zip(axes, diags): + plot_frame(row, diag) + + plt.suptitle("Reconstruction debug: per-frame geometry check", fontsize=11) + plt.tight_layout() + + if args.save: + os.makedirs(args.save, exist_ok=True) + path = os.path.join(args.save, "debug_frames.png") + plt.savefig(path, dpi=120) + print(f"\nSaved → {path}") + else: + plt.show() + + +if __name__ == "__main__": + main() From fe6e4816ede97ee636d1aed2ccf8d5477015da0f Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Apr 2026 10:06:50 +0000 Subject: [PATCH 24/24] Extend debug_reconstruction: chunk_scales check, full-sequence front% scan, cam_fwd output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Print chunk_scales per window — flags clamped (1e-3/1e3) alignment failures - Scan every Nth frame for front% to reveal which windows are flipped - Print camera forward vector alongside cam_pos in per-frame table https://claude.ai/code/session_012nvgo5ETaSxQ7AjxLpPMfx --- tools/debug_reconstruction.py | 66 +++++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/tools/debug_reconstruction.py b/tools/debug_reconstruction.py index d3cdbc6..24388b1 100644 --- a/tools/debug_reconstruction.py +++ b/tools/debug_reconstruction.py @@ -169,10 +169,10 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("npz", help="Path to predictions.npz") parser.add_argument("--frames", nargs="+", type=int, default=None, - help="Frame indices to inspect (default: 0, S//4, S//2, 3*S//4)") + help="Frame indices to inspect (default: evenly spaced 5 frames)") parser.add_argument("--conf_threshold", type=float, default=2.0) parser.add_argument("--save", default=None, - help="Directory to save per-frame debug figures") + help="Directory to save debug figures") args = parser.parse_args() print(f"Loading {args.npz} ...") @@ -185,35 +185,81 @@ def main(): depth_conf = (d.get("depth_conf") if "depth_conf" in d else d.get("world_points_conf") if "world_points_conf" in d else None) + chunk_scales = d["chunk_scales"] if "chunk_scales" in d else None # (num_windows,) S = world_points.shape[0] + + # ── Window alignment scales ──────────────────────────────────── + if chunk_scales is not None: + cs = chunk_scales.reshape(-1) + print(f"\n=== Window alignment scales (chunk_scales) ===") + print(f" num windows : {len(cs)}") + print(f" values : {np.array2string(cs, precision=4, separator=', ')}") + bad = (cs < 0.01) | (cs > 100) + if bad.any(): + print(f" WARNING: windows {np.where(bad)[0].tolist()} have extreme scales " + f"(clamped to 1e-3 or 1e3) — alignment failed for these windows") + else: + print(f" All scales look reasonable (range [{cs.min():.4f}, {cs.max():.4f}])") + + # ── Full-sequence front% scan ───────────────────────────────── + print(f"\n=== Front% scan (every 10th frame) ===") + stride = max(1, S // 50) + scan_frames = list(range(0, S, stride)) + front_pcts = [] + for fi in scan_frames: + pts = world_points[fi].reshape(-1, 3) + c2w = extrinsic[fi] + w2c = c2w_to_w2c(c2w) + _, dc = project_to_camera(pts, w2c, intrinsic[fi]) + finite = np.isfinite(pts).all(axis=1) + conf_ok = depth_conf[fi].reshape(-1) > args.conf_threshold if depth_conf is not None else finite + valid = finite & conf_ok + pct = 100 * (dc[valid] > 0).sum() / max(valid.sum(), 1) + front_pcts.append(float(pct)) + + front_arr = np.array(front_pcts) + bad_frames = [scan_frames[i] for i, p in enumerate(front_pcts) if p < 50] + good_frames = [scan_frames[i] for i, p in enumerate(front_pcts) if p >= 90] + print(f" Frames with <50% front (flipped): {bad_frames}") + print(f" Frames with >=90% front (correct): count={len(good_frames)}/{len(scan_frames)}") + frames = args.frames or [0, S // 4, S // 2, 3 * S // 4, S - 1] frames = [min(f, S - 1) for f in frames] - print(f"Analysing frames: {frames}\n") + print(f"\nDetail frames: {frames}") # ── Per-frame text summary ───────────────────────────────────── - print(f"{'frame':>6} {'front%':>7} {'behind%':>8} {'depth_median':>12} {'cam_pos':>30}") + print(f"\n{'frame':>6} {'front%':>7} {'behind%':>8} {'depth_median':>12} " + f"{'cam_pos':>30} {'cam_fwd':>30}") diags = [] for fi in frames: diag = analyze_frame(fi, world_points, extrinsic, intrinsic, images, depth_conf, args.conf_threshold) diags.append(diag) + fwd = diag["cam_forward"] print(f"{fi:6d} {diag['pct_front']:7.1f} {diag['pct_behind']:8.1f} " f"{diag['depth_median']:12.3f} " - f"[{diag['cam_pos'][0]:.2f}, {diag['cam_pos'][1]:.2f}, {diag['cam_pos'][2]:.2f}]") + f"[{diag['cam_pos'][0]:5.2f},{diag['cam_pos'][1]:5.2f},{diag['cam_pos'][2]:5.2f}] " + f"[{fwd[0]:5.2f},{fwd[1]:5.2f},{fwd[2]:5.2f}]") # ── Diagnosis ───────────────────────────────────────────────── print() avg_front = np.mean([d["pct_front"] for d in diags]) + pct_bad_windows = 100 * len(bad_frames) / max(len(scan_frames), 1) if avg_front > 90: print("✓ Points are mostly in front of cameras — geometry looks correct.") print(" Blank viewer / bad PLY is likely a density/scale issue, not a logic bug.") - elif avg_front > 50: - print("△ Partial front/behind mix — possible coordinate convention mismatch.") - print(" Check if extrinsic is truly c2w (camera-to-world).") + elif pct_bad_windows > 10 and chunk_scales is not None and ((chunk_scales.reshape(-1) < 0.01).any()): + print("✗ Window scale clamped to minimum — depth-ratio alignment failed.") + print(" Likely cause: near-zero or negative depth in overlap frames.") + print(" Fix: increase --overlap_size or --num_scale_frames.") + elif pct_bad_windows > 10: + print("✗ Many frames have points behind cameras.") + print(" Pattern: check if bad frames cluster at window boundaries.") + print(" If clustered → windowed stitching issue (overlap too small).") + print(" If scattered → model output inconsistency (try --mode streaming).") else: - print("✗ Most points are BEHIND cameras — likely a coordinate convention bug.") - print(" The c2w→w2c inversion or world_point projection may be flipped.") + print("△ Partial front/behind mix — possible coordinate convention mismatch.") # ── Figures ─────────────────────────────────────────────────── n = len(diags)