Robbyant · YoshiRi · Apr 18, 2026 · Apr 18, 2026 · Apr 18, 2026 · Apr 19, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,47 @@
+# pytorch/pytorch images ship with Python, pip, and PyTorch pre-installed,
+# so no NVIDIA registry auth or manual CUDA installation is needed.
+FROM pytorch/pytorch:2.9.1-cuda12.8-cudnn9-devel
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV PYTORCH_ALLOC_CONF=expandable_segments:True
+
+# LOW_VRAM_MODE=1: cast aggregator to bf16 on CPU before GPU transfer (~2-3 GB VRAM savings).
+# Per original authors: "no measurable quality change" for the aggregator trunk,
+# but the scale-phase RoPE computations are affected on very small GPUs.
+# Build with: docker build --build-arg LOW_VRAM_MODE=1 -t lingbot-map-demo-light .
+ARG LOW_VRAM_MODE=0
+ENV LOW_VRAM_MODE=${LOW_VRAM_MODE}
+
+# System dependencies (Python/pip already present in base image)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    wget \
+    curl \
+    libglib2.0-0 \
+    libgl1 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy source and install lingbot-map with visualization extras
+WORKDIR /app
+COPY . .
+RUN pip install --no-cache-dir -e ".[vis]" && \
+    pip install --no-cache-dir onnxruntime
+
+# FlashInfer for efficient KV-cache attention (falls back to SDPA if unavailable)
+RUN pip install --no-cache-dir flashinfer-python \
+    -i https://flashinfer.ai/whl/cu128/torch2.9/ || \
+    echo "WARNING: FlashInfer not installed — demo will use --use_sdpa fallback"
+
+RUN mkdir -p /model /data/images
+
+COPY docker/entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+EXPOSE 8080
+
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/README.md b/README.md
@@ -208,6 +208,151 @@ python demo.py --model_path /path/to/checkpoint.pt \
 
 `--camera_num_iterations` defaults to `4`; setting it to `1` skips three refinement passes in the camera head (and shrinks its KV cache by 4×).
 
+# 🐳 Docker
+
+Run the full demo — including model download, inference, and 3D viewer — without any local Python or CUDA setup.
+
+## Image Design
+
+| Layer | Detail |
+|:---|:---|
+| Base image | `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-devel` (public, no auth required) |
+| Attention backend | [FlashInfer](https://github.com/flashinfer-ai/flashinfer) for paged KV-cache; auto-falls back to PyTorch SDPA if unavailable |
+| Visualisation | [viser](https://github.com/nerfstudio-project/viser) web viewer exposed on port **8080** |
+| Model resolution | `docker/entrypoint.sh` checks `/model/` at startup and auto-downloads from HuggingFace when no `.pt` file is found |
+| Data access | Images and model weights are provided via **volume mounts** — nothing user-specific is baked into the image |
+
+```
+lingbot-map-demo
+├── /app/              ← source code + built-in example scenes
+│   └── example/{church,oxford,university,loop}/
+├── /model/            ← mount a host directory here to cache the model
+└── /data/             ← mount your images or video here
+```
+
+## Prerequisites
+
+- [Docker](https://docs.docker.com/get-docker/) with the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
+- An NVIDIA GPU (CUDA 12.8 driver)
+
+## Build
+
+```bash
+git clone https://github.com/YoshiRi/lingbot-map-docker.git
+cd lingbot-map-docker
+docker build -t lingbot-map-demo .
+```
+
+## Try the Built-in Example Scenes
+
+The four example scenes from `example/` are already baked into the image at `/app/example/`.
+No extra data mount is needed — just provide a writable directory for the model cache.
+
+```bash
+# Church (outdoor, sky masking recommended)
+docker run --gpus all \
+  -v $(pwd)/model:/model \
+  -p 8080:8080 \
+  lingbot-map-demo \
+  --image_folder /app/example/church --mask_sky
+
+# Oxford
+docker run --gpus all \
+  -v $(pwd)/model:/model \
+  -p 8080:8080 \
+  lingbot-map-demo \
+  --image_folder /app/example/oxford --mask_sky
+
+# University
+docker run --gpus all \
+  -v $(pwd)/model:/model \
+  -p 8080:8080 \
+  lingbot-map-demo \
+  --image_folder /app/example/university --mask_sky
+
+# Loop (loop-closure trajectory, no sky masking needed)
+docker run --gpus all \
+  -v $(pwd)/model:/model \
+  -p 8080:8080 \
+  lingbot-map-demo \
+  --image_folder /app/example/loop
+```
+
+On **first run** the model is downloaded from HuggingFace and cached in `./model/`; subsequent runs start immediately.
+Open **http://localhost:8080** in your browser once inference completes.
+
+## Run with Your Own Images
+
+Place your images (`.jpg` / `.png`) in a local folder, then mount it:
+
+```bash
+docker run --gpus all \
+  -v /path/to/your/images:/data/images \
+  -v $(pwd)/model:/model \
+  -p 8080:8080 \
+  lingbot-map-demo \
+  --image_folder /data/images
+```
+
+## Run with a Video File
+
+```bash
+docker run --gpus all \
+  -v /path/to/video.mp4:/data/video.mp4 \
+  -v $(pwd)/model:/model \
+  -p 8080:8080 \
+  lingbot-map-demo \
+  --video_path /data/video.mp4 --fps 10
+```
+
+## docker-compose
+
+Edit `docker-compose.yml` to set your image folder and model variant, then:
+
+```bash
+# Put your images in ./images/
+docker compose up
+```
+
+## Environment Variables
+
+| Variable | Default | Description |
+|:---|:---|:---|
+| `HF_MODEL_NAME` | `lingbot-map` | Checkpoint to download: `lingbot-map`, `lingbot-map-long`, or `lingbot-map-stage1` |
+| `MODEL_PATH` | *(auto)* | Explicit path to a `.pt` file inside the container (skips auto-download) |
+| `MODEL_CACHE_DIR` | `/model` | Directory where the downloaded model is stored |
+| `HUGGING_FACE_HUB_TOKEN` | *(none)* | HuggingFace token for gated repos |
+
+## Tips
+
+**Use a pre-downloaded model** (avoids HuggingFace download at runtime):
+```bash
+docker run --gpus all \
+  -v /path/to/checkpoint.pt:/model/lingbot-map.pt \
+  -v $(pwd)/images:/data/images \
+  -p 8080:8080 \
+  lingbot-map-demo \
+  --image_folder /data/images
+```
+
+**Limited GPU memory** — add one or both flags:
+```bash
+  --num_scale_frames 2      # reduces activation peak of the initial scale phase
+  --keyframe_interval 6     # keeps only every 6th frame in KV cache
+```
+
+**Long sequences (> 3000 frames)** — use windowed mode:
+```bash
+  --mode windowed --window_size 128
+```
+
+**Faster inference** — reduce camera head iterations (small accuracy trade-off):
+```bash
+  --camera_num_iterations 1
+```
+
+---
+
 # 📜 License
 
 This project is released under the Apache License 2.0. See [LICENSE](LICENSE.txt) file for details.

diff --git a/demo.py b/demo.py
@@ -20,7 +20,9 @@
 
 import argparse
 import glob
+import json
 import os
+import struct
 import time
 
 # Must be set before `import torch` / any CUDA init. Reduces the reserved-vs-allocated
@@ -127,7 +129,7 @@ def load_model(args, device):
 
     if args.model_path:
         print(f"Loading checkpoint: {args.model_path}")
-        ckpt = torch.load(args.model_path, map_location=device, weights_only=False)
+        ckpt = torch.load(args.model_path, map_location="cpu", weights_only=False)
         state_dict = ckpt.get("model", ckpt)
         missing, unexpected = model.load_state_dict(state_dict, strict=False)
         if missing:
@@ -136,6 +138,13 @@ def load_model(args, device):
             print(f"  Unexpected keys: {len(unexpected)}")
         print("  Checkpoint loaded.")
 
+    if os.environ.get("LOW_VRAM_MODE", "0") == "1" and \
+            device.type == "cuda" and getattr(model, "aggregator", None) is not None:
+        cap = torch.cuda.get_device_capability(device)
+        dtype = torch.bfloat16 if cap[0] >= 8 else torch.float16
+        print(f"[LOW_VRAM_MODE] Casting aggregator to {dtype} on CPU before GPU transfer")
+        model.aggregator = model.aggregator.to(dtype=dtype)
+
     return model.to(device).eval()
 
 
@@ -227,6 +236,107 @@ def prepare_for_visualization(predictions, images=None):
     return vis_predictions
 
 
+# =============================================================================
+# Export
+# =============================================================================
+
+def export_results(predictions, images_cpu, output_dir, conf_threshold=0.0, ply_stride=1):
+    """Save inference results to output_dir.
+
+    Writes three files:
+      predictions.npz  – raw numpy arrays (depth, world_points, extrinsic, intrinsic, images)
+      pointcloud.ply   – merged, confidence-filtered point cloud (binary PLY)
+      cameras.json     – per-frame camera-to-world poses and intrinsics
+    """
+    os.makedirs(output_dir, exist_ok=True)
+
+    # ── NPZ ──────────────────────────────────────────────────────────────────
+    npz_path = os.path.join(output_dir, "predictions.npz")
+    save_dict = {}
+    for k, v in predictions.items():
+        if isinstance(v, torch.Tensor):
+            save_dict[k] = v.cpu().numpy()
+        elif isinstance(v, np.ndarray):
+            save_dict[k] = v
+    if isinstance(images_cpu, torch.Tensor):
+        images_arr = images_cpu.numpy()
+    elif isinstance(images_cpu, np.ndarray):
+        images_arr = images_cpu
+    else:
+        images_arr = None
+    if images_arr is not None:
+        if images_arr.ndim == 5 and images_arr.shape[0] == 1:
+            images_arr = images_arr[0]  # (1,S,C,H,W) → (S,C,H,W)
+        save_dict["images"] = images_arr
+    np.savez_compressed(npz_path, **save_dict)
+    print(f"  Saved predictions → {npz_path}")
+
+    # ── PLY ──────────────────────────────────────────────────────────────────
+    world_points = save_dict.get("world_points")   # (S, H, W, 3)
+    depth        = save_dict.get("depth")          # (S, H, W, 1) fallback
+    depth_conf   = save_dict.get("depth_conf")     # (S, H, W)
+    images_np    = save_dict.get("images")         # (S, 3, H, W)
+
+    if world_points is None and depth is not None:
+        from lingbot_map.utils.geometry import unproject_depth_map_to_point_map
+        world_points = unproject_depth_map_to_point_map(
+            depth, save_dict["extrinsic"], save_dict["intrinsic"]
+        )
+
+    if world_points is not None and images_np is not None:
+        S, H, W = world_points.shape[:3]
+        colors = images_np.transpose(0, 2, 3, 1)  # (S, H, W, 3)
+        pts_all, col_all = [], []
+        st = max(1, int(ply_stride))
+        for i in range(S):
+            pts = world_points[i, ::st, ::st].reshape(-1, 3)
+            col = (colors[i, ::st, ::st].reshape(-1, 3) * 255).clip(0, 255).astype(np.uint8)
+            valid = np.isfinite(pts).all(axis=1)
+            if depth_conf is not None:
+                valid &= depth_conf[i, ::st, ::st].reshape(-1) > conf_threshold
+            pts_all.append(pts[valid])
+            col_all.append(col[valid])
+        pts_merged = np.concatenate(pts_all, axis=0).astype(np.float32)
+        col_merged = np.concatenate(col_all, axis=0)
+        n = len(pts_merged)
+
+        ply_path = os.path.join(output_dir, "pointcloud.ply")
+        header = (
+            "ply\nformat binary_little_endian 1.0\n"
+            f"element vertex {n}\n"
+            "property float x\nproperty float y\nproperty float z\n"
+            "property uchar red\nproperty uchar green\nproperty uchar blue\n"
+            "end_header\n"
+        ).encode()
+        with open(ply_path, "wb") as f:
+            f.write(header)
+            # interleave xyz + rgb tightly
+            data = np.empty(n, dtype=[("x","f4"),("y","f4"),("z","f4"),
+                                      ("r","u1"),("g","u1"),("b","u1")])
+            data["x"], data["y"], data["z"] = pts_merged[:,0], pts_merged[:,1], pts_merged[:,2]
+            data["r"], data["g"], data["b"] = col_merged[:,0], col_merged[:,1], col_merged[:,2]
+            f.write(data.tobytes())
+        print(f"  Saved point cloud ({n:,} pts) → {ply_path}")
+    else:
+        print("  Skipping PLY export (world_points or images not available)")
+
+    # ── cameras.json ─────────────────────────────────────────────────────────
+    extrinsic  = save_dict.get("extrinsic")   # (S, 3, 4)  c2w
+    intrinsic  = save_dict.get("intrinsic")   # (S, 3, 3)
+    if extrinsic is not None and intrinsic is not None:
+        cameras = []
+        for i in range(len(extrinsic)):
+            cameras.append({
+                "frame": i,
+                "c2w":  extrinsic[i].tolist(),
+                "K":    intrinsic[i].tolist(),
+            })
+        cam_path = os.path.join(output_dir, "cameras.json")
+        with open(cam_path, "w") as f:
+            json.dump(cameras, f, indent=2)
+        print(f"  Saved cameras ({len(cameras)} frames) → {cam_path}")
+
+
 # =============================================================================
 # Main
 # =============================================================================
@@ -291,6 +401,12 @@ def main():
     parser.add_argument("--export_preprocessed", type=str, default=None,
                         help="Export stride-sampled, resized/cropped images to this folder")
 
+    # Output
+    parser.add_argument("--output_dir", type=str, default="/data/output",
+                        help="Directory for exported results (predictions.npz, pointcloud.ply, cameras.json)")
+    parser.add_argument("--no_viewer", action="store_true",
+                        help="Skip the interactive viewer (export only)")
+
     args = parser.parse_args()
     assert args.image_folder or args.video_path, \
         "Provide --image_folder or --video_path"
@@ -398,6 +514,15 @@ def main():
 
     predictions, images_cpu = postprocess(predictions, images_for_post)
 
+    # ── Export ───────────────────────────────────────────────────────────────
+    print(f"Exporting results to {args.output_dir} ...")
+    export_results(predictions, images_cpu, args.output_dir,
+                   conf_threshold=args.conf_threshold, ply_stride=args.downsample_factor)
+
+    if args.no_viewer:
+        print("Viewer skipped (--no_viewer). Done.")
+        return
+
     # ── Visualize ────────────────────────────────────────────────────────────
     try:
         from lingbot_map.vis import PointCloudViewer

diff --git a/docker-compose.lowvram.yml b/docker-compose.lowvram.yml
@@ -0,0 +1,29 @@
+services:
+  lingbot-map:
+    build:
+      context: .
+      args:
+        LOW_VRAM_MODE: "1"               # cast aggregator to bf16 before GPU transfer
+    image: lingbot-map-demo-light
+    runtime: nvidia
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - HF_MODEL_NAME=lingbot-map
+      - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-}
+    volumes:
+      - ${IMAGE_HOST_PATH:-./example/oxford}:/data/images   # override: IMAGE_HOST_PATH=~/photos docker compose up
+      - ./model:/model                                      # model cache (downloaded on first run)
+      - ./output:/data/output                               # exported results (PLY / NPZ / JSON)
+    ports:
+      - "8080:8080"
+    command: >
+      --image_folder /data/images
+      --output_dir /data/output
+      --mode windowed
+      --window_size 16
+      --overlap_size 8
+      --num_scale_frames 4
+      --conf_threshold 2.0
+      --downsample_factor 4
+      --mask_sky
+      --point_size 0.005