diff --git a/.gitignore b/.gitignore index ccd784f..36a793e 100644 --- a/.gitignore +++ b/.gitignore @@ -209,3 +209,10 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ + +# model files +*.pt +*.pth +*.onnx +*.engine +*.ts diff --git a/README.md b/README.md index 49392b8..21352a8 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,8 @@ ROS2 perception stack for generalist robotics. ## Packages - **track_anything** - EdgeTAM tracking + 3D segmentation with RGBD +- **detect_anything** - YOLO-E detection node and utilities +- **semantic_mapping** - placeholder for future mapping components (currently empty) - **vector_perception_utils** - Image and point cloud utilities ## Installation @@ -21,6 +23,7 @@ pip install -r requirements.txt # Build source /opt/ros/jazzy/setup.bash colcon build +``` diff --git a/detect_anything/CMakeLists.txt b/detect_anything/CMakeLists.txt new file mode 100644 index 0000000..c8e9188 --- /dev/null +++ b/detect_anything/CMakeLists.txt @@ -0,0 +1,47 @@ +cmake_minimum_required(VERSION 3.10) +project(detect_anything) + +find_package(ament_cmake REQUIRED) +find_package(rclpy REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(std_msgs REQUIRED) +find_package(vision_msgs REQUIRED) +find_package(cv_bridge REQUIRED) +find_package(rosidl_default_generators REQUIRED) +find_package(Python3 REQUIRED COMPONENTS Interpreter) + +rosidl_generate_interfaces(${PROJECT_NAME} + "msg/DetectionResult.msg" + DEPENDENCIES std_msgs sensor_msgs +) + +set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages") + +install( + DIRECTORY detect_anything + DESTINATION ${PYTHON_INSTALL_DIR} +) + +install( + PROGRAMS + scripts/detection_node + DESTINATION lib/${PROJECT_NAME} +) + +install( + FILES resource/detect_anything + DESTINATION share/${PROJECT_NAME}/resource +) + +install( + DIRECTORY config + DESTINATION share/${PROJECT_NAME} +) + +set(ENV_HOOK "${CMAKE_CURRENT_SOURCE_DIR}/env-hooks/venv_pythonpath.sh.in") +set(ENV_HOOK_OUT "${CMAKE_CURRENT_BINARY_DIR}/ament_cmake_environment_hooks/venv_pythonpath.sh") +configure_file(${ENV_HOOK} ${ENV_HOOK_OUT} @ONLY) +ament_environment_hooks(${ENV_HOOK_OUT}) + +ament_export_dependencies(rosidl_default_runtime) +ament_package() diff --git a/detect_anything/README.md b/detect_anything/README.md new file mode 100644 index 0000000..1e94a06 --- /dev/null +++ b/detect_anything/README.md @@ -0,0 +1,28 @@ +# detect_anything + +YOLO-E detection node that publishes `DetectionResult` with cropped masks and an annotated overlay topic. + +## What’s inside +- `detect_anything/detection.py`: detection results container and Ultralytics parsing helpers. +- `detect_anything/yoloe.py`: YOLO-E wrapper with prompt support and basic filtering. +- `detect_anything/detection_node.py`: ROS2 node wiring the detector to `DetectionResult`. +- `msg/DetectionResult.msg`: compressed image + cropped mask array. + +## Quick start +```bash +source ~/vector_venv/bin/activate +source /opt/ros/jazzy/setup.bash +colcon build --packages-select detect_anything +source install/setup.bash + +ros2 run detect_anything detection_node \ + --ros-args -p model_path:=/path/to/yoloe/models \ + -p model_name:=yoloe-11s-seg-pf.pt \ + -p conf:=0.6 \ + -p max_area_ratio:=0.3 \ + -p image_topic:=/camera/image +``` + +Topics: +- Publishes `/detection_result` (`detect_anything/DetectionResult`) and `/annotated_image_detection` (`sensor_msgs/Image`). +- Subscribes to `/camera/image` (or `/camera/image/compressed` if `use_compressed:=true`). diff --git a/detect_anything/config/objects.yaml b/detect_anything/config/objects.yaml new file mode 100644 index 0000000..371f186 --- /dev/null +++ b/detect_anything/config/objects.yaml @@ -0,0 +1,46 @@ +# Simple list of object names for prompting YOLO-E +- chair +- desk +- tv_monitor +- sofa +- unknown +- printer +- coffee machine +- refrigerator +- trash can +- shoe +- sink +- table +- oven +- bed +- painting +- bulletin board +- plant +- vase +- cabinet +- shelf +- book +- cup +- sculpture +- keyboard +- mouse +- clock +- phone +- toilet +- bathtub +- microwave oven +- pan +- suitcase +- light +- curtain +- whiteboard +- shower knob +- bottle +- water dispenser +- vending machine +- laptop +- bag +- locker +- picture +- cardboard +- extinguisher diff --git a/detect_anything/detect_anything/__init__.py b/detect_anything/detect_anything/__init__.py new file mode 100644 index 0000000..9c61aab --- /dev/null +++ b/detect_anything/detect_anything/__init__.py @@ -0,0 +1 @@ +"""detect_anything package.""" diff --git a/detect_anything/detect_anything/detection.py b/detect_anything/detect_anything/detection.py new file mode 100644 index 0000000..1d64d76 --- /dev/null +++ b/detect_anything/detect_anything/detection.py @@ -0,0 +1,256 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import List, Optional, Sequence + +from cv_bridge import CvBridge +import cv2 +import numpy as np + +from detect_anything.msg import DetectionResult + +_BRIDGE = CvBridge() + + +@dataclass +class SingleDetectionResult: + """Simple container for a YOLO/Ultralytics detection with a segmentation mask.""" + + bbox: tuple[float, float, float, float] + track_id: int + class_id: int + label: str + confidence: float + mask: np.ndarray + + def area(self) -> float: + x1, y1, x2, y2 = self.bbox + return max(0.0, (x2 - x1)) * max(0.0, (y2 - y1)) + + +@dataclass +class DetectionResults: + """Container for detections plus the source image.""" + + image: np.ndarray + detections: List[SingleDetectionResult] + + def overlay(self, alpha: float = 0.4, thickness: int = 2) -> np.ndarray: + """Draw bboxes and masks on the image.""" + output = self.image.copy() + for det in self.detections: + rng = np.random.default_rng(det.track_id) + color = tuple(int(c) for c in rng.integers(50, 255, size=3)) + + if det.mask.size: + mask_bool = det.mask.astype(bool) + if mask_bool.shape[:2] == output.shape[:2]: + colored_mask = np.zeros_like(output) + colored_mask[mask_bool] = color + output[mask_bool] = cv2.addWeighted( + output[mask_bool], 1 - alpha, colored_mask[mask_bool], alpha, 0 + ) + contours, _ = cv2.findContours(det.mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(output, contours, -1, color, thickness) + + x1, y1, x2, y2 = map(int, det.bbox) + cv2.rectangle(output, (x1, y1), (x2, y2), color, thickness) + + label_text = f"{det.label} {det.confidence:.2f}" + label_text = f"{label_text} | id {det.track_id}" + + (text_w, text_h), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + cv2.rectangle( + output, + (x1, max(0, y1 - text_h - baseline - 2)), + (x1 + text_w, max(0, y1)), + color, + -1, + ) + cv2.putText( + output, + label_text, + (x1, max(0, y1 - baseline - 2)), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, + cv2.LINE_AA, + ) + return output + + def to_msg(self, header) -> DetectionResult: + """ + Convert detections into a DetectionResult ROS message. + """ + msg = DetectionResult() + msg.header = header + msg.image = _BRIDGE.cv2_to_compressed_imgmsg(self.image, dst_format="jpeg") + msg.image.header = header + + for det in self.detections: + msg.track_id.append(int(det.track_id)) + msg.x1.append(float(det.bbox[0])) + msg.y1.append(float(det.bbox[1])) + msg.x2.append(float(det.bbox[2])) + msg.y2.append(float(det.bbox[3])) + msg.label.append(str(det.label)) + msg.confidence.append(float(det.confidence)) + + mask = det.mask + # Crop mask to bbox and align origin to bbox top-left for transport. + x1, y1, x2, y2 = [int(v) for v in det.bbox] + h, w = self.image.shape[:2] + x1 = max(0, min(x1, w)) + x2 = max(0, min(x2, w)) + y1 = max(0, min(y1, h)) + y2 = max(0, min(y2, h)) + crop_h = max(1, y2 - y1) + crop_w = max(1, x2 - x1) + if mask.size == 0: + mask = np.zeros((crop_h, crop_w), dtype=np.uint8) + else: + mask = mask[y1:y2, x1:x2] + if mask.shape != (crop_h, crop_w): + mask = cv2.resize( + mask.astype(np.uint8), + (crop_w, crop_h), + interpolation=cv2.INTER_NEAREST, + ) + + mask_msg = _BRIDGE.cv2_to_compressed_imgmsg(mask, dst_format="png") + mask_msg.header = header + msg.masks.append(mask_msg) + + return msg + + @classmethod + def from_msg(cls, msg: DetectionResult) -> DetectionResults: + """ + Construct DetectionResults from a DetectionResult ROS message. + Masks are converted back to full-size overlays at bbox locations. + """ + image = _BRIDGE.compressed_imgmsg_to_cv2(msg.image) + detections: List[SingleDetectionResult] = [] + + for i in range(len(msg.track_id)): + bbox = ( + float(msg.x1[i]), + float(msg.y1[i]), + float(msg.x2[i]), + float(msg.y2[i]), + ) + mask_img = _BRIDGE.compressed_imgmsg_to_cv2(msg.masks[i], desired_encoding="mono8") + + full_mask = np.zeros(image.shape[:2], dtype=np.uint8) + x1, y1, x2, y2 = [int(v) for v in bbox] + h, w = image.shape[:2] + x1 = max(0, min(x1, w)) + x2 = max(0, min(x2, w)) + y1 = max(0, min(y1, h)) + y2 = max(0, min(y2, h)) + crop_h = max(1, y2 - y1) + crop_w = max(1, x2 - x1) + if mask_img.shape[:2] != (crop_h, crop_w): + mask_img = cv2.resize(mask_img, (crop_w, crop_h), interpolation=cv2.INTER_NEAREST) + full_mask[y1:y2, x1:x2] = mask_img + + detections.append( + SingleDetectionResult( + bbox=bbox, + track_id=int(msg.track_id[i]), + class_id=0, # class_id is not transmitted in DetectionResult + label=str(msg.label[i]), + confidence=float(msg.confidence[i]), + mask=full_mask, + ) + ) + + return cls(image=image, detections=detections) + + +def _from_ultralytics_result( + result, + idx: int, + image_shape: Optional[tuple[int, int]] = None, + names: Optional[Sequence[str]] = None, +) -> SingleDetectionResult: + """Convert a single Ultralytics Results entry into a SingleDetectionResult.""" + from ultralytics.engine.results import Results # local import + + if not isinstance(result, Results): + raise TypeError("result must be an ultralytics Results object") + if result.boxes is None: + raise ValueError("Result has no boxes") + + bbox_array = result.boxes.xyxy[idx].cpu().numpy() + bbox = ( + float(bbox_array[0]), + float(bbox_array[1]), + float(bbox_array[2]), + float(bbox_array[3]), + ) + + confidence = float(result.boxes.conf[idx].cpu()) + class_id = int(result.boxes.cls[idx].cpu()) + + resolved_names = names + if resolved_names is None and hasattr(result, "names"): + resolved_names = result.names # type: ignore[assignment] + + label = f"class_{class_id}" + if resolved_names is not None: + if isinstance(resolved_names, dict): + label = resolved_names.get(class_id, label) + elif isinstance(resolved_names, (list, tuple)) and class_id < len(resolved_names): + label = resolved_names[class_id] + + track_id = idx + if hasattr(result.boxes, "id") and result.boxes.id is not None: + track_id = int(result.boxes.id[idx].cpu()) + + target_h, target_w = (image_shape or (None, None)) + if target_h is None or target_w is None: + if hasattr(result, "orig_shape") and result.orig_shape: + target_h, target_w = result.orig_shape[:2] + + mask = np.zeros((target_h, target_w), dtype=np.uint8) if target_h and target_w else np.zeros((0, 0), dtype=np.uint8) + if result.masks is not None and idx < len(result.masks.data): + mask_tensor = result.masks.data[idx] + mask_np = mask_tensor.cpu().numpy() + + if mask_np.ndim == 3: + mask_np = mask_np.squeeze() + + if target_h and target_w and mask_np.shape != (target_h, target_w): + mask_np = cv2.resize(mask_np.astype(np.float32), (target_w, target_h), interpolation=cv2.INTER_LINEAR) + + mask = (mask_np > 0.5).astype(np.uint8) * 255 # type: ignore[assignment] + + return SingleDetectionResult( + bbox=bbox, + track_id=track_id, + class_id=class_id, + label=label, + confidence=confidence, + mask=mask, + ) + + +def from_ultralytics_results( + results, + image_shape: Optional[tuple[int, int]] = None, + names: Optional[Sequence[str]] = None, +) -> List[SingleDetectionResult]: + """Convert a list of Ultralytics Results objects into detections with masks.""" + detections: List[SingleDetectionResult] = [] + for result in results: + if result.boxes is None: + continue + num_detections = len(result.boxes.xyxy) + for i in range(num_detections): + detections.append(_from_ultralytics_result(result, i, image_shape=image_shape, names=names)) + return detections + + +__all__ = ["DetectionResults", "SingleDetectionResult", "from_ultralytics_results"] diff --git a/detect_anything/detect_anything/detection_node.py b/detect_anything/detect_anything/detection_node.py new file mode 100644 index 0000000..aebd0e8 --- /dev/null +++ b/detect_anything/detect_anything/detection_node.py @@ -0,0 +1,169 @@ +from __future__ import annotations + +from pathlib import Path +from typing import List, Optional + +import yaml +import rclpy +from cv_bridge import CvBridge +from sensor_msgs.msg import CompressedImage, Image +from rclpy.node import Node +from rclpy.qos import DurabilityPolicy, QoSProfile, ReliabilityPolicy + +from detect_anything.detection import DetectionResults +from detect_anything.yoloe import Yoloe2DDetector, YoloePromptMode +from detect_anything.msg import DetectionResult + + +class DetectionNode(Node): + """ROS2 node that runs YOLO-E detection and publishes DetectionResult.""" + + def __init__(self) -> None: + super().__init__("detect_anything_detection_node") + self.bridge = CvBridge() + + # Parameters + defaults = self._default_params() + self.declare_parameter("image_topic", defaults["image_topic"]) + self.declare_parameter("compressed_image_topic", defaults["compressed_image_topic"]) + self.declare_parameter("use_compressed", defaults["use_compressed"]) + self.declare_parameter("model_path", defaults["model_path"]) + self.declare_parameter("model_name", defaults["model_name"]) + self.declare_parameter("device", defaults["device"]) + self.declare_parameter("prompt_mode", defaults["prompt_mode"]) + self.declare_parameter("objects_file", defaults["objects_file"]) + self.declare_parameter("conf", defaults["conf"]) + self.declare_parameter("max_area_ratio", defaults["max_area_ratio"]) + + params = self._default_params() + image_topic = str(self.get_parameter("image_topic").value) + compressed_image_topic = str(self.get_parameter("compressed_image_topic").value) + self.use_compressed = bool(self.get_parameter("use_compressed").value) + + model_path_val = self.get_parameter("model_path").value + model_path = str(model_path_val) if model_path_val not in (None, "") else params["model_path"] + if str(model_path).lower() == "none": + model_path = params["model_path"] + + model_name = str(self.get_parameter("model_name").value or params["model_name"]) + device = str(self.get_parameter("device").value or params["device"]) + prompt_mode_str = str(self.get_parameter("prompt_mode").value or params["prompt_mode"]).lower() + objects_file = Path(str(self.get_parameter("objects_file").value or params["objects_file"])) + conf = float(self.get_parameter("conf").value) + max_area_ratio = float(self.get_parameter("max_area_ratio").value) + + prompt_mode = YoloePromptMode(prompt_mode_str) + + # Auto-pick the right default weight if user didn't override + if prompt_mode == YoloePromptMode.LRPC and ("seg.pt" in model_name and "seg-pf.pt" not in model_name): + model_name = "yoloe-11l-seg-pf.pt" + self.get_logger().info(f"Prompt-free mode: switching model_name to {model_name}") + if prompt_mode == YoloePromptMode.PROMPT and "seg-pf.pt" in model_name: + model_name = "yoloe-11l-seg.pt" + self.get_logger().info(f"Prompt mode: switching model_name to {model_name}") + + self.detector = Yoloe2DDetector( + model_path=model_path, + model_name=model_name, + device=device or None, + prompt_mode=prompt_mode, + max_area_ratio=max_area_ratio, + conf=conf, + ) + + prompts = self._load_prompts(objects_file) + if prompts and prompt_mode == YoloePromptMode.PROMPT: + self.detector.set_prompts(text=prompts) + self.get_logger().info(f"Loaded {len(prompts)} prompts from {objects_file}") + + qos = QoSProfile( + depth=10, + reliability=ReliabilityPolicy.RELIABLE, + durability=DurabilityPolicy.VOLATILE, + ) + if self.use_compressed: + self.create_subscription( + CompressedImage, + compressed_image_topic, + self._compressed_image_callback, + qos, + ) + else: + self.create_subscription( + Image, + image_topic, + self._image_callback, + qos, + ) + + self.detection_pub = self.create_publisher(DetectionResult, "/detection_result", qos) + self.overlay_pub = self.create_publisher(Image, "/annotated_image_detection", qos) + + self.get_logger().info("detect_anything detection node started") + + def _default_params(self) -> dict: + base_share = Path(__file__).resolve().parent.parent + return { + "image_topic": "/camera/image", + "compressed_image_topic": "/camera/image/compressed", + "use_compressed": True, + "model_path": str(base_share / "models"), + "model_name": "yoloe-11l-seg.pt", + "device": "cuda", + "prompt_mode": YoloePromptMode.PROMPT.value, + "objects_file": str(base_share / "config" / "objects.yaml"), + "conf": 0.25, + "max_area_ratio": 0.3, + } + + def _load_prompts(self, objects_file: Path) -> List[str]: + if not objects_file.exists(): + return [] + try: + with open(objects_file, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) or [] + if isinstance(data, list): + return [str(p) for p in data] + except Exception as exc: + self.get_logger().warn(f"Failed to read prompts from {objects_file}: {exc}") + return [] + + def _image_callback(self, msg: Image) -> None: + cv_image = self.bridge.imgmsg_to_cv2(msg, desired_encoding="bgr8") + self._process_image(cv_image, msg.header) + + def _compressed_image_callback(self, msg: CompressedImage) -> None: + cv_image = self.bridge.compressed_imgmsg_to_cv2(msg, desired_encoding="bgr8") + self._process_image(cv_image, msg.header) + + def _process_image(self, cv_image, header) -> None: + detections: DetectionResults = self.detector.process_image(cv_image) + + detection_msg = detections.to_msg(header) + self.detection_pub.publish(detection_msg) + + overlay = detections.overlay() + overlay_msg = self.bridge.cv2_to_imgmsg(overlay, encoding="bgr8") + overlay_msg.header = header + self.overlay_pub.publish(overlay_msg) + + def destroy_node(self) -> bool: + if hasattr(self, "detector") and self.detector: + self.detector.stop() + return super().destroy_node() + + +def main(args: Optional[list[str]] = None) -> None: + rclpy.init(args=args) + node = DetectionNode() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/detect_anything/detect_anything/yoloe.py b/detect_anything/detect_anything/yoloe.py new file mode 100644 index 0000000..303afb9 --- /dev/null +++ b/detect_anything/detect_anything/yoloe.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +import threading +from typing import Any, List, Optional, Sequence + +import numpy as np +import torch +from ultralytics import YOLOE # type: ignore[attr-defined, import-not-found] +from ultralytics.utils import SETTINGS + +from detect_anything.detection import ( + DetectionResults, + SingleDetectionResult, + from_ultralytics_results, +) + + +class YoloePromptMode(Enum): + """YOLO-E prompt modes.""" + + LRPC = "lrpc" + PROMPT = "prompt" + + +class Yoloe2DDetector: + """Thin wrapper around YOLO-E for 2D detection + tracking with masks.""" + + def __init__( + self, + model_path: str | None = None, + model_name: str | None = None, + device: str | None = None, + prompt_mode: YoloePromptMode = YoloePromptMode.LRPC, + max_area_ratio: Optional[float] = 0.3, + conf: float = 0.6, + iou: float = 0.6, + ) -> None: + if model_name is None: + model_name = "yoloe-11s-seg-pf.pt" if prompt_mode == YoloePromptMode.LRPC else "yoloe-11s-seg.pt" + + model_dir = Path(model_path) if model_path else Path("models/yoloe") + model_dir = model_dir.expanduser().resolve() + SETTINGS["weights_dir"] = str(model_dir) + self.model = YOLOE(model_dir / model_name) + self.prompt_mode = prompt_mode + self._visual_prompts: dict[str, np.ndarray] | None = None + self.max_area_ratio = max_area_ratio + self._lock = threading.Lock() + self.conf = conf + self.iou = iou + self._names_override: Sequence[str] | None = None + + if prompt_mode == YoloePromptMode.PROMPT: + self.set_prompts(text=["nothing"]) + + if self.max_area_ratio is not None and not (0.0 < self.max_area_ratio <= 1.0): + raise ValueError("max_area_ratio must be in the range (0, 1].") + + if device: + self.device = device + elif torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + + def set_prompts( + self, + text: Optional[List[str]] = None, + bboxes: Optional[np.ndarray] = None, + ) -> None: + """Set prompts for detection. Provide either text or bboxes, not both.""" + if self.prompt_mode == YoloePromptMode.LRPC: + # Prompt-free model cannot accept text/visual prompts. + return + if text is not None and bboxes is not None: + raise ValueError("Provide either text or bboxes, not both.") + if text is None and bboxes is None: + raise ValueError("Must provide either text or bboxes.") + + with self._lock: + self.model.predictor = None + if text is not None: + self.model.set_classes(text, self.model.get_text_pe(text)) # type: ignore[no-untyped-call] + self._visual_prompts = None + self._names_override = list(text) + else: + cls = np.arange(len(bboxes), dtype=np.int16) # type: ignore[arg-type] + self._visual_prompts = {"bboxes": bboxes, "cls": cls} # type: ignore[dict-item] + self._names_override = None + + def process_image(self, image: np.ndarray) -> DetectionResults: + """Process an image and return detection results.""" + track_kwargs: dict[str, Any] = { + "source": image, + "device": self.device, + "conf": self.conf, + "iou": self.iou, + "persist": True, + "verbose": False, + } + + with self._lock: + if self._visual_prompts is not None: + track_kwargs["visual_prompts"] = self._visual_prompts + + results = self.model.track(**track_kwargs) # type: ignore[arg-type] + + detections = from_ultralytics_results(results, image_shape=image.shape[:2], names=self._names_override) + filtered = self._apply_filters(image, detections) + return DetectionResults(image=image, detections=filtered) + + def _apply_filters( + self, + image: np.ndarray, + detections: List[SingleDetectionResult], + ) -> List[SingleDetectionResult]: + if self.max_area_ratio is None: + return detections + + image_area = float(image.shape[0] * image.shape[1]) + if image_area <= 0: + return detections + + return [det for det in detections if (det.area() / image_area) <= self.max_area_ratio] + + def stop(self) -> None: + """Clean up resources used by the detector.""" + if hasattr(self.model, "predictor") and self.model.predictor is not None: + predictor = self.model.predictor + if hasattr(predictor, "trackers") and predictor.trackers: + for tracker in predictor.trackers: + if hasattr(tracker, "tracker") and hasattr(tracker.tracker, "gmc"): + gmc = tracker.tracker.gmc + if hasattr(gmc, "executor") and gmc.executor is not None: + gmc.executor.shutdown(wait=True) + self.model.predictor = None + + +__all__ = ["Yoloe2DDetector", "YoloePromptMode"] diff --git a/detect_anything/env-hooks/venv_pythonpath.sh.in b/detect_anything/env-hooks/venv_pythonpath.sh.in new file mode 100644 index 0000000..9212bac --- /dev/null +++ b/detect_anything/env-hooks/venv_pythonpath.sh.in @@ -0,0 +1 @@ +ament_prepend_unique_value PYTHONPATH "@PYTHON_INSTALL_DIR@" diff --git a/detect_anything/msg/DetectionResult.msg b/detect_anything/msg/DetectionResult.msg new file mode 100644 index 0000000..766d5f1 --- /dev/null +++ b/detect_anything/msg/DetectionResult.msg @@ -0,0 +1,10 @@ +std_msgs/Header header +int32[] track_id +float32[] x1 +float32[] y1 +float32[] x2 +float32[] y2 +string[] label +float32[] confidence +sensor_msgs/CompressedImage image +sensor_msgs/CompressedImage[] masks # cropped to bbox; aligned to bbox top-left; encoding: mono8 diff --git a/detect_anything/package.xml b/detect_anything/package.xml new file mode 100644 index 0000000..a7699ae --- /dev/null +++ b/detect_anything/package.xml @@ -0,0 +1,26 @@ + + + + detect_anything + 0.0.1 + YOLO-E detection node and utilities. + Alex Lin + MIT + + ament_cmake + + rosidl_default_generators + rosidl_default_runtime + rosidl_interface_packages + + rclpy + std_msgs + sensor_msgs + vision_msgs + cv_bridge + vector_perception_utils + + + ament_cmake + + diff --git a/detect_anything/resource/detect_anything b/detect_anything/resource/detect_anything new file mode 100644 index 0000000..eac574f --- /dev/null +++ b/detect_anything/resource/detect_anything @@ -0,0 +1 @@ +detect_anything diff --git a/detect_anything/scripts/detection_node b/detect_anything/scripts/detection_node new file mode 100755 index 0000000..822cacf --- /dev/null +++ b/detect_anything/scripts/detection_node @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 +from detect_anything.detection_node import main + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 7b6c110..67894e1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,13 +4,14 @@ opencv-python scipy pillow tqdm +pyyaml +lark # 3D processing open3d -# deep learning -torch<=2.4.0 -torchvision<=1.19.0 +# vision +ultralytics # SAM2 and EdgeTAM dependencies sam2 diff --git a/rviz/semantic_mapping.rviz b/rviz/semantic_mapping.rviz new file mode 100644 index 0000000..d56223b --- /dev/null +++ b/rviz/semantic_mapping.rviz @@ -0,0 +1,267 @@ +Panels: + - Class: rviz_common/Displays + Help Height: 78 + Name: Displays + Property Tree Widget: + Expanded: + - /Global Options1 + - /Status1 + - /Odometry1 + Splitter Ratio: 0.5 + Tree Height: 413 + - Class: rviz_common/Selection + Name: Selection + - Class: rviz_common/Tool Properties + Expanded: + - /2D Goal Pose1 + - /Publish Point1 + Name: Tool Properties + Splitter Ratio: 0.5886790156364441 + - Class: rviz_common/Views + Expanded: + - /Current View1 + Name: Views + Splitter Ratio: 0.5 + - Class: rviz_common/Time + Experimental: false + Name: Time + SyncMode: 0 + SyncSource: PointCloud2 +Visualization Manager: + Class: "" + Displays: + - Alpha: 0.5 + Cell Size: 1 + Class: rviz_default_plugins/Grid + Color: 160; 160; 164 + Enabled: true + Line Style: + Line Width: 0.029999999329447746 + Value: Lines + Name: Grid + Normal Cell Count: 0 + Offset: + X: 0 + Y: 0 + Z: 0 + Plane: XY + Plane Cell Count: 10 + Reference Frame: + Value: true + - Class: rviz_default_plugins/Image + Enabled: true + Max Value: 1 + Median window: 5 + Min Value: 0 + Name: Image + Normalize Range: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /annotated_image_detection + Value: true + - Angle Tolerance: 0.10000000149011612 + Class: rviz_default_plugins/Odometry + Covariance: + Orientation: + Alpha: 0.5 + Color: 255; 255; 127 + Color Style: Unique + Frame: Local + Offset: 1 + Scale: 1 + Value: true + Position: + Alpha: 0.30000001192092896 + Color: 204; 51; 204 + Scale: 1 + Value: true + Value: false + Enabled: true + Keep: 5 + Name: Odometry + Position Tolerance: 0.10000000149011612 + Shape: + Alpha: 1 + Axes Length: 1 + Axes Radius: 0.10000000149011612 + Color: 255; 25; 0 + Head Length: 0.30000001192092896 + Head Radius: 0.10000000149011612 + Shaft Length: 1 + Shaft Radius: 0.05000000074505806 + Value: Arrow + Topic: + Depth: 5 + Durability Policy: Volatile + Filter size: 10 + History Policy: Keep Last + Reliability Policy: Reliable + Value: /state_estimation + Value: true + - Alpha: 0.10000000149011612 + Autocompute Intensity Bounds: true + Autocompute Value Bounds: + Max Value: 10 + Min Value: -10 + Value: true + Axis: Z + Channel Name: intensity + Class: rviz_default_plugins/PointCloud2 + Color: 255; 255; 255 + Color Transformer: FlatColor + Decay Time: 3 + Enabled: true + Invert Rainbow: false + Max Color: 255; 255; 255 + Max Intensity: 165 + Min Color: 0; 0; 0 + Min Intensity: 0 + Name: PointCloud2 + Position Transformer: XYZ + Selectable: true + Size (Pixels): 3 + Size (m): 0.004999999888241291 + Style: Flat Squares + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /registered_scan + Use Fixed Frame: true + Use rainbow: true + Value: true + - Class: rviz_default_plugins/MarkerArray + Enabled: true + Name: MarkerArray + Namespaces: + bbox: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /obj_boxes + Value: true + - Alpha: 1 + Autocompute Intensity Bounds: true + Autocompute Value Bounds: + Max Value: 10 + Min Value: -10 + Value: true + Axis: Z + Channel Name: intensity + Class: rviz_default_plugins/PointCloud2 + Color: 255; 255; 255 + Color Transformer: RGB8 + Decay Time: 0 + Enabled: true + Invert Rainbow: false + Max Color: 255; 255; 255 + Max Intensity: 4096 + Min Color: 0; 0; 0 + Min Intensity: 0 + Name: PointCloud2 + Position Transformer: XYZ + Selectable: true + Size (Pixels): 3 + Size (m): 0.009999999776482582 + Style: Flat Squares + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /obj_points + Use Fixed Frame: true + Use rainbow: true + Value: true + Enabled: true + Global Options: + Background Color: 48; 48; 48 + Fixed Frame: map + Frame Rate: 30 + Name: root + Tools: + - Class: rviz_default_plugins/Interact + Hide Inactive Objects: true + - Class: rviz_default_plugins/MoveCamera + - Class: rviz_default_plugins/Select + - Class: rviz_default_plugins/FocusCamera + - Class: rviz_default_plugins/Measure + Line color: 128; 128; 0 + - Class: rviz_default_plugins/SetInitialPose + Covariance x: 0.25 + Covariance y: 0.25 + Covariance yaw: 0.06853891909122467 + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /initialpose + - Class: rviz_default_plugins/SetGoal + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /goal_pose + - Class: rviz_default_plugins/PublishPoint + Single click: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /clicked_point + Transformation: + Current: + Class: rviz_default_plugins/TF + Value: true + Views: + Current: + Class: rviz_default_plugins/Orbit + Distance: 21.720943450927734 + Enable Stereo Rendering: + Stereo Eye Separation: 0.05999999865889549 + Stereo Focal Distance: 1 + Swap Stereo Eyes: false + Value: false + Focal Point: + X: 3.1760172843933105 + Y: 11.240063667297363 + Z: 2.561012029647827 + Focal Shape Fixed Size: false + Focal Shape Size: 0.05000000074505806 + Invert Z Axis: false + Name: Current View + Near Clip Distance: 0.009999999776482582 + Pitch: 1.1497966051101685 + Target Frame: + Value: Orbit (rviz_default_plugins) + Yaw: 3.4885518550872803 + Saved: ~ +Window Geometry: + Displays: + collapsed: false + Height: 996 + Hide Left Dock: false + Hide Right Dock: false + Image: + collapsed: false + QMainWindow State: 000000ff00000000fd00000004000000000000027b00000342fc0200000009fb0000001200530065006c0065006300740069006f006e00000001e10000009b0000005d00fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000a3fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c006100790073010000003f0000022b000000cc00fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261fb0000000a0049006d0061006700650100000270000001110000001700ffffff000000010000010f00000342fc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073010000003f00000342000000a900fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e10000019700000003000006b90000003efc0100000002fb0000000800540069006d00650100000000000006b90000026f00fffffffb0000000800540069006d00650100000000000004500000000000000000000003230000034200000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000 + Selection: + collapsed: false + Time: + collapsed: false + Tool Properties: + collapsed: false + Views: + collapsed: false + Width: 1721 + X: 3248 + Y: 331 diff --git a/semantic_mapping/CMakeLists.txt b/semantic_mapping/CMakeLists.txt new file mode 100644 index 0000000..7266468 --- /dev/null +++ b/semantic_mapping/CMakeLists.txt @@ -0,0 +1,52 @@ +cmake_minimum_required(VERSION 3.10) +project(semantic_mapping) + +find_package(ament_cmake REQUIRED) +find_package(rclpy REQUIRED) +find_package(std_msgs REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(geometry_msgs REQUIRED) +find_package(nav_msgs REQUIRED) +find_package(visualization_msgs REQUIRED) +find_package(cv_bridge REQUIRED) +find_package(rosidl_default_generators REQUIRED) +find_package(Python3 REQUIRED COMPONENTS Interpreter) + +rosidl_generate_interfaces(${PROJECT_NAME} + "msg/ObjectNode.msg" + "msg/ObjectType.msg" + "msg/TargetObjectInstruction.msg" + "msg/ViewpointRep.msg" + DEPENDENCIES std_msgs sensor_msgs geometry_msgs +) + +set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages") + +install( + DIRECTORY semantic_mapping + DESTINATION ${PYTHON_INSTALL_DIR} +) + +install( + PROGRAMS + scripts/semantic_mapping_node + DESTINATION lib/${PROJECT_NAME} +) + +install( + FILES resource/semantic_mapping + DESTINATION share/${PROJECT_NAME}/resource +) + +install( + DIRECTORY config + DESTINATION share/${PROJECT_NAME} +) + +install( + DIRECTORY launch + DESTINATION share/${PROJECT_NAME} +) + +ament_export_dependencies(rosidl_default_runtime) +ament_package() diff --git a/semantic_mapping/README.md b/semantic_mapping/README.md new file mode 100644 index 0000000..62b78e5 --- /dev/null +++ b/semantic_mapping/README.md @@ -0,0 +1,152 @@ +# Semantic Mapping + +ROS2 package for semantic 3D object mapping with VLM integration. + +## Overview + +This package builds a persistent semantic map of objects detected by `detect_anything`, fusing 2D detections with LiDAR point clouds to create 3D object representations. It publishes `ObjectNode` messages compatible with the tare_planner exploration system. +The pipeline stores voxelized geometry, merges observations, and exports planner‑friendly ROS messages. + +## Status: Work in Progress + +| Component | File | Status | +|-----------|------|--------| +| Messages | `msg/*.msg` | Complete | +| Data Models | `map_object.py` | Complete | +| Matching | `utils/matching.py` | Complete | +| Object Mapper | `mapper.py` | Complete | +| Cloud Projection | `vector_perception_utils/pointcloud_utils.py` | Complete | +| Publishers | `utils/map_object_utils.py` | Complete | +| Image Storage | `utils/storage.py` | Complete | +| Mapping Node | `mapping_node.py` | Complete | +| Launch Files | `launch/` | Complete | + +## Messages + +### ObjectNode.msg +Main output message for planner integration: +- `object_id[]`: Track IDs (may contain multiple merged IDs) +- `label`: Dominant class label +- `position`: Object centroid in world frame +- `bbox3d[8]`: 3D bounding box corners +- `cloud`: Object point cloud +- `status`: Lifecycle status ("new", "updated", "unchanged") +- `img_path`: Path to best saved image crop +- `is_asked_vlm`: VLM query flag +- `viewpoint_id`: Associated viewpoint + +### ObjectType.msg +VLM query/answer for object relabeling. + +### TargetObjectInstruction.msg +User instruction for target object search. + +### ViewpointRep.msg +Viewpoint trigger for image saving. + +Note: `detect_anything/DetectionResult` masks are cropped to the bbox and aligned to +the bbox top-left (as documented in `detect_anything/msg/DetectionResult.msg`). +The mapping node reconstructs full-image masks before projection. + +## Architecture + +``` +detect_anything/DetectionResult + │ + ▼ +┌───────────────────────────────────────────────┐ +│ semantic_mapping_node │ +│ │ +│ ┌────────────────────┐ ┌──────────────┐ │ +│ │ pointcloud_utils │ │ ObjectMapper │ │ +│ │ (projection + mask)├──▶│ (pending/ │ │ +│ │ │ │ permanent) │ │ +│ └────────────────────┘ └──────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌────────────────┐ ┌──────────────┐ │ +│ │ MapObject │ │ map_object_ │ │ +│ │ (voxels/votes) ├─▶│ utils │ │ +│ └────────────────┘ └──────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ utils/storage.py mapping_node.py │ +└───────────────────────────────────────────────┘ + │ + ▼ +/object_nodes, /obj_points, /obj_boxes, /obj_labels +``` + +Key components: +- `map_object.py`: Object state, voxel voting, pose de‑duplication, cached geometry. +- `utils/matching.py`: Track‑id and spatial matching with IoU/ratio heuristics. +- `mapper.py`: Pending/permanent tiers, merge rules, and pruning. +- `vector_perception_utils/pointcloud_utils.py`: Projection + mask segmentation utilities. +- `utils/map_object_utils.py`: ROS message construction helpers. +- `utils/storage.py`: Async image save queue. +- `mapping_node.py`: ROS2 orchestration (subscriptions, sync, publish). + +## Dependencies + +- `detect_anything`: Detection results with masks +- `vector_perception_utils`: Point cloud and image utilities +- ROS2 messages: `sensor_msgs`, `geometry_msgs`, `nav_msgs`, `visualization_msgs` + +## Usage + +```bash +# Build +colcon build --packages-select semantic_mapping + +# Run (after implementation complete) +ros2 launch semantic_mapping semantic_mapping.launch.py +``` + +## Configuration + +- `config/params.yaml`: Default node parameters for `semantic_mapping_node`. +- `config/sensor_to_camera_tf.yaml`: Static sensor→camera transform for projection. +- `config/objects.yaml`: Object taxonomy for detection labels. + +Key runtime params: +- `regularize_voxel_size`: Connectivity voxel size used in shape regularization. +- `ground_filter`, `ground_radius`, `ground_z_clearance`: Simple ground clamp around robot. +- `visualization_max_objects`: Limits RViz markers/pointcloud to most recent objects. + +## Topics + +### Subscriptions +| Topic | Type | Description | +|-------|------|-------------| +| `/detection_result` | `detect_anything/DetectionResult` | 2D detections with masks | +| `/registered_scan` | `sensor_msgs/PointCloud2` | LiDAR point cloud | +| `/state_estimation` | `nav_msgs/Odometry` | Robot odometry | +| `/object_type_answer` | `ObjectType` | VLM relabeling response | +| `/target_object_instruction` | `TargetObjectInstruction` | Target object search | +| `/viewpoint_rep_header` | `ViewpointRep` | Viewpoint trigger | + +### Publications +| Topic | Type | Description | +|-------|------|-------------| +| `/object_nodes` | `ObjectNode` | Semantic object map (for planner) | +| `/obj_points` | `sensor_msgs/PointCloud2` | Colored object point cloud | +| `/obj_boxes` | `visualization_msgs/MarkerArray` | 3D bounding box wireframes | +| `/obj_labels` | `visualization_msgs/MarkerArray` | Object label text | +| `/object_type_query` | `ObjectType` | VLM query for relabeling | + +## References + +- Full migration plan: [`docs/SEMANTIC_MAPPING_MIGRATION.md`](docs/SEMANTIC_MAPPING_MIGRATION.md) +- Original implementation: `VLM_ROS/src/semantic_mapping/` (main branch) + - SAM2 for segmentation + - External Gemini API for VLM reasoning +- Detection: `detect_anything` package (decoupled) +- dimos patterns: `dimos/perception/detection/objectDB.py` + +## Handoff Notes + +- Regularization uses vote-weighted filtering + voxel connected components; adjust + `regularize_voxel_size` and the regularization percentile (in `MapObject`) if + thin structures are dropped. +- Mapping node logs summary stats and a single mapping loop time per cycle. +- Permanent objects persist; only pending objects time out via `pending_ttl_s`. diff --git a/semantic_mapping/config/params.yaml b/semantic_mapping/config/params.yaml new file mode 100644 index 0000000..0749d09 --- /dev/null +++ b/semantic_mapping/config/params.yaml @@ -0,0 +1,48 @@ +semantic_mapping_node: + ros__parameters: + detection_topic: /detection_result + cloud_topic: /registered_scan + odom_topic: /state_estimation + viewpoint_topic: /viewpoint_rep_header + object_type_answer_topic: /object_type_answer + object_type_query_topic: /object_type_query + target_object_instruction_topic: /target_object_instruction + object_nodes_topic: /object_nodes + obj_points_topic: /obj_points + obj_boxes_topic: /obj_boxes + obj_labels_topic: /obj_labels + map_frame: map + + confidence_threshold: 0.35 + voxel_size: 0.05 + regularize_voxel_size: 0.2 + distance_threshold: 0.3 + iou_threshold: 0.3 + min_detections_for_permanent: 4 + pending_ttl_s: 5.0 + num_angle_bins: 15 + + detection_linear_state_time_bias: 0.0 + detection_angular_state_time_bias: 0.0 + cloud_window_before: 0.5 + cloud_window_after: 0.1 + + projection: equirect + axis_mode: xz + image_width: 1920 + image_height: 640 + depth_filter: true + depth_filter_margin: 0.15 + depth_jump_threshold: 0.3 + sensor_to_camera_tf: config/sensor_to_camera_tf.yaml + mask_erode_kernel: 5 + mask_erode_iterations: 2 + max_cloud_distance: 8.0 + visualization_max_objects: 50 + ground_filter: true + ground_radius: 2.5 + ground_z_clearance: 0.0 + + target_object: refrigerator + save_png: True + output_dir: output/object_images diff --git a/semantic_mapping/config/sensor_to_camera_tf.yaml b/semantic_mapping/config/sensor_to_camera_tf.yaml new file mode 100644 index 0000000..e48c06f --- /dev/null +++ b/semantic_mapping/config/sensor_to_camera_tf.yaml @@ -0,0 +1,11 @@ +sensor_to_camera: + parent_frame: lidar + child_frame: camera + translation: + x: -0.12 + y: -0.075 + z: 0.265 + rotation_rpy: + roll: -1.5707963 + pitch: 0.0 + yaw: -1.5707963 diff --git a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md new file mode 100644 index 0000000..3f22372 --- /dev/null +++ b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md @@ -0,0 +1,487 @@ +# Semantic Mapping Migration Plan + +## Overview + +This document outlines the comprehensive plan for porting the `semantic_mapping` package from `VLM_ROS` into `vector_perception_ros/semantic_mapping`, following the coding standards of `vector_perception_ros` while preserving functional compatibility with downstream consumers (particularly the tare_planner). + +> **Note**: Detection is decoupled into `detect_anything` package. VLM reasoning uses external Gemini API. + +## Design Principles + +1. **Logic cleanup, not restructure** - Preserve the core algorithms (voxel voting, observation angles, shape regularization, IoU/distance matching) while cleaning up dead code and improving organization. +2. **Same I/O contract** - Maintain the same ROS topic names, message types, and semantics so existing planner integration continues to work. +3. **vector_perception_ros coding standards** - Full type hints, Google-style docstrings, dataclass containers, proper ROS2 patterns. +4. **ObjectDB-style patterns** - Adopt the two-tier (pending/permanent) architecture and dual matching (track_id + distance) from dimos where it improves clarity. + +--- + +## Implementation Progress + +| Step | Component | Status | +|------|-----------|--------| +| 1 | Package Setup & Messages | **COMPLETED** | +| 2 | Data Models (`map_object.py`) | **COMPLETED** | +| 2.1 | Refactor to use shared utilities | **COMPLETED** | +| 3 | Matching Utilities (`utils/matching.py`) | **COMPLETED** | +| 4 | Object Mapper (`mapper.py`) | **COMPLETED** | +| 5 | Cloud-Image Projection (`pointcloud_utils.py`) | **COMPLETED** | +| 6 | Message Publishers (`utils/map_object_utils.py`) | **COMPLETED** | +| 7 | Image Storage (`utils/storage.py`) | **COMPLETED** | +| 8 | Mapping Node (`mapping_node.py`) | **COMPLETED** | +| 9 | Launch Files & Config | **COMPLETED** | +| 10 | Integration Testing | **COMPLETED** | + +### Step 2.1: Refactored map_object.py to use shared utilities + +Added to `vector_perception_utils/transform_utils.py`: +- `normalize_angle()` - now supports both scalar and array input +- `discretize_angles()` - discretize angles into bin indices + +Added to `vector_perception_utils/pointcloud_utils.py`: +- `get_oriented_bbox()` - returns (center, corners) tuple from minimal OBB + +Refactored in `map_object.py`: +- Removed duplicate `_normalize_angles_to_pi` and `_discretize_angles` functions +- Simplified `_compute_geometry()` from 43 lines to 15 using `get_oriented_bbox()` +- Use `create_pointcloud()` in `regularize_shape()` instead of manual o3d creation +- Use `compute_centroid()` instead of `np.mean(..., axis=0)` + +--- + +## VLM_ROS Reference (Source Implementation) + +Key files in `VLM_ROS/src/semantic_mapping/`: +- `semantic_map_new.py`: `ObjMapper` manages the persistent map +- `single_object_new.py`: `SingleObject` holds per-object state +- `semantic_mapping_node.py`: `MappingNode` orchestrates SAM2 masks, LiDAR projection, odom transforms + +### ObjMapper Core Flow +- Input: dict with `bboxes`, `labels`, `confidences`, `ids`, `masks` (from SAM2), plus LiDAR cloud and odom +- Memory fix: Uses `.copy()` on detection data extraction to prevent memory leaks +- VLM query threshold: Objects need `best_image_score > 500` before querying VLM +- `update_map()` steps: + 1. Transform incoming cloud into body/world frame using odom + 2. Filter detections by confidence threshold + 3. For each detection: reconstruct full-image masks (bbox-cropped in `detect_anything/msg/DetectionResult.msg`), project masked cloud, build per-object voxels, then either merge or create a `SingleObject` + 4. Merge heuristics: match by tracker `obj_id`, 3D IoU, distance thresholds, and optional primitive groups (e.g., chair/sofa) + 5. Maintain delete lists, save queues (best image/mask per object) + 6. Publish ROS messages: `ObjectNode`, bbox markers, label markers, colored point cloud + +### SingleObject Highlights +- Keeps class_id→counts/confidence, track_id list, voxel grid (`VoxelFeatureManager`), robot poses, masks, best image saving +- Computes 3D bbox (axis-aligned + oriented), centroids, clustering flags, and validity caches +- Manages persistence (`life`, `inactive_frame`, `publish_status`) and VLM interaction state + +### dimos Patterns to Adopt +- `ObjMapper` ⇄ `ObjectDB`: explicit pending/permanent tiers, track_id + spatial matching, promotion thresholds +- `SingleObject` ⇄ `Object`: cleaner update patterns, separation of core geometry vs VLM metadata + +--- + +## Phase 1: Package Structure & Messages (COMPLETED) + +### 1.1 Package Structure + +``` +semantic_mapping/ +├── CMakeLists.txt # Hybrid CMake for message generation + Python install +├── package.xml # Dependencies: detect_anything, vector_perception_utils, ROS msgs +├── msg/ +│ ├── ObjectNode.msg +│ ├── ObjectType.msg +│ ├── TargetObjectInstruction.msg +│ └── ViewpointRep.msg +├── config/ +│ ├── objects.yaml # Object taxonomy (to be copied from VLM_ROS) +│ └── params.yaml # Default node parameters +├── launch/ +│ └── semantic_mapping.launch.py +├── semantic_mapping/ +│ ├── __init__.py +│ ├── mapping_node.py # ROS2 node (orchestration) +│ ├── mapper.py # ObjectMapper class (core logic) +│ ├── map_object.py # Data classes: MapObject, Observation, VoxelManager +│ ├── utils/ +│ │ ├── matching.py # Object matching utilities (IoU, distance, track_id) +│ │ ├── map_object_utils.py # ROS message conversion helpers +│ │ └── storage.py # Async image saving queue +│ ├── (moved to vector_perception_utils) # Cloud-image projection helpers +├── docs/ +│ └── SEMANTIC_MAPPING_MIGRATION.md +└── resource/ + └── semantic_mapping # ament resource marker +``` + +### 1.2 Message Definitions + +**ObjectNode.msg** (preserves exact schema for planner compatibility): +``` +std_msgs/Header header +int32[] object_id +string label +geometry_msgs/Point position +geometry_msgs/Point[8] bbox3d # 8 corner points in world frame +sensor_msgs/PointCloud2 cloud +string status # "new", "updated", "unchanged" +string img_path +bool is_asked_vlm +int32 viewpoint_id +``` + +**ObjectType.msg**: VLM query/answer for object relabeling +**TargetObjectInstruction.msg**: User-set target/anchor objects +**ViewpointRep.msg**: Viewpoint trigger message + +--- + +## Phase 2: Data Models (COMPLETED) + +### 2.1 ObjectStatus Enum + +```python +class ObjectStatus(Enum): + """Object lifecycle status (ObjectDB-style tiering).""" + PENDING = "pending" # < min_detections threshold + PERMANENT = "permanent" # >= threshold, confirmed object + DELETED = "deleted" # Marked for removal +``` + +### 2.2 Observation + +```python +@dataclass +class Observation: + """A single detection observation to update a MapObject.""" + track_id: int + label: str + confidence: float + points: np.ndarray # (N, 3) world-frame points + robot_pose: dict # {R: (3,3), t: (3,)} + mask: np.ndarray | None = None # 2D segmentation mask + image: np.ndarray | None = None # RGB crop for best-image selection + timestamp: float = 0.0 +``` + +### 2.3 VoxelFeatureManager + +```python +@dataclass +class VoxelFeatureManager: + """Manages voxelized point cloud with observation angle voting.""" + voxels: np.ndarray # (N, 3) world coords + votes: np.ndarray # (N,) detection count per voxel + observation_angles: np.ndarray # (N, num_bins) angle histogram + regularized_mask: np.ndarray | None # (N,) valid voxel mask after clustering + voxel_size: float = 0.03 + num_angle_bins: int = 20 + + def update(self, new_points: np.ndarray, robot_position: np.ndarray) -> None: + """Add new points with KDTree-based merge and angle voting.""" + + def get_valid_voxels(self, min_votes: int = 1) -> np.ndarray: + """Return voxels passing vote threshold and regularization mask.""" +``` + +### 2.4 MapObject + +```python +@dataclass +class MapObject: + """Represents a tracked object in the semantic map.""" + # Identity + object_id: str # UUID for this object + track_ids: list[int] # Associated tracker IDs (can merge) + + # Classification (voted) + class_votes: dict[str, int] # label -> observation count + confidence_scores: dict[str, float] # label -> best confidence + + # Geometry + voxel_manager: VoxelFeatureManager + robot_poses: list[dict] # [{R, t}, ...] observation poses + + # Lifecycle (ObjectDB-style) + detection_count: int = 0 + inactive_frames: int = 0 + status: ObjectStatus = ObjectStatus.PENDING + + # VLM interaction + is_asked_vlm: bool = False + updated_by_vlm: bool = False + best_image_path: str | None = None + best_image_score: float = 0.0 + + @property + def label(self) -> str: ... + @property + def centroid(self) -> np.ndarray: ... + @property + def bbox3d(self) -> np.ndarray: ... + def update(self, observation: Observation) -> None: ... + def merge(self, other: MapObject) -> None: ... + def regularize_shape(self, percentile: float = 0.85) -> None: ... +``` + +--- + +## Phase 3: Matching Utilities (utils/matching.py) + +```python +def compute_iou_3d(bbox1: np.ndarray, bbox2: np.ndarray) -> float: + """Compute 3D IoU between two axis-aligned bounding boxes.""" + +def compute_center_distance(obj1: MapObject, obj2: MapObject) -> float: + """Compute Euclidean distance between object centroids.""" + +def are_merge_compatible(label1: str, label2: str, merge_groups: list[set[str]]) -> bool: + """Check if two labels can be merged (same class or in same merge group).""" + +def find_best_match( + obs: Observation, + candidates: list[MapObject], + distance_threshold: float, + iou_threshold: float, +) -> MapObject | None: + """Find best matching object by distance and IoU criteria.""" +``` + +--- + +## Phase 4: Object Mapper (mapper.py) + +```python +class ObjectMapper: + """ + Manages the semantic object map with two-tier state management. + Pending objects are promoted to permanent after detection_count >= threshold. + """ + + def __init__( + self, + voxel_size: float = 0.03, + confidence_threshold: float = 0.30, + distance_threshold: float = 0.5, + iou_threshold: float = 0.2, + min_detections_for_permanent: int = 3, + pending_ttl_s: float = 10.0, + max_inactive_frames: int = 20, + ) -> None: + self._pending_objects: dict[str, MapObject] = {} + self._permanent_objects: dict[str, MapObject] = {} + self._track_id_map: dict[int, str] = {} + self._lock = threading.RLock() + self._merge_groups: list[set[str]] = [ + {"chair", "sofa", "couch", "armchair"}, + {"table", "desk"}, + ] + + def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats: ... + def _match_observation(self, obs: Observation) -> MapObject | None: ... + def _create_object(self, obs: Observation) -> MapObject: ... + def _check_promotion(self, obj: MapObject) -> None: ... + def _try_merge_objects(self) -> None: ... + def _prune_inactive(self, target_object: str | None) -> list[MapObject]: ... + def get_objects(self, include_pending: bool = False) -> list[MapObject]: ... +``` + +--- + +## Phase 5: Mapping Node (mapping_node.py) + +### 5.1 Node Structure + +```python +class SemanticMappingNode(Node): + """ROS2 node for semantic 3D object mapping.""" + + def __init__(self) -> None: + super().__init__("semantic_mapping_node") + self._declare_parameters() + self.mapper = ObjectMapper(...) + self.projection = pointcloud_utils + self.image_saver = ImageSaveQueue(...) + + # Data buffers (with locks) + self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=50) + self._odom_buffer: deque[tuple[float, Pose]] = deque(maxlen=100) + self._lock = threading.Lock() + + self._create_subscriptions() + self._create_publishers() + self.create_timer(0.5, self._mapping_callback) +``` + +### 5.2 Subscriptions + +| Topic | Type | QoS | +|-------|------|-----| +| `/detection_result` | `detect_anything/DetectionResult` | Reliable | +| `/registered_scan` | `PointCloud2` | Best Effort | +| `/state_estimation` | `Odometry` | Best Effort | +| `/object_type_answer` | `ObjectType` | Reliable | +| `/target_object_instruction` | `TargetObjectInstruction` | Reliable | +| `/viewpoint_rep_header` | `ViewpointRep` | Reliable | + +### 5.3 Publishers + +| Topic | Type | Description | +|-------|------|-------------| +| `/object_nodes` | `ObjectNode` | Main output for planner | +| `/obj_points` | `PointCloud2` | Colored object cloud | +| `/obj_boxes` | `MarkerArray` | Bbox wireframes | +| `/obj_labels` | `MarkerArray` | Text labels | +| `/annotated_image` | `Image` | Debug visualization | +| `/object_type_query` | `ObjectType` | VLM query | + +--- + +## Phase 6: Cloud-Image Projection (pointcloud_utils.py) + +```python +def project_pointcloud_to_image(...): ... +def segment_pointcloud_by_mask(...): ... +def segment_pointclouds_from_projection(...): ... +def transform_segmented_clouds_to_world(...): ... +``` + +--- + +## Phase 7: Message Publishers (utils/map_object_utils.py) + +```python +def map_object_to_object_node(obj: MapObject, header: Header, status: str, target_object: str | None = None) -> ObjectNode | None: ... +def create_bbox_markers(objects: list[MapObject], header: Header) -> MarkerArray: ... +def create_label_markers(objects: list[MapObject], header: Header) -> MarkerArray: ... +def create_object_pointcloud(objects: list[MapObject], header: Header) -> PointCloud2 | None: ... +def create_delete_markers(deleted_ids: list[int], header: Header, namespace: str) -> MarkerArray: ... +``` + +Publisher helpers: +- `map_object_to_object_node` converts a `MapObject` into an `ObjectNode` with centroid, bbox corners, and point cloud. +- Marker helpers emit bbox wireframes and text labels for RViz. +- `create_object_pointcloud` builds a colored aggregate cloud from object voxels. + +--- + +## Phase 8: Image Storage (utils/storage.py) + +```python +class ImageSaveQueue: + """Background worker for saving object images to disk.""" + + def __init__(self, output_dir: Path, max_queue_size: int = 100) -> None: ... + def start(self) -> None: ... + def stop(self) -> None: ... + def queue_save(self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None) -> str: ... +``` + +Image storage: +- Async worker writes `.npy` (and optional `.png`) crops to disk. +- Queue is bounded to avoid blocking the main mapping loop. + +--- + +## Phase 9: Launch Files + +```python +# semantic_mapping.launch.py +DeclareLaunchArgument('platform', default_value='mecanum_sim') +DeclareLaunchArgument('detection_topic', default_value='/detection_result') +DeclareLaunchArgument('cloud_topic', default_value='/registered_scan') +DeclareLaunchArgument('odom_topic', default_value='/state_estimation') +DeclareLaunchArgument('voxel_size', default_value='0.03') +DeclareLaunchArgument('confidence_threshold', default_value='0.30') +DeclareLaunchArgument('target_object', default_value='') +``` + +--- + +## Implementation Checklist + +### Completed +- [x] Create `CMakeLists.txt` for message generation +- [x] Port message definitions from tare_planner +- [x] Set up `package.xml` with dependencies +- [x] Implement `map_object.py` with VoxelFeatureManager, MapObject, Observation +- [x] Port lazy computation patterns (centroid, bbox3d, regularization) +- [x] Refactor `map_object.py` to use shared utilities from `vector_perception_utils` +- [x] Add `normalize_angle`, `discretize_angles` to `transform_utils.py` +- [x] Add `get_oriented_bbox` to `pointcloud_utils.py` +- [x] Simplify `_compute_geometry()` using Open3D oriented bbox +- [x] Implement `utils/matching.py` with IoU, distance, merge-compatibility functions +- [x] Implement `mapper.py` with ObjectDB-style tiering +- [x] Implement cloud-image projection helpers in `vector_perception_utils/pointcloud_utils.py` +- [x] Implement `utils/map_object_utils.py` message conversion functions +- [x] Implement `utils/storage.py` with async queue +- [x] Implement `mapping_node.py` following detect_anything patterns + +### Remaining +- [x] Create launch files and params.yaml +- [x] Integration testing with bag files + +--- + +## Code Cleanup Items + +Items from VLM_ROS to preserve: +- Memory fixes with `.copy()` on detection data extraction +- VLM query threshold (`best_image_score > 500`) + +Items to remove: +1. Unused publishers: `/cloud_image` +2. Unused methods: `serialize_objs_to_bag()`, `is_merge_allowed()`, `handle_object_query()` +3. Review constants: `BACKGROUND_OBJECTS`, `VERTICAL_OBJECTS`, `BIG_OBJECTS`, `SMALL_OBJECTS` +4. Don't port: `byte_track`, `Grounded-SAM-2`, `mmconfig` directories + +--- + +## Compatibility Checklist + +- [x] `/object_nodes` publishes ObjectNode with exact schema +- [x] `/obj_points` publishes colored PointCloud2 +- [x] `/obj_boxes` publishes MarkerArray wireframes +- [x] `/obj_labels` publishes MarkerArray text +- [x] `/object_type_query` publishes ObjectType for VLM +- [x] Subscribes to `/detection_result` from detect_anything +- [x] Subscribes to `/registered_scan` PointCloud2 +- [x] Subscribes to `/state_estimation` Odometry +- [x] Subscribes to `/target_object_instruction` +- [x] Subscribes to `/viewpoint_rep_header` +- [x] Subscribes to `/object_type_answer` +- [x] Object positions, bbox corners, status flags match expected semantics +- [x] Delete markers properly remove objects from RViz + +--- + +## Recent Updates (Jan 2026) + +- Unified launch to `semantic_mapping.launch.py` with mode handling, detect_anything integration, and bagfile defaults. +- Fixed cloud/image projection transform to match legacy `CloudImageFusion` math. +- Added mask erosion, depth-jump pruning, distance gating, and light outlier filtering for cleaner object clouds. +- Added reprojected observation-angle voting for better multi-view aggregation. +- Fixed bbox ordering and switched `/obj_boxes` to axis-aligned markers (via `get_oriented_bbox(axis_aligned=True)`). +- Replaced DBSCAN regularization with vote-weighted filtering + voxel connected components. +- Added `regularize_voxel_size` to grow connectivity for thin structures. +- Simplified ground filtering to radius-based Z clamp around robot. +- Added `visualization_max_objects` to keep recent objects visible in RViz. +- Logging: mapping stats (total/total_deleted/total_permanent) + mapping loop time. +- Fixed image saving color order and ensured output dir is cleared on start. +- Set MobileCLIP weights cache to `detect_anything/models` and removed corrupted caches. + +## Temporary Handoff Notes + +- Regularization is now connected-components; tune `regularize_voxel_size` and keep ratio + (`MapObject.regularize_shape` percentile) if legs/sparse parts are dropped. +- Ground filter uses `ground_filter`, `ground_radius`, `ground_z_clearance`. +- If bagfile sync issues persist, consider switching mapping inputs to `/sensor_scan` + `/state_estimation_at_scan`. +- Verify VLM objects.yaml path in VLM_ROS (`vlm_reasoning_node` still expects old path). + +--- + +## Planner Dependencies + +The exploration planner (`sensor_coverage_planner_ground.cpp` in tare_planner) is tightly coupled to `ObjectNode`: +- Subscribes to `/object_nodes` and stores per-object reps +- Uses object positions, bbox corners, clouds, status, `is_asked_vlm`, and `img_path` for navigation decisions + +**Implication**: Keep the `ObjectNode` schema stable. Any new fields from the `MapObject` model can be mapped into extensions, but existing fields and semantics must remain for planner compatibility. diff --git a/semantic_mapping/launch/semantic_mapping.launch.py b/semantic_mapping/launch/semantic_mapping.launch.py new file mode 100644 index 0000000..0b7d8ab --- /dev/null +++ b/semantic_mapping/launch/semantic_mapping.launch.py @@ -0,0 +1,137 @@ +"""Launch semantic mapping and detect_anything with a mode selector.""" + +import os + +from ament_index_python.packages import get_package_share_directory +from launch import LaunchDescription +from launch.actions import DeclareLaunchArgument, OpaqueFunction +from launch.substitutions import LaunchConfiguration +from launch_ros.actions import Node + + +def _parse_bool(value: str, default: bool) -> bool: + if value == "": + return default + return value.lower() in ("true", "1", "yes") + + +def _launch_nodes(context): + mode = LaunchConfiguration("mode").perform(context).lower() + use_sim_time_arg = LaunchConfiguration("use_sim_time").perform(context) + use_compressed_arg = LaunchConfiguration("use_compressed").perform(context) + odom_topic_arg = LaunchConfiguration("odom_topic").perform(context) + + use_sim_time = _parse_bool(use_sim_time_arg, default=mode == "bagfile") + use_compressed = _parse_bool(use_compressed_arg, default=mode in ("bagfile", "sim")) + + if odom_topic_arg: + odom_topic = odom_topic_arg + else: + odom_topic = "/aft_mapped_to_init_incremental" if mode == "bagfile" else "/state_estimation" + + params_file = LaunchConfiguration("params_file").perform(context) + sensor_to_camera_tf = LaunchConfiguration("sensor_to_camera_tf").perform(context) + detection_topic = LaunchConfiguration("detection_topic").perform(context) + cloud_topic = LaunchConfiguration("cloud_topic").perform(context) + viewpoint_topic = LaunchConfiguration("viewpoint_topic").perform(context) + object_type_answer_topic = LaunchConfiguration("object_type_answer_topic").perform(context) + object_type_query_topic = LaunchConfiguration("object_type_query_topic").perform(context) + target_object_instruction_topic = LaunchConfiguration("target_object_instruction_topic").perform( + context + ) + object_nodes_topic = LaunchConfiguration("object_nodes_topic").perform(context) + obj_points_topic = LaunchConfiguration("obj_points_topic").perform(context) + obj_boxes_topic = LaunchConfiguration("obj_boxes_topic").perform(context) + obj_labels_topic = LaunchConfiguration("obj_labels_topic").perform(context) + map_frame = LaunchConfiguration("map_frame").perform(context) + target_object = LaunchConfiguration("target_object").perform(context) + save_png = _parse_bool(LaunchConfiguration("save_png").perform(context), default=False) + output_dir = LaunchConfiguration("output_dir").perform(context) + image_topic = LaunchConfiguration("image_topic").perform(context) + compressed_image_topic = LaunchConfiguration("compressed_image_topic").perform(context) + conf = float(LaunchConfiguration("conf").perform(context)) + max_area_ratio = float(LaunchConfiguration("max_area_ratio").perform(context)) + + detect_anything_node = Node( + package="detect_anything", + executable="detection_node", + name="detect_anything_detection_node", + output="screen", + parameters=[ + { + "use_sim_time": use_sim_time, + "use_compressed": use_compressed, + "image_topic": image_topic, + "compressed_image_topic": compressed_image_topic, + "conf": conf, + "max_area_ratio": max_area_ratio, + } + ], + ) + + semantic_mapping_node = Node( + package="semantic_mapping", + executable="semantic_mapping_node", + name="semantic_mapping_node", + output="screen", + parameters=[ + params_file, + { + "use_sim_time": use_sim_time, + "sensor_to_camera_tf": sensor_to_camera_tf, + "detection_topic": detection_topic, + "cloud_topic": cloud_topic, + "odom_topic": odom_topic, + "viewpoint_topic": viewpoint_topic, + "object_type_answer_topic": object_type_answer_topic, + "object_type_query_topic": object_type_query_topic, + "target_object_instruction_topic": target_object_instruction_topic, + "object_nodes_topic": object_nodes_topic, + "obj_points_topic": obj_points_topic, + "obj_boxes_topic": obj_boxes_topic, + "obj_labels_topic": obj_labels_topic, + "map_frame": map_frame, + "target_object": target_object, + "save_png": save_png, + "output_dir": output_dir, + }, + ], + ) + + return [detect_anything_node, semantic_mapping_node] + + +def generate_launch_description() -> LaunchDescription: + pkg_share = get_package_share_directory("semantic_mapping") + default_params = os.path.join(pkg_share, "config", "params.yaml") + default_sensor_tf = os.path.join(pkg_share, "config", "sensor_to_camera_tf.yaml") + + return LaunchDescription( + [ + DeclareLaunchArgument("mode", default_value="sim"), + DeclareLaunchArgument("use_sim_time", default_value=""), + DeclareLaunchArgument("use_compressed", default_value=""), + DeclareLaunchArgument("params_file", default_value=default_params), + DeclareLaunchArgument("sensor_to_camera_tf", default_value=default_sensor_tf), + DeclareLaunchArgument("detection_topic", default_value="/detection_result"), + DeclareLaunchArgument("cloud_topic", default_value="/registered_scan"), + DeclareLaunchArgument("odom_topic", default_value=""), + DeclareLaunchArgument("viewpoint_topic", default_value="/viewpoint_rep_header"), + DeclareLaunchArgument("object_type_answer_topic", default_value="/object_type_answer"), + DeclareLaunchArgument("object_type_query_topic", default_value="/object_type_query"), + DeclareLaunchArgument("target_object_instruction_topic", default_value="/target_object_instruction"), + DeclareLaunchArgument("object_nodes_topic", default_value="/object_nodes"), + DeclareLaunchArgument("obj_points_topic", default_value="/obj_points"), + DeclareLaunchArgument("obj_boxes_topic", default_value="/obj_boxes"), + DeclareLaunchArgument("obj_labels_topic", default_value="/obj_labels"), + DeclareLaunchArgument("map_frame", default_value="map"), + DeclareLaunchArgument("target_object", default_value="refrigerator"), + DeclareLaunchArgument("save_png", default_value="false"), + DeclareLaunchArgument("output_dir", default_value="output/object_images"), + DeclareLaunchArgument("image_topic", default_value="/camera/image"), + DeclareLaunchArgument("compressed_image_topic", default_value="/camera/image/compressed"), + DeclareLaunchArgument("conf", default_value="0.35"), + DeclareLaunchArgument("max_area_ratio", default_value="0.3"), + OpaqueFunction(function=_launch_nodes), + ] + ) diff --git a/semantic_mapping/msg/ObjectNode.msg b/semantic_mapping/msg/ObjectNode.msg new file mode 100644 index 0000000..b2d7686 --- /dev/null +++ b/semantic_mapping/msg/ObjectNode.msg @@ -0,0 +1,35 @@ +# Semantic object representation for planner integration. +# +# Each message represents a single object in the semantic map with its +# 3D bounding box, point cloud, and metadata for VLM interaction. + +std_msgs/Header header + +# Object identity - may contain multiple merged track IDs +int32[] object_id + +# Dominant class label (by weighted vote) +string label + +# Object centroid in world frame +geometry_msgs/Point position + +# 3D bounding box corners (8 points in world frame) +# Order: [min_x,min_y,min_z], [max_x,min_y,min_z], [max_x,max_y,min_z], [min_x,max_y,min_z], +# [min_x,min_y,max_z], [max_x,min_y,max_z], [max_x,max_y,max_z], [min_x,max_y,max_z] +geometry_msgs/Point[8] bbox3d + +# Object point cloud in world frame +sensor_msgs/PointCloud2 cloud + +# Lifecycle status: "new", "updated", "unchanged" +string status + +# Path to best saved image crop of this object +string img_path + +# VLM interaction flags +bool is_asked_vlm + +# Associated viewpoint ID (for viewpoint-triggered saves) +int32 viewpoint_id diff --git a/semantic_mapping/msg/ObjectType.msg b/semantic_mapping/msg/ObjectType.msg new file mode 100644 index 0000000..ab02ea6 --- /dev/null +++ b/semantic_mapping/msg/ObjectType.msg @@ -0,0 +1,17 @@ +# VLM query/answer message for object relabeling. +# +# Used bidirectionally: +# - Query (mapping -> VLM): object_id, img_path, labels[] populated +# - Answer (VLM -> mapping): final_label populated with VLM decision + +# Object ID to query/update +int32 object_id + +# Path to object image for VLM analysis +string img_path + +# VLM-determined final label (populated in answer) +string final_label + +# Candidate labels from detection (populated in query) +string[] labels diff --git a/semantic_mapping/msg/TargetObjectInstruction.msg b/semantic_mapping/msg/TargetObjectInstruction.msg new file mode 100644 index 0000000..1a88005 --- /dev/null +++ b/semantic_mapping/msg/TargetObjectInstruction.msg @@ -0,0 +1,22 @@ +# User instruction for target object search. +# +# Specifies the target object to find and optional spatial/attribute +# conditions relative to an anchor object. + +# Primary target object label to search for +string target_object + +# Room condition (e.g., "in the kitchen") +string room_condition + +# Spatial condition relative to anchor (e.g., "next to", "on top of") +string spatial_condition + +# Attribute condition for target (e.g., "red", "large") +string attribute_condition + +# Anchor object for spatial reference +string anchor_object + +# Attribute condition for anchor object +string attribute_condition_anchor diff --git a/semantic_mapping/msg/ViewpointRep.msg b/semantic_mapping/msg/ViewpointRep.msg new file mode 100644 index 0000000..5bb1bfe --- /dev/null +++ b/semantic_mapping/msg/ViewpointRep.msg @@ -0,0 +1,8 @@ +# Viewpoint trigger message for image saving. +# +# Published by planner to request saving images at specific viewpoints. + +std_msgs/Header header + +# Unique viewpoint identifier +int32 viewpoint_id diff --git a/semantic_mapping/package.xml b/semantic_mapping/package.xml new file mode 100644 index 0000000..4804d1a --- /dev/null +++ b/semantic_mapping/package.xml @@ -0,0 +1,29 @@ + + + + semantic_mapping + 0.0.1 + Semantic 3D object mapping node with VLM integration. + Alex Lin + MIT + + ament_cmake + + rosidl_default_generators + rosidl_default_runtime + rosidl_interface_packages + + rclpy + std_msgs + sensor_msgs + geometry_msgs + nav_msgs + visualization_msgs + cv_bridge + detect_anything + vector_perception_utils + + + ament_cmake + + diff --git a/semantic_mapping/resource/semantic_mapping b/semantic_mapping/resource/semantic_mapping new file mode 100644 index 0000000..e69de29 diff --git a/semantic_mapping/scripts/semantic_mapping_node b/semantic_mapping/scripts/semantic_mapping_node new file mode 100755 index 0000000..0312e88 --- /dev/null +++ b/semantic_mapping/scripts/semantic_mapping_node @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +"""Entry point for semantic_mapping_node.""" + +from semantic_mapping.mapping_node import main + +if __name__ == "__main__": + main() diff --git a/semantic_mapping/semantic_mapping/__init__.py b/semantic_mapping/semantic_mapping/__init__.py new file mode 100644 index 0000000..8df6f20 --- /dev/null +++ b/semantic_mapping/semantic_mapping/__init__.py @@ -0,0 +1 @@ +"""Semantic 3D object mapping for vector_perception_ros.""" diff --git a/semantic_mapping/semantic_mapping/map_object.py b/semantic_mapping/semantic_mapping/map_object.py new file mode 100644 index 0000000..a9e3633 --- /dev/null +++ b/semantic_mapping/semantic_mapping/map_object.py @@ -0,0 +1,705 @@ +"""Data models for semantic mapping. + +This module defines the core data structures for the semantic mapping system: +- VoxelFeatureManager: Manages voxelized point clouds with observation angle voting +- MapObject: Represents a tracked object in the semantic map +- Observation: A single detection observation +- ObjectStatus: Enum for object lifecycle status +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum +from typing import Optional + +import numpy as np +from scipy.spatial import cKDTree + +from vector_perception_utils.transform_utils import discretize_angles, normalize_angle +from vector_perception_utils.pointcloud_utils import ( + compute_centroid, + create_pointcloud, +) + + +class ObjectStatus(Enum): + """Object lifecycle status (ObjectDB-style tiering).""" + + PENDING = "pending" # < min_detections threshold + PERMANENT = "permanent" # >= threshold, confirmed object + DELETED = "deleted" # Marked for removal + + +@dataclass +class Observation: + """A single detection observation to update a MapObject. + + Attributes: + track_id: Tracker ID from detection. + label: Class label string. + confidence: Detection confidence score. + points: World-frame point cloud (N, 3). + robot_R: Robot rotation matrix (3, 3) at observation time. + robot_t: Robot translation vector (3,) at observation time. + mask: Optional 2D segmentation mask. + image: Optional RGB image crop for best-image selection. + timestamp: Observation timestamp in seconds. + """ + + track_id: int + label: str + confidence: float + points: np.ndarray + robot_R: np.ndarray + robot_t: np.ndarray + mask: Optional[np.ndarray] = None + image: Optional[np.ndarray] = None + timestamp: float = 0.0 + + +class VoxelFeatureManager: + """Manages voxelized point cloud with observation angle voting. + + This class tracks 3D voxels representing an object, along with: + - Vote counts per voxel (how many times each voxel was observed) + - Observation angles (which directions each voxel was seen from) + - Regularization mask (which voxels pass shape filtering) + + The observation angle voting helps distinguish between reliable points + (seen from multiple angles) and spurious points (seen from only one angle). + + Attributes: + voxels: Point coordinates (N, 3) in world frame. + votes: Detection count per voxel (N,). + observation_angles: Binary angle histogram per voxel (N, num_bins). + regularized_mask: Boolean mask for valid voxels after clustering (N,). + voxel_size: Size of voxel grid cells in meters. + num_angle_bins: Number of discretized angle bins. + """ + + def __init__( + self, + points: np.ndarray, + voxel_size: float, + robot_R: np.ndarray, + robot_t: np.ndarray, + num_angle_bins: int = 15, + ) -> None: + """Initialize VoxelFeatureManager with initial points. + + Args: + points: Initial point cloud (N, 3) in world frame. + voxel_size: Voxel grid cell size in meters. + robot_R: Robot rotation matrix (3, 3) at observation time. + robot_t: Robot translation vector (3,) at observation time. + num_angle_bins: Number of angle bins for observation tracking. + """ + num_points = points.shape[0] + + # Compute observation angles from robot position + vectors = points - robot_t + obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0]) + obs_bins = discretize_angles(obs_angles, num_angle_bins) + + self.voxels = points.copy() + self.voxel_size = voxel_size + self.num_angle_bins = num_angle_bins + self._tree = cKDTree(points) + + # Initialize vote counts + self.votes = np.ones(num_points, dtype=np.float32) + + # Initialize observation angle histogram (binary: seen/not seen from each angle) + self.observation_angles = np.zeros((num_points, num_angle_bins), dtype=np.uint8) + self.observation_angles[np.arange(num_points), obs_bins] = 1 + + # Regularization mask (all False until regularize_shape is called) + self.regularized_mask = np.zeros(num_points, dtype=bool) + + # Remove vote for dynamic object clearing (not commonly used) + self._remove_votes = np.zeros(num_points, dtype=np.int32) + + def update( + self, + new_points: np.ndarray, + robot_R: np.ndarray, + robot_t: np.ndarray, + ) -> None: + """Add new points with KDTree-based merge and angle voting. + + Points within voxel_size of existing points are merged (votes incremented). + Points farther away are added as new voxels. + + Args: + new_points: New point cloud (M, 3) in world frame. + robot_R: Robot rotation matrix (3, 3) at observation time. + robot_t: Robot translation vector (3,) at observation time. + """ + if new_points.shape[0] == 0: + return + + # Compute observation angles for new points + vectors = new_points - robot_t + obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0]) + obs_bins = discretize_angles(obs_angles, self.num_angle_bins) + + # Find nearest existing voxels + distances, indices = self._tree.query(new_points) + + # Merge points within voxel_size + merge_mask = distances < self.voxel_size + merge_indices = indices[merge_mask] + merge_bins = obs_bins[merge_mask] + + self.votes[merge_indices] += 1 + self.observation_angles[merge_indices, merge_bins] = 1 + + # Add new points that don't merge + new_mask = ~merge_mask + if np.any(new_mask): + new_voxels = new_points[new_mask] + new_bins = obs_bins[new_mask] + num_new = new_voxels.shape[0] + + # Extend arrays + self.voxels = np.vstack([self.voxels, new_voxels]) + self.votes = np.concatenate([self.votes, np.ones(num_new, dtype=np.float32)]) + + new_obs = np.zeros((num_new, self.num_angle_bins), dtype=np.uint8) + new_obs[np.arange(num_new), new_bins] = 1 + self.observation_angles = np.vstack([self.observation_angles, new_obs]) + + self.regularized_mask = np.concatenate( + [self.regularized_mask, np.zeros(num_new, dtype=bool)] + ) + self._remove_votes = np.concatenate( + [self._remove_votes, np.zeros(num_new, dtype=np.int32)] + ) + + # Rebuild KDTree + self._tree = cKDTree(self.voxels) + + def update_observation_angles(self, voxel_indices: np.ndarray, obs_bins: np.ndarray) -> None: + """Update observation angle histogram for selected voxels.""" + if voxel_indices.size == 0: + return + self.observation_angles[voxel_indices, obs_bins] = 1 + + def merge_from(self, other: VoxelFeatureManager) -> None: + """Merge voxels from another VoxelFeatureManager. + + Args: + other: Another VoxelFeatureManager to merge from. + """ + if other.voxels.shape[0] == 0: + return + + distances, indices = self._tree.query(other.voxels) + + # Merge overlapping voxels + merge_mask = distances < self.voxel_size + merge_indices = indices[merge_mask] + + self.votes[merge_indices] += other.votes[merge_mask] + self.observation_angles[merge_indices] = np.logical_or( + self.observation_angles[merge_indices], + other.observation_angles[merge_mask], + ).astype(np.uint8) + + # Add non-overlapping voxels + new_mask = ~merge_mask + if np.any(new_mask): + self.voxels = np.vstack([self.voxels, other.voxels[new_mask]]) + self.votes = np.concatenate([self.votes, other.votes[new_mask]]) + self.observation_angles = np.vstack( + [self.observation_angles, other.observation_angles[new_mask]] + ) + self.regularized_mask = np.concatenate( + [self.regularized_mask, other.regularized_mask[new_mask]] + ) + self._remove_votes = np.concatenate( + [self._remove_votes, np.zeros(np.sum(new_mask), dtype=np.int32)] + ) + self._tree = cKDTree(self.voxels) + + def get_valid_indices( + self, + diversity_percentile: float = 0.3, + use_regularized: bool = True, + ) -> np.ndarray: + """Get indices of valid voxels based on observation angle diversity. + + Voxels are ranked by their observation angle diversity (number of angles + from which they were observed), weighted by vote count. The top percentile + of voxels are returned. + + Args: + diversity_percentile: Fraction of total weight to include (0-1). + use_regularized: If True, only consider voxels in regularized_mask. + + Returns: + Indices of valid voxels. + """ + if use_regularized: + mask = self.regularized_mask + if not np.any(mask): + return np.array([], dtype=int) + obs_angles = self.observation_angles[mask] + votes = self.votes[mask] + else: + obs_angles = self.observation_angles + votes = self.votes + + if len(obs_angles) == 0: + return np.array([], dtype=int) + + # Compute diversity score (number of angles observed) + angle_diversity = np.sum(obs_angles, axis=1) + + # Sort by diversity (ascending) + sorted_indices = np.argsort(angle_diversity) + sorted_diversity = angle_diversity[sorted_indices] + sorted_weights = votes[sorted_indices] + + # Weight by diversity * votes + weighted = sorted_diversity * sorted_weights + total_weight = np.sum(weighted) + + if total_weight == 0: + return np.array([], dtype=int) + + # Find cutoff index for percentile + target_weight = (1 - diversity_percentile) * total_weight + cumsum = np.cumsum(weighted) + cutoff_idx = np.searchsorted(cumsum, target_weight) + + # Return indices above cutoff (higher diversity) + return sorted_indices[cutoff_idx:] + + def get_valid_voxels( + self, + diversity_percentile: float = 0.3, + use_regularized: bool = True, + ) -> np.ndarray: + """Get valid voxel coordinates. + + Args: + diversity_percentile: Fraction of total weight to include (0-1). + use_regularized: If True, only consider voxels in regularized_mask. + + Returns: + Valid voxel coordinates (M, 3). + """ + indices = self.get_valid_indices(diversity_percentile, use_regularized) + if use_regularized and np.any(self.regularized_mask): + voxels = self.voxels[self.regularized_mask] + else: + voxels = self.voxels + + if len(indices) == 0: + return np.empty((0, 3), dtype=np.float32) + return voxels[indices] + + @property + def num_voxels(self) -> int: + """Total number of voxels.""" + return self.voxels.shape[0] + + @property + def num_regularized(self) -> int: + """Number of voxels passing regularization.""" + return int(np.sum(self.regularized_mask)) + + +@dataclass +class MapObject: + """Represents a tracked object in the semantic map. + + This class combines patterns from VLM_ROS SingleObject and dimos Object, + with ObjectDB-style lifecycle management. + + Attributes: + object_id: Unique identifier for this object (UUID string). + track_ids: List of associated tracker IDs (can merge multiple). + voxel_manager: Manages the object's voxelized point cloud. + class_votes: Label -> observation count mapping. + confidence_scores: Label -> best confidence score mapping. + points_count: Label -> total points count mapping. + robot_poses: List of robot poses when object was observed. + detection_count: Total number of detections. + inactive_frames: Frames since last update. + last_update_time: Timestamp of last update. + status: Object lifecycle status. + is_asked_vlm: Whether VLM has been queried for this object. + updated_by_vlm: Whether object was relabeled by VLM. + best_image_path: Path to best saved image crop. + best_image_score: Score of best saved image. + centroid: Cached centroid position (None if needs recompute). + bbox3d_corners: Cached 3D bbox corners (None if needs recompute). + """ + + object_id: str + track_ids: list[int] + voxel_manager: VoxelFeatureManager + + # Classification (voted) + class_votes: dict[str, int] = field(default_factory=dict) + confidence_scores: dict[str, float] = field(default_factory=dict) + points_count: dict[str, int] = field(default_factory=dict) + + # Robot poses at observation times + robot_poses: list[dict] = field(default_factory=list) + + # Lifecycle (ObjectDB-style) + detection_count: int = 1 + inactive_frames: int = 0 + last_update_time: float = 0.0 + status: ObjectStatus = ObjectStatus.PENDING + + # VLM interaction + is_asked_vlm: bool = False + updated_by_vlm: bool = False + + # Image storage + best_image_path: Optional[str] = None + best_image_score: float = 0.0 + + # Cached geometry (None means needs recompute) + _centroid: Optional[np.ndarray] = field(default=None, repr=False) + _bbox3d_corners: Optional[np.ndarray] = field(default=None, repr=False) + _needs_recompute: bool = field(default=True, repr=False) + + # Configuration + _diversity_percentile: float = field(default=0.85, repr=False) + _angle_threshold: float = field(default=np.deg2rad(5), repr=False) + _distance_threshold: float = field(default=0.3, repr=False) + _needs_regularization: bool = field(default=True, repr=False) + _regularize_voxel_size: float = field(default=0.06, repr=False) + + @classmethod + def from_observation( + cls, + object_id: str, + obs: Observation, + voxel_size: float = 0.03, + regularize_voxel_size: float = 0.06, + num_angle_bins: int = 15, + ) -> MapObject: + """Create a new MapObject from an initial observation. + + Args: + object_id: Unique identifier for this object. + obs: Initial observation. + voxel_size: Voxel grid cell size in meters. + regularize_voxel_size: Voxel size for regularization clustering. + num_angle_bins: Number of angle bins for observation tracking. + + Returns: + New MapObject instance. + """ + voxel_manager = VoxelFeatureManager( + points=obs.points, + voxel_size=voxel_size, + robot_R=obs.robot_R, + robot_t=obs.robot_t, + num_angle_bins=num_angle_bins, + ) + + obj = cls( + object_id=object_id, + track_ids=[obs.track_id], + voxel_manager=voxel_manager, + class_votes={obs.label: 1}, + confidence_scores={obs.label: obs.confidence}, + points_count={obs.label: len(obs.points)}, + robot_poses=[{"R": obs.robot_R.copy(), "t": obs.robot_t.copy()}], + last_update_time=obs.timestamp, + _regularize_voxel_size=regularize_voxel_size, + ) + return obj + + def _invalidate_cache(self) -> None: + """Mark cached geometry as needing recompute.""" + self._needs_recompute = True + self._centroid = None + self._bbox3d_corners = None + + def _is_similar_pose(self, new_R: np.ndarray, new_t: np.ndarray) -> bool: + """Check if a new pose is similar to existing poses. + + Used to avoid redundant updates from the same viewpoint. + Compares observation angle and distance to object center. + + Args: + new_R: New robot rotation matrix (3, 3). Not used but kept for API. + new_t: New robot translation vector (3,). + + Returns: + True if pose is similar to an existing pose. + """ + _ = new_R # Unused but kept for API consistency + obj_center = self.centroid + if obj_center is None: + obj_center = compute_centroid(self.voxel_manager.voxels) + + for pose in self.robot_poses: + old_t = pose["t"] # R not used for similarity check + + old_to_obj = obj_center - old_t + new_to_obj = obj_center - new_t + + old_angle = np.arctan2(old_to_obj[1], old_to_obj[0]) + new_angle = np.arctan2(new_to_obj[1], new_to_obj[0]) + angle_diff = abs(normalize_angle(np.array([new_angle - old_angle]))[0]) + + old_dist = np.linalg.norm(old_to_obj) + new_dist = np.linalg.norm(new_to_obj) + dist_diff = abs(new_dist - old_dist) + + if angle_diff < self._angle_threshold and dist_diff < self._distance_threshold: + return True + + return False + + def update(self, obs: Observation) -> None: + """Update object with a new observation. + + Args: + obs: New observation to incorporate. + """ + # Update voxel manager + self.voxel_manager.update(obs.points, obs.robot_R, obs.robot_t) + self.detection_count += 1 + self.inactive_frames = 0 + self.last_update_time = obs.timestamp + self._needs_regularization = True + + # Check if pose is new (not similar to existing) + is_similar = self._is_similar_pose(obs.robot_R, obs.robot_t) + + if not is_similar: + # Update class statistics + self.class_votes[obs.label] = self.class_votes.get(obs.label, 0) + 1 + self.confidence_scores[obs.label] = max( + self.confidence_scores.get(obs.label, 0), obs.confidence + ) + self.points_count[obs.label] = ( + self.points_count.get(obs.label, 0) + len(obs.points) + ) + + # Add pose + self.robot_poses.append({"R": obs.robot_R.copy(), "t": obs.robot_t.copy()}) + + # Add track ID if new + if obs.track_id not in self.track_ids: + self.track_ids.append(obs.track_id) + + self._invalidate_cache() + + def merge(self, other: MapObject) -> None: + """Merge another MapObject into this one. + + Args: + other: Object to merge from. + """ + # Merge track IDs + for tid in other.track_ids: + if tid not in self.track_ids: + self.track_ids.append(tid) + + # Merge voxel managers + self.voxel_manager.merge_from(other.voxel_manager) + self._needs_regularization = True + + # Merge class statistics + for label, count in other.class_votes.items(): + self.class_votes[label] = self.class_votes.get(label, 0) + count + self.confidence_scores[label] = max( + self.confidence_scores.get(label, 0), + other.confidence_scores.get(label, 0), + ) + self.points_count[label] = ( + self.points_count.get(label, 0) + other.points_count.get(label, 0) + ) + + # Merge poses (only unique ones) + for pose in other.robot_poses: + if not self._is_similar_pose(pose["R"], pose["t"]): + self.robot_poses.append(pose) + + # Update lifecycle + self.detection_count += other.detection_count + self.last_update_time = max(self.last_update_time, other.last_update_time) + self.is_asked_vlm = self.is_asked_vlm or other.is_asked_vlm + + self._invalidate_cache() + + @property + def label(self) -> str: + """Get dominant class label by weighted vote. + + Returns: + Most likely class label based on points_count * confidence. + """ + if not self.class_votes: + return "unknown" + + # Compute weighted scores + weighted_scores = {} + for lbl in self.class_votes: + pts = self.points_count.get(lbl, 0) + conf = self.confidence_scores.get(lbl, 0) + weighted_scores[lbl] = pts * conf + + return max(weighted_scores, key=weighted_scores.get) + + @property + def confidence(self) -> float: + """Get confidence of the dominant label.""" + return self.confidence_scores.get(self.label, 0.0) + + @property + def centroid(self) -> Optional[np.ndarray]: + """Compute centroid weighted by observation angles. + + Returns: + Centroid position (3,) or None if no valid voxels. + """ + if self._needs_recompute or self._centroid is None: + self._compute_geometry() + return self._centroid + + @property + def bbox3d_corners(self) -> Optional[np.ndarray]: + """Compute axis-aligned 3D bounding box corners. + + Returns: + 8 corner points (8, 3) or None if no valid voxels. + """ + if self._needs_recompute or self._bbox3d_corners is None: + self._compute_geometry() + return self._bbox3d_corners + + def _compute_geometry(self) -> None: + """Compute cached centroid and bbox from valid voxels using Open3D. + + Returns: + None. Updates cached centroid and bbox in-place. + """ + # Get valid voxels (regularized first, then fallback to all) + valid_voxels = self.voxel_manager.get_valid_voxels( + diversity_percentile=self._diversity_percentile, + use_regularized=True, + ) + if len(valid_voxels) == 0: + valid_voxels = self.voxel_manager.get_valid_voxels( + diversity_percentile=self._diversity_percentile, + use_regularized=False, + ) + + if len(valid_voxels) < 4: + self._centroid = compute_centroid(valid_voxels) if len(valid_voxels) > 0 else None + self._bbox3d_corners = None + else: + pcd = create_pointcloud(valid_voxels) + obb = pcd.get_minimal_oriented_bounding_box() + self._centroid = np.asarray(obb.center) + half_extent = 0.5 * np.asarray(obb.extent) + local_corners = np.array( + [ + [-half_extent[0], -half_extent[1], -half_extent[2]], + [half_extent[0], -half_extent[1], -half_extent[2]], + [half_extent[0], half_extent[1], -half_extent[2]], + [-half_extent[0], half_extent[1], -half_extent[2]], + [-half_extent[0], -half_extent[1], half_extent[2]], + [half_extent[0], -half_extent[1], half_extent[2]], + [half_extent[0], half_extent[1], half_extent[2]], + [-half_extent[0], half_extent[1], half_extent[2]], + ], + dtype=float, + ) + self._bbox3d_corners = local_corners @ np.asarray(obb.R).T + np.asarray(obb.center) + + self._needs_recompute = False + + def regularize_shape(self, percentile: float = 0.85) -> None: + """Apply vote-weighted regularization with light outlier removal.""" + if not self._needs_regularization: + return + voxels = self.voxel_manager.voxels + if len(voxels) < 3: + self.voxel_manager.regularized_mask = np.ones(len(voxels), dtype=bool) + self._needs_regularization = False + self._invalidate_cache() + return + + weights = np.sum(self.voxel_manager.observation_angles, axis=1) * self.voxel_manager.votes + keep_ratio = percentile if percentile is not None else 1.0 + if keep_ratio >= 1.0: + keep_mask = np.ones(voxels.shape[0], dtype=bool) + else: + cutoff = float(np.quantile(weights, 1.0 - keep_ratio)) + keep_mask = weights >= cutoff + + if not np.any(keep_mask): + keep_mask = np.ones(voxels.shape[0], dtype=bool) + + selected_indices = np.nonzero(keep_mask)[0] + points_sel = voxels[selected_indices] + weights_sel = weights[selected_indices] + if points_sel.shape[0] >= 3: + voxel_size = self._regularize_voxel_size + keys = np.floor(points_sel / voxel_size).astype(np.int32) + voxel_map: dict[tuple[int, int, int], list[int]] = {} + for idx, key in enumerate(map(tuple, keys)): + voxel_map.setdefault(key, []).append(idx) + + neighbor_offsets = [ + (dx, dy, dz) + for dx in (-1, 0, 1) + for dy in (-1, 0, 1) + for dz in (-1, 0, 1) + if not (dx == 0 and dy == 0 and dz == 0) + ] + visited: set[tuple[int, int, int]] = set() + components: list[tuple[np.ndarray, float, np.ndarray]] = [] + for key in voxel_map: + if key in visited: + continue + stack = [key] + visited.add(key) + comp_indices: list[int] = [] + while stack: + current = stack.pop() + comp_indices.extend(voxel_map[current]) + for dx, dy, dz in neighbor_offsets: + neighbor = (current[0] + dx, current[1] + dy, current[2] + dz) + if neighbor in voxel_map and neighbor not in visited: + visited.add(neighbor) + stack.append(neighbor) + + comp_idx = np.array(comp_indices, dtype=int) + comp_weight = float(np.sum(weights_sel[comp_idx])) + comp_centroid = compute_centroid(points_sel[comp_idx]) + components.append((comp_idx, comp_weight, comp_centroid)) + + if components: + main_idx = int(np.argmax([c[1] for c in components])) + main_points = points_sel[components[main_idx][0]] + main_centroid = components[main_idx][2] + extent = np.max(main_points, axis=0) - np.min(main_points, axis=0) + dist_thresh = max(0.3, 0.5 * float(np.max(extent))) + + keep_sel_mask = np.zeros(points_sel.shape[0], dtype=bool) + for comp_idx, _, comp_centroid in components: + if np.linalg.norm(comp_centroid - main_centroid) <= dist_thresh: + keep_sel_mask[comp_idx] = True + if np.any(keep_sel_mask): + keep_mask = np.zeros(voxels.shape[0], dtype=bool) + keep_mask[selected_indices[keep_sel_mask]] = True + + self.voxel_manager.regularized_mask = keep_mask + + self._needs_regularization = False + self._invalidate_cache() diff --git a/semantic_mapping/semantic_mapping/mapper.py b/semantic_mapping/semantic_mapping/mapper.py new file mode 100644 index 0000000..cc67cf1 --- /dev/null +++ b/semantic_mapping/semantic_mapping/mapper.py @@ -0,0 +1,275 @@ +"""Object map manager for semantic mapping.""" + +from __future__ import annotations + +from dataclasses import dataclass +import threading +import time +import uuid + +from .utils.matching import compute_bbox, compute_iou_3d_with_ratios, find_best_match +from .map_object import MapObject, ObjectStatus, Observation + + +@dataclass +class UpdateStats: + """Summary of updates applied in a mapper step.""" + + created: int = 0 + updated: int = 0 + merged: int = 0 + deleted: int = 0 + promoted: int = 0 + total: int = 0 + + +class ObjectMapper: + """Manages the semantic object map with two-tier state management.""" + + def __init__( + self, + voxel_size: float = 0.03, + regularize_voxel_size: float = 0.06, + confidence_threshold: float = 0.30, + distance_threshold: float = 0.5, + iou_threshold: float = 0.2, + min_detections_for_permanent: int = 3, + pending_ttl_s: float = 10.0, + num_angle_bins: int = 15, + ) -> None: + """Initialize the object mapper and its thresholds. + + Args: + voxel_size: Voxel grid cell size in meters. + regularize_voxel_size: Voxel size for regularization clustering. + confidence_threshold: Minimum detection confidence to accept. + distance_threshold: Max centroid distance for spatial matching. + iou_threshold: IoU threshold for merge checks. + min_detections_for_permanent: Detections before promotion. + pending_ttl_s: Time-to-live for pending objects. + num_angle_bins: Observation angle bins for voxel manager. + """ + self._pending_objects: dict[str, MapObject] = {} + self._permanent_objects: dict[str, MapObject] = {} + self._track_id_map: dict[int, str] = {} + self._lock = threading.RLock() + + self._voxel_size = voxel_size + self._regularize_voxel_size = regularize_voxel_size + self._confidence_threshold = confidence_threshold + self._distance_threshold = distance_threshold + self._iou_threshold = iou_threshold + self._min_detections_for_permanent = min_detections_for_permanent + self._pending_ttl_s = pending_ttl_s + self._num_angle_bins = num_angle_bins + + def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats: + """Update the map with new observations. + + Args: + observations: List of new observations to integrate. + target_object: Optional label to protect from pruning. + + Returns: + UpdateStats summarizing changes applied. + """ + stats = UpdateStats() + now = max((obs.timestamp for obs in observations), default=time.time()) + updated_ids: set[str] = set() + regularized_ids: set[str] = set() + + with self._lock: + for obs in observations: + if obs.confidence < self._confidence_threshold: + continue + + obj = self._match_observation(obs) + if obj is None: + obj = self._create_object(obs) + self._pending_objects[obj.object_id] = obj + stats.created += 1 + else: + obj.update(obs) + stats.updated += 1 + + updated_ids.add(obj.object_id) + for track_id in obj.track_ids: + self._track_id_map[track_id] = obj.object_id + + if obj.status == ObjectStatus.PENDING: + if self._check_promotion(obj): + stats.promoted += 1 + + if obj.object_id not in regularized_ids and obj.inactive_frames == 0: + obj.regularize_shape(percentile=obj._diversity_percentile) + regularized_ids.add(obj.object_id) + + for obj in list(self._pending_objects.values()) + list(self._permanent_objects.values()): + if obj.object_id not in updated_ids: + obj.inactive_frames += 1 + + deleted = self._prune_inactive(target_object, now) + stats.deleted = len(deleted) + stats.merged += self.merge_objects() + stats.total = len(self._permanent_objects) + + return stats + + def _match_observation(self, obs: Observation) -> MapObject | None: + """Match an observation to an existing object.""" + if obs.track_id in self._track_id_map: + obj_id = self._track_id_map[obs.track_id] + if obj_id in self._permanent_objects: + return self._permanent_objects[obj_id] + if obj_id in self._pending_objects: + return self._pending_objects[obj_id] + + candidates = list(self._permanent_objects.values()) + list(self._pending_objects.values()) + return find_best_match( + obs=obs, + candidates=candidates, + distance_threshold=self._distance_threshold, + iou_threshold=self._iou_threshold, + ) + + def _create_object(self, obs: Observation) -> MapObject: + """Create a new MapObject from an observation.""" + obj = MapObject.from_observation( + object_id=str(uuid.uuid4()), + obs=obs, + voxel_size=self._voxel_size, + regularize_voxel_size=self._regularize_voxel_size, + num_angle_bins=self._num_angle_bins, + ) + self._track_id_map[obs.track_id] = obj.object_id + return obj + + def _check_promotion(self, obj: MapObject) -> bool: + """Promote object from pending to permanent if threshold met.""" + if obj.status != ObjectStatus.PENDING: + return False + if obj.detection_count < self._min_detections_for_permanent: + return False + if obj.object_id in self._pending_objects: + self._pending_objects.pop(obj.object_id) + obj.status = ObjectStatus.PERMANENT + self._permanent_objects[obj.object_id] = obj + return True + + def merge_objects(self) -> int: + """Merge overlapping permanent objects. + + Returns: + Number of merges performed. + """ + merged = 0 + objs = list(self._permanent_objects.values()) + i = 0 + while i < len(objs): + obj = objs[i] + j = i + 1 + while j < len(objs): + other = objs[j] + should_merge = False + + if set(obj.track_ids) & set(other.track_ids): + should_merge = True + else: + if obj.centroid is None or other.centroid is None: + j += 1 + continue + dist = float(((obj.centroid - other.centroid) ** 2).sum() ** 0.5) + if dist > self._distance_threshold: + j += 1 + continue + + obj_points = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, + ) + if len(obj_points) == 0: + obj_points = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + other_points = other.voxel_manager.get_valid_voxels( + diversity_percentile=other._diversity_percentile, + use_regularized=True, + ) + if len(other_points) == 0: + other_points = other.voxel_manager.get_valid_voxels( + diversity_percentile=other._diversity_percentile, + use_regularized=False, + ) + + obj_bbox = compute_bbox(obj_points) if len(obj_points) >= 3 else None + other_bbox = compute_bbox(other_points) if len(other_points) >= 3 else None + obj_corners = obj_bbox[0] if obj_bbox is not None else None + other_corners = other_bbox[0] if other_bbox is not None else None + obj_extent = obj_bbox[1] if obj_bbox is not None else None + other_extent = other_bbox[1] if other_bbox is not None else None + + dist_merge = dist < 0.25 + if obj_extent is not None and other_extent is not None: + dist_thresh = (((obj_extent / 2 + other_extent / 2) / 2) ** 2).sum() ** 0.5 + dist_merge = dist_merge or dist < dist_thresh * 0.5 + + if dist_merge: + should_merge = True + elif obj_corners is not None and other_corners is not None: + iou, ratio_obj, ratio_other = compute_iou_3d_with_ratios( + obj_corners, other_corners + ) + if ( + iou > self._iou_threshold + or ratio_obj > 0.4 + or ratio_other > 0.4 + ): + should_merge = True + + if should_merge: + obj.merge(other) + for track_id in other.track_ids: + self._track_id_map[track_id] = obj.object_id + self._permanent_objects.pop(other.object_id, None) + objs.pop(j) + merged += 1 + else: + j += 1 + i += 1 + + return merged + + def _prune_inactive(self, target_object: str | None, now: float) -> list[MapObject]: + """Remove inactive pending/permanent objects.""" + deleted: list[MapObject] = [] + + for obj_id, obj in list(self._pending_objects.items()): + if (now - obj.last_update_time) > self._pending_ttl_s: + deleted.append(self._pending_objects.pop(obj_id)) + + for obj_id, obj in list(self._permanent_objects.items()): + if target_object and obj.label == target_object: + continue + if obj.status == ObjectStatus.DELETED: + deleted.append(self._permanent_objects.pop(obj_id)) + + for obj in deleted: + for track_id in obj.track_ids: + if self._track_id_map.get(track_id) == obj.object_id: + self._track_id_map.pop(track_id, None) + + return deleted + + def get_objects(self, include_pending: bool = False) -> list[MapObject]: + """Return current objects. + + Args: + include_pending: If True, include pending objects. + + Returns: + List of MapObject instances. + """ + if include_pending: + return list(self._permanent_objects.values()) + list(self._pending_objects.values()) + return list(self._permanent_objects.values()) diff --git a/semantic_mapping/semantic_mapping/mapping_node.py b/semantic_mapping/semantic_mapping/mapping_node.py new file mode 100644 index 0000000..16b5649 --- /dev/null +++ b/semantic_mapping/semantic_mapping/mapping_node.py @@ -0,0 +1,861 @@ +"""ROS2 mapping node for semantic 3D object mapping.""" + +from __future__ import annotations + +from collections import deque +from pathlib import Path +import time + +import numpy as np +import yaml +from cv_bridge import CvBridge +import cv2 +import rclpy +from rclpy.node import Node +from rclpy.qos import QoSProfile, ReliabilityPolicy +from sensor_msgs.msg import CompressedImage, PointCloud2 +from sensor_msgs_py import point_cloud2 +from nav_msgs.msg import Odometry +from std_msgs.msg import Header +from visualization_msgs.msg import MarkerArray +from scipy.spatial.transform import Rotation, Slerp + +from detect_anything.msg import DetectionResult +from semantic_mapping.msg import ObjectType, ObjectNode, TargetObjectInstruction, ViewpointRep + +from vector_perception_utils.pointcloud_utils import ( + project_pointcloud_to_image, + segment_pointclouds_from_projection, + transform_segmented_clouds_to_world, +) +from vector_perception_utils.transform_utils import discretize_angles + +from .map_object import Observation +from .utils.map_object_utils import ( + create_bbox_markers, + create_delete_markers, + create_label_markers, + create_object_pointcloud, + map_object_to_object_node, +) +from .mapper import ObjectMapper +from .utils.storage import ImageSaveQueue + + +class SemanticMappingNode(Node): + """ROS2 node for semantic object mapping. + + This node fuses 2D detections and LiDAR point clouds into persistent 3D objects: + - Buffers detections, clouds, and odometry with timestamps + - Interpolates odometry at detection time (position + orientation) + - Segments point clouds by detection masks and projects to world frame + - Updates the object mapper and publishes ROS messages + - Queues cropped images for VLM queries + """ + + def __init__(self) -> None: + """Initialize the mapping node, parameters, buffers, and ROS I/O.""" + super().__init__("semantic_mapping_node") + self._bridge = CvBridge() + + self.declare_parameter("detection_topic", "/detection_result") + self.declare_parameter("cloud_topic", "/registered_scan") + self.declare_parameter("odom_topic", "/state_estimation") + self.declare_parameter("viewpoint_topic", "/viewpoint_rep_header") + self.declare_parameter("object_type_answer_topic", "/object_type_answer") + self.declare_parameter("object_type_query_topic", "/object_type_query") + self.declare_parameter("target_object_instruction_topic", "/target_object_instruction") + self.declare_parameter("object_nodes_topic", "/object_nodes") + self.declare_parameter("obj_points_topic", "/obj_points") + self.declare_parameter("obj_boxes_topic", "/obj_boxes") + self.declare_parameter("obj_labels_topic", "/obj_labels") + self.declare_parameter("map_frame", "map") + + self.declare_parameter("confidence_threshold", 0.3) + self.declare_parameter("voxel_size", 0.03) + self.declare_parameter("regularize_voxel_size", 0.06) + self.declare_parameter("distance_threshold", 0.5) + self.declare_parameter("iou_threshold", 0.2) + self.declare_parameter("min_detections_for_permanent", 3) + self.declare_parameter("pending_ttl_s", 10.0) + self.declare_parameter("num_angle_bins", 15) + + self.declare_parameter("detection_linear_state_time_bias", 0.0) + self.declare_parameter("detection_angular_state_time_bias", 0.0) + self.declare_parameter("cloud_window_before", 0.5) + self.declare_parameter("cloud_window_after", 0.1) + + self.declare_parameter("projection", "equirect") + self.declare_parameter("axis_mode", "xz") + self.declare_parameter("image_width", 1920) + self.declare_parameter("image_height", 640) + self.declare_parameter("depth_filter", False) + self.declare_parameter("depth_filter_margin", 0.15) + self.declare_parameter("depth_jump_threshold", 0.3) + self.declare_parameter("sensor_to_camera_tf", "config/sensor_to_camera_tf.yaml") + self.declare_parameter("mask_erode_kernel", 3) + self.declare_parameter("mask_erode_iterations", 2) + self.declare_parameter("max_cloud_distance", 8.0) + self.declare_parameter("visualization_max_objects", 50) + self.declare_parameter("ground_filter", True) + self.declare_parameter("ground_radius", 2.0) + self.declare_parameter("ground_z_clearance", 0.0) + + self.declare_parameter("target_object", "") + self.declare_parameter("save_png", False) + self.declare_parameter("output_dir", "output/object_images") + + detection_topic = str(self.get_parameter("detection_topic").value) + cloud_topic = str(self.get_parameter("cloud_topic").value) + odom_topic = str(self.get_parameter("odom_topic").value) + viewpoint_topic = str(self.get_parameter("viewpoint_topic").value) + object_type_answer_topic = str(self.get_parameter("object_type_answer_topic").value) + object_type_query_topic = str(self.get_parameter("object_type_query_topic").value) + target_object_instruction_topic = str(self.get_parameter("target_object_instruction_topic").value) + object_nodes_topic = str(self.get_parameter("object_nodes_topic").value) + obj_points_topic = str(self.get_parameter("obj_points_topic").value) + obj_boxes_topic = str(self.get_parameter("obj_boxes_topic").value) + obj_labels_topic = str(self.get_parameter("obj_labels_topic").value) + self._map_frame = str(self.get_parameter("map_frame").value) + + self._confidence_threshold = float(self.get_parameter("confidence_threshold").value) + self._detection_linear_bias = float(self.get_parameter("detection_linear_state_time_bias").value) + self._detection_angular_bias = float(self.get_parameter("detection_angular_state_time_bias").value) + self._cloud_window_before = float(self.get_parameter("cloud_window_before").value) + self._cloud_window_after = float(self.get_parameter("cloud_window_after").value) + + self._projection = str(self.get_parameter("projection").value) + self._axis_mode = str(self.get_parameter("axis_mode").value) + self._image_width = int(self.get_parameter("image_width").value) + self._image_height = int(self.get_parameter("image_height").value) + self._depth_filter = bool(self.get_parameter("depth_filter").value) + self._depth_filter_margin = float(self.get_parameter("depth_filter_margin").value) + self._depth_jump_threshold = float(self.get_parameter("depth_jump_threshold").value) + self._sensor_to_camera = self._load_sensor_to_camera( + str(self.get_parameter("sensor_to_camera_tf").value) + ) + self._mask_erode_kernel_size = max(1, int(self.get_parameter("mask_erode_kernel").value)) + self._mask_erode_iterations = max(0, int(self.get_parameter("mask_erode_iterations").value)) + self._mask_erode_kernel = cv2.getStructuringElement( + cv2.MORPH_RECT, + (self._mask_erode_kernel_size, self._mask_erode_kernel_size), + ) + self._max_cloud_distance = float(self.get_parameter("max_cloud_distance").value) + self._visualization_max_objects = int( + self.get_parameter("visualization_max_objects").value + ) + self._ground_filter = bool(self.get_parameter("ground_filter").value) + self._ground_radius = float(self.get_parameter("ground_radius").value) + self._ground_z_clearance = float(self.get_parameter("ground_z_clearance").value) + self._ground_percentile = 0.05 + self._ground_min_points = 80 + + self._target_object = str(self.get_parameter("target_object").value) + self._save_png = bool(self.get_parameter("save_png").value) + self._output_dir = Path(str(self.get_parameter("output_dir").value)) + + self._mapper = ObjectMapper( + voxel_size=float(self.get_parameter("voxel_size").value), + regularize_voxel_size=float(self.get_parameter("regularize_voxel_size").value), + confidence_threshold=self._confidence_threshold, + distance_threshold=float(self.get_parameter("distance_threshold").value), + iou_threshold=float(self.get_parameter("iou_threshold").value), + min_detections_for_permanent=int(self.get_parameter("min_detections_for_permanent").value), + pending_ttl_s=float(self.get_parameter("pending_ttl_s").value), + num_angle_bins=int(self.get_parameter("num_angle_bins").value), + ) + + self._image_saver = ImageSaveQueue( + output_dir=self._output_dir, + max_queue_size=100, + save_png=self._save_png, + ) + self._image_saver.start() + + self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=200) + self._odom_buffer: deque[tuple[float, dict]] = deque(maxlen=400) + self._detection_buffer: deque[DetectionResult] = deque(maxlen=100) + self._viewpoint_queue: deque[tuple[float, int]] = deque(maxlen=50) + self._processed_viewpoints: set[float] = set() + self._published_ids: set[int] = set() + self._mapping_busy = False + + qos_cloud = QoSProfile(depth=10, reliability=ReliabilityPolicy.BEST_EFFORT) + qos_default = QoSProfile(depth=10, reliability=ReliabilityPolicy.RELIABLE) + + self.create_subscription(DetectionResult, detection_topic, self._detection_callback, qos_default) + self.create_subscription(PointCloud2, cloud_topic, self._cloud_callback, qos_cloud) + self.create_subscription(Odometry, odom_topic, self._odom_callback, qos_cloud) + self.create_subscription(ViewpointRep, viewpoint_topic, self._viewpoint_callback, qos_default) + self.create_subscription(ObjectType, object_type_answer_topic, self._object_type_answer_callback, qos_default) + self.create_subscription( + TargetObjectInstruction, target_object_instruction_topic, self._target_object_callback, qos_default + ) + + self._object_type_query_pub = self.create_publisher(ObjectType, object_type_query_topic, qos_default) + self._object_nodes_pub = self.create_publisher(ObjectNode, object_nodes_topic, qos_default) + self._obj_points_pub = self.create_publisher(PointCloud2, obj_points_topic, qos_default) + self._obj_boxes_pub = self.create_publisher(MarkerArray, obj_boxes_topic, qos_default) + self._obj_labels_pub = self.create_publisher(MarkerArray, obj_labels_topic, qos_default) + + self.create_timer(0.5, self._mapping_callback) + + def destroy_node(self) -> bool: + """Stop background workers and destroy the ROS node.""" + self._image_saver.stop() + return super().destroy_node() + + def _detection_callback(self, msg: DetectionResult) -> None: + """Buffer detection results for later synchronization. + + Args: + msg: DetectionResult message with bboxes, labels, and masks. + """ + self._detection_buffer.append(msg) + + def _cloud_callback(self, msg: PointCloud2) -> None: + """Buffer incoming point clouds with timestamps. + + Args: + msg: PointCloud2 message in the sensor frame. + """ + stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 + points = point_cloud2.read_points_numpy(msg, field_names=("x", "y", "z")) + self._cloud_buffer.append((stamp, points)) + + def _odom_callback(self, msg: Odometry) -> None: + """Buffer odometry data for interpolation. + + Args: + msg: Odometry message containing pose and twist. + """ + stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 + odom = { + "position": np.array( + [msg.pose.pose.position.x, msg.pose.pose.position.y, msg.pose.pose.position.z] + ), + "orientation": np.array( + [ + msg.pose.pose.orientation.x, + msg.pose.pose.orientation.y, + msg.pose.pose.orientation.z, + msg.pose.pose.orientation.w, + ] + ), + "linear_velocity": np.array( + [msg.twist.twist.linear.x, msg.twist.twist.linear.y, msg.twist.twist.linear.z] + ), + "angular_velocity": np.array( + [msg.twist.twist.angular.x, msg.twist.twist.angular.y, msg.twist.twist.angular.z] + ), + } + self._odom_buffer.append((stamp, odom)) + + def _viewpoint_callback(self, msg: ViewpointRep) -> None: + """Queue viewpoint trigger requests. + + Args: + msg: ViewpointRep message with timestamp and viewpoint id. + """ + stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 + if stamp in self._processed_viewpoints: + return + self._viewpoint_queue.append((stamp, msg.viewpoint_id)) + + def _target_object_callback(self, msg: TargetObjectInstruction) -> None: + """Update target object label used for VLM query decisions. + + Args: + msg: TargetObjectInstruction message. + """ + self._target_object = msg.target_object + + def _object_type_answer_callback(self, msg: ObjectType) -> None: + """Apply VLM relabeling updates to tracked objects. + + Args: + msg: ObjectType message with final label for a track id. + """ + for obj in self._mapper.get_objects(include_pending=True): + if msg.object_id in obj.track_ids: + obj.class_votes[msg.final_label] = obj.class_votes.get(msg.final_label, 0) + 50 + obj.confidence_scores[msg.final_label] = max( + obj.confidence_scores.get(msg.final_label, 0.0), 1.0 + ) + obj.points_count[msg.final_label] = obj.points_count.get(msg.final_label, 0) + obj.updated_by_vlm = True + obj._invalidate_cache() + break + + def _mapping_callback(self) -> None: + """Main mapping loop: synchronize inputs, update map, and publish outputs.""" + if self._mapping_busy: + return + if len(self._detection_buffer) < 1 or len(self._cloud_buffer) < 1 or len(self._odom_buffer) < 2: + return + + self._mapping_busy = True + start_time = time.perf_counter() + try: + detection_msg, viewpoint_stamp, viewpoint_id = self._select_detection() + if detection_msg is None: + return + + detection_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9 + image = self._bridge.compressed_imgmsg_to_cv2(detection_msg.image, desired_encoding="bgr8") + + odom = self._interpolate_odom(detection_stamp) + if odom is None: + return + + neighbor_cloud = self._collect_cloud_window(detection_stamp) + if neighbor_cloud is None: + return + + observations = self._build_observations( + detection_msg=detection_msg, + image=image, + odom=odom, + cloud=neighbor_cloud, + ) + if len(observations) == 0: + return + + stats = self._mapper.update(observations, target_object=self._target_object) + self.get_logger().info( + "Mapping stats:\n" + f" total: {stats.total}\n" + f" deleted: {stats.deleted}" + ) + self._update_observation_angles(observations) + self._update_best_images(observations, image) + self._publish_map(detection_stamp, viewpoint_id) + self._publish_object_type_queries() + + if viewpoint_stamp is not None: + self._processed_viewpoints.add(viewpoint_stamp) + finally: + elapsed_ms = (time.perf_counter() - start_time) * 1000.0 + self.get_logger().info(f"Mapping loop time: {elapsed_ms:.2f} ms") + self._mapping_busy = False + + def _select_detection(self) -> tuple[DetectionResult | None, float | None, int]: + """Pick the detection to process, optionally aligned to a viewpoint. + + Returns: + Tuple of (detection_msg, viewpoint_stamp, viewpoint_id). + """ + viewpoint_stamp = None + viewpoint_id = -1 + + if len(self._viewpoint_queue) > 0: + stamp, vid = self._viewpoint_queue[0] + detections = list(self._detection_buffer) + stamps = [msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 for msg in detections] + diffs = np.abs(np.array(stamps) - stamp) + idx = int(np.argmin(diffs)) + if diffs[idx] <= 1.0: + detection_msg = detections[idx] + viewpoint_stamp = stamp + viewpoint_id = vid + self._viewpoint_queue.popleft() + return detection_msg, viewpoint_stamp, viewpoint_id + self._viewpoint_queue.popleft() + + idx = -2 if len(self._detection_buffer) > 1 else -1 + return self._detection_buffer[idx], viewpoint_stamp, viewpoint_id + + def _interpolate_odom(self, detection_stamp: float) -> dict | None: + """Interpolate odometry at the detection timestamp. + + Args: + detection_stamp: Timestamp in seconds. + + Returns: + Interpolated odometry dict or None if interpolation is not possible. + """ + target_stamp = detection_stamp + self._detection_linear_bias + angular_stamp = detection_stamp + self._detection_angular_bias + stamps = [s for s, _ in self._odom_buffer] + if len(stamps) < 2: + return None + + if target_stamp < stamps[0] or target_stamp > stamps[-1]: + return None + if angular_stamp < stamps[0] or angular_stamp > stamps[-1]: + return None + + right_idx = next(i for i, s in enumerate(stamps) if s >= target_stamp) + left_idx = max(right_idx - 1, 0) + angular_right_idx = next(i for i, s in enumerate(stamps) if s >= angular_stamp) + angular_left_idx = max(angular_right_idx - 1, 0) + + left_stamp, left_odom = self._odom_buffer[left_idx] + right_stamp, right_odom = self._odom_buffer[right_idx] + angular_left_stamp, angular_left_odom = self._odom_buffer[angular_left_idx] + angular_right_stamp, angular_right_odom = self._odom_buffer[angular_right_idx] + denom = right_stamp - left_stamp + ratio = (target_stamp - left_stamp) / denom if denom > 0 else 0.5 + angular_denom = angular_right_stamp - angular_left_stamp + angular_ratio = (angular_stamp - angular_left_stamp) / angular_denom if angular_denom > 0 else 0.5 + + camera_odom = {} + camera_odom["position"] = right_odom["position"] * ratio + left_odom["position"] * (1 - ratio) + camera_odom["linear_velocity"] = ( + right_odom["linear_velocity"] * ratio + left_odom["linear_velocity"] * (1 - ratio) + ) + rotations = Rotation.from_quat( + [angular_left_odom["orientation"], angular_right_odom["orientation"]] + ) + slerp = Slerp([0, 1], rotations) + camera_odom["orientation"] = slerp(angular_ratio).as_quat() + camera_odom["angular_velocity"] = ( + angular_right_odom["angular_velocity"] * angular_ratio + + angular_left_odom["angular_velocity"] * (1 - angular_ratio) + ) + return camera_odom + + def _collect_cloud_window(self, detection_stamp: float) -> np.ndarray | None: + """Aggregate point clouds around a detection timestamp. + + Args: + detection_stamp: Timestamp in seconds. + + Returns: + Concatenated point cloud or None if no data in window. + """ + start = detection_stamp - self._cloud_window_before + end = detection_stamp + self._cloud_window_after + clouds = [pts for stamp, pts in self._cloud_buffer if start <= stamp <= end] + if len(clouds) == 0: + return None + return np.concatenate(clouds, axis=0) + + def _compute_ground_z_threshold( + self, + points: np.ndarray, + robot_xy: np.ndarray, + ) -> float | None: + if not self._ground_filter: + return None + if points.shape[0] < self._ground_min_points: + return None + if self._ground_radius <= 0.0: + return None + if not 0.0 < self._ground_percentile < 1.0: + return None + + diffs = points[:, :2] - robot_xy + dist2 = diffs[:, 0] ** 2 + diffs[:, 1] ** 2 + radius2 = self._ground_radius ** 2 + mask = dist2 <= radius2 + if np.count_nonzero(mask) < self._ground_min_points: + return None + + z_vals = points[mask, 2] + z_thresh = float(np.percentile(z_vals, self._ground_percentile * 100.0)) + return z_thresh + self._ground_z_clearance + + def _build_observations( + self, + detection_msg: DetectionResult, + image: np.ndarray, + odom: dict, + cloud: np.ndarray, + ) -> list[Observation]: + """Build Observation objects from detections and segmented point clouds. + + Args: + detection_msg: DetectionResult with bboxes, labels, and masks. + image: RGB image used for masks and cropping. + odom: Interpolated odometry dict (position + orientation). + cloud: Aggregated point cloud in sensor frame. + + Returns: + List of Observation instances for object updates. + """ + det_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9 + image_shape = image.shape[:2] + + masks = [] + labels = [] + confidences = [] + track_ids = [] + + for i in range(len(detection_msg.track_id)): + conf = float(detection_msg.confidence[i]) + if conf < self._confidence_threshold: + continue + x1 = int(detection_msg.x1[i]) + y1 = int(detection_msg.y1[i]) + x2 = int(detection_msg.x2[i]) + y2 = int(detection_msg.y2[i]) + x1 = max(0, min(x1, image_shape[1] - 1)) + x2 = max(0, min(x2, image_shape[1])) + y1 = max(0, min(y1, image_shape[0] - 1)) + y2 = max(0, min(y2, image_shape[0])) + if x2 <= x1 or y2 <= y1: + continue + + full_mask = np.zeros(image_shape, dtype=bool) + if i < len(detection_msg.masks): + mask_msg: CompressedImage = detection_msg.masks[i] + mask_crop = self._bridge.compressed_imgmsg_to_cv2(mask_msg) + if mask_crop.ndim == 3: + mask_crop = cv2.cvtColor(mask_crop, cv2.COLOR_BGR2GRAY) + mask_bool = mask_crop.astype(bool) + if self._mask_erode_iterations > 0: + mask_eroded = cv2.erode( + mask_bool.astype(np.uint8), + self._mask_erode_kernel, + iterations=self._mask_erode_iterations, + ) + mask_bool = mask_eroded.astype(bool) + crop_h = min(mask_bool.shape[0], y2 - y1) + crop_w = min(mask_bool.shape[1], x2 - x1) + full_mask[y1 : y1 + crop_h, x1 : x1 + crop_w] = mask_bool[:crop_h, :crop_w] + + masks.append(full_mask) + labels.append(detection_msg.label[i]) + confidences.append(conf) + track_ids.append(int(detection_msg.track_id[i])) + + if len(masks) == 0: + return [] + + R_b2w = Rotation.from_quat(odom["orientation"]).as_matrix() + t_b2w = np.array(odom["position"]) + R_w2b = R_b2w.T + t_w2b = -R_w2b @ t_b2w + cloud_body = cloud @ R_w2b.T + t_w2b + + _, pixel_idx = project_pointcloud_to_image( + points=cloud_body, + projection=self._projection, + image_width=self._image_width, + image_height=self._image_height, + axis_mode=self._axis_mode, + intrinsics=None, + depth_filter=self._depth_filter, + depth_filter_margin=self._depth_filter_margin, + transform=self._sensor_to_camera, + ) + seg_clouds, seg_depths = segment_pointclouds_from_projection( + points=cloud_body, + masks=masks, + pixel_idx=pixel_idx, + ) + if self._depth_filter: + filtered_clouds = [] + filtered_depths = [] + for cloud, depth in zip(seg_clouds, seg_depths): + if depth.shape[0] == 0: + filtered_clouds.append(cloud) + filtered_depths.append(depth) + continue + min_depth = float(np.min(depth)) + keep = depth[:, 0] <= (min_depth + self._depth_filter_margin) + filtered_clouds.append(cloud[keep]) + filtered_depths.append(depth[keep]) + seg_clouds = filtered_clouds + seg_depths = filtered_depths + world_clouds = transform_segmented_clouds_to_world( + clouds=seg_clouds, + depths=seg_depths, + R_b2w=R_b2w, + t_b2w=t_b2w, + depth_jump_threshold=self._depth_jump_threshold, + ) + cloud_world = cloud_body @ R_b2w.T + t_b2w + ground_z_threshold = self._compute_ground_z_threshold(cloud_world, t_b2w[:2]) + + observations = [] + for idx, world_cloud in enumerate(world_clouds): + if self._max_cloud_distance > 0: + dist = np.linalg.norm(world_cloud - t_b2w, axis=1) + world_cloud = world_cloud[dist < self._max_cloud_distance] + if ground_z_threshold is not None: + world_cloud = world_cloud[world_cloud[:, 2] > ground_z_threshold] + if world_cloud.shape[0] == 0: + continue + observations.append( + Observation( + track_id=track_ids[idx], + label=labels[idx], + confidence=confidences[idx], + points=world_cloud, + robot_R=R_b2w, + robot_t=t_b2w, + mask=masks[idx], + image=image, + timestamp=det_stamp, + ) + ) + + return observations + + def _update_observation_angles(self, observations: list[Observation]) -> None: + """Reproject object voxels onto the current mask to update angle votes.""" + if len(observations) == 0: + return + + objects = self._mapper.get_objects(include_pending=True) + track_to_obj = {} + for obj in objects: + for track_id in obj.track_ids: + track_to_obj[track_id] = obj + + for obs in observations: + obj = track_to_obj.get(obs.track_id) + if obj is None or obs.mask is None: + continue + voxels = obj.voxel_manager.voxels + if voxels.shape[0] == 0: + continue + + R_b2w = obs.robot_R + t_b2w = obs.robot_t + R_w2b = R_b2w.T + t_w2b = -R_w2b @ t_b2w + voxels_body = voxels @ R_w2b.T + t_w2b + _, pixel_idx = project_pointcloud_to_image( + points=voxels_body, + projection=self._projection, + image_width=self._image_width, + image_height=self._image_height, + axis_mode=self._axis_mode, + intrinsics=None, + depth_filter=self._depth_filter, + depth_filter_margin=self._depth_filter_margin, + transform=self._sensor_to_camera, + ) + pix = pixel_idx.astype(int) + valid = ( + (pix[:, 0] >= 0) + & (pix[:, 0] < obs.mask.shape[1]) + & (pix[:, 1] >= 0) + & (pix[:, 1] < obs.mask.shape[0]) + ) + if not np.any(valid): + continue + pix = pix[valid] + voxel_indices = np.nonzero(valid)[0] + on_mask = obs.mask[pix[:, 1], pix[:, 0]].astype(bool) + if not np.any(on_mask): + continue + voxel_indices = voxel_indices[on_mask] + vectors = voxels[voxel_indices] - t_b2w + obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0]) + obs_bins = discretize_angles(obs_angles, obj.voxel_manager.num_angle_bins) + obj.voxel_manager.update_observation_angles(voxel_indices, obs_bins) + obj._invalidate_cache() + + def _update_best_images(self, observations: list[Observation], image: np.ndarray) -> None: + """Update best image crops for objects based on mask area. + + Args: + observations: Observations created for the current frame. + image: Full RGB image associated with the detections. + """ + for obs in observations: + obj = None + for candidate in self._mapper.get_objects(include_pending=True): + if obs.track_id in candidate.track_ids: + obj = candidate + break + if obj is None or obs.mask is None: + continue + + coords = np.argwhere(obs.mask) + if coords.size == 0: + continue + mask_area = coords.shape[0] + if mask_area <= obj.best_image_score: + continue + + y0, x0 = coords.min(axis=0) + y1, x1 = coords.max(axis=0) + 1 + padding = max(int(y1 - y0), int(x1 - x0), 80) + y0 = max(0, y0 - padding) + x0 = max(0, x0 - padding) + y1 = min(obs.mask.shape[0], y1 + padding) + x1 = min(obs.mask.shape[1], x1 + padding) + + if obj.best_image_path: + image_path = Path(obj.best_image_path) + mask_path = image_path.with_name(image_path.stem + "_mask.npy") + if image_path.exists(): + image_path.unlink() + if mask_path.exists(): + mask_path.unlink() + if self._save_png: + png_path = image_path.with_suffix(".png") + mask_png_path = image_path.with_name(image_path.stem + "_mask.png") + if png_path.exists(): + png_path.unlink() + if mask_png_path.exists(): + mask_png_path.unlink() + + cropped_image = image[y0:y1, x0:x1] + cropped_mask = obs.mask[y0:y1, x0:x1] + + obj.best_image_path = self._image_saver.queue_save( + object_id=str(obs.track_id), + image=cropped_image, + mask=cropped_mask, + ) + obj.best_image_score = float(mask_area) + obj.is_asked_vlm = False + + def _publish_map(self, stamp: float, viewpoint_id: int) -> None: + """Publish object nodes, markers, and aggregated point clouds. + + Args: + stamp: Timestamp (seconds) for message headers. + viewpoint_id: Viewpoint id for ObjectNode messages. + """ + seconds = int(stamp) + nanoseconds = int((stamp - seconds) * 1e9) + header = Header() + header.stamp.sec = seconds + header.stamp.nanosec = nanoseconds + header.frame_id = self._map_frame + + objects = self._mapper.get_objects(include_pending=True) + for obj in objects: + if obj.detection_count == 1: + status = "new" + elif obj.inactive_frames == 0: + status = "updated" + else: + status = "unchanged" + msg = map_object_to_object_node( + obj, + header, + status=status, + viewpoint_id=viewpoint_id, + target_object=self._target_object, + ) + if msg is not None: + self._object_nodes_pub.publish(msg) + + viz_objects = objects + if self._visualization_max_objects > 0 and len(objects) > self._visualization_max_objects: + viz_objects = sorted( + objects, key=lambda obj: obj.last_update_time, reverse=True + )[: self._visualization_max_objects] + + bbox_markers = create_bbox_markers(viz_objects, header) + label_markers = create_label_markers(viz_objects, header) + self._obj_boxes_pub.publish(bbox_markers) + self._obj_labels_pub.publish(label_markers) + + obj_cloud = create_object_pointcloud(viz_objects, header) + if obj_cloud is not None: + self._obj_points_pub.publish(obj_cloud) + + current_ids = {int(obj.track_ids[0]) if obj.track_ids else 0 for obj in viz_objects} + deleted_ids = list(self._published_ids - current_ids) + if len(deleted_ids) > 0: + delete_markers = create_delete_markers(deleted_ids, header, namespace="bbox") + self._obj_boxes_pub.publish(delete_markers) + delete_labels = create_delete_markers(deleted_ids, header, namespace="labels") + self._obj_labels_pub.publish(delete_labels) + self._published_ids = current_ids + + def _publish_object_type_queries(self) -> None: + """Publish VLM queries for target objects with good image crops.""" + for obj in self._mapper.get_objects(include_pending=False): + if obj.label != self._target_object: + continue + if obj.is_asked_vlm: + continue + if obj.best_image_score <= 500: + continue + msg = ObjectType() + msg.object_id = int(obj.track_ids[0]) if obj.track_ids else -1 + msg.img_path = obj.best_image_path or "" + msg.labels = list(obj.class_votes.keys()) + self._object_type_query_pub.publish(msg) + obj.is_asked_vlm = True + + def _load_sensor_to_camera(self, path: str) -> np.ndarray | None: + """Load sensor->camera transform from YAML. + + Args: + path: Path to YAML config with translation and rpy. + + Returns: + 4x4 transform matrix or None if config not found. + """ + config_path = Path(path) + if not config_path.is_absolute(): + config_path = Path(__file__).resolve().parent.parent / config_path + if not config_path.exists(): + return None + + data = yaml.safe_load(config_path.read_text()) or {} + tf_data = data.get("sensor_to_camera", {}) + trans = tf_data.get("translation", {}) + rpy = tf_data.get("rotation_rpy", {}) + cam_offset = np.array( + [ + float(trans.get("x", 0.0)), + float(trans.get("y", 0.0)), + float(trans.get("z", 0.0)), + ] + ) + roll = float(rpy.get("roll", 0.0)) + pitch = float(rpy.get("pitch", 0.0)) + yaw = float(rpy.get("yaw", 0.0)) + cos_roll = np.cos(roll) + sin_roll = np.sin(roll) + cos_pitch = np.cos(pitch) + sin_pitch = np.sin(pitch) + cos_yaw = np.cos(yaw) + sin_yaw = np.sin(yaw) + r_z = np.array( + [ + [cos_yaw, -sin_yaw, 0.0], + [sin_yaw, cos_yaw, 0.0], + [0.0, 0.0, 1.0], + ] + ) + r_y = np.array( + [ + [cos_pitch, 0.0, sin_pitch], + [0.0, 1.0, 0.0], + [-sin_pitch, 0.0, cos_pitch], + ] + ) + r_x = np.array( + [ + [1.0, 0.0, 0.0], + [0.0, cos_roll, -sin_roll], + [0.0, sin_roll, cos_roll], + ] + ) + cam_rotation = r_z @ r_y @ r_x + rotation = cam_rotation.T + translation = -rotation @ cam_offset + transform = np.eye(4) + transform[:3, :3] = rotation + transform[:3, 3] = translation + return transform + + +def main(args: list[str] | None = None) -> None: + """Entry point for the semantic mapping node.""" + rclpy.init(args=args) + node = SemanticMappingNode() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/semantic_mapping/semantic_mapping/utils/__init__.py b/semantic_mapping/semantic_mapping/utils/__init__.py new file mode 100644 index 0000000..d949aeb --- /dev/null +++ b/semantic_mapping/semantic_mapping/utils/__init__.py @@ -0,0 +1 @@ +"""Utility modules for semantic mapping.""" diff --git a/semantic_mapping/semantic_mapping/utils/map_object_utils.py b/semantic_mapping/semantic_mapping/utils/map_object_utils.py new file mode 100644 index 0000000..5ad4163 --- /dev/null +++ b/semantic_mapping/semantic_mapping/utils/map_object_utils.py @@ -0,0 +1,264 @@ +"""ROS message helpers for MapObject publishers.""" + +from __future__ import annotations + +import zlib +from typing import Optional + +import numpy as np +from geometry_msgs.msg import Point +from sensor_msgs.msg import PointCloud2 +from std_msgs.msg import Header +from visualization_msgs.msg import Marker, MarkerArray + +from semantic_mapping.msg import ObjectNode +from vector_perception_utils.pointcloud_utils import create_pointcloud2_msg + +from ..map_object import MapObject + + +def _axis_aligned_corners(points: np.ndarray) -> Optional[np.ndarray]: + if points.size == 0: + return None + min_bound = np.min(points, axis=0) + max_bound = np.max(points, axis=0) + return np.array( + [ + [min_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], max_bound[1], max_bound[2]], + [min_bound[0], max_bound[1], max_bound[2]], + ], + dtype=float, + ) + + +def get_object_seed(obj: MapObject) -> int: + """Generate a deterministic seed from a MapObject for color generation. + + Args: + obj: MapObject instance. + + Returns: + CRC32 hash of the object's identifier. + """ + seed_key = obj.object_id or (str(obj.track_ids[0]) if obj.track_ids else obj.label) + return zlib.crc32(seed_key.encode("utf-8")) + + +def map_object_to_object_node( + obj: MapObject, + header: Header, + status: str = "new", + viewpoint_id: int = -1, + target_object: str | None = None, +) -> Optional[ObjectNode]: + """Convert a MapObject into an ObjectNode message. + + Args: + obj: MapObject instance. + header: ROS header to attach to the message. + status: Lifecycle status string ("new", "updated", "unchanged"). + viewpoint_id: Viewpoint identifier for image saving. + target_object: Target object label for potential labeling. + + Returns: + ObjectNode message or None if required geometry is missing. + """ + if obj.centroid is None or obj.bbox3d_corners is None: + return None + + msg = ObjectNode() + msg.header = header + msg.object_id = obj.track_ids + if target_object and obj.label == target_object and not obj.is_asked_vlm: + msg.label = "Potential Target" + else: + msg.label = obj.label + msg.status = status + msg.position.x = float(obj.centroid[0]) + msg.position.y = float(obj.centroid[1]) + msg.position.z = float(obj.centroid[2]) + + for i in range(8): + msg.bbox3d[i].x = float(obj.bbox3d_corners[i, 0]) + msg.bbox3d[i].y = float(obj.bbox3d_corners[i, 1]) + msg.bbox3d[i].z = float(obj.bbox3d_corners[i, 2]) + + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, + ) + if len(voxels) == 0: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + if len(voxels) > 0: + seed = get_object_seed(obj) + color = np.array( + [ + (seed & 0xFF) / 255.0, + ((seed >> 8) & 0xFF) / 255.0, + ((seed >> 16) & 0xFF) / 255.0, + ], + dtype=np.float32, + ) + colors = np.tile(color, (voxels.shape[0], 1)) + msg.cloud = create_pointcloud2_msg(points=voxels, colors=colors, header=header) + + msg.img_path = obj.best_image_path or "" + msg.is_asked_vlm = obj.is_asked_vlm + msg.viewpoint_id = -1 if obj.updated_by_vlm else viewpoint_id + return msg + + +def create_bbox_markers( + objects: list[MapObject], header: Header, namespace: str = "bbox" +) -> MarkerArray: + """Create wireframe bounding box markers for MapObjects.""" + markers = MarkerArray() + edges = [ + (0, 1), + (1, 2), + (2, 3), + (3, 0), + (4, 5), + (5, 6), + (6, 7), + (7, 4), + (0, 4), + (1, 5), + (2, 6), + (3, 7), + ] + + for obj in objects: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, + ) + if len(voxels) == 0: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + corners = _axis_aligned_corners(voxels) if len(voxels) > 0 else None + if corners is None: + continue + marker = Marker() + marker.header = header + marker.ns = namespace + marker.id = int(obj.track_ids[0]) if obj.track_ids else 0 + marker.type = Marker.LINE_LIST + marker.action = Marker.ADD + marker.scale.x = 0.02 + + seed = get_object_seed(obj) + marker.color.r = (seed & 0xFF) / 255.0 + marker.color.g = ((seed >> 8) & 0xFF) / 255.0 + marker.color.b = ((seed >> 16) & 0xFF) / 255.0 + marker.color.a = 1.0 + + for i, j in edges: + marker.points.append(Point(x=float(corners[i, 0]), y=float(corners[i, 1]), z=float(corners[i, 2]))) + marker.points.append(Point(x=float(corners[j, 0]), y=float(corners[j, 1]), z=float(corners[j, 2]))) + + markers.markers.append(marker) + + return markers + + +def create_label_markers( + objects: list[MapObject], + header: Header, + namespace: str = "labels", + text_height: float = 0.4, +) -> MarkerArray: + """Create text label markers for MapObjects.""" + markers = MarkerArray() + + for obj in objects: + if obj.centroid is None: + continue + marker = Marker() + marker.header = header + marker.ns = namespace + marker.id = int(obj.track_ids[0]) if obj.track_ids else 0 + marker.type = Marker.TEXT_VIEW_FACING + marker.action = Marker.ADD + marker.scale.z = text_height + marker.text = obj.label + marker.pose.position.x = float(obj.centroid[0]) + marker.pose.position.y = float(obj.centroid[1]) + marker.pose.position.z = float(obj.centroid[2]) + 0.3 + + seed = get_object_seed(obj) + marker.color.r = (seed & 0xFF) / 255.0 + marker.color.g = ((seed >> 8) & 0xFF) / 255.0 + marker.color.b = ((seed >> 16) & 0xFF) / 255.0 + marker.color.a = 1.0 + + markers.markers.append(marker) + + return markers + + +def create_object_pointcloud(objects: list[MapObject], header: Header) -> Optional[PointCloud2]: + """Create a colored point cloud for a list of MapObjects.""" + points_list = [] + colors_list = [] + + for obj in objects: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, + ) + if len(voxels) == 0: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + if len(voxels) == 0: + continue + + seed = get_object_seed(obj) + color = np.array( + [ + (seed & 0xFF) / 255.0, + ((seed >> 8) & 0xFF) / 255.0, + ((seed >> 16) & 0xFF) / 255.0, + ], + dtype=np.float32, + ) + colors = np.tile(color, (voxels.shape[0], 1)) + points_list.append(voxels) + colors_list.append(colors) + + if len(points_list) == 0: + return None + + points = np.concatenate(points_list, axis=0) + colors = np.concatenate(colors_list, axis=0) + return create_pointcloud2_msg(points=points, colors=colors, header=header) + + +def create_delete_markers( + deleted_ids: list[int], + header: Header, + namespace: str = "bbox", +) -> MarkerArray: + """Create deletion markers for a list of marker IDs.""" + markers = MarkerArray() + for marker_id in deleted_ids: + marker = Marker() + marker.header = header + marker.ns = namespace + marker.id = int(marker_id) + marker.action = Marker.DELETE + markers.markers.append(marker) + return markers diff --git a/semantic_mapping/semantic_mapping/utils/matching.py b/semantic_mapping/semantic_mapping/utils/matching.py new file mode 100644 index 0000000..e5aaa19 --- /dev/null +++ b/semantic_mapping/semantic_mapping/utils/matching.py @@ -0,0 +1,256 @@ +"""Object matching utilities for semantic mapping.""" + +from __future__ import annotations + +from typing import Optional + +import cv2 +import numpy as np + +from vector_perception_utils.pointcloud_utils import compute_centroid + +from ..map_object import MapObject, Observation + + +def _polygon_area(poly: np.ndarray) -> float: + """Compute polygon area using the shoelace formula. + + Args: + poly: Polygon vertices in order (N, 2). + + Returns: + Polygon area. + """ + if poly.shape[0] < 3: + return 0.0 + x = poly[:, 0] + y = poly[:, 1] + return 0.5 * abs(float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1)))) + + +def _ensure_ccw(poly: np.ndarray) -> np.ndarray: + """Ensure polygon vertices are ordered counter-clockwise.""" + if poly.shape[0] < 3: + return poly + x = poly[:, 0] + y = poly[:, 1] + signed = float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1))) + if signed < 0: + return poly[::-1] + return poly + + +def compute_bbox(points: np.ndarray) -> Optional[tuple[np.ndarray, np.ndarray]]: + """Compute yaw-only oriented bbox corners + extents.""" + if points.shape[0] < 3: + return None + + xy = points[:, :2].astype(np.float32) + unique_xy = np.unique(xy, axis=0) + if unique_xy.shape[0] < 3: + return None + + rect = cv2.minAreaRect(unique_xy) + (width, height) = rect[1] + if width == 0.0 or height == 0.0: + return None + + box = cv2.boxPoints(rect).astype(np.float64) + rect_xy = _ensure_ccw(box) + + zmin = float(np.min(points[:, 2])) + zmax = float(np.max(points[:, 2])) + extent = np.array([width, height, zmax - zmin], dtype=float) + z_center = zmax - extent[2] / 2.0 + half_z = extent[2] / 2.0 + + bottom = np.hstack([rect_xy, np.full((4, 1), z_center - half_z)]) + top = np.hstack([rect_xy, np.full((4, 1), z_center + half_z)]) + corners = np.vstack([bottom, top]) + return corners, extent + + +def _line_intersection(p1: np.ndarray, p2: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> np.ndarray: + """Intersection of line p1->p2 with line cp1->cp2. + + Args: + p1: Line segment start (2,). + p2: Line segment end (2,). + cp1: Clip edge start (2,). + cp2: Clip edge end (2,). + + Returns: + Intersection point (2,). + """ + dc = cp1 - cp2 + dp = p1 - p2 + denom = dc[0] * dp[1] - dc[1] * dp[0] + if denom == 0: + return p2 + n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0] + n2 = p1[0] * p2[1] - p1[1] * p2[0] + x = (n1 * dp[0] - n2 * dc[0]) / denom + y = (n1 * dp[1] - n2 * dc[1]) / denom + return np.array([x, y], dtype=np.float64) + + +def _polygon_clip(subject: np.ndarray, clip: np.ndarray) -> np.ndarray: + """Clip a polygon with a convex clip polygon (Sutherland–Hodgman). + + Args: + subject: Subject polygon vertices (N, 2). + clip: Clip polygon vertices (M, 2). + + Returns: + Clipped polygon vertices (K, 2). + """ + if subject.shape[0] == 0: + return subject + + def inside(p: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> bool: + return (cp2[0] - cp1[0]) * (p[1] - cp1[1]) - (cp2[1] - cp1[1]) * ( + p[0] - cp1[0] + ) >= 0 + + output = subject + for i in range(len(clip)): + cp1 = clip[i] + cp2 = clip[(i + 1) % len(clip)] + input_list = output + output = [] + if len(input_list) == 0: + return np.empty((0, 2), dtype=np.float64) + s = input_list[-1] + for e in input_list: + if inside(e, cp1, cp2): + if not inside(s, cp1, cp2): + output.append(_line_intersection(s, e, cp1, cp2)) + output.append(e) + elif inside(s, cp1, cp2): + output.append(_line_intersection(s, e, cp1, cp2)) + s = e + output = np.array(output, dtype=np.float64) + return np.array(output, dtype=np.float64) + + +def compute_iou_3d_with_ratios( + bbox1: np.ndarray, bbox2: np.ndarray +) -> tuple[float, float, float]: + """Compute IoU and intersection ratios using yaw-only bbox overlap. + + Args: + bbox1: Corner points for bbox1 (N, 3). + bbox2: Corner points for bbox2 (N, 3). + + Returns: + Tuple of (iou, ratio_bbox1, ratio_bbox2). + """ + if bbox1.shape[0] < 4 or bbox2.shape[0] < 4: + return 0.0, 0.0, 0.0 + + rect1 = _ensure_ccw(bbox1[:4, :2]) + rect2 = _ensure_ccw(bbox2[:4, :2]) + if rect1.shape[0] < 3 or rect2.shape[0] < 3: + return 0.0, 0.0, 0.0 + + poly_inter = _polygon_clip(rect1, rect2) + inter_area = _polygon_area(poly_inter) if poly_inter.shape[0] >= 3 else 0.0 + + zmin1, zmax1 = float(np.min(bbox1[:, 2])), float(np.max(bbox1[:, 2])) + zmin2, zmax2 = float(np.min(bbox2[:, 2])), float(np.max(bbox2[:, 2])) + height = max(0.0, min(zmax1, zmax2) - max(zmin1, zmin2)) + inter_vol = inter_area * height + + area1 = _polygon_area(rect1) + area2 = _polygon_area(rect2) + vol1 = area1 * (zmax1 - zmin1) + vol2 = area2 * (zmax2 - zmin2) + denom = vol1 + vol2 - inter_vol + + iou = inter_vol / denom if denom > 0 else 0.0 + ratio1 = inter_vol / vol1 if vol1 > 0 else 0.0 + ratio2 = inter_vol / vol2 if vol2 > 0 else 0.0 + return iou, ratio1, ratio2 + + +def find_best_match( + obs: Observation, + candidates: list[MapObject], + distance_threshold: float, + iou_threshold: float, + ratio_threshold: float = 0.4, + min_distance_merge: float = 0.25, +) -> Optional[MapObject]: + """Find best matching object by distance/IoU heuristics. + + Args: + obs: Observation to match. + candidates: Candidate map objects. + distance_threshold: Max centroid distance to consider. + iou_threshold: Min IoU to accept a merge. + ratio_threshold: Min overlap ratio to accept a merge. + min_distance_merge: Absolute distance threshold for merge. + + Returns: + Best matching object or None if no match passes thresholds. + """ + if obs.points.shape[0] == 0: + return None + + obs_center = compute_centroid(obs.points) + obs_bbox = compute_bbox(obs.points) + obs_corners = obs_bbox[0] if obs_bbox is not None else None + obs_extent = obs_bbox[1] if obs_bbox is not None else None + + best_candidate: Optional[MapObject] = None + best_dist = float("inf") + + for candidate in candidates: + if obs.track_id in candidate.track_ids: + return candidate + + cand_center = candidate.centroid + if cand_center is None: + continue + + dist = float(np.linalg.norm(obs_center - cand_center)) + if dist > distance_threshold: + continue + + cand_points = candidate.voxel_manager.get_valid_voxels( + diversity_percentile=candidate._diversity_percentile, + use_regularized=True, + ) + if len(cand_points) == 0: + cand_points = candidate.voxel_manager.get_valid_voxels( + diversity_percentile=candidate._diversity_percentile, + use_regularized=False, + ) + cand_bbox = compute_bbox(cand_points) if len(cand_points) >= 3 else None + cand_corners = cand_bbox[0] if cand_bbox is not None else None + cand_extent = cand_bbox[1] if cand_bbox is not None else None + + dist_merge = dist < min_distance_merge + if obs_extent is not None and cand_extent is not None: + dist_thresh = np.linalg.norm((obs_extent / 2 + cand_extent / 2) / 2) * 0.5 + dist_merge = dist_merge or dist < dist_thresh + + if dist_merge: + if dist < best_dist: + best_candidate = candidate + best_dist = dist + continue + + if obs_corners is None or cand_corners is None: + continue + + iou, ratio_obs, ratio_cand = compute_iou_3d_with_ratios( + obs_corners, cand_corners + ) + + if iou > iou_threshold or ratio_obs > ratio_threshold or ratio_cand > ratio_threshold: + if dist < best_dist: + best_candidate = candidate + best_dist = dist + + return best_candidate diff --git a/semantic_mapping/semantic_mapping/utils/storage.py b/semantic_mapping/semantic_mapping/utils/storage.py new file mode 100644 index 0000000..d2be3cc --- /dev/null +++ b/semantic_mapping/semantic_mapping/utils/storage.py @@ -0,0 +1,90 @@ +"""Async image saving queue for semantic mapping.""" + +from __future__ import annotations + +from pathlib import Path +import shutil +import queue +import threading +import time + +import cv2 +import numpy as np + + +class ImageSaveQueue: + """Background worker for saving object images to disk.""" + + def __init__(self, output_dir: Path, max_queue_size: int = 100, save_png: bool = False) -> None: + """Initialize the image save queue. + + Args: + output_dir: Directory where images are stored. + max_queue_size: Maximum number of queued save jobs. + save_png: If True, also save a .png copy of the image. + """ + self._output_dir = Path(output_dir) + if self._output_dir.exists() and any(self._output_dir.iterdir()): + shutil.rmtree(self._output_dir) + self._output_dir.mkdir(parents=True, exist_ok=True) + self._save_png = save_png + self._queue: queue.Queue = queue.Queue(maxsize=max_queue_size) + self._stop_event = threading.Event() + self._thread: threading.Thread | None = None + + def start(self) -> None: + """Start the background worker.""" + if self._thread is not None and self._thread.is_alive(): + return + self._stop_event.clear() + self._thread = threading.Thread(target=self._run, daemon=True) + self._thread.start() + + def stop(self) -> None: + """Stop the background worker.""" + self._stop_event.set() + try: + self._queue.put_nowait(None) + except queue.Full: + pass + if self._thread is not None: + self._thread.join(timeout=2.0) + self._thread = None + + def queue_save( + self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None + ) -> str: + """Queue an image (and optional mask) for saving. + + Args: + object_id: Object identifier to embed in filename. + image: Image array to save. + mask: Optional mask array to save alongside the image. + + Returns: + Path to the saved image (.npy). + """ + timestamp_ms = int(time.time() * 1000) + image_path = self._output_dir / f"{object_id}_{timestamp_ms}.npy" + mask_path = self._output_dir / f"{object_id}_{timestamp_ms}_mask.npy" + self._queue.put(("save", str(image_path), str(mask_path), image, mask)) + return str(image_path) + + def _run(self) -> None: + while not self._stop_event.is_set(): + item = self._queue.get() + if item is None: + self._queue.task_done() + break + op, image_path, mask_path, image, mask = item + if op == "save": + try: + np.save(image_path, image) + if mask is not None: + np.save(mask_path, mask) + if self._save_png: + png_path = Path(image_path).with_suffix(".png") + cv2.imwrite(str(png_path), image) + except Exception: + pass + self._queue.task_done() diff --git a/vector_perception_utils/setup.cfg b/vector_perception_utils/setup.cfg deleted file mode 100644 index 1ce0c22..0000000 --- a/vector_perception_utils/setup.cfg +++ /dev/null @@ -1,5 +0,0 @@ -[develop] -script_dir=$base/lib/vector_perception_utils -[install] -install_scripts=$base/lib/vector_perception_utils - diff --git a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py index 835db76..e86678d 100644 --- a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py +++ b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py @@ -148,6 +148,75 @@ def remove_radius_outliers( return points_filtered, colors_filtered, inlier_indices +def fit_plane( + points: np.ndarray, + distance_threshold: float = 0.05, + ransac_n: int = 3, + num_iterations: int = 50, + max_points: int = 8000, + min_inlier_ratio: float = 0.0, + min_normal_z: float = 0.0, + min_above_ratio: float = 0.0, + prefer_up: bool = True, +) -> Optional[np.ndarray]: + """Fit a plane to a point cloud using RANSAC. + + Args: + points: Point coordinates (N, 3). + distance_threshold: Max distance for inliers during RANSAC. + ransac_n: Number of points to sample per RANSAC iteration. + num_iterations: RANSAC iterations. + max_points: Max points to sample for fitting (0 keeps all). + min_inlier_ratio: Minimum inlier ratio to accept the plane. + min_normal_z: Minimum absolute Z component of the plane normal. + min_above_ratio: Minimum ratio of points above the plane. + prefer_up: If True, flip normal to have positive Z. + + Returns: + Plane coefficients [a, b, c, d] for ax + by + cz + d = 0, or None. + """ + if points.shape[0] < 3: + return None + + sample_points = points + if max_points > 0 and points.shape[0] > max_points: + indices = np.random.choice(points.shape[0], max_points, replace=False) + sample_points = points[indices] + + pcd = create_pointcloud(sample_points) + plane_model, inliers = pcd.segment_plane( + distance_threshold=distance_threshold, + ransac_n=ransac_n, + num_iterations=num_iterations, + ) + if len(inliers) == 0: + return None + if min_inlier_ratio > 0.0 and len(inliers) / sample_points.shape[0] < min_inlier_ratio: + return None + + normal = np.array(plane_model[:3], dtype=float) + norm = float(np.linalg.norm(normal)) + if norm == 0.0: + return None + normal /= norm + d = float(plane_model[3]) / norm + + if min_normal_z > 0.0 and abs(normal[2]) < min_normal_z: + return None + if prefer_up and normal[2] < 0: + normal = -normal + d = -d + + if min_above_ratio > 0.0: + check_points = sample_points + distances = check_points @ normal + d + above_ratio = float(np.mean(distances > 0.0)) + if above_ratio < min_above_ratio: + return None + + return np.array([normal[0], normal[1], normal[2], d], dtype=float) + + def transform_pointcloud( points: np.ndarray, transform: np.ndarray @@ -199,42 +268,126 @@ def crop_pointcloud_bbox( def compute_centroid(points: np.ndarray) -> np.ndarray: """ Compute centroid of point cloud. - + Args: points: Point coordinates (N, 3) - + Returns: Centroid as (3,) array """ return np.mean(points, axis=0) +def get_bbox_center(bbox) -> np.ndarray: + """Return bounding box center for Open3D AABB/OBB variants.""" + if hasattr(bbox, "center"): + return np.asarray(bbox.center) + if hasattr(bbox, "get_center"): + return np.asarray(bbox.get_center()) + if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"): + min_bound = np.asarray(bbox.get_min_bound()) + max_bound = np.asarray(bbox.get_max_bound()) + return (min_bound + max_bound) * 0.5 + return np.zeros(3, dtype=float) + + +def get_bbox_extent(bbox) -> np.ndarray: + """Return bounding box extent for Open3D AABB/OBB variants.""" + if hasattr(bbox, "extent"): + return np.asarray(bbox.extent) + if hasattr(bbox, "get_extent"): + return np.asarray(bbox.get_extent()) + if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"): + min_bound = np.asarray(bbox.get_min_bound()) + max_bound = np.asarray(bbox.get_max_bound()) + return max_bound - min_bound + return np.zeros(3, dtype=float) + + +def get_bbox_corners(bbox) -> np.ndarray: + """Return 8 bbox corners for Open3D AABB/OBB variants.""" + if hasattr(bbox, "get_box_points"): + return np.asarray(bbox.get_box_points()) + if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"): + min_bound = np.asarray(bbox.get_min_bound()) + max_bound = np.asarray(bbox.get_max_bound()) + return np.array( + [ + [min_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], max_bound[1], max_bound[2]], + [min_bound[0], max_bound[1], max_bound[2]], + ], + dtype=float, + ) + return np.zeros((0, 3), dtype=float) + + +def get_oriented_bbox( + points: np.ndarray, + axis_aligned: bool = True, +) -> Optional[Tuple[np.ndarray, np.ndarray]]: + """ + Compute bounding box from point cloud. + + Args: + points: Point coordinates (N, 3) + axis_aligned: If True, compute axis-aligned bounding box. + If False, compute minimal oriented bounding box. + + Returns: + Tuple of (center, corners) where center is (3,) and corners is (8, 3), + or None if insufficient points. + """ + if len(points) < 4: + return None + + pcd = create_pointcloud(points) + + try: + if axis_aligned: + bbox = pcd.get_axis_aligned_bounding_box() + else: + bbox = pcd.get_minimal_oriented_bounding_box() + except Exception: + return None + + center = get_bbox_center(bbox) + corners = get_bbox_corners(bbox) + return center, corners + def pointcloud_to_bbox3d(points: np.ndarray, header=None): """ Convert point cloud to minimal oriented 3D bounding box (ROS2 message). - + Args: points: Point coordinates (N, 3) header: Optional ROS2 header for the message - + Returns: vision_msgs/BoundingBox3D message or None if insufficient points """ - + if len(points) < 4: return None pcd = create_pointcloud(points) - + try: obb = pcd.get_minimal_oriented_bounding_box() except: return None - - center = obb.center - extent = obb.extent - quat = Rotation.from_matrix(obb.R).as_quat() - + center = get_bbox_center(obb) + extent = get_bbox_extent(obb) + if hasattr(obb, "R"): + quat = Rotation.from_matrix(obb.R).as_quat() + else: + quat = np.array([0.0, 0.0, 0.0, 1.0], dtype=float) + bbox = BoundingBox3D() bbox.center.position = Point( x=float(center[0]), y=float(center[1]), z=float(center[2]) @@ -243,7 +396,7 @@ def pointcloud_to_bbox3d(points: np.ndarray, header=None): x=float(quat[0]), y=float(quat[1]), z=float(quat[2]), w=float(quat[3]) ) bbox.size = Vector3(x=float(extent[0]), y=float(extent[1]), z=float(extent[2])) - + return bbox @@ -328,6 +481,192 @@ def pointcloud_to_depth( return depth_image +def project_pointcloud_to_image( + points: np.ndarray, + projection: str = "equirect", + image_width: int = 1920, + image_height: int = 640, + axis_mode: str = "xz", + intrinsics: Optional[np.ndarray] = None, + depth_filter: bool = False, + depth_filter_margin: float = 0.15, + transform: Optional[np.ndarray] = None, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Project a point cloud into image pixel coordinates. + + Args: + points: Point coordinates (N, 3). + projection: Projection model ("equirect" or "pinhole"). + image_width: Output image width in pixels. + image_height: Output image height in pixels. + axis_mode: Equirect axis convention ("xz" or "xy"). + intrinsics: Pinhole intrinsics (3, 3) if using "pinhole". + depth_filter: Enable depth-based occlusion filtering. + depth_filter_margin: Depth margin for occlusion filtering (meters). + transform: Optional 4x4 transform applied before projection. + + Returns: + Tuple of (camera_points, pixel_idx) where pixel_idx is (N, 3) with (u, v, depth). + """ + camera_points = transform_pointcloud(points, transform) if transform is not None else points + + if projection == "pinhole": + if intrinsics is None: + raise ValueError("intrinsics required for pinhole projection") + u = camera_points[:, 0] * intrinsics[0, 0] / (camera_points[:, 2] + 1e-6) + intrinsics[0, 2] + v = camera_points[:, 1] * intrinsics[1, 1] / (camera_points[:, 2] + 1e-6) + intrinsics[1, 2] + return camera_points, np.stack([u, v, camera_points[:, 2]], axis=-1) + + if axis_mode == "xz": + hori_dis = np.sqrt(camera_points[:, 0] ** 2 + camera_points[:, 2] ** 2) + u = ( + image_width / (2 * np.pi) * np.arctan2(camera_points[:, 0], camera_points[:, 2]) + + image_width / 2 + + 1 + ) + v = ( + image_width / (2 * np.pi) * np.arctan(camera_points[:, 1] / hori_dis) + + image_height / 2 + + 1 + ) + else: + hori_dis = np.sqrt(camera_points[:, 0] ** 2 + camera_points[:, 1] ** 2) + u = ( + -image_width / (2 * np.pi) * np.arctan2(camera_points[:, 1], camera_points[:, 0]) + + image_width / 2 + + 1 + ) + v = ( + -image_width / (2 * np.pi) * np.arctan2(camera_points[:, 2], hori_dis) + + image_height / 2 + + 1 + ) + + u = np.clip(u, 0, image_width - 1) + v = np.clip(v, 0, image_height - 1) + + if depth_filter: + depth_map = np.full((image_height, image_width), np.inf) + idx = v.astype(int) * image_width + u.astype(int) + np.minimum.at(depth_map.ravel(), idx, hori_dis) + remove_mask = hori_dis >= depth_map[v.astype(int), u.astype(int)] + depth_filter_margin + u = u.copy() + v = v.copy() + u[remove_mask] = -1 + v[remove_mask] = -1 + + return camera_points, np.stack([u, v, hori_dis], axis=-1) + + +def segment_pointcloud_by_mask( + points: np.ndarray, + mask: np.ndarray, + pixel_idx: np.ndarray, +) -> np.ndarray: + """ + Segment a point cloud using a binary mask and projected pixels. + + Args: + points: Point coordinates (N, 3). + mask: Binary mask (H, W). + pixel_idx: Pixel coordinates (N, 3) as (u, v, depth). + + Returns: + Masked point cloud (M, 3). + """ + h, w = mask.shape[:2] + pixel_idx = pixel_idx.astype(int) + valid = ( + (pixel_idx[:, 0] >= 0) + & (pixel_idx[:, 0] < w) + & (pixel_idx[:, 1] >= 0) + & (pixel_idx[:, 1] < h) + ) + pixel_idx = pixel_idx[valid] + points = points[valid] + keep = mask[pixel_idx[:, 1], pixel_idx[:, 0]].astype(bool) + return points[keep] + + +def segment_pointclouds_from_projection( + points: np.ndarray, + masks: list[np.ndarray], + pixel_idx: np.ndarray, +) -> Tuple[list[np.ndarray], list[np.ndarray]]: + """ + Segment point clouds from projected pixels. + + Args: + points: Point coordinates (N, 3). + masks: List of binary masks (H, W). + pixel_idx: Pixel coordinates (N, 3) as (u, v, depth). + + Returns: + Tuple of (clouds, depths) where each entry corresponds to a mask. + """ + if masks is None or len(masks) == 0: + return [], [] + + image_shape = masks[0].shape + valid = ( + (pixel_idx[:, 0] >= 0) + & (pixel_idx[:, 0] < image_shape[1]) + & (pixel_idx[:, 1] >= 0) + & (pixel_idx[:, 1] < image_shape[0]) + ) + pixel_idx = pixel_idx[valid] + points = points[valid] + + depths = pixel_idx[:, 2] + pixel_idx = pixel_idx.astype(int) + + obj_clouds = [] + obj_depths = [] + for obj_mask in masks: + cloud_mask = obj_mask[pixel_idx[:, 1], pixel_idx[:, 0]].astype(bool) + obj_clouds.append(points[cloud_mask]) + obj_depths.append(depths[cloud_mask].reshape(-1, 1)) + + return obj_clouds, obj_depths + + +def transform_segmented_clouds_to_world( + clouds: list[np.ndarray], + depths: list[np.ndarray], + R_b2w: np.ndarray, + t_b2w: np.ndarray, + depth_jump_threshold: float = 0.3, +) -> list[np.ndarray]: + """ + Prune depth jumps and transform segmented clouds to world frame. + + Args: + clouds: List of segmented point clouds (sensor frame). + depths: List of depth arrays aligned with clouds. + R_b2w: Body-to-world rotation matrix (3, 3). + t_b2w: Body-to-world translation vector (3,). + depth_jump_threshold: Depth jump threshold for pruning (meters). + + Returns: + List of segmented object clouds in world frame. + """ + obj_cloud_world_list = [] + for obj_cloud, obj_depth in zip(clouds, depths): + if obj_depth.shape[0] > 1: + obj_depth_diff = (obj_depth[1:] - obj_depth[:-1]).squeeze() + if obj_depth_diff.size > 0 and np.max(obj_depth_diff) > depth_jump_threshold: + idx_tmp = np.ones(len(obj_depth), dtype=bool) + jump_idx = int(np.argmax(obj_depth_diff)) + idx_tmp[jump_idx + 1 :] = False + obj_cloud = obj_cloud[idx_tmp] + + obj_cloud_world = obj_cloud[:, :3] @ R_b2w.T + t_b2w + obj_cloud_world_list.append(obj_cloud_world) + + return obj_cloud_world_list + + def create_pointcloud2_msg(points: np.ndarray, colors: np.ndarray, header): """ Create PointCloud2 ROS message from points and colors. diff --git a/vector_perception_utils/vector_perception_utils/transform_utils.py b/vector_perception_utils/vector_perception_utils/transform_utils.py index 534d8d2..ef3238e 100644 --- a/vector_perception_utils/vector_perception_utils/transform_utils.py +++ b/vector_perception_utils/vector_perception_utils/transform_utils.py @@ -8,7 +8,7 @@ from geometry_msgs.msg import Pose, Quaternion -def normalize_angle(angle: float) -> float: +def normalize_angle(angle: float | np.ndarray) -> float | np.ndarray: """ Normalize angle to [-pi, pi] range. @@ -21,6 +21,26 @@ def normalize_angle(angle: float) -> float: return np.arctan2(np.sin(angle), np.cos(angle)) +def discretize_angles(angles: np.ndarray, num_bins: int) -> np.ndarray: + """ + Discretize angles into bin indices. + + Normalizes angles to [-pi, pi] then maps to bin indices [0, num_bins-1]. + + Args: + angles: Angles in radians (any range). + num_bins: Number of angle bins. + + Returns: + Array of bin indices (0 to num_bins-1), same shape as input. + """ + angles = normalize_angle(angles) + # Map [-pi, pi] to [0, 2*pi] then to [0, num_bins) + normalized = (angles + np.pi) / (2 * np.pi) + bins = (normalized * num_bins).astype(int) + return np.clip(bins, 0, num_bins - 1) + + def pose_to_matrix(pose: Pose) -> np.ndarray: """ Convert pose to 4x4 homogeneous transform matrix.