From 7ffe106bb10e8fd0f05917e1f7f71ada1a3b551e Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Wed, 21 Jan 2026 18:34:23 -0500 Subject: [PATCH 1/6] initial working detection node refactor --- README.md | 3 + detect_anything/CMakeLists.txt | 47 ++++ detect_anything/README.md | 28 ++ detect_anything/config/objects.yaml | 46 ++++ detect_anything/detect_anything/__init__.py | 1 + detect_anything/detect_anything/detection.py | 256 ++++++++++++++++++ .../detect_anything/detection_node.py | 169 ++++++++++++ detect_anything/detect_anything/yoloe.py | 139 ++++++++++ .../env-hooks/venv_pythonpath.sh.in | 1 + detect_anything/msg/DetectionResult.msg | 10 + detect_anything/package.xml | 26 ++ detect_anything/resource/detect_anything | 1 + detect_anything/scripts/detection_node | 5 + requirements.txt | 7 +- semantic_mapping/package.xml | 27 ++ semantic_mapping/resource/semantic_mapping | 1 + semantic_mapping/semantic_mapping/__init__.py | 1 + semantic_mapping/setup.cfg | 4 + semantic_mapping/setup.py | 27 ++ vector_perception_utils/setup.cfg | 5 - 20 files changed, 796 insertions(+), 8 deletions(-) create mode 100644 detect_anything/CMakeLists.txt create mode 100644 detect_anything/README.md create mode 100644 detect_anything/config/objects.yaml create mode 100644 detect_anything/detect_anything/__init__.py create mode 100644 detect_anything/detect_anything/detection.py create mode 100644 detect_anything/detect_anything/detection_node.py create mode 100644 detect_anything/detect_anything/yoloe.py create mode 100644 detect_anything/env-hooks/venv_pythonpath.sh.in create mode 100644 detect_anything/msg/DetectionResult.msg create mode 100644 detect_anything/package.xml create mode 100644 detect_anything/resource/detect_anything create mode 100755 detect_anything/scripts/detection_node create mode 100644 semantic_mapping/package.xml create mode 100644 semantic_mapping/resource/semantic_mapping create mode 100644 semantic_mapping/semantic_mapping/__init__.py create mode 100644 semantic_mapping/setup.cfg create mode 100644 semantic_mapping/setup.py delete mode 100644 vector_perception_utils/setup.cfg diff --git a/README.md b/README.md index 49392b8..21352a8 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,8 @@ ROS2 perception stack for generalist robotics. ## Packages - **track_anything** - EdgeTAM tracking + 3D segmentation with RGBD +- **detect_anything** - YOLO-E detection node and utilities +- **semantic_mapping** - placeholder for future mapping components (currently empty) - **vector_perception_utils** - Image and point cloud utilities ## Installation @@ -21,6 +23,7 @@ pip install -r requirements.txt # Build source /opt/ros/jazzy/setup.bash colcon build +``` diff --git a/detect_anything/CMakeLists.txt b/detect_anything/CMakeLists.txt new file mode 100644 index 0000000..c8e9188 --- /dev/null +++ b/detect_anything/CMakeLists.txt @@ -0,0 +1,47 @@ +cmake_minimum_required(VERSION 3.10) +project(detect_anything) + +find_package(ament_cmake REQUIRED) +find_package(rclpy REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(std_msgs REQUIRED) +find_package(vision_msgs REQUIRED) +find_package(cv_bridge REQUIRED) +find_package(rosidl_default_generators REQUIRED) +find_package(Python3 REQUIRED COMPONENTS Interpreter) + +rosidl_generate_interfaces(${PROJECT_NAME} + "msg/DetectionResult.msg" + DEPENDENCIES std_msgs sensor_msgs +) + +set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages") + +install( + DIRECTORY detect_anything + DESTINATION ${PYTHON_INSTALL_DIR} +) + +install( + PROGRAMS + scripts/detection_node + DESTINATION lib/${PROJECT_NAME} +) + +install( + FILES resource/detect_anything + DESTINATION share/${PROJECT_NAME}/resource +) + +install( + DIRECTORY config + DESTINATION share/${PROJECT_NAME} +) + +set(ENV_HOOK "${CMAKE_CURRENT_SOURCE_DIR}/env-hooks/venv_pythonpath.sh.in") +set(ENV_HOOK_OUT "${CMAKE_CURRENT_BINARY_DIR}/ament_cmake_environment_hooks/venv_pythonpath.sh") +configure_file(${ENV_HOOK} ${ENV_HOOK_OUT} @ONLY) +ament_environment_hooks(${ENV_HOOK_OUT}) + +ament_export_dependencies(rosidl_default_runtime) +ament_package() diff --git a/detect_anything/README.md b/detect_anything/README.md new file mode 100644 index 0000000..1e94a06 --- /dev/null +++ b/detect_anything/README.md @@ -0,0 +1,28 @@ +# detect_anything + +YOLO-E detection node that publishes `DetectionResult` with cropped masks and an annotated overlay topic. + +## What’s inside +- `detect_anything/detection.py`: detection results container and Ultralytics parsing helpers. +- `detect_anything/yoloe.py`: YOLO-E wrapper with prompt support and basic filtering. +- `detect_anything/detection_node.py`: ROS2 node wiring the detector to `DetectionResult`. +- `msg/DetectionResult.msg`: compressed image + cropped mask array. + +## Quick start +```bash +source ~/vector_venv/bin/activate +source /opt/ros/jazzy/setup.bash +colcon build --packages-select detect_anything +source install/setup.bash + +ros2 run detect_anything detection_node \ + --ros-args -p model_path:=/path/to/yoloe/models \ + -p model_name:=yoloe-11s-seg-pf.pt \ + -p conf:=0.6 \ + -p max_area_ratio:=0.3 \ + -p image_topic:=/camera/image +``` + +Topics: +- Publishes `/detection_result` (`detect_anything/DetectionResult`) and `/annotated_image_detection` (`sensor_msgs/Image`). +- Subscribes to `/camera/image` (or `/camera/image/compressed` if `use_compressed:=true`). diff --git a/detect_anything/config/objects.yaml b/detect_anything/config/objects.yaml new file mode 100644 index 0000000..371f186 --- /dev/null +++ b/detect_anything/config/objects.yaml @@ -0,0 +1,46 @@ +# Simple list of object names for prompting YOLO-E +- chair +- desk +- tv_monitor +- sofa +- unknown +- printer +- coffee machine +- refrigerator +- trash can +- shoe +- sink +- table +- oven +- bed +- painting +- bulletin board +- plant +- vase +- cabinet +- shelf +- book +- cup +- sculpture +- keyboard +- mouse +- clock +- phone +- toilet +- bathtub +- microwave oven +- pan +- suitcase +- light +- curtain +- whiteboard +- shower knob +- bottle +- water dispenser +- vending machine +- laptop +- bag +- locker +- picture +- cardboard +- extinguisher diff --git a/detect_anything/detect_anything/__init__.py b/detect_anything/detect_anything/__init__.py new file mode 100644 index 0000000..9c61aab --- /dev/null +++ b/detect_anything/detect_anything/__init__.py @@ -0,0 +1 @@ +"""detect_anything package.""" diff --git a/detect_anything/detect_anything/detection.py b/detect_anything/detect_anything/detection.py new file mode 100644 index 0000000..1d64d76 --- /dev/null +++ b/detect_anything/detect_anything/detection.py @@ -0,0 +1,256 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import List, Optional, Sequence + +from cv_bridge import CvBridge +import cv2 +import numpy as np + +from detect_anything.msg import DetectionResult + +_BRIDGE = CvBridge() + + +@dataclass +class SingleDetectionResult: + """Simple container for a YOLO/Ultralytics detection with a segmentation mask.""" + + bbox: tuple[float, float, float, float] + track_id: int + class_id: int + label: str + confidence: float + mask: np.ndarray + + def area(self) -> float: + x1, y1, x2, y2 = self.bbox + return max(0.0, (x2 - x1)) * max(0.0, (y2 - y1)) + + +@dataclass +class DetectionResults: + """Container for detections plus the source image.""" + + image: np.ndarray + detections: List[SingleDetectionResult] + + def overlay(self, alpha: float = 0.4, thickness: int = 2) -> np.ndarray: + """Draw bboxes and masks on the image.""" + output = self.image.copy() + for det in self.detections: + rng = np.random.default_rng(det.track_id) + color = tuple(int(c) for c in rng.integers(50, 255, size=3)) + + if det.mask.size: + mask_bool = det.mask.astype(bool) + if mask_bool.shape[:2] == output.shape[:2]: + colored_mask = np.zeros_like(output) + colored_mask[mask_bool] = color + output[mask_bool] = cv2.addWeighted( + output[mask_bool], 1 - alpha, colored_mask[mask_bool], alpha, 0 + ) + contours, _ = cv2.findContours(det.mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(output, contours, -1, color, thickness) + + x1, y1, x2, y2 = map(int, det.bbox) + cv2.rectangle(output, (x1, y1), (x2, y2), color, thickness) + + label_text = f"{det.label} {det.confidence:.2f}" + label_text = f"{label_text} | id {det.track_id}" + + (text_w, text_h), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + cv2.rectangle( + output, + (x1, max(0, y1 - text_h - baseline - 2)), + (x1 + text_w, max(0, y1)), + color, + -1, + ) + cv2.putText( + output, + label_text, + (x1, max(0, y1 - baseline - 2)), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, + cv2.LINE_AA, + ) + return output + + def to_msg(self, header) -> DetectionResult: + """ + Convert detections into a DetectionResult ROS message. + """ + msg = DetectionResult() + msg.header = header + msg.image = _BRIDGE.cv2_to_compressed_imgmsg(self.image, dst_format="jpeg") + msg.image.header = header + + for det in self.detections: + msg.track_id.append(int(det.track_id)) + msg.x1.append(float(det.bbox[0])) + msg.y1.append(float(det.bbox[1])) + msg.x2.append(float(det.bbox[2])) + msg.y2.append(float(det.bbox[3])) + msg.label.append(str(det.label)) + msg.confidence.append(float(det.confidence)) + + mask = det.mask + # Crop mask to bbox and align origin to bbox top-left for transport. + x1, y1, x2, y2 = [int(v) for v in det.bbox] + h, w = self.image.shape[:2] + x1 = max(0, min(x1, w)) + x2 = max(0, min(x2, w)) + y1 = max(0, min(y1, h)) + y2 = max(0, min(y2, h)) + crop_h = max(1, y2 - y1) + crop_w = max(1, x2 - x1) + if mask.size == 0: + mask = np.zeros((crop_h, crop_w), dtype=np.uint8) + else: + mask = mask[y1:y2, x1:x2] + if mask.shape != (crop_h, crop_w): + mask = cv2.resize( + mask.astype(np.uint8), + (crop_w, crop_h), + interpolation=cv2.INTER_NEAREST, + ) + + mask_msg = _BRIDGE.cv2_to_compressed_imgmsg(mask, dst_format="png") + mask_msg.header = header + msg.masks.append(mask_msg) + + return msg + + @classmethod + def from_msg(cls, msg: DetectionResult) -> DetectionResults: + """ + Construct DetectionResults from a DetectionResult ROS message. + Masks are converted back to full-size overlays at bbox locations. + """ + image = _BRIDGE.compressed_imgmsg_to_cv2(msg.image) + detections: List[SingleDetectionResult] = [] + + for i in range(len(msg.track_id)): + bbox = ( + float(msg.x1[i]), + float(msg.y1[i]), + float(msg.x2[i]), + float(msg.y2[i]), + ) + mask_img = _BRIDGE.compressed_imgmsg_to_cv2(msg.masks[i], desired_encoding="mono8") + + full_mask = np.zeros(image.shape[:2], dtype=np.uint8) + x1, y1, x2, y2 = [int(v) for v in bbox] + h, w = image.shape[:2] + x1 = max(0, min(x1, w)) + x2 = max(0, min(x2, w)) + y1 = max(0, min(y1, h)) + y2 = max(0, min(y2, h)) + crop_h = max(1, y2 - y1) + crop_w = max(1, x2 - x1) + if mask_img.shape[:2] != (crop_h, crop_w): + mask_img = cv2.resize(mask_img, (crop_w, crop_h), interpolation=cv2.INTER_NEAREST) + full_mask[y1:y2, x1:x2] = mask_img + + detections.append( + SingleDetectionResult( + bbox=bbox, + track_id=int(msg.track_id[i]), + class_id=0, # class_id is not transmitted in DetectionResult + label=str(msg.label[i]), + confidence=float(msg.confidence[i]), + mask=full_mask, + ) + ) + + return cls(image=image, detections=detections) + + +def _from_ultralytics_result( + result, + idx: int, + image_shape: Optional[tuple[int, int]] = None, + names: Optional[Sequence[str]] = None, +) -> SingleDetectionResult: + """Convert a single Ultralytics Results entry into a SingleDetectionResult.""" + from ultralytics.engine.results import Results # local import + + if not isinstance(result, Results): + raise TypeError("result must be an ultralytics Results object") + if result.boxes is None: + raise ValueError("Result has no boxes") + + bbox_array = result.boxes.xyxy[idx].cpu().numpy() + bbox = ( + float(bbox_array[0]), + float(bbox_array[1]), + float(bbox_array[2]), + float(bbox_array[3]), + ) + + confidence = float(result.boxes.conf[idx].cpu()) + class_id = int(result.boxes.cls[idx].cpu()) + + resolved_names = names + if resolved_names is None and hasattr(result, "names"): + resolved_names = result.names # type: ignore[assignment] + + label = f"class_{class_id}" + if resolved_names is not None: + if isinstance(resolved_names, dict): + label = resolved_names.get(class_id, label) + elif isinstance(resolved_names, (list, tuple)) and class_id < len(resolved_names): + label = resolved_names[class_id] + + track_id = idx + if hasattr(result.boxes, "id") and result.boxes.id is not None: + track_id = int(result.boxes.id[idx].cpu()) + + target_h, target_w = (image_shape or (None, None)) + if target_h is None or target_w is None: + if hasattr(result, "orig_shape") and result.orig_shape: + target_h, target_w = result.orig_shape[:2] + + mask = np.zeros((target_h, target_w), dtype=np.uint8) if target_h and target_w else np.zeros((0, 0), dtype=np.uint8) + if result.masks is not None and idx < len(result.masks.data): + mask_tensor = result.masks.data[idx] + mask_np = mask_tensor.cpu().numpy() + + if mask_np.ndim == 3: + mask_np = mask_np.squeeze() + + if target_h and target_w and mask_np.shape != (target_h, target_w): + mask_np = cv2.resize(mask_np.astype(np.float32), (target_w, target_h), interpolation=cv2.INTER_LINEAR) + + mask = (mask_np > 0.5).astype(np.uint8) * 255 # type: ignore[assignment] + + return SingleDetectionResult( + bbox=bbox, + track_id=track_id, + class_id=class_id, + label=label, + confidence=confidence, + mask=mask, + ) + + +def from_ultralytics_results( + results, + image_shape: Optional[tuple[int, int]] = None, + names: Optional[Sequence[str]] = None, +) -> List[SingleDetectionResult]: + """Convert a list of Ultralytics Results objects into detections with masks.""" + detections: List[SingleDetectionResult] = [] + for result in results: + if result.boxes is None: + continue + num_detections = len(result.boxes.xyxy) + for i in range(num_detections): + detections.append(_from_ultralytics_result(result, i, image_shape=image_shape, names=names)) + return detections + + +__all__ = ["DetectionResults", "SingleDetectionResult", "from_ultralytics_results"] diff --git a/detect_anything/detect_anything/detection_node.py b/detect_anything/detect_anything/detection_node.py new file mode 100644 index 0000000..4d9f339 --- /dev/null +++ b/detect_anything/detect_anything/detection_node.py @@ -0,0 +1,169 @@ +from __future__ import annotations + +from pathlib import Path +from typing import List, Optional + +import yaml +import rclpy +from cv_bridge import CvBridge +from sensor_msgs.msg import CompressedImage, Image +from rclpy.node import Node +from rclpy.qos import DurabilityPolicy, QoSProfile, ReliabilityPolicy + +from detect_anything.detection import DetectionResults +from detect_anything.yoloe import Yoloe2DDetector, YoloePromptMode +from detect_anything.msg import DetectionResult + + +class DetectionNode(Node): + """ROS2 node that runs YOLO-E detection and publishes DetectionResult.""" + + def __init__(self) -> None: + super().__init__("detect_anything_detection_node") + self.bridge = CvBridge() + + # Parameters + defaults = self._default_params() + self.declare_parameter("image_topic", defaults["image_topic"]) + self.declare_parameter("compressed_image_topic", defaults["compressed_image_topic"]) + self.declare_parameter("use_compressed", defaults["use_compressed"]) + self.declare_parameter("model_path", defaults["model_path"]) + self.declare_parameter("model_name", defaults["model_name"]) + self.declare_parameter("device", defaults["device"]) + self.declare_parameter("prompt_mode", defaults["prompt_mode"]) + self.declare_parameter("objects_file", defaults["objects_file"]) + self.declare_parameter("conf", defaults["conf"]) + self.declare_parameter("max_area_ratio", defaults["max_area_ratio"]) + + params = self._default_params() + image_topic = str(self.get_parameter("image_topic").value) + compressed_image_topic = str(self.get_parameter("compressed_image_topic").value) + self.use_compressed = bool(self.get_parameter("use_compressed").value) + + model_path_val = self.get_parameter("model_path").value + model_path = str(model_path_val) if model_path_val not in (None, "") else params["model_path"] + if str(model_path).lower() == "none": + model_path = params["model_path"] + + model_name = str(self.get_parameter("model_name").value or params["model_name"]) + device = str(self.get_parameter("device").value or params["device"]) + prompt_mode_str = str(self.get_parameter("prompt_mode").value or params["prompt_mode"]).lower() + objects_file = Path(str(self.get_parameter("objects_file").value or params["objects_file"])) + conf = float(self.get_parameter("conf").value) + max_area_ratio = float(self.get_parameter("max_area_ratio").value) + + prompt_mode = YoloePromptMode(prompt_mode_str) + + # Auto-pick the right default weight if user didn't override + if prompt_mode == YoloePromptMode.LRPC and ("seg.pt" in model_name and "seg-pf.pt" not in model_name): + model_name = "yoloe-11l-seg-pf.pt" + self.get_logger().info(f"Prompt-free mode: switching model_name to {model_name}") + if prompt_mode == YoloePromptMode.PROMPT and "seg-pf.pt" in model_name: + model_name = "yoloe-11l-seg.pt" + self.get_logger().info(f"Prompt mode: switching model_name to {model_name}") + + self.detector = Yoloe2DDetector( + model_path=model_path, + model_name=model_name, + device=device or None, + prompt_mode=prompt_mode, + max_area_ratio=max_area_ratio, + conf=conf, + ) + + prompts = self._load_prompts(objects_file) + if prompts and prompt_mode == YoloePromptMode.PROMPT: + self.detector.set_prompts(text=prompts) + self.get_logger().info(f"Loaded {len(prompts)} prompts from {objects_file}") + + qos = QoSProfile( + depth=10, + reliability=ReliabilityPolicy.RELIABLE, + durability=DurabilityPolicy.VOLATILE, + ) + if self.use_compressed: + self.create_subscription( + CompressedImage, + compressed_image_topic, + self._compressed_image_callback, + qos, + ) + else: + self.create_subscription( + Image, + image_topic, + self._image_callback, + qos, + ) + + self.detection_pub = self.create_publisher(DetectionResult, "/detection_result", qos) + self.overlay_pub = self.create_publisher(Image, "/annotated_image_detection", qos) + + self.get_logger().info("detect_anything detection node started") + + def _default_params(self) -> dict: + base_share = Path(__file__).resolve().parent.parent + return { + "image_topic": "/camera/image", + "compressed_image_topic": "/camera/image/compressed", + "use_compressed": True, + "model_path": str(base_share / "models"), + "model_name": "yoloe-11l-seg.pt", + "device": "cuda", + "prompt_mode": YoloePromptMode.PROMPT.value, + "objects_file": str(base_share / "config" / "objects.yaml"), + "conf": 0.5, + "max_area_ratio": 0.3, + } + + def _load_prompts(self, objects_file: Path) -> List[str]: + if not objects_file.exists(): + return [] + try: + with open(objects_file, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) or [] + if isinstance(data, list): + return [str(p) for p in data] + except Exception as exc: + self.get_logger().warn(f"Failed to read prompts from {objects_file}: {exc}") + return [] + + def _image_callback(self, msg: Image) -> None: + cv_image = self.bridge.imgmsg_to_cv2(msg, desired_encoding="bgr8") + self._process_image(cv_image, msg.header) + + def _compressed_image_callback(self, msg: CompressedImage) -> None: + cv_image = self.bridge.compressed_imgmsg_to_cv2(msg, desired_encoding="bgr8") + self._process_image(cv_image, msg.header) + + def _process_image(self, cv_image, header) -> None: + detections: DetectionResults = self.detector.process_image(cv_image) + + detection_msg = detections.to_msg(header) + self.detection_pub.publish(detection_msg) + + overlay = detections.overlay() + overlay_msg = self.bridge.cv2_to_imgmsg(overlay, encoding="bgr8") + overlay_msg.header = header + self.overlay_pub.publish(overlay_msg) + + def destroy_node(self) -> bool: + if hasattr(self, "detector") and self.detector: + self.detector.stop() + return super().destroy_node() + + +def main(args: Optional[list[str]] = None) -> None: + rclpy.init(args=args) + node = DetectionNode() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/detect_anything/detect_anything/yoloe.py b/detect_anything/detect_anything/yoloe.py new file mode 100644 index 0000000..c0695c3 --- /dev/null +++ b/detect_anything/detect_anything/yoloe.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +import threading +from typing import Any, List, Optional, Sequence + +import numpy as np +import torch +from ultralytics import YOLOE # type: ignore[attr-defined, import-not-found] + +from detect_anything.detection import ( + DetectionResults, + SingleDetectionResult, + from_ultralytics_results, +) + + +class YoloePromptMode(Enum): + """YOLO-E prompt modes.""" + + LRPC = "lrpc" + PROMPT = "prompt" + + +class Yoloe2DDetector: + """Thin wrapper around YOLO-E for 2D detection + tracking with masks.""" + + def __init__( + self, + model_path: str | None = None, + model_name: str | None = None, + device: str | None = None, + prompt_mode: YoloePromptMode = YoloePromptMode.LRPC, + max_area_ratio: Optional[float] = 0.3, + conf: float = 0.6, + iou: float = 0.6, + ) -> None: + if model_name is None: + model_name = "yoloe-11s-seg-pf.pt" if prompt_mode == YoloePromptMode.LRPC else "yoloe-11s-seg.pt" + + model_dir = Path(model_path) if model_path else Path("models/yoloe") + self.model = YOLOE(model_dir / model_name) + self.prompt_mode = prompt_mode + self._visual_prompts: dict[str, np.ndarray] | None = None + self.max_area_ratio = max_area_ratio + self._lock = threading.Lock() + self.conf = conf + self.iou = iou + self._names_override: Sequence[str] | None = None + + if prompt_mode == YoloePromptMode.PROMPT: + self.set_prompts(text=["nothing"]) + + if self.max_area_ratio is not None and not (0.0 < self.max_area_ratio <= 1.0): + raise ValueError("max_area_ratio must be in the range (0, 1].") + + if device: + self.device = device + elif torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + + def set_prompts( + self, + text: Optional[List[str]] = None, + bboxes: Optional[np.ndarray] = None, + ) -> None: + """Set prompts for detection. Provide either text or bboxes, not both.""" + if self.prompt_mode == YoloePromptMode.LRPC: + # Prompt-free model cannot accept text/visual prompts. + return + if text is not None and bboxes is not None: + raise ValueError("Provide either text or bboxes, not both.") + if text is None and bboxes is None: + raise ValueError("Must provide either text or bboxes.") + + with self._lock: + self.model.predictor = None + if text is not None: + self.model.set_classes(text, self.model.get_text_pe(text)) # type: ignore[no-untyped-call] + self._visual_prompts = None + self._names_override = list(text) + else: + cls = np.arange(len(bboxes), dtype=np.int16) # type: ignore[arg-type] + self._visual_prompts = {"bboxes": bboxes, "cls": cls} # type: ignore[dict-item] + self._names_override = None + + def process_image(self, image: np.ndarray) -> DetectionResults: + """Process an image and return detection results.""" + track_kwargs: dict[str, Any] = { + "source": image, + "device": self.device, + "conf": self.conf, + "iou": self.iou, + "persist": True, + "verbose": False, + } + + with self._lock: + if self._visual_prompts is not None: + track_kwargs["visual_prompts"] = self._visual_prompts + + results = self.model.track(**track_kwargs) # type: ignore[arg-type] + + detections = from_ultralytics_results(results, image_shape=image.shape[:2], names=self._names_override) + filtered = self._apply_filters(image, detections) + return DetectionResults(image=image, detections=filtered) + + def _apply_filters( + self, + image: np.ndarray, + detections: List[SingleDetectionResult], + ) -> List[SingleDetectionResult]: + if self.max_area_ratio is None: + return detections + + image_area = float(image.shape[0] * image.shape[1]) + if image_area <= 0: + return detections + + return [det for det in detections if (det.area() / image_area) <= self.max_area_ratio] + + def stop(self) -> None: + """Clean up resources used by the detector.""" + if hasattr(self.model, "predictor") and self.model.predictor is not None: + predictor = self.model.predictor + if hasattr(predictor, "trackers") and predictor.trackers: + for tracker in predictor.trackers: + if hasattr(tracker, "tracker") and hasattr(tracker.tracker, "gmc"): + gmc = tracker.tracker.gmc + if hasattr(gmc, "executor") and gmc.executor is not None: + gmc.executor.shutdown(wait=True) + self.model.predictor = None + + +__all__ = ["Yoloe2DDetector", "YoloePromptMode"] diff --git a/detect_anything/env-hooks/venv_pythonpath.sh.in b/detect_anything/env-hooks/venv_pythonpath.sh.in new file mode 100644 index 0000000..9212bac --- /dev/null +++ b/detect_anything/env-hooks/venv_pythonpath.sh.in @@ -0,0 +1 @@ +ament_prepend_unique_value PYTHONPATH "@PYTHON_INSTALL_DIR@" diff --git a/detect_anything/msg/DetectionResult.msg b/detect_anything/msg/DetectionResult.msg new file mode 100644 index 0000000..766d5f1 --- /dev/null +++ b/detect_anything/msg/DetectionResult.msg @@ -0,0 +1,10 @@ +std_msgs/Header header +int32[] track_id +float32[] x1 +float32[] y1 +float32[] x2 +float32[] y2 +string[] label +float32[] confidence +sensor_msgs/CompressedImage image +sensor_msgs/CompressedImage[] masks # cropped to bbox; aligned to bbox top-left; encoding: mono8 diff --git a/detect_anything/package.xml b/detect_anything/package.xml new file mode 100644 index 0000000..a7699ae --- /dev/null +++ b/detect_anything/package.xml @@ -0,0 +1,26 @@ + + + + detect_anything + 0.0.1 + YOLO-E detection node and utilities. + Alex Lin + MIT + + ament_cmake + + rosidl_default_generators + rosidl_default_runtime + rosidl_interface_packages + + rclpy + std_msgs + sensor_msgs + vision_msgs + cv_bridge + vector_perception_utils + + + ament_cmake + + diff --git a/detect_anything/resource/detect_anything b/detect_anything/resource/detect_anything new file mode 100644 index 0000000..eac574f --- /dev/null +++ b/detect_anything/resource/detect_anything @@ -0,0 +1 @@ +detect_anything diff --git a/detect_anything/scripts/detection_node b/detect_anything/scripts/detection_node new file mode 100755 index 0000000..822cacf --- /dev/null +++ b/detect_anything/scripts/detection_node @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 +from detect_anything.detection_node import main + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 7b6c110..67894e1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,13 +4,14 @@ opencv-python scipy pillow tqdm +pyyaml +lark # 3D processing open3d -# deep learning -torch<=2.4.0 -torchvision<=1.19.0 +# vision +ultralytics # SAM2 and EdgeTAM dependencies sam2 diff --git a/semantic_mapping/package.xml b/semantic_mapping/package.xml new file mode 100644 index 0000000..ca18f0e --- /dev/null +++ b/semantic_mapping/package.xml @@ -0,0 +1,27 @@ + + + + semantic_mapping + 0.0.1 + YOLO-E based detection utilities and ROS2 node for semantic mapping. + Alex Lin + MIT + + ament_cmake + + rclpy + std_msgs + sensor_msgs + vision_msgs + cv_bridge + vector_perception_utils + + ament_copyright + ament_flake8 + ament_pep257 + python3-pytest + + + ament_python + + diff --git a/semantic_mapping/resource/semantic_mapping b/semantic_mapping/resource/semantic_mapping new file mode 100644 index 0000000..8f63025 --- /dev/null +++ b/semantic_mapping/resource/semantic_mapping @@ -0,0 +1 @@ +semantic_mapping diff --git a/semantic_mapping/semantic_mapping/__init__.py b/semantic_mapping/semantic_mapping/__init__.py new file mode 100644 index 0000000..6ac0fcc --- /dev/null +++ b/semantic_mapping/semantic_mapping/__init__.py @@ -0,0 +1 @@ +"""Semantic mapping ROS2 package.""" diff --git a/semantic_mapping/setup.cfg b/semantic_mapping/setup.cfg new file mode 100644 index 0000000..31239a6 --- /dev/null +++ b/semantic_mapping/setup.cfg @@ -0,0 +1,4 @@ +[develop] +script_dir=$base/lib/semantic_mapping +[install] +install_scripts=$base/lib/semantic_mapping diff --git a/semantic_mapping/setup.py b/semantic_mapping/setup.py new file mode 100644 index 0000000..fc5260b --- /dev/null +++ b/semantic_mapping/setup.py @@ -0,0 +1,27 @@ +from glob import glob +import os + +from setuptools import find_packages, setup + +package_name = "semantic_mapping" + +setup( + name=package_name, + version="0.0.1", + packages=find_packages(exclude=["test"]), + package_data={package_name: []}, + data_files=[ + ("share/ament_index/resource_index/packages", ["resource/" + package_name]), + ("share/" + package_name, ["package.xml"]), + (os.path.join("share", package_name, "launch"), glob(os.path.join("launch", "*launch.[pxy][yma]*"))), + (os.path.join("share", package_name, "config"), glob(os.path.join("config", "*.yaml"))), + ], + install_requires=["setuptools"], + zip_safe=True, + maintainer="Alex Lin", + maintainer_email="alex.lin416@outlook.com", + description="Semantic mapping utilities with a YOLO-E detector node.", + license="MIT", + tests_require=["pytest"], + entry_points={"console_scripts": []}, +) diff --git a/vector_perception_utils/setup.cfg b/vector_perception_utils/setup.cfg deleted file mode 100644 index 1ce0c22..0000000 --- a/vector_perception_utils/setup.cfg +++ /dev/null @@ -1,5 +0,0 @@ -[develop] -script_dir=$base/lib/vector_perception_utils -[install] -install_scripts=$base/lib/vector_perception_utils - From 45c180478c1bacfa0a2ac8b8e7ca6db218038dc0 Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Thu, 22 Jan 2026 23:32:11 -0500 Subject: [PATCH 2/6] initial migration --- .gitignore | 7 + .../detect_anything/detection_node.py | 2 +- semantic_mapping/CMakeLists.txt | 52 ++ semantic_mapping/README.md | 117 +++ .../config/sensor_to_camera_tf.yaml | 11 + .../docs/SEMANTIC_MAPPING_MIGRATION.md | 452 ++++++++++++ semantic_mapping/msg/ObjectNode.msg | 35 + semantic_mapping/msg/ObjectType.msg | 17 + .../msg/TargetObjectInstruction.msg | 22 + semantic_mapping/msg/ViewpointRep.msg | 8 + semantic_mapping/package.xml | 18 +- semantic_mapping/resource/semantic_mapping | 1 - .../scripts/semantic_mapping_node | 7 + semantic_mapping/semantic_mapping/__init__.py | 2 +- .../semantic_mapping/map_object.py | 683 ++++++++++++++++++ .../semantic_mapping/map_object_utils.py | 217 ++++++ semantic_mapping/semantic_mapping/mapper.py | 257 +++++++ semantic_mapping/semantic_mapping/matching.py | 246 +++++++ semantic_mapping/semantic_mapping/storage.py | 91 +++ semantic_mapping/setup.cfg | 4 - semantic_mapping/setup.py | 27 - .../pointcloud_utils.py | 230 +++++- .../transform_utils.py | 22 +- 23 files changed, 2476 insertions(+), 52 deletions(-) create mode 100644 semantic_mapping/CMakeLists.txt create mode 100644 semantic_mapping/README.md create mode 100644 semantic_mapping/config/sensor_to_camera_tf.yaml create mode 100644 semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md create mode 100644 semantic_mapping/msg/ObjectNode.msg create mode 100644 semantic_mapping/msg/ObjectType.msg create mode 100644 semantic_mapping/msg/TargetObjectInstruction.msg create mode 100644 semantic_mapping/msg/ViewpointRep.msg create mode 100644 semantic_mapping/scripts/semantic_mapping_node create mode 100644 semantic_mapping/semantic_mapping/map_object.py create mode 100644 semantic_mapping/semantic_mapping/map_object_utils.py create mode 100644 semantic_mapping/semantic_mapping/mapper.py create mode 100644 semantic_mapping/semantic_mapping/matching.py create mode 100644 semantic_mapping/semantic_mapping/storage.py delete mode 100644 semantic_mapping/setup.cfg delete mode 100644 semantic_mapping/setup.py diff --git a/.gitignore b/.gitignore index ccd784f..36a793e 100644 --- a/.gitignore +++ b/.gitignore @@ -209,3 +209,10 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ + +# model files +*.pt +*.pth +*.onnx +*.engine +*.ts diff --git a/detect_anything/detect_anything/detection_node.py b/detect_anything/detect_anything/detection_node.py index 4d9f339..aebd0e8 100644 --- a/detect_anything/detect_anything/detection_node.py +++ b/detect_anything/detect_anything/detection_node.py @@ -112,7 +112,7 @@ def _default_params(self) -> dict: "device": "cuda", "prompt_mode": YoloePromptMode.PROMPT.value, "objects_file": str(base_share / "config" / "objects.yaml"), - "conf": 0.5, + "conf": 0.25, "max_area_ratio": 0.3, } diff --git a/semantic_mapping/CMakeLists.txt b/semantic_mapping/CMakeLists.txt new file mode 100644 index 0000000..7266468 --- /dev/null +++ b/semantic_mapping/CMakeLists.txt @@ -0,0 +1,52 @@ +cmake_minimum_required(VERSION 3.10) +project(semantic_mapping) + +find_package(ament_cmake REQUIRED) +find_package(rclpy REQUIRED) +find_package(std_msgs REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(geometry_msgs REQUIRED) +find_package(nav_msgs REQUIRED) +find_package(visualization_msgs REQUIRED) +find_package(cv_bridge REQUIRED) +find_package(rosidl_default_generators REQUIRED) +find_package(Python3 REQUIRED COMPONENTS Interpreter) + +rosidl_generate_interfaces(${PROJECT_NAME} + "msg/ObjectNode.msg" + "msg/ObjectType.msg" + "msg/TargetObjectInstruction.msg" + "msg/ViewpointRep.msg" + DEPENDENCIES std_msgs sensor_msgs geometry_msgs +) + +set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages") + +install( + DIRECTORY semantic_mapping + DESTINATION ${PYTHON_INSTALL_DIR} +) + +install( + PROGRAMS + scripts/semantic_mapping_node + DESTINATION lib/${PROJECT_NAME} +) + +install( + FILES resource/semantic_mapping + DESTINATION share/${PROJECT_NAME}/resource +) + +install( + DIRECTORY config + DESTINATION share/${PROJECT_NAME} +) + +install( + DIRECTORY launch + DESTINATION share/${PROJECT_NAME} +) + +ament_export_dependencies(rosidl_default_runtime) +ament_package() diff --git a/semantic_mapping/README.md b/semantic_mapping/README.md new file mode 100644 index 0000000..2dfd1dc --- /dev/null +++ b/semantic_mapping/README.md @@ -0,0 +1,117 @@ +# Semantic Mapping + +ROS2 package for semantic 3D object mapping with VLM integration. + +## Overview + +This package builds a persistent semantic map of objects detected by `detect_anything`, fusing 2D detections with LiDAR point clouds to create 3D object representations. It publishes `ObjectNode` messages compatible with the tare_planner exploration system. + +## Status: Work in Progress + +| Component | File | Status | +|-----------|------|--------| +| Messages | `msg/*.msg` | Complete | +| Data Models | `map_object.py` | Complete | +| Matching | `matching.py` | Pending | +| Object Mapper | `mapper.py` | Pending | +| Cloud Projection | `vector_perception_utils/pointcloud_utils.py` | Pending | +| Publishers | `map_object_utils.py` | Pending | +| Image Storage | `storage.py` | Pending | +| Mapping Node | `mapping_node.py` | Pending | +| Launch Files | `launch/` | Pending | + +## Messages + +### ObjectNode.msg +Main output message for planner integration: +- `object_id[]`: Track IDs (may contain multiple merged IDs) +- `label`: Dominant class label +- `position`: Object centroid in world frame +- `bbox3d[8]`: 3D bounding box corners +- `cloud`: Object point cloud +- `status`: Lifecycle status ("new", "updated", "unchanged") +- `img_path`: Path to best saved image crop +- `is_asked_vlm`: VLM query flag +- `viewpoint_id`: Associated viewpoint + +### ObjectType.msg +VLM query/answer for object relabeling. + +### TargetObjectInstruction.msg +User instruction for target object search. + +### ViewpointRep.msg +Viewpoint trigger for image saving. + +## Architecture + +``` +detect_anything/DetectionResult + │ + ▼ +┌───────────────────────────────────────┐ +│ semantic_mapping_node │ +│ │ +│ ┌─────────────┐ ┌──────────────┐ │ +│ │ CloudImage │ │ ObjectMapper │ │ +│ │ Fusion │───▶│ (ObjectDB │ │ +│ │ │ │ pattern) │ │ +│ └─────────────┘ └──────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌─────────────┐ ┌──────────────┐ │ +│ │ MapObject │ │ Publishers │ │ +│ │ (voxels, │───▶│ (ObjectNode, │ │ +│ │ votes) │ │ Markers) │ │ +│ └─────────────┘ └──────────────┘ │ +└───────────────────────────────────────┘ + │ + ▼ +/object_nodes, /obj_points, /obj_boxes, /obj_labels +``` + +## Dependencies + +- `detect_anything`: Detection results with masks +- `vector_perception_utils`: Point cloud and image utilities +- ROS2 messages: `sensor_msgs`, `geometry_msgs`, `nav_msgs`, `visualization_msgs` + +## Usage + +```bash +# Build +colcon build --packages-select semantic_mapping + +# Run (after implementation complete) +ros2 launch semantic_mapping semantic_mapping.launch.py +``` + +## Topics + +### Subscriptions +| Topic | Type | Description | +|-------|------|-------------| +| `/detection_result` | `detect_anything/DetectionResult` | 2D detections with masks | +| `/registered_scan` | `sensor_msgs/PointCloud2` | LiDAR point cloud | +| `/state_estimation` | `nav_msgs/Odometry` | Robot odometry | +| `/object_type_answer` | `ObjectType` | VLM relabeling response | +| `/target_object_instruction` | `TargetObjectInstruction` | Target object search | +| `/viewpoint_rep_header` | `ViewpointRep` | Viewpoint trigger | + +### Publications +| Topic | Type | Description | +|-------|------|-------------| +| `/object_nodes` | `ObjectNode` | Semantic object map (for planner) | +| `/obj_points` | `sensor_msgs/PointCloud2` | Colored object point cloud | +| `/obj_boxes` | `visualization_msgs/MarkerArray` | 3D bounding box wireframes | +| `/obj_labels` | `visualization_msgs/MarkerArray` | Object label text | +| `/object_type_query` | `ObjectType` | VLM query for relabeling | + +## References + +- Full migration plan: [`docs/SEMANTIC_MAPPING_MIGRATION.md`](docs/SEMANTIC_MAPPING_MIGRATION.md) +- Original implementation: `VLM_ROS/src/semantic_mapping/` (main branch) + - SAM2 for segmentation + - External Gemini API for VLM reasoning +- Detection: `detect_anything` package (decoupled) +- dimos patterns: `dimos/perception/detection/objectDB.py` diff --git a/semantic_mapping/config/sensor_to_camera_tf.yaml b/semantic_mapping/config/sensor_to_camera_tf.yaml new file mode 100644 index 0000000..baa6724 --- /dev/null +++ b/semantic_mapping/config/sensor_to_camera_tf.yaml @@ -0,0 +1,11 @@ +sensor_to_camera: + parent_frame: lidar + child_frame: camera + translation: + x: 0.0 + y: 0.0 + z: 0.265 + rotation_rpy: + roll: -1.5707963 + pitch: 0.0 + yaw: -1.5707963 diff --git a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md new file mode 100644 index 0000000..4a23f74 --- /dev/null +++ b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md @@ -0,0 +1,452 @@ +# Semantic Mapping Migration Plan + +## Overview + +This document outlines the comprehensive plan for porting the `semantic_mapping` package from `VLM_ROS` into `vector_perception_ros/semantic_mapping`, following the coding standards of `vector_perception_ros` while preserving functional compatibility with downstream consumers (particularly the tare_planner). + +> **Note**: Detection is decoupled into `detect_anything` package. VLM reasoning uses external Gemini API. + +## Design Principles + +1. **Logic cleanup, not restructure** - Preserve the core algorithms (voxel voting, observation angles, shape regularization, IoU/distance matching) while cleaning up dead code and improving organization. +2. **Same I/O contract** - Maintain the same ROS topic names, message types, and semantics so existing planner integration continues to work. +3. **vector_perception_ros coding standards** - Full type hints, Google-style docstrings, dataclass containers, proper ROS2 patterns. +4. **ObjectDB-style patterns** - Adopt the two-tier (pending/permanent) architecture and dual matching (track_id + distance) from dimos where it improves clarity. + +--- + +## Implementation Progress + +| Step | Component | Status | +|------|-----------|--------| +| 1 | Package Setup & Messages | **COMPLETED** | +| 2 | Data Models (`map_object.py`) | **COMPLETED** | +| 2.1 | Refactor to use shared utilities | **COMPLETED** | +| 3 | Matching Utilities (`matching.py`) | Pending | +| 4 | Object Mapper (`mapper.py`) | Pending | +| 5 | Cloud-Image Projection (`pointcloud_utils.py`) | Pending | +| 6 | Message Publishers (`map_object_utils.py`) | Pending | +| 7 | Image Storage (`storage.py`) | Pending | +| 8 | Mapping Node (`mapping_node.py`) | Pending | +| 9 | Launch Files & Config | Pending | +| 10 | Integration Testing | Pending | + +### Step 2.1: Refactored map_object.py to use shared utilities + +Added to `vector_perception_utils/transform_utils.py`: +- `normalize_angle()` - now supports both scalar and array input +- `discretize_angles()` - discretize angles into bin indices + +Added to `vector_perception_utils/pointcloud_utils.py`: +- `get_oriented_bbox()` - returns (center, corners) tuple from minimal OBB + +Refactored in `map_object.py`: +- Removed duplicate `_normalize_angles_to_pi` and `_discretize_angles` functions +- Simplified `_compute_geometry()` from 43 lines to 15 using `get_oriented_bbox()` +- Use `create_pointcloud()` in `regularize_shape()` instead of manual o3d creation +- Use `compute_centroid()` instead of `np.mean(..., axis=0)` + +--- + +## VLM_ROS Reference (Source Implementation) + +Key files in `VLM_ROS/src/semantic_mapping/`: +- `semantic_map_new.py`: `ObjMapper` manages the persistent map +- `single_object_new.py`: `SingleObject` holds per-object state +- `semantic_mapping_node.py`: `MappingNode` orchestrates SAM2 masks, LiDAR projection, odom transforms + +### ObjMapper Core Flow +- Input: dict with `bboxes`, `labels`, `confidences`, `ids`, `masks` (from SAM2), plus LiDAR cloud and odom +- Memory fix: Uses `.copy()` on detection data extraction to prevent memory leaks +- VLM query threshold: Objects need `best_image_score > 500` before querying VLM +- `update_map()` steps: + 1. Transform incoming cloud into body/world frame using odom + 2. Filter detections by confidence threshold + 3. For each detection: project masked cloud via `CloudImageFusion`, build per-object voxels, then either merge or create a `SingleObject` + 4. Merge heuristics: match by tracker `obj_id`, 3D IoU, distance thresholds, and optional primitive groups (e.g., chair/sofa) + 5. Maintain delete lists, save queues (best image/mask per object) + 6. Publish ROS messages: `ObjectNode`, bbox markers, label markers, colored point cloud + +### SingleObject Highlights +- Keeps class_id→counts/confidence, track_id list, voxel grid (`VoxelFeatureManager`), robot poses, masks, best image saving +- Computes 3D bbox (axis-aligned + oriented), centroids, clustering flags, and validity caches +- Manages persistence (`life`, `inactive_frame`, `publish_status`) and VLM interaction state + +### dimos Patterns to Adopt +- `ObjMapper` ⇄ `ObjectDB`: explicit pending/permanent tiers, track_id + spatial matching, promotion thresholds +- `SingleObject` ⇄ `Object`: cleaner update patterns, separation of core geometry vs VLM metadata + +--- + +## Phase 1: Package Structure & Messages (COMPLETED) + +### 1.1 Package Structure + +``` +semantic_mapping/ +├── CMakeLists.txt # Hybrid CMake for message generation + Python install +├── package.xml # Dependencies: detect_anything, vector_perception_utils, ROS msgs +├── msg/ +│ ├── ObjectNode.msg +│ ├── ObjectType.msg +│ ├── TargetObjectInstruction.msg +│ └── ViewpointRep.msg +├── config/ +│ ├── objects.yaml # Object taxonomy (to be copied from VLM_ROS) +│ └── params.yaml # Default node parameters (to be created) +├── launch/ +│ └── semantic_mapping.launch.py +├── semantic_mapping/ +│ ├── __init__.py +│ ├── mapping_node.py # ROS2 node (orchestration) +│ ├── mapper.py # ObjectMapper class (core logic) +│ ├── map_object.py # Data classes: MapObject, Observation, VoxelManager +│ ├── matching.py # Object matching utilities (IoU, distance, track_id) +│ ├── (moved to utils) # Cloud-image projection helpers in vector_perception_utils +│ ├── map_object_utils.py # ROS message conversion helpers +│ └── storage.py # Async image saving queue +├── docs/ +│ └── SEMANTIC_MAPPING_MIGRATION.md +└── resource/ + └── semantic_mapping # ament resource marker +``` + +### 1.2 Message Definitions + +**ObjectNode.msg** (preserves exact schema for planner compatibility): +``` +std_msgs/Header header +int32[] object_id +string label +geometry_msgs/Point position +geometry_msgs/Point[8] bbox3d # 8 corner points in world frame +sensor_msgs/PointCloud2 cloud +string status # "new", "updated", "unchanged" +string img_path +bool is_asked_vlm +int32 viewpoint_id +``` + +**ObjectType.msg**: VLM query/answer for object relabeling +**TargetObjectInstruction.msg**: User-set target/anchor objects +**ViewpointRep.msg**: Viewpoint trigger message + +--- + +## Phase 2: Data Models (COMPLETED) + +### 2.1 ObjectStatus Enum + +```python +class ObjectStatus(Enum): + """Object lifecycle status (ObjectDB-style tiering).""" + PENDING = "pending" # < min_detections threshold + PERMANENT = "permanent" # >= threshold, confirmed object + DELETED = "deleted" # Marked for removal +``` + +### 2.2 Observation + +```python +@dataclass +class Observation: + """A single detection observation to update a MapObject.""" + track_id: int + label: str + confidence: float + points: np.ndarray # (N, 3) world-frame points + robot_pose: dict # {R: (3,3), t: (3,)} + mask: np.ndarray | None = None # 2D segmentation mask + image: np.ndarray | None = None # RGB crop for best-image selection + timestamp: float = 0.0 +``` + +### 2.3 VoxelFeatureManager + +```python +@dataclass +class VoxelFeatureManager: + """Manages voxelized point cloud with observation angle voting.""" + voxels: np.ndarray # (N, 3) world coords + votes: np.ndarray # (N,) detection count per voxel + observation_angles: np.ndarray # (N, num_bins) angle histogram + regularized_mask: np.ndarray | None # (N,) valid voxel mask after clustering + voxel_size: float = 0.03 + num_angle_bins: int = 20 + + def update(self, new_points: np.ndarray, robot_position: np.ndarray) -> None: + """Add new points with KDTree-based merge and angle voting.""" + + def get_valid_voxels(self, min_votes: int = 1) -> np.ndarray: + """Return voxels passing vote threshold and regularization mask.""" +``` + +### 2.4 MapObject + +```python +@dataclass +class MapObject: + """Represents a tracked object in the semantic map.""" + # Identity + object_id: str # UUID for this object + track_ids: list[int] # Associated tracker IDs (can merge) + + # Classification (voted) + class_votes: dict[str, int] # label -> observation count + confidence_scores: dict[str, float] # label -> best confidence + + # Geometry + voxel_manager: VoxelFeatureManager + robot_poses: list[dict] # [{R, t}, ...] observation poses + + # Lifecycle (ObjectDB-style) + detection_count: int = 0 + inactive_frames: int = 0 + status: ObjectStatus = ObjectStatus.PENDING + + # VLM interaction + is_asked_vlm: bool = False + updated_by_vlm: bool = False + best_image_path: str | None = None + best_image_score: float = 0.0 + + @property + def label(self) -> str: ... + @property + def centroid(self) -> np.ndarray: ... + @property + def bbox3d(self) -> np.ndarray: ... + def update(self, observation: Observation) -> None: ... + def merge(self, other: MapObject) -> None: ... + def regularize_shape(self, percentile: float = 0.85) -> None: ... +``` + +--- + +## Phase 3: Matching Utilities (matching.py) + +```python +def compute_iou_3d(bbox1: np.ndarray, bbox2: np.ndarray) -> float: + """Compute 3D IoU between two axis-aligned bounding boxes.""" + +def compute_center_distance(obj1: MapObject, obj2: MapObject) -> float: + """Compute Euclidean distance between object centroids.""" + +def are_merge_compatible(label1: str, label2: str, merge_groups: list[set[str]]) -> bool: + """Check if two labels can be merged (same class or in same merge group).""" + +def find_best_match( + obs: Observation, + candidates: list[MapObject], + distance_threshold: float, + iou_threshold: float, +) -> MapObject | None: + """Find best matching object by distance and IoU criteria.""" +``` + +--- + +## Phase 4: Object Mapper (mapper.py) + +```python +class ObjectMapper: + """ + Manages the semantic object map with two-tier state management. + Pending objects are promoted to permanent after detection_count >= threshold. + """ + + def __init__( + self, + voxel_size: float = 0.03, + confidence_threshold: float = 0.30, + distance_threshold: float = 0.5, + iou_threshold: float = 0.2, + min_detections_for_permanent: int = 3, + pending_ttl_s: float = 10.0, + max_inactive_frames: int = 20, + ) -> None: + self._pending_objects: dict[str, MapObject] = {} + self._permanent_objects: dict[str, MapObject] = {} + self._track_id_map: dict[int, str] = {} + self._lock = threading.RLock() + self._merge_groups: list[set[str]] = [ + {"chair", "sofa", "couch", "armchair"}, + {"table", "desk"}, + ] + + def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats: ... + def _match_observation(self, obs: Observation) -> MapObject | None: ... + def _create_object(self, obs: Observation) -> MapObject: ... + def _check_promotion(self, obj: MapObject) -> None: ... + def _try_merge_objects(self) -> None: ... + def _prune_inactive(self, target_object: str | None) -> list[MapObject]: ... + def get_objects(self, include_pending: bool = False) -> list[MapObject]: ... +``` + +--- + +## Phase 5: Mapping Node (mapping_node.py) + +### 5.1 Node Structure + +```python +class SemanticMappingNode(Node): + """ROS2 node for semantic 3D object mapping.""" + + def __init__(self) -> None: + super().__init__("semantic_mapping_node") + self._declare_parameters() + self.mapper = ObjectMapper(...) + self.projection = pointcloud_utils + self.image_saver = ImageSaveQueue(...) + + # Data buffers (with locks) + self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=50) + self._odom_buffer: deque[tuple[float, Pose]] = deque(maxlen=100) + self._lock = threading.Lock() + + self._create_subscriptions() + self._create_publishers() + self.create_timer(0.5, self._mapping_callback) +``` + +### 5.2 Subscriptions + +| Topic | Type | QoS | +|-------|------|-----| +| `/detection_result` | `detect_anything/DetectionResult` | Reliable | +| `/registered_scan` | `PointCloud2` | Best Effort | +| `/state_estimation` | `Odometry` | Best Effort | +| `/object_type_answer` | `ObjectType` | Reliable | +| `/target_object_instruction` | `TargetObjectInstruction` | Reliable | +| `/viewpoint_rep_header` | `ViewpointRep` | Reliable | + +### 5.3 Publishers + +| Topic | Type | Description | +|-------|------|-------------| +| `/object_nodes` | `ObjectNode` | Main output for planner | +| `/obj_points` | `PointCloud2` | Colored object cloud | +| `/obj_boxes` | `MarkerArray` | Bbox wireframes | +| `/obj_labels` | `MarkerArray` | Text labels | +| `/annotated_image` | `Image` | Debug visualization | +| `/object_type_query` | `ObjectType` | VLM query | + +--- + +## Phase 6: Cloud-Image Projection (pointcloud_utils.py) + +```python +def project_pointcloud_to_image(...): ... +def segment_pointcloud_by_mask(...): ... +def segment_pointclouds_from_projection(...): ... +def transform_segmented_clouds_to_world(...): ... +``` + +--- + +## Phase 7: Message Publishers (map_object_utils.py) + +```python +def map_object_to_object_node(obj: MapObject, header: Header, status: str) -> ObjectNode | None: ... +def create_bbox_markers(objects: list[MapObject], header: Header) -> MarkerArray: ... +def create_label_markers(objects: list[MapObject], header: Header) -> MarkerArray: ... +def create_object_pointcloud(objects: list[MapObject], header: Header) -> PointCloud2 | None: ... +def create_delete_markers(deleted_ids: list[int], header: Header, namespace: str) -> MarkerArray: ... +``` + +--- + +## Phase 8: Image Storage (storage.py) + +```python +class ImageSaveQueue: + """Background worker for saving object images to disk.""" + + def __init__(self, output_dir: Path, max_queue_size: int = 100) -> None: ... + def start(self) -> None: ... + def stop(self) -> None: ... + def queue_save(self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None) -> str: ... +``` + +--- + +## Phase 9: Launch Files + +```python +# semantic_mapping.launch.py +DeclareLaunchArgument('platform', default_value='mecanum_sim') +DeclareLaunchArgument('detection_topic', default_value='/detection_result') +DeclareLaunchArgument('cloud_topic', default_value='/registered_scan') +DeclareLaunchArgument('odom_topic', default_value='/state_estimation') +DeclareLaunchArgument('voxel_size', default_value='0.03') +DeclareLaunchArgument('confidence_threshold', default_value='0.30') +DeclareLaunchArgument('target_object', default_value='') +``` + +--- + +## Implementation Checklist + +### Completed +- [x] Create `CMakeLists.txt` for message generation +- [x] Port message definitions from tare_planner +- [x] Set up `package.xml` with dependencies +- [x] Implement `map_object.py` with VoxelFeatureManager, MapObject, Observation +- [x] Port lazy computation patterns (centroid, bbox3d, regularization) +- [x] Refactor `map_object.py` to use shared utilities from `vector_perception_utils` +- [x] Add `normalize_angle`, `discretize_angles` to `transform_utils.py` +- [x] Add `get_oriented_bbox` to `pointcloud_utils.py` +- [x] Simplify `_compute_geometry()` using Open3D oriented bbox + +### Remaining +- [ ] Implement `matching.py` with IoU, distance, merge-compatibility functions +- [ ] Implement `mapper.py` with ObjectDB-style tiering +- [ ] Implement cloud-image projection helpers in `vector_perception_utils/pointcloud_utils.py` +- [ ] Implement `map_object_utils.py` message conversion functions +- [ ] Implement `storage.py` with async queue +- [ ] Implement `mapping_node.py` following detect_anything patterns +- [ ] Create launch files and params.yaml +- [ ] Integration testing with bag files + +--- + +## Code Cleanup Items + +Items from VLM_ROS to preserve: +- Memory fixes with `.copy()` on detection data extraction +- VLM query threshold (`best_image_score > 500`) + +Items to remove: +1. Unused publishers: `/cloud_image` +2. Unused methods: `serialize_objs_to_bag()`, `is_merge_allowed()`, `handle_object_query()` +3. Review constants: `BACKGROUND_OBJECTS`, `VERTICAL_OBJECTS`, `BIG_OBJECTS`, `SMALL_OBJECTS` +4. Don't port: `byte_track`, `Grounded-SAM-2`, `mmconfig` directories + +--- + +## Compatibility Checklist + +- [ ] `/object_nodes` publishes ObjectNode with exact schema +- [ ] `/obj_points` publishes colored PointCloud2 +- [ ] `/obj_boxes` publishes MarkerArray wireframes +- [ ] `/obj_labels` publishes MarkerArray text +- [ ] `/object_type_query` publishes ObjectType for VLM +- [ ] Subscribes to `/detection_result` from detect_anything +- [ ] Subscribes to `/registered_scan` PointCloud2 +- [ ] Subscribes to `/state_estimation` Odometry +- [ ] Subscribes to `/target_object_instruction` +- [ ] Subscribes to `/viewpoint_rep_header` +- [ ] Subscribes to `/object_type_answer` +- [ ] Object positions, bbox corners, status flags match expected semantics +- [ ] Delete markers properly remove objects from RViz + +--- + +## Planner Dependencies + +The exploration planner (`sensor_coverage_planner_ground.cpp` in tare_planner) is tightly coupled to `ObjectNode`: +- Subscribes to `/object_nodes` and stores per-object reps +- Uses object positions, bbox corners, clouds, status, `is_asked_vlm`, and `img_path` for navigation decisions + +**Implication**: Keep the `ObjectNode` schema stable. Any new fields from the `MapObject` model can be mapped into extensions, but existing fields and semantics must remain for planner compatibility. diff --git a/semantic_mapping/msg/ObjectNode.msg b/semantic_mapping/msg/ObjectNode.msg new file mode 100644 index 0000000..b2d7686 --- /dev/null +++ b/semantic_mapping/msg/ObjectNode.msg @@ -0,0 +1,35 @@ +# Semantic object representation for planner integration. +# +# Each message represents a single object in the semantic map with its +# 3D bounding box, point cloud, and metadata for VLM interaction. + +std_msgs/Header header + +# Object identity - may contain multiple merged track IDs +int32[] object_id + +# Dominant class label (by weighted vote) +string label + +# Object centroid in world frame +geometry_msgs/Point position + +# 3D bounding box corners (8 points in world frame) +# Order: [min_x,min_y,min_z], [max_x,min_y,min_z], [max_x,max_y,min_z], [min_x,max_y,min_z], +# [min_x,min_y,max_z], [max_x,min_y,max_z], [max_x,max_y,max_z], [min_x,max_y,max_z] +geometry_msgs/Point[8] bbox3d + +# Object point cloud in world frame +sensor_msgs/PointCloud2 cloud + +# Lifecycle status: "new", "updated", "unchanged" +string status + +# Path to best saved image crop of this object +string img_path + +# VLM interaction flags +bool is_asked_vlm + +# Associated viewpoint ID (for viewpoint-triggered saves) +int32 viewpoint_id diff --git a/semantic_mapping/msg/ObjectType.msg b/semantic_mapping/msg/ObjectType.msg new file mode 100644 index 0000000..ab02ea6 --- /dev/null +++ b/semantic_mapping/msg/ObjectType.msg @@ -0,0 +1,17 @@ +# VLM query/answer message for object relabeling. +# +# Used bidirectionally: +# - Query (mapping -> VLM): object_id, img_path, labels[] populated +# - Answer (VLM -> mapping): final_label populated with VLM decision + +# Object ID to query/update +int32 object_id + +# Path to object image for VLM analysis +string img_path + +# VLM-determined final label (populated in answer) +string final_label + +# Candidate labels from detection (populated in query) +string[] labels diff --git a/semantic_mapping/msg/TargetObjectInstruction.msg b/semantic_mapping/msg/TargetObjectInstruction.msg new file mode 100644 index 0000000..1a88005 --- /dev/null +++ b/semantic_mapping/msg/TargetObjectInstruction.msg @@ -0,0 +1,22 @@ +# User instruction for target object search. +# +# Specifies the target object to find and optional spatial/attribute +# conditions relative to an anchor object. + +# Primary target object label to search for +string target_object + +# Room condition (e.g., "in the kitchen") +string room_condition + +# Spatial condition relative to anchor (e.g., "next to", "on top of") +string spatial_condition + +# Attribute condition for target (e.g., "red", "large") +string attribute_condition + +# Anchor object for spatial reference +string anchor_object + +# Attribute condition for anchor object +string attribute_condition_anchor diff --git a/semantic_mapping/msg/ViewpointRep.msg b/semantic_mapping/msg/ViewpointRep.msg new file mode 100644 index 0000000..5bb1bfe --- /dev/null +++ b/semantic_mapping/msg/ViewpointRep.msg @@ -0,0 +1,8 @@ +# Viewpoint trigger message for image saving. +# +# Published by planner to request saving images at specific viewpoints. + +std_msgs/Header header + +# Unique viewpoint identifier +int32 viewpoint_id diff --git a/semantic_mapping/package.xml b/semantic_mapping/package.xml index ca18f0e..4804d1a 100644 --- a/semantic_mapping/package.xml +++ b/semantic_mapping/package.xml @@ -3,25 +3,27 @@ semantic_mapping 0.0.1 - YOLO-E based detection utilities and ROS2 node for semantic mapping. + Semantic 3D object mapping node with VLM integration. Alex Lin MIT ament_cmake + rosidl_default_generators + rosidl_default_runtime + rosidl_interface_packages + rclpy std_msgs sensor_msgs - vision_msgs + geometry_msgs + nav_msgs + visualization_msgs cv_bridge + detect_anything vector_perception_utils - ament_copyright - ament_flake8 - ament_pep257 - python3-pytest - - ament_python + ament_cmake diff --git a/semantic_mapping/resource/semantic_mapping b/semantic_mapping/resource/semantic_mapping index 8f63025..e69de29 100644 --- a/semantic_mapping/resource/semantic_mapping +++ b/semantic_mapping/resource/semantic_mapping @@ -1 +0,0 @@ -semantic_mapping diff --git a/semantic_mapping/scripts/semantic_mapping_node b/semantic_mapping/scripts/semantic_mapping_node new file mode 100644 index 0000000..0312e88 --- /dev/null +++ b/semantic_mapping/scripts/semantic_mapping_node @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +"""Entry point for semantic_mapping_node.""" + +from semantic_mapping.mapping_node import main + +if __name__ == "__main__": + main() diff --git a/semantic_mapping/semantic_mapping/__init__.py b/semantic_mapping/semantic_mapping/__init__.py index 6ac0fcc..8df6f20 100644 --- a/semantic_mapping/semantic_mapping/__init__.py +++ b/semantic_mapping/semantic_mapping/__init__.py @@ -1 +1 @@ -"""Semantic mapping ROS2 package.""" +"""Semantic 3D object mapping for vector_perception_ros.""" diff --git a/semantic_mapping/semantic_mapping/map_object.py b/semantic_mapping/semantic_mapping/map_object.py new file mode 100644 index 0000000..8140835 --- /dev/null +++ b/semantic_mapping/semantic_mapping/map_object.py @@ -0,0 +1,683 @@ +"""Data models for semantic mapping. + +This module defines the core data structures for the semantic mapping system: +- VoxelFeatureManager: Manages voxelized point clouds with observation angle voting +- MapObject: Represents a tracked object in the semantic map +- Observation: A single detection observation +- ObjectStatus: Enum for object lifecycle status +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum +from typing import Optional + +import numpy as np +from scipy.spatial import cKDTree + +from vector_perception_utils.transform_utils import discretize_angles, normalize_angle +from vector_perception_utils.pointcloud_utils import ( + compute_centroid, + create_pointcloud, +) + + +class ObjectStatus(Enum): + """Object lifecycle status (ObjectDB-style tiering).""" + + PENDING = "pending" # < min_detections threshold + PERMANENT = "permanent" # >= threshold, confirmed object + DELETED = "deleted" # Marked for removal + + +@dataclass +class Observation: + """A single detection observation to update a MapObject. + + Attributes: + track_id: Tracker ID from detection. + label: Class label string. + confidence: Detection confidence score. + points: World-frame point cloud (N, 3). + robot_R: Robot rotation matrix (3, 3) at observation time. + robot_t: Robot translation vector (3,) at observation time. + mask: Optional 2D segmentation mask. + image: Optional RGB image crop for best-image selection. + timestamp: Observation timestamp in seconds. + """ + + track_id: int + label: str + confidence: float + points: np.ndarray + robot_R: np.ndarray + robot_t: np.ndarray + mask: Optional[np.ndarray] = None + image: Optional[np.ndarray] = None + timestamp: float = 0.0 + + +class VoxelFeatureManager: + """Manages voxelized point cloud with observation angle voting. + + This class tracks 3D voxels representing an object, along with: + - Vote counts per voxel (how many times each voxel was observed) + - Observation angles (which directions each voxel was seen from) + - Regularization mask (which voxels pass shape filtering) + + The observation angle voting helps distinguish between reliable points + (seen from multiple angles) and spurious points (seen from only one angle). + + Attributes: + voxels: Point coordinates (N, 3) in world frame. + votes: Detection count per voxel (N,). + observation_angles: Binary angle histogram per voxel (N, num_bins). + regularized_mask: Boolean mask for valid voxels after clustering (N,). + voxel_size: Size of voxel grid cells in meters. + num_angle_bins: Number of discretized angle bins. + """ + + def __init__( + self, + points: np.ndarray, + voxel_size: float, + robot_R: np.ndarray, + robot_t: np.ndarray, + num_angle_bins: int = 15, + ) -> None: + """Initialize VoxelFeatureManager with initial points. + + Args: + points: Initial point cloud (N, 3) in world frame. + voxel_size: Voxel grid cell size in meters. + robot_R: Robot rotation matrix (3, 3) at observation time. + robot_t: Robot translation vector (3,) at observation time. + num_angle_bins: Number of angle bins for observation tracking. + """ + num_points = points.shape[0] + + # Compute observation angles from robot position + vectors = points - robot_t + obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0]) + obs_bins = discretize_angles(obs_angles, num_angle_bins) + + self.voxels = points.copy() + self.voxel_size = voxel_size + self.num_angle_bins = num_angle_bins + self._tree = cKDTree(points) + + # Initialize vote counts + self.votes = np.ones(num_points, dtype=np.float32) + + # Initialize observation angle histogram (binary: seen/not seen from each angle) + self.observation_angles = np.zeros((num_points, num_angle_bins), dtype=np.uint8) + self.observation_angles[np.arange(num_points), obs_bins] = 1 + + # Regularization mask (all False until regularize_shape is called) + self.regularized_mask = np.zeros(num_points, dtype=bool) + + # Remove vote for dynamic object clearing (not commonly used) + self._remove_votes = np.zeros(num_points, dtype=np.int32) + + def update( + self, + new_points: np.ndarray, + robot_R: np.ndarray, + robot_t: np.ndarray, + ) -> None: + """Add new points with KDTree-based merge and angle voting. + + Points within voxel_size of existing points are merged (votes incremented). + Points farther away are added as new voxels. + + Args: + new_points: New point cloud (M, 3) in world frame. + robot_R: Robot rotation matrix (3, 3) at observation time. + robot_t: Robot translation vector (3,) at observation time. + """ + if new_points.shape[0] == 0: + return + + # Compute observation angles for new points + vectors = new_points - robot_t + obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0]) + obs_bins = discretize_angles(obs_angles, self.num_angle_bins) + + # Find nearest existing voxels + distances, indices = self._tree.query(new_points) + + # Merge points within voxel_size + merge_mask = distances < self.voxel_size + merge_indices = indices[merge_mask] + merge_bins = obs_bins[merge_mask] + + self.votes[merge_indices] += 1 + self.observation_angles[merge_indices, merge_bins] = 1 + + # Add new points that don't merge + new_mask = ~merge_mask + if np.any(new_mask): + new_voxels = new_points[new_mask] + new_bins = obs_bins[new_mask] + num_new = new_voxels.shape[0] + + # Extend arrays + self.voxels = np.vstack([self.voxels, new_voxels]) + self.votes = np.concatenate([self.votes, np.ones(num_new, dtype=np.float32)]) + + new_obs = np.zeros((num_new, self.num_angle_bins), dtype=np.uint8) + new_obs[np.arange(num_new), new_bins] = 1 + self.observation_angles = np.vstack([self.observation_angles, new_obs]) + + self.regularized_mask = np.concatenate( + [self.regularized_mask, np.zeros(num_new, dtype=bool)] + ) + self._remove_votes = np.concatenate( + [self._remove_votes, np.zeros(num_new, dtype=np.int32)] + ) + + # Rebuild KDTree + self._tree = cKDTree(self.voxels) + + def merge_from(self, other: VoxelFeatureManager) -> None: + """Merge voxels from another VoxelFeatureManager. + + Args: + other: Another VoxelFeatureManager to merge from. + """ + if other.voxels.shape[0] == 0: + return + + distances, indices = self._tree.query(other.voxels) + + # Merge overlapping voxels + merge_mask = distances < self.voxel_size + merge_indices = indices[merge_mask] + + self.votes[merge_indices] += other.votes[merge_mask] + self.observation_angles[merge_indices] = np.logical_or( + self.observation_angles[merge_indices], + other.observation_angles[merge_mask], + ).astype(np.uint8) + + # Add non-overlapping voxels + new_mask = ~merge_mask + if np.any(new_mask): + self.voxels = np.vstack([self.voxels, other.voxels[new_mask]]) + self.votes = np.concatenate([self.votes, other.votes[new_mask]]) + self.observation_angles = np.vstack( + [self.observation_angles, other.observation_angles[new_mask]] + ) + self.regularized_mask = np.concatenate( + [self.regularized_mask, other.regularized_mask[new_mask]] + ) + self._remove_votes = np.concatenate( + [self._remove_votes, np.zeros(np.sum(new_mask), dtype=np.int32)] + ) + self._tree = cKDTree(self.voxels) + + def get_valid_indices( + self, + diversity_percentile: float = 0.3, + use_regularized: bool = True, + ) -> np.ndarray: + """Get indices of valid voxels based on observation angle diversity. + + Voxels are ranked by their observation angle diversity (number of angles + from which they were observed), weighted by vote count. The top percentile + of voxels are returned. + + Args: + diversity_percentile: Fraction of total weight to include (0-1). + use_regularized: If True, only consider voxels in regularized_mask. + + Returns: + Indices of valid voxels. + """ + if use_regularized: + mask = self.regularized_mask + if not np.any(mask): + return np.array([], dtype=int) + obs_angles = self.observation_angles[mask] + votes = self.votes[mask] + else: + obs_angles = self.observation_angles + votes = self.votes + + if len(obs_angles) == 0: + return np.array([], dtype=int) + + # Compute diversity score (number of angles observed) + angle_diversity = np.sum(obs_angles, axis=1) + + # Sort by diversity (ascending) + sorted_indices = np.argsort(angle_diversity) + sorted_diversity = angle_diversity[sorted_indices] + sorted_weights = votes[sorted_indices] + + # Weight by diversity * votes + weighted = sorted_diversity * sorted_weights + total_weight = np.sum(weighted) + + if total_weight == 0: + return np.array([], dtype=int) + + # Find cutoff index for percentile + target_weight = (1 - diversity_percentile) * total_weight + cumsum = np.cumsum(weighted) + cutoff_idx = np.searchsorted(cumsum, target_weight) + + # Return indices above cutoff (higher diversity) + return sorted_indices[cutoff_idx:] + + def get_valid_voxels( + self, + diversity_percentile: float = 0.3, + use_regularized: bool = True, + ) -> np.ndarray: + """Get valid voxel coordinates. + + Args: + diversity_percentile: Fraction of total weight to include (0-1). + use_regularized: If True, only consider voxels in regularized_mask. + + Returns: + Valid voxel coordinates (M, 3). + """ + indices = self.get_valid_indices(diversity_percentile, use_regularized) + if use_regularized and np.any(self.regularized_mask): + voxels = self.voxels[self.regularized_mask] + else: + voxels = self.voxels + + if len(indices) == 0: + return np.empty((0, 3), dtype=np.float32) + return voxels[indices] + + @property + def num_voxels(self) -> int: + """Total number of voxels.""" + return self.voxels.shape[0] + + @property + def num_regularized(self) -> int: + """Number of voxels passing regularization.""" + return int(np.sum(self.regularized_mask)) + + +@dataclass +class MapObject: + """Represents a tracked object in the semantic map. + + This class combines patterns from VLM_ROS SingleObject and dimos Object, + with ObjectDB-style lifecycle management. + + Attributes: + object_id: Unique identifier for this object (UUID string). + track_ids: List of associated tracker IDs (can merge multiple). + voxel_manager: Manages the object's voxelized point cloud. + class_votes: Label -> observation count mapping. + confidence_scores: Label -> best confidence score mapping. + points_count: Label -> total points count mapping. + robot_poses: List of robot poses when object was observed. + detection_count: Total number of detections. + inactive_frames: Frames since last update. + last_update_time: Timestamp of last update. + status: Object lifecycle status. + is_asked_vlm: Whether VLM has been queried for this object. + updated_by_vlm: Whether object was relabeled by VLM. + best_image_path: Path to best saved image crop. + best_image_score: Score of best saved image. + centroid: Cached centroid position (None if needs recompute). + bbox3d_corners: Cached 3D bbox corners (None if needs recompute). + """ + + object_id: str + track_ids: list[int] + voxel_manager: VoxelFeatureManager + + # Classification (voted) + class_votes: dict[str, int] = field(default_factory=dict) + confidence_scores: dict[str, float] = field(default_factory=dict) + points_count: dict[str, int] = field(default_factory=dict) + + # Robot poses at observation times + robot_poses: list[dict] = field(default_factory=list) + + # Lifecycle (ObjectDB-style) + detection_count: int = 1 + inactive_frames: int = 0 + last_update_time: float = 0.0 + status: ObjectStatus = ObjectStatus.PENDING + + # VLM interaction + is_asked_vlm: bool = False + updated_by_vlm: bool = False + + # Image storage + best_image_path: Optional[str] = None + best_image_score: float = 0.0 + + # Cached geometry (None means needs recompute) + _centroid: Optional[np.ndarray] = field(default=None, repr=False) + _bbox3d_corners: Optional[np.ndarray] = field(default=None, repr=False) + _needs_recompute: bool = field(default=True, repr=False) + + # Configuration + _diversity_percentile: float = field(default=0.85, repr=False) + _angle_threshold: float = field(default=np.deg2rad(5), repr=False) + _distance_threshold: float = field(default=0.3, repr=False) + + @classmethod + def from_observation( + cls, + object_id: str, + obs: Observation, + voxel_size: float = 0.03, + num_angle_bins: int = 15, + ) -> MapObject: + """Create a new MapObject from an initial observation. + + Args: + object_id: Unique identifier for this object. + obs: Initial observation. + voxel_size: Voxel grid cell size in meters. + num_angle_bins: Number of angle bins for observation tracking. + + Returns: + New MapObject instance. + """ + voxel_manager = VoxelFeatureManager( + points=obs.points, + voxel_size=voxel_size, + robot_R=obs.robot_R, + robot_t=obs.robot_t, + num_angle_bins=num_angle_bins, + ) + + obj = cls( + object_id=object_id, + track_ids=[obs.track_id], + voxel_manager=voxel_manager, + class_votes={obs.label: 1}, + confidence_scores={obs.label: obs.confidence}, + points_count={obs.label: len(obs.points)}, + robot_poses=[{"R": obs.robot_R.copy(), "t": obs.robot_t.copy()}], + last_update_time=obs.timestamp, + ) + return obj + + def _invalidate_cache(self) -> None: + """Mark cached geometry as needing recompute.""" + self._needs_recompute = True + self._centroid = None + self._bbox3d_corners = None + + def _is_similar_pose(self, new_R: np.ndarray, new_t: np.ndarray) -> bool: + """Check if a new pose is similar to existing poses. + + Used to avoid redundant updates from the same viewpoint. + Compares observation angle and distance to object center. + + Args: + new_R: New robot rotation matrix (3, 3). Not used but kept for API. + new_t: New robot translation vector (3,). + + Returns: + True if pose is similar to an existing pose. + """ + _ = new_R # Unused but kept for API consistency + obj_center = self.centroid + if obj_center is None: + obj_center = compute_centroid(self.voxel_manager.voxels) + + for pose in self.robot_poses: + old_t = pose["t"] # R not used for similarity check + + old_to_obj = obj_center - old_t + new_to_obj = obj_center - new_t + + old_angle = np.arctan2(old_to_obj[1], old_to_obj[0]) + new_angle = np.arctan2(new_to_obj[1], new_to_obj[0]) + angle_diff = abs(normalize_angle(np.array([new_angle - old_angle]))[0]) + + old_dist = np.linalg.norm(old_to_obj) + new_dist = np.linalg.norm(new_to_obj) + dist_diff = abs(new_dist - old_dist) + + if angle_diff < self._angle_threshold and dist_diff < self._distance_threshold: + return True + + return False + + def update(self, obs: Observation) -> None: + """Update object with a new observation. + + Args: + obs: New observation to incorporate. + """ + # Update voxel manager + self.voxel_manager.update(obs.points, obs.robot_R, obs.robot_t) + self.detection_count += 1 + self.inactive_frames = 0 + self.last_update_time = obs.timestamp + + # Check if pose is new (not similar to existing) + is_similar = self._is_similar_pose(obs.robot_R, obs.robot_t) + + if not is_similar: + # Update class statistics + self.class_votes[obs.label] = self.class_votes.get(obs.label, 0) + 1 + self.confidence_scores[obs.label] = max( + self.confidence_scores.get(obs.label, 0), obs.confidence + ) + self.points_count[obs.label] = ( + self.points_count.get(obs.label, 0) + len(obs.points) + ) + + # Add pose + self.robot_poses.append({"R": obs.robot_R.copy(), "t": obs.robot_t.copy()}) + + # Add track ID if new + if obs.track_id not in self.track_ids: + self.track_ids.append(obs.track_id) + + self._invalidate_cache() + + def merge(self, other: MapObject) -> None: + """Merge another MapObject into this one. + + Args: + other: Object to merge from. + """ + # Merge track IDs + for tid in other.track_ids: + if tid not in self.track_ids: + self.track_ids.append(tid) + + # Merge voxel managers + self.voxel_manager.merge_from(other.voxel_manager) + + # Merge class statistics + for label, count in other.class_votes.items(): + self.class_votes[label] = self.class_votes.get(label, 0) + count + self.confidence_scores[label] = max( + self.confidence_scores.get(label, 0), + other.confidence_scores.get(label, 0), + ) + self.points_count[label] = ( + self.points_count.get(label, 0) + other.points_count.get(label, 0) + ) + + # Merge poses (only unique ones) + for pose in other.robot_poses: + if not self._is_similar_pose(pose["R"], pose["t"]): + self.robot_poses.append(pose) + + # Update lifecycle + self.detection_count += other.detection_count + self.last_update_time = max(self.last_update_time, other.last_update_time) + self.is_asked_vlm = self.is_asked_vlm or other.is_asked_vlm + + self._invalidate_cache() + + @property + def label(self) -> str: + """Get dominant class label by weighted vote. + + Returns: + Most likely class label based on points_count * confidence. + """ + if not self.class_votes: + return "unknown" + + # Compute weighted scores + weighted_scores = {} + for lbl in self.class_votes: + pts = self.points_count.get(lbl, 0) + conf = self.confidence_scores.get(lbl, 0) + weighted_scores[lbl] = pts * conf + + return max(weighted_scores, key=weighted_scores.get) + + @property + def confidence(self) -> float: + """Get confidence of the dominant label.""" + return self.confidence_scores.get(self.label, 0.0) + + @property + def centroid(self) -> Optional[np.ndarray]: + """Compute centroid weighted by observation angles. + + Returns: + Centroid position (3,) or None if no valid voxels. + """ + if self._needs_recompute or self._centroid is None: + self._compute_geometry() + return self._centroid + + @property + def bbox3d_corners(self) -> Optional[np.ndarray]: + """Compute axis-aligned 3D bounding box corners. + + Returns: + 8 corner points (8, 3) or None if no valid voxels. + """ + if self._needs_recompute or self._bbox3d_corners is None: + self._compute_geometry() + return self._bbox3d_corners + + def _compute_geometry(self) -> None: + """Compute cached centroid and bbox from valid voxels using Open3D. + + Returns: + None. Updates cached centroid and bbox in-place. + """ + # Get valid voxels (regularized first, then fallback to all) + valid_voxels = self.voxel_manager.get_valid_voxels( + diversity_percentile=self._diversity_percentile, + use_regularized=True, + ) + if len(valid_voxels) == 0: + valid_voxels = self.voxel_manager.get_valid_voxels( + diversity_percentile=self._diversity_percentile, + use_regularized=False, + ) + + if len(valid_voxels) < 4: + self._centroid = compute_centroid(valid_voxels) if len(valid_voxels) > 0 else None + self._bbox3d_corners = None + else: + pcd = create_pointcloud(valid_voxels) + obb = pcd.get_minimal_oriented_bounding_box() + self._centroid = np.asarray(obb.center) + self._bbox3d_corners = np.asarray(obb.get_box_points()) + + self._needs_recompute = False + + def regularize_shape(self, percentile: float = 0.85) -> None: + """Apply DBSCAN clustering to filter outlier voxels. + + This method clusters the voxels using DBSCAN, then selects clusters + by descending weight (observation angle diversity) until the target + percentile of total weight is reached. Clusters far from the main + cluster are down-weighted. + + Args: + percentile: Target fraction of total weight to include (0-1). + """ + voxels = self.voxel_manager.voxels + if len(voxels) < 3: + self.voxel_manager.regularized_mask = np.ones(len(voxels), dtype=bool) + self._invalidate_cache() + return + + # DBSCAN clustering + pcd = create_pointcloud(voxels) + + eps = self.voxel_manager.voxel_size * 2.5 + min_points = 10 if self.detection_count >= 3 else 5 + + labels = np.array(pcd.cluster_dbscan(eps=eps, min_points=min_points)) + unique_labels = np.unique(labels) + + # Collect cluster info + cluster_masks = [] + cluster_weights = [] + cluster_centroids = [] + + for label in unique_labels: + if label == -1: # Noise + continue + mask = labels == label + weight = float(np.sum(self.voxel_manager.observation_angles[mask])) + if weight < 5: # Skip tiny clusters + continue + cluster_masks.append(mask) + cluster_weights.append(weight) + cluster_centroids.append(compute_centroid(voxels[mask])) + + if len(cluster_masks) == 0: + self.voxel_manager.regularized_mask = np.zeros(len(voxels), dtype=bool) + elif len(cluster_masks) == 1: + self.voxel_manager.regularized_mask = cluster_masks[0] + else: + weights = np.array(cluster_weights) + centroids = np.array(cluster_centroids) + + # Find main cluster + main_idx = int(np.argmax(weights)) + main_centroid = centroids[main_idx] + + # Distance-based weight adjustment + dists = np.linalg.norm(centroids - main_centroid, axis=1) + dist_base = np.max(dists) if np.max(dists) > 0 else 1.0 + + alpha = 3.0 # Decay rate + scale = np.exp(-alpha * dists / dist_base) + adjusted_weights = weights * scale + + # Select clusters up to percentile + target = percentile * np.sum(adjusted_weights) + order = np.argsort(adjusted_weights)[::-1] + + chosen = np.zeros(len(cluster_masks), dtype=bool) + acc = 0.0 + + for i in order: + if acc + adjusted_weights[i] <= target + 1e-8: + chosen[i] = True + acc += adjusted_weights[i] + if acc >= target: + break + + # Merge chosen clusters + valid_mask = np.zeros(len(voxels), dtype=bool) + for i, cmask in enumerate(cluster_masks): + if chosen[i]: + valid_mask |= cmask + + self.voxel_manager.regularized_mask = valid_mask + + self._invalidate_cache() diff --git a/semantic_mapping/semantic_mapping/map_object_utils.py b/semantic_mapping/semantic_mapping/map_object_utils.py new file mode 100644 index 0000000..6bc3d8d --- /dev/null +++ b/semantic_mapping/semantic_mapping/map_object_utils.py @@ -0,0 +1,217 @@ +"""ROS message helpers for MapObject publishers.""" + +from __future__ import annotations + +import zlib +from typing import Optional + +import numpy as np +from geometry_msgs.msg import Point +from sensor_msgs.msg import PointCloud2 +from std_msgs.msg import Header +from visualization_msgs.msg import Marker, MarkerArray + +from semantic_mapping.msg import ObjectNode +from vector_perception_utils.pointcloud_utils import create_pointcloud2_msg + +from .map_object import MapObject + + +def map_object_to_object_node( + obj: MapObject, + header: Header, + status: str = "new", + viewpoint_id: int = -1, +) -> Optional[ObjectNode]: + """Convert a MapObject into an ObjectNode message. + + Args: + obj: MapObject instance. + header: ROS header to attach to the message. + status: Lifecycle status string ("new", "updated", "unchanged"). + viewpoint_id: Viewpoint identifier for image saving. + + Returns: + ObjectNode message or None if required geometry is missing. + """ + if obj.centroid is None or obj.bbox3d_corners is None: + return None + + msg = ObjectNode() + msg.header = header + msg.object_id = obj.track_ids + msg.label = obj.label + msg.status = status + msg.position.x = float(obj.centroid[0]) + msg.position.y = float(obj.centroid[1]) + msg.position.z = float(obj.centroid[2]) + + for i in range(8): + msg.bbox3d[i].x = float(obj.bbox3d_corners[i, 0]) + msg.bbox3d[i].y = float(obj.bbox3d_corners[i, 1]) + msg.bbox3d[i].z = float(obj.bbox3d_corners[i, 2]) + + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, + ) + if len(voxels) == 0: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + if len(voxels) > 0: + seed = zlib.crc32(obj.label.encode("utf-8")) + color = np.array( + [ + (seed & 0xFF) / 255.0, + ((seed >> 8) & 0xFF) / 255.0, + ((seed >> 16) & 0xFF) / 255.0, + ], + dtype=np.float32, + ) + colors = np.tile(color, (voxels.shape[0], 1)) + msg.cloud = create_pointcloud2_msg(points=voxels, colors=colors, header=header) + + msg.img_path = obj.best_image_path or "" + msg.is_asked_vlm = obj.is_asked_vlm + msg.viewpoint_id = -1 if obj.updated_by_vlm else viewpoint_id + return msg + + +def create_bbox_markers( + objects: list[MapObject], header: Header, namespace: str = "bbox" +) -> MarkerArray: + """Create wireframe bounding box markers for MapObjects.""" + markers = MarkerArray() + edges = [ + (0, 1), + (1, 2), + (2, 3), + (3, 0), + (4, 5), + (5, 6), + (6, 7), + (7, 4), + (0, 4), + (1, 5), + (2, 6), + (3, 7), + ] + + for obj in objects: + if obj.bbox3d_corners is None: + continue + marker = Marker() + marker.header = header + marker.ns = namespace + marker.id = int(obj.track_ids[0]) if obj.track_ids else 0 + marker.type = Marker.LINE_LIST + marker.action = Marker.ADD + marker.scale.x = 0.02 + + seed = zlib.crc32(obj.label.encode("utf-8")) + marker.color.r = (seed & 0xFF) / 255.0 + marker.color.g = ((seed >> 8) & 0xFF) / 255.0 + marker.color.b = ((seed >> 16) & 0xFF) / 255.0 + marker.color.a = 1.0 + + corners = obj.bbox3d_corners + for i, j in edges: + marker.points.append(Point(x=float(corners[i, 0]), y=float(corners[i, 1]), z=float(corners[i, 2]))) + marker.points.append(Point(x=float(corners[j, 0]), y=float(corners[j, 1]), z=float(corners[j, 2]))) + + markers.markers.append(marker) + + return markers + + +def create_label_markers( + objects: list[MapObject], + header: Header, + namespace: str = "labels", + text_height: float = 0.4, +) -> MarkerArray: + """Create text label markers for MapObjects.""" + markers = MarkerArray() + + for obj in objects: + if obj.centroid is None: + continue + marker = Marker() + marker.header = header + marker.ns = namespace + marker.id = int(obj.track_ids[0]) if obj.track_ids else 0 + marker.type = Marker.TEXT_VIEW_FACING + marker.action = Marker.ADD + marker.scale.z = text_height + marker.text = obj.label + marker.pose.position.x = float(obj.centroid[0]) + marker.pose.position.y = float(obj.centroid[1]) + marker.pose.position.z = float(obj.centroid[2]) + 0.3 + + seed = zlib.crc32(obj.label.encode("utf-8")) + marker.color.r = (seed & 0xFF) / 255.0 + marker.color.g = ((seed >> 8) & 0xFF) / 255.0 + marker.color.b = ((seed >> 16) & 0xFF) / 255.0 + marker.color.a = 1.0 + + markers.markers.append(marker) + + return markers + + +def create_object_pointcloud(objects: list[MapObject], header: Header) -> Optional[PointCloud2]: + """Create a colored point cloud for a list of MapObjects.""" + points_list = [] + colors_list = [] + + for obj in objects: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, + ) + if len(voxels) == 0: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + if len(voxels) == 0: + continue + + seed = zlib.crc32(obj.label.encode("utf-8")) + color = np.array( + [ + (seed & 0xFF) / 255.0, + ((seed >> 8) & 0xFF) / 255.0, + ((seed >> 16) & 0xFF) / 255.0, + ], + dtype=np.float32, + ) + colors = np.tile(color, (voxels.shape[0], 1)) + points_list.append(voxels) + colors_list.append(colors) + + if len(points_list) == 0: + return None + + points = np.concatenate(points_list, axis=0) + colors = np.concatenate(colors_list, axis=0) + return create_pointcloud2_msg(points=points, colors=colors, header=header) + + +def create_delete_markers( + deleted_ids: list[int], + header: Header, + namespace: str = "bbox", +) -> MarkerArray: + """Create deletion markers for a list of marker IDs.""" + markers = MarkerArray() + for marker_id in deleted_ids: + marker = Marker() + marker.header = header + marker.ns = namespace + marker.id = int(marker_id) + marker.action = Marker.DELETE + markers.markers.append(marker) + return markers diff --git a/semantic_mapping/semantic_mapping/mapper.py b/semantic_mapping/semantic_mapping/mapper.py new file mode 100644 index 0000000..f8a10ac --- /dev/null +++ b/semantic_mapping/semantic_mapping/mapper.py @@ -0,0 +1,257 @@ +"""Object map manager for semantic mapping.""" + +from __future__ import annotations + +from dataclasses import dataclass +import threading +import time +import uuid + +from .matching import compute_iou_3d_with_ratios, find_best_match +from .map_object import MapObject, ObjectStatus, Observation + + +@dataclass +class UpdateStats: + """Summary of updates applied in a mapper step.""" + + created: int = 0 + updated: int = 0 + merged: int = 0 + deleted: int = 0 + promoted: int = 0 + total: int = 0 + + +class ObjectMapper: + """Manages the semantic object map with two-tier state management.""" + + def __init__( + self, + voxel_size: float = 0.03, + confidence_threshold: float = 0.30, + distance_threshold: float = 0.5, + iou_threshold: float = 0.2, + min_detections_for_permanent: int = 3, + pending_ttl_s: float = 10.0, + max_inactive_frames: int = 20, + num_angle_bins: int = 15, + ) -> None: + """Initialize the object mapper and its thresholds. + + Args: + voxel_size: Voxel grid cell size in meters. + confidence_threshold: Minimum detection confidence to accept. + distance_threshold: Max centroid distance for spatial matching. + iou_threshold: IoU threshold for merge checks. + min_detections_for_permanent: Detections before promotion. + pending_ttl_s: Time-to-live for pending objects. + max_inactive_frames: Frames before pruning inactive objects. + num_angle_bins: Observation angle bins for voxel manager. + """ + self._pending_objects: dict[str, MapObject] = {} + self._permanent_objects: dict[str, MapObject] = {} + self._track_id_map: dict[int, str] = {} + self._lock = threading.RLock() + + self._voxel_size = voxel_size + self._confidence_threshold = confidence_threshold + self._distance_threshold = distance_threshold + self._iou_threshold = iou_threshold + self._min_detections_for_permanent = min_detections_for_permanent + self._pending_ttl_s = pending_ttl_s + self._max_inactive_frames = max_inactive_frames + self._num_angle_bins = num_angle_bins + + def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats: + """Update the map with new observations. + + Args: + observations: List of new observations to integrate. + target_object: Optional label to protect from pruning. + + Returns: + UpdateStats summarizing changes applied. + """ + stats = UpdateStats() + now = max((obs.timestamp for obs in observations), default=time.time()) + updated_ids: set[str] = set() + + with self._lock: + for obs in observations: + if obs.confidence < self._confidence_threshold: + continue + + obj = self._match_observation(obs) + if obj is None: + obj = self._create_object(obs) + self._pending_objects[obj.object_id] = obj + stats.created += 1 + else: + obj.update(obs) + stats.updated += 1 + + updated_ids.add(obj.object_id) + for track_id in obj.track_ids: + self._track_id_map[track_id] = obj.object_id + + if obj.status == ObjectStatus.PENDING: + self._check_promotion(obj) + if obj.status == ObjectStatus.PERMANENT: + stats.promoted += 1 + + for obj in list(self._pending_objects.values()) + list(self._permanent_objects.values()): + if obj.object_id not in updated_ids: + obj.inactive_frames += 1 + + stats.deleted += len(self._prune_inactive(target_object, now)) + stats.merged += self.merge_objects() + stats.total = len(self._pending_objects) + len(self._permanent_objects) + + return stats + + def _match_observation(self, obs: Observation) -> MapObject | None: + """Match an observation to an existing object.""" + if obs.track_id in self._track_id_map: + obj_id = self._track_id_map[obs.track_id] + if obj_id in self._permanent_objects: + return self._permanent_objects[obj_id] + if obj_id in self._pending_objects: + return self._pending_objects[obj_id] + + candidates = list(self._permanent_objects.values()) + list(self._pending_objects.values()) + return find_best_match( + obs=obs, + candidates=candidates, + distance_threshold=self._distance_threshold, + iou_threshold=self._iou_threshold, + ) + + def _create_object(self, obs: Observation) -> MapObject: + """Create a new MapObject from an observation.""" + obj = MapObject.from_observation( + object_id=str(uuid.uuid4()), + obs=obs, + voxel_size=self._voxel_size, + num_angle_bins=self._num_angle_bins, + ) + self._track_id_map[obs.track_id] = obj.object_id + return obj + + def _check_promotion(self, obj: MapObject) -> None: + """Promote object from pending to permanent if threshold met.""" + if obj.status != ObjectStatus.PENDING: + return + if obj.detection_count < self._min_detections_for_permanent: + return + if obj.object_id in self._pending_objects: + self._pending_objects.pop(obj.object_id) + obj.status = ObjectStatus.PERMANENT + self._permanent_objects[obj.object_id] = obj + + def merge_objects(self) -> int: + """Merge overlapping permanent objects. + + Returns: + Number of merges performed. + """ + merged = 0 + objs = list(self._permanent_objects.values()) + i = 0 + while i < len(objs): + obj = objs[i] + j = i + 1 + while j < len(objs): + other = objs[j] + should_merge = False + + if set(obj.track_ids) & set(other.track_ids): + should_merge = True + else: + if obj.centroid is None or other.centroid is None: + j += 1 + continue + dist = float(((obj.centroid - other.centroid) ** 2).sum() ** 0.5) + if dist > self._distance_threshold: + j += 1 + continue + + obj_corners = obj.bbox3d_corners + other_corners = other.bbox3d_corners + obj_extent = ( + obj_corners.max(axis=0) - obj_corners.min(axis=0) + if obj_corners is not None and obj_corners.size > 0 + else None + ) + other_extent = ( + other_corners.max(axis=0) - other_corners.min(axis=0) + if other_corners is not None and other_corners.size > 0 + else None + ) + + dist_merge = dist < 0.25 + if obj_extent is not None and other_extent is not None: + dist_thresh = (((obj_extent / 2 + other_extent / 2) / 2) ** 2).sum() ** 0.5 + dist_merge = dist_merge or dist < dist_thresh * 0.5 + + if dist_merge: + should_merge = True + elif obj_corners is not None and other_corners is not None: + iou, ratio_obj, ratio_other = compute_iou_3d_with_ratios( + obj_corners, other_corners + ) + if ( + iou > self._iou_threshold + or ratio_obj > 0.4 + or ratio_other > 0.4 + ): + should_merge = True + + if should_merge: + obj.merge(other) + for track_id in other.track_ids: + self._track_id_map[track_id] = obj.object_id + self._permanent_objects.pop(other.object_id, None) + objs.pop(j) + merged += 1 + else: + j += 1 + i += 1 + + return merged + + def _prune_inactive(self, target_object: str | None, now: float) -> list[MapObject]: + """Remove inactive pending/permanent objects.""" + deleted: list[MapObject] = [] + + for obj_id, obj in list(self._pending_objects.items()): + if (now - obj.last_update_time) > self._pending_ttl_s or ( + obj.inactive_frames > self._max_inactive_frames + ): + deleted.append(self._pending_objects.pop(obj_id)) + + for obj_id, obj in list(self._permanent_objects.items()): + if target_object and obj.label == target_object: + continue + if obj.inactive_frames > self._max_inactive_frames: + deleted.append(self._permanent_objects.pop(obj_id)) + + for obj in deleted: + for track_id in obj.track_ids: + if self._track_id_map.get(track_id) == obj.object_id: + self._track_id_map.pop(track_id, None) + + return deleted + + def get_objects(self, include_pending: bool = False) -> list[MapObject]: + """Return current objects. + + Args: + include_pending: If True, include pending objects. + + Returns: + List of MapObject instances. + """ + if include_pending: + return list(self._permanent_objects.values()) + list(self._pending_objects.values()) + return list(self._permanent_objects.values()) diff --git a/semantic_mapping/semantic_mapping/matching.py b/semantic_mapping/semantic_mapping/matching.py new file mode 100644 index 0000000..418521e --- /dev/null +++ b/semantic_mapping/semantic_mapping/matching.py @@ -0,0 +1,246 @@ +"""Object matching utilities for semantic mapping.""" + +from __future__ import annotations + +from typing import Optional + +import numpy as np + +from vector_perception_utils.pointcloud_utils import compute_centroid, get_oriented_bbox + +from .map_object import MapObject, Observation + + +def _convex_hull_2d(points: np.ndarray) -> np.ndarray: + """Compute 2D convex hull in CCW order (Monotonic chain). + + Args: + points: Input points (N, 2). + + Returns: + Hull vertices in CCW order (M, 2). + """ + if points.shape[0] < 3: + return points + + pts = np.unique(points, axis=0) + if pts.shape[0] < 3: + return pts + + pts = pts[np.lexsort((pts[:, 1], pts[:, 0]))] + + def cross(o: np.ndarray, a: np.ndarray, b: np.ndarray) -> float: + return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0]) + + lower: list[np.ndarray] = [] + for p in pts: + while len(lower) >= 2 and cross(lower[-2], lower[-1], p) <= 0: + lower.pop() + lower.append(p) + + upper: list[np.ndarray] = [] + for p in reversed(pts): + while len(upper) >= 2 and cross(upper[-2], upper[-1], p) <= 0: + upper.pop() + upper.append(p) + + hull = np.array(lower[:-1] + upper[:-1]) + return hull + + +def _polygon_area(poly: np.ndarray) -> float: + """Compute polygon area using the shoelace formula. + + Args: + poly: Polygon vertices in order (N, 2). + + Returns: + Polygon area. + """ + if poly.shape[0] < 3: + return 0.0 + x = poly[:, 0] + y = poly[:, 1] + return 0.5 * abs(float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1)))) + + +def _line_intersection(p1: np.ndarray, p2: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> np.ndarray: + """Intersection of line p1->p2 with line cp1->cp2. + + Args: + p1: Line segment start (2,). + p2: Line segment end (2,). + cp1: Clip edge start (2,). + cp2: Clip edge end (2,). + + Returns: + Intersection point (2,). + """ + dc = cp1 - cp2 + dp = p1 - p2 + denom = dc[0] * dp[1] - dc[1] * dp[0] + if denom == 0: + return p2 + n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0] + n2 = p1[0] * p2[1] - p1[1] * p2[0] + x = (n1 * dp[0] - n2 * dc[0]) / denom + y = (n1 * dp[1] - n2 * dc[1]) / denom + return np.array([x, y], dtype=np.float64) + + +def _polygon_clip(subject: np.ndarray, clip: np.ndarray) -> np.ndarray: + """Clip a polygon with a convex clip polygon (Sutherland–Hodgman). + + Args: + subject: Subject polygon vertices (N, 2). + clip: Clip polygon vertices (M, 2). + + Returns: + Clipped polygon vertices (K, 2). + """ + if subject.shape[0] == 0: + return subject + + def inside(p: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> bool: + return (cp2[0] - cp1[0]) * (p[1] - cp1[1]) - (cp2[1] - cp1[1]) * ( + p[0] - cp1[0] + ) >= 0 + + output = subject + for i in range(len(clip)): + cp1 = clip[i] + cp2 = clip[(i + 1) % len(clip)] + input_list = output + output = [] + if len(input_list) == 0: + break + s = input_list[-1] + for e in input_list: + if inside(e, cp1, cp2): + if not inside(s, cp1, cp2): + output.append(_line_intersection(s, e, cp1, cp2)) + output.append(e) + elif inside(s, cp1, cp2): + output.append(_line_intersection(s, e, cp1, cp2)) + s = e + output = np.array(output, dtype=np.float64) + return output + + +def compute_iou_3d_with_ratios( + bbox1: np.ndarray, bbox2: np.ndarray +) -> tuple[float, float, float]: + """Compute IoU and intersection ratios using 2D polygon overlap + Z overlap. + + Args: + bbox1: Corner points for bbox1 (N, 3). + bbox2: Corner points for bbox2 (N, 3). + + Returns: + Tuple of (iou, ratio_bbox1, ratio_bbox2). + """ + if bbox1.shape[0] < 4 or bbox2.shape[0] < 4: + return 0.0, 0.0, 0.0 + + hull1 = _convex_hull_2d(bbox1[:, :2]) + hull2 = _convex_hull_2d(bbox2[:, :2]) + if hull1.shape[0] < 3 or hull2.shape[0] < 3: + return 0.0, 0.0, 0.0 + + poly_inter = _polygon_clip(hull1, hull2) + inter_area = _polygon_area(poly_inter) if poly_inter.shape[0] >= 3 else 0.0 + + zmin1, zmax1 = float(np.min(bbox1[:, 2])), float(np.max(bbox1[:, 2])) + zmin2, zmax2 = float(np.min(bbox2[:, 2])), float(np.max(bbox2[:, 2])) + height = max(0.0, min(zmax1, zmax2) - max(zmin1, zmin2)) + inter_vol = inter_area * height + + area1 = _polygon_area(hull1) + area2 = _polygon_area(hull2) + vol1 = area1 * (zmax1 - zmin1) + vol2 = area2 * (zmax2 - zmin2) + denom = vol1 + vol2 - inter_vol + + iou = inter_vol / denom if denom > 0 else 0.0 + ratio1 = inter_vol / vol1 if vol1 > 0 else 0.0 + ratio2 = inter_vol / vol2 if vol2 > 0 else 0.0 + return iou, ratio1, ratio2 + + +def find_best_match( + obs: Observation, + candidates: list[MapObject], + distance_threshold: float, + iou_threshold: float, + ratio_threshold: float = 0.4, + min_distance_merge: float = 0.25, +) -> Optional[MapObject]: + """Find best matching object by distance/IoU heuristics. + + Args: + obs: Observation to match. + candidates: Candidate map objects. + distance_threshold: Max centroid distance to consider. + iou_threshold: Min IoU to accept a merge. + ratio_threshold: Min overlap ratio to accept a merge. + min_distance_merge: Absolute distance threshold for merge. + + Returns: + Best matching object or None if no match passes thresholds. + """ + if obs.points.shape[0] == 0: + return None + + obs_center = compute_centroid(obs.points) + obs_bbox = get_oriented_bbox(obs.points) + obs_corners = obs_bbox[1] if obs_bbox is not None else None + obs_extent = ( + np.max(obs_corners, axis=0) - np.min(obs_corners, axis=0) + if obs_corners is not None and obs_corners.size > 0 + else None + ) + + best_candidate: Optional[MapObject] = None + best_dist = float("inf") + + for candidate in candidates: + if obs.track_id in candidate.track_ids: + return candidate + + cand_center = candidate.centroid + + dist = float(np.linalg.norm(obs_center - cand_center)) + if dist > distance_threshold: + continue + + cand_corners = candidate.bbox3d_corners + cand_extent = ( + np.max(cand_corners, axis=0) - np.min(cand_corners, axis=0) + if cand_corners is not None and cand_corners.size > 0 + else None + ) + + dist_merge = dist < min_distance_merge + if obs_extent is not None and cand_extent is not None: + dist_thresh = np.linalg.norm((obs_extent / 2 + cand_extent / 2) / 2) * 0.5 + dist_merge = dist_merge or dist < dist_thresh + + if dist_merge: + if dist < best_dist: + best_candidate = candidate + best_dist = dist + continue + + if obs_corners is None or cand_corners is None: + continue + + iou, ratio_obs, ratio_cand = compute_iou_3d_with_ratios( + obs_corners, cand_corners + ) + + if iou > iou_threshold or ratio_obs > ratio_threshold or ratio_cand > ratio_threshold: + if dist < best_dist: + best_candidate = candidate + best_dist = dist + + return best_candidate diff --git a/semantic_mapping/semantic_mapping/storage.py b/semantic_mapping/semantic_mapping/storage.py new file mode 100644 index 0000000..6699373 --- /dev/null +++ b/semantic_mapping/semantic_mapping/storage.py @@ -0,0 +1,91 @@ +"""Async image saving queue for semantic mapping.""" + +from __future__ import annotations + +from pathlib import Path +import queue +import threading +import time + +import cv2 +import numpy as np + + +class ImageSaveQueue: + """Background worker for saving object images to disk.""" + + def __init__(self, output_dir: Path, max_queue_size: int = 100, save_png: bool = False) -> None: + """Initialize the image save queue. + + Args: + output_dir: Directory where images are stored. + max_queue_size: Maximum number of queued save jobs. + save_png: If True, also save a .png copy of the image. + """ + self._output_dir = Path(output_dir) + self._output_dir.mkdir(parents=True, exist_ok=True) + self._save_png = save_png + self._queue: queue.Queue = queue.Queue(maxsize=max_queue_size) + self._stop_event = threading.Event() + self._thread: threading.Thread | None = None + + def start(self) -> None: + """Start the background worker.""" + if self._thread is not None and self._thread.is_alive(): + return + self._stop_event.clear() + self._thread = threading.Thread(target=self._run, daemon=True) + self._thread.start() + + def stop(self) -> None: + """Stop the background worker.""" + self._stop_event.set() + try: + self._queue.put_nowait(None) + except queue.Full: + pass + if self._thread is not None: + self._thread.join(timeout=2.0) + self._thread = None + + def queue_save( + self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None + ) -> str: + """Queue an image (and optional mask) for saving. + + Args: + object_id: Object identifier to embed in filename. + image: Image array to save. + mask: Optional mask array to save alongside the image. + + Returns: + Path to the saved image (.npy). + """ + timestamp_ms = int(time.time() * 1000) + image_path = self._output_dir / f"{object_id}_{timestamp_ms}.npy" + mask_path = self._output_dir / f"{object_id}_{timestamp_ms}_mask.npy" + self._queue.put(("save", str(image_path), str(mask_path), image, mask)) + return str(image_path) + + def _run(self) -> None: + while not self._stop_event.is_set(): + item = self._queue.get() + if item is None: + self._queue.task_done() + break + op, image_path, mask_path, image, mask = item + if op == "save": + try: + np.save(image_path, image) + if mask is not None: + np.save(mask_path, mask) + if self._save_png: + png_path = Path(image_path).with_suffix(".png") + if image.ndim == 3 and image.shape[2] == 3: + bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + cv2.imwrite(str(png_path), bgr) + else: + cv2.imwrite(str(png_path), image) + except Exception: + pass + self._queue.task_done() diff --git a/semantic_mapping/setup.cfg b/semantic_mapping/setup.cfg deleted file mode 100644 index 31239a6..0000000 --- a/semantic_mapping/setup.cfg +++ /dev/null @@ -1,4 +0,0 @@ -[develop] -script_dir=$base/lib/semantic_mapping -[install] -install_scripts=$base/lib/semantic_mapping diff --git a/semantic_mapping/setup.py b/semantic_mapping/setup.py deleted file mode 100644 index fc5260b..0000000 --- a/semantic_mapping/setup.py +++ /dev/null @@ -1,27 +0,0 @@ -from glob import glob -import os - -from setuptools import find_packages, setup - -package_name = "semantic_mapping" - -setup( - name=package_name, - version="0.0.1", - packages=find_packages(exclude=["test"]), - package_data={package_name: []}, - data_files=[ - ("share/ament_index/resource_index/packages", ["resource/" + package_name]), - ("share/" + package_name, ["package.xml"]), - (os.path.join("share", package_name, "launch"), glob(os.path.join("launch", "*launch.[pxy][yma]*"))), - (os.path.join("share", package_name, "config"), glob(os.path.join("config", "*.yaml"))), - ], - install_requires=["setuptools"], - zip_safe=True, - maintainer="Alex Lin", - maintainer_email="alex.lin416@outlook.com", - description="Semantic mapping utilities with a YOLO-E detector node.", - license="MIT", - tests_require=["pytest"], - entry_points={"console_scripts": []}, -) diff --git a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py index 835db76..c27a23b 100644 --- a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py +++ b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py @@ -199,42 +199,68 @@ def crop_pointcloud_bbox( def compute_centroid(points: np.ndarray) -> np.ndarray: """ Compute centroid of point cloud. - + Args: points: Point coordinates (N, 3) - + Returns: Centroid as (3,) array """ return np.mean(points, axis=0) +def get_oriented_bbox( + points: np.ndarray, +) -> Optional[Tuple[np.ndarray, np.ndarray]]: + """ + Compute minimal oriented bounding box from point cloud. + + Args: + points: Point coordinates (N, 3) + + Returns: + Tuple of (center, corners) where center is (3,) and corners is (8, 3), + or None if insufficient points. + """ + if len(points) < 4: + return None + + pcd = create_pointcloud(points) + + try: + obb = pcd.get_minimal_oriented_bounding_box() + except Exception: + return None + + center = np.asarray(obb.center) + corners = np.asarray(obb.get_box_points()) + return center, corners + def pointcloud_to_bbox3d(points: np.ndarray, header=None): """ Convert point cloud to minimal oriented 3D bounding box (ROS2 message). - + Args: points: Point coordinates (N, 3) header: Optional ROS2 header for the message - + Returns: vision_msgs/BoundingBox3D message or None if insufficient points """ - + if len(points) < 4: return None pcd = create_pointcloud(points) - + try: obb = pcd.get_minimal_oriented_bounding_box() except: return None - center = obb.center extent = obb.extent quat = Rotation.from_matrix(obb.R).as_quat() - + bbox = BoundingBox3D() bbox.center.position = Point( x=float(center[0]), y=float(center[1]), z=float(center[2]) @@ -243,7 +269,7 @@ def pointcloud_to_bbox3d(points: np.ndarray, header=None): x=float(quat[0]), y=float(quat[1]), z=float(quat[2]), w=float(quat[3]) ) bbox.size = Vector3(x=float(extent[0]), y=float(extent[1]), z=float(extent[2])) - + return bbox @@ -328,6 +354,192 @@ def pointcloud_to_depth( return depth_image +def project_pointcloud_to_image( + points: np.ndarray, + projection: str = "equirect", + image_width: int = 1920, + image_height: int = 640, + axis_mode: str = "xz", + intrinsics: Optional[np.ndarray] = None, + depth_filter: bool = False, + depth_filter_margin: float = 0.15, + transform: Optional[np.ndarray] = None, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Project a point cloud into image pixel coordinates. + + Args: + points: Point coordinates (N, 3). + projection: Projection model ("equirect" or "pinhole"). + image_width: Output image width in pixels. + image_height: Output image height in pixels. + axis_mode: Equirect axis convention ("xz" or "xy"). + intrinsics: Pinhole intrinsics (3, 3) if using "pinhole". + depth_filter: Enable depth-based occlusion filtering. + depth_filter_margin: Depth margin for occlusion filtering (meters). + transform: Optional 4x4 transform applied before projection. + + Returns: + Tuple of (camera_points, pixel_idx) where pixel_idx is (N, 3) with (u, v, depth). + """ + camera_points = transform_pointcloud(points, transform) if transform is not None else points + + if projection == "pinhole": + if intrinsics is None: + raise ValueError("intrinsics required for pinhole projection") + u = camera_points[:, 0] * intrinsics[0, 0] / (camera_points[:, 2] + 1e-6) + intrinsics[0, 2] + v = camera_points[:, 1] * intrinsics[1, 1] / (camera_points[:, 2] + 1e-6) + intrinsics[1, 2] + return camera_points, np.stack([u, v, camera_points[:, 2]], axis=-1) + + if axis_mode == "xz": + hori_dis = np.sqrt(camera_points[:, 0] ** 2 + camera_points[:, 2] ** 2) + u = ( + image_width / (2 * np.pi) * np.arctan2(camera_points[:, 0], camera_points[:, 2]) + + image_width / 2 + + 1 + ) + v = ( + image_width / (2 * np.pi) * np.arctan(camera_points[:, 1] / hori_dis) + + image_height / 2 + + 1 + ) + else: + hori_dis = np.sqrt(camera_points[:, 0] ** 2 + camera_points[:, 1] ** 2) + u = ( + -image_width / (2 * np.pi) * np.arctan2(camera_points[:, 1], camera_points[:, 0]) + + image_width / 2 + + 1 + ) + v = ( + -image_width / (2 * np.pi) * np.arctan2(camera_points[:, 2], hori_dis) + + image_height / 2 + + 1 + ) + + u = np.clip(u, 0, image_width - 1) + v = np.clip(v, 0, image_height - 1) + + if depth_filter: + depth_map = np.full((image_height, image_width), np.inf) + idx = v.astype(int) * image_width + u.astype(int) + np.minimum.at(depth_map.ravel(), idx, hori_dis) + remove_mask = hori_dis >= depth_map[v.astype(int), u.astype(int)] + depth_filter_margin + u = u.copy() + v = v.copy() + u[remove_mask] = -1 + v[remove_mask] = -1 + + return camera_points, np.stack([u, v, hori_dis], axis=-1) + + +def segment_pointcloud_by_mask( + points: np.ndarray, + mask: np.ndarray, + pixel_idx: np.ndarray, +) -> np.ndarray: + """ + Segment a point cloud using a binary mask and projected pixels. + + Args: + points: Point coordinates (N, 3). + mask: Binary mask (H, W). + pixel_idx: Pixel coordinates (N, 3) as (u, v, depth). + + Returns: + Masked point cloud (M, 3). + """ + h, w = mask.shape[:2] + pixel_idx = pixel_idx.astype(int) + valid = ( + (pixel_idx[:, 0] >= 0) + & (pixel_idx[:, 0] < w) + & (pixel_idx[:, 1] >= 0) + & (pixel_idx[:, 1] < h) + ) + pixel_idx = pixel_idx[valid] + points = points[valid] + keep = mask[pixel_idx[:, 1], pixel_idx[:, 0]].astype(bool) + return points[keep] + + +def segment_pointclouds_from_projection( + points: np.ndarray, + masks: list[np.ndarray], + pixel_idx: np.ndarray, +) -> Tuple[list[np.ndarray], list[np.ndarray]]: + """ + Segment point clouds from projected pixels. + + Args: + points: Point coordinates (N, 3). + masks: List of binary masks (H, W). + pixel_idx: Pixel coordinates (N, 3) as (u, v, depth). + + Returns: + Tuple of (clouds, depths) where each entry corresponds to a mask. + """ + if masks is None or len(masks) == 0: + return [], [] + + image_shape = masks[0].shape + valid = ( + (pixel_idx[:, 0] >= 0) + & (pixel_idx[:, 0] < image_shape[1]) + & (pixel_idx[:, 1] >= 0) + & (pixel_idx[:, 1] < image_shape[0]) + ) + pixel_idx = pixel_idx[valid] + points = points[valid] + + depths = pixel_idx[:, 2] + pixel_idx = pixel_idx.astype(int) + + obj_clouds = [] + obj_depths = [] + for obj_mask in masks: + cloud_mask = obj_mask[pixel_idx[:, 1], pixel_idx[:, 0]].astype(bool) + obj_clouds.append(points[cloud_mask]) + obj_depths.append(depths[cloud_mask].reshape(-1, 1)) + + return obj_clouds, obj_depths + + +def transform_segmented_clouds_to_world( + clouds: list[np.ndarray], + depths: list[np.ndarray], + R_b2w: np.ndarray, + t_b2w: np.ndarray, + depth_jump_threshold: float = 0.3, +) -> list[np.ndarray]: + """ + Prune depth jumps and transform segmented clouds to world frame. + + Args: + clouds: List of segmented point clouds (sensor frame). + depths: List of depth arrays aligned with clouds. + R_b2w: Body-to-world rotation matrix (3, 3). + t_b2w: Body-to-world translation vector (3,). + depth_jump_threshold: Depth jump threshold for pruning (meters). + + Returns: + List of segmented object clouds in world frame. + """ + obj_cloud_world_list = [] + for obj_cloud, obj_depth in zip(clouds, depths): + if obj_depth.shape[0] > 1: + obj_depth_diff = (obj_depth[1:] - obj_depth[:-1]).squeeze() + if obj_depth_diff.size > 0 and np.max(obj_depth_diff) > depth_jump_threshold: + idx_tmp = np.ones(len(obj_depth), dtype=bool) + jump_idx = int(np.argmax(obj_depth_diff)) + idx_tmp[jump_idx + 1 :] = False + obj_cloud = obj_cloud[idx_tmp] + + obj_cloud_world = obj_cloud[:, :3] @ R_b2w.T + t_b2w + obj_cloud_world_list.append(obj_cloud_world) + + return obj_cloud_world_list + + def create_pointcloud2_msg(points: np.ndarray, colors: np.ndarray, header): """ Create PointCloud2 ROS message from points and colors. diff --git a/vector_perception_utils/vector_perception_utils/transform_utils.py b/vector_perception_utils/vector_perception_utils/transform_utils.py index 534d8d2..ef3238e 100644 --- a/vector_perception_utils/vector_perception_utils/transform_utils.py +++ b/vector_perception_utils/vector_perception_utils/transform_utils.py @@ -8,7 +8,7 @@ from geometry_msgs.msg import Pose, Quaternion -def normalize_angle(angle: float) -> float: +def normalize_angle(angle: float | np.ndarray) -> float | np.ndarray: """ Normalize angle to [-pi, pi] range. @@ -21,6 +21,26 @@ def normalize_angle(angle: float) -> float: return np.arctan2(np.sin(angle), np.cos(angle)) +def discretize_angles(angles: np.ndarray, num_bins: int) -> np.ndarray: + """ + Discretize angles into bin indices. + + Normalizes angles to [-pi, pi] then maps to bin indices [0, num_bins-1]. + + Args: + angles: Angles in radians (any range). + num_bins: Number of angle bins. + + Returns: + Array of bin indices (0 to num_bins-1), same shape as input. + """ + angles = normalize_angle(angles) + # Map [-pi, pi] to [0, 2*pi] then to [0, num_bins) + normalized = (angles + np.pi) / (2 * np.pi) + bins = (normalized * num_bins).astype(int) + return np.clip(bins, 0, num_bins - 1) + + def pose_to_matrix(pose: Pose) -> np.ndarray: """ Convert pose to 4x4 homogeneous transform matrix. From 88b2be0e227e0e52264346a1403945a0cadacd7a Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Thu, 22 Jan 2026 23:47:38 -0500 Subject: [PATCH 3/6] updated README --- semantic_mapping/README.md | 58 ++++++++++++------- .../docs/SEMANTIC_MAPPING_MIGRATION.md | 29 ++++++---- 2 files changed, 56 insertions(+), 31 deletions(-) diff --git a/semantic_mapping/README.md b/semantic_mapping/README.md index 2dfd1dc..4729659 100644 --- a/semantic_mapping/README.md +++ b/semantic_mapping/README.md @@ -5,6 +5,7 @@ ROS2 package for semantic 3D object mapping with VLM integration. ## Overview This package builds a persistent semantic map of objects detected by `detect_anything`, fusing 2D detections with LiDAR point clouds to create 3D object representations. It publishes `ObjectNode` messages compatible with the tare_planner exploration system. +The pipeline stores voxelized geometry, merges observations, and exports planner‑friendly ROS messages. ## Status: Work in Progress @@ -12,11 +13,11 @@ This package builds a persistent semantic map of objects detected by `detect_any |-----------|------|--------| | Messages | `msg/*.msg` | Complete | | Data Models | `map_object.py` | Complete | -| Matching | `matching.py` | Pending | -| Object Mapper | `mapper.py` | Pending | -| Cloud Projection | `vector_perception_utils/pointcloud_utils.py` | Pending | -| Publishers | `map_object_utils.py` | Pending | -| Image Storage | `storage.py` | Pending | +| Matching | `matching.py` | Complete | +| Object Mapper | `mapper.py` | Complete | +| Cloud Projection | `vector_perception_utils/pointcloud_utils.py` | Complete | +| Publishers | `map_object_utils.py` | Complete | +| Image Storage | `storage.py` | Complete | | Mapping Node | `mapping_node.py` | Pending | | Launch Files | `launch/` | Pending | @@ -49,27 +50,37 @@ Viewpoint trigger for image saving. detect_anything/DetectionResult │ ▼ -┌───────────────────────────────────────┐ -│ semantic_mapping_node │ -│ │ -│ ┌─────────────┐ ┌──────────────┐ │ -│ │ CloudImage │ │ ObjectMapper │ │ -│ │ Fusion │───▶│ (ObjectDB │ │ -│ │ │ │ pattern) │ │ -│ └─────────────┘ └──────────────┘ │ -│ │ │ │ -│ ▼ ▼ │ -│ ┌─────────────┐ ┌──────────────┐ │ -│ │ MapObject │ │ Publishers │ │ -│ │ (voxels, │───▶│ (ObjectNode, │ │ -│ │ votes) │ │ Markers) │ │ -│ └─────────────┘ └──────────────┘ │ -└───────────────────────────────────────┘ +┌───────────────────────────────────────────────┐ +│ semantic_mapping_node │ +│ │ +│ ┌────────────────────┐ ┌──────────────┐ │ +│ │ pointcloud_utils │ │ ObjectMapper │ │ +│ │ (projection + mask)├──▶│ (pending/ │ │ +│ │ │ │ permanent) │ │ +│ └────────────────────┘ └──────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌────────────────┐ ┌──────────────┐ │ +│ │ MapObject │ │ map_object_ │ │ +│ │ (voxels/votes) ├─▶│ utils │ │ +│ └────────────────┘ └──────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ storage.py mapping_node.py │ +└───────────────────────────────────────────────┘ │ ▼ /object_nodes, /obj_points, /obj_boxes, /obj_labels ``` +Key components: +- `map_object.py`: Object state, voxel voting, pose de‑duplication, cached geometry. +- `matching.py`: Track‑id and spatial matching with IoU/ratio heuristics. +- `mapper.py`: Pending/permanent tiers, merge rules, and pruning. +- `vector_perception_utils/pointcloud_utils.py`: Projection + mask segmentation utilities. +- `map_object_utils.py`: ROS message construction helpers. +- `storage.py`: Async image save queue. + ## Dependencies - `detect_anything`: Detection results with masks @@ -86,6 +97,11 @@ colcon build --packages-select semantic_mapping ros2 launch semantic_mapping semantic_mapping.launch.py ``` +## Configuration + +- `config/sensor_to_camera_tf.yaml`: Static sensor→camera transform for projection. +- `config/objects.yaml`: Object taxonomy for detection labels. + ## Topics ### Subscriptions diff --git a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md index 4a23f74..53b4560 100644 --- a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md +++ b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md @@ -22,11 +22,11 @@ This document outlines the comprehensive plan for porting the `semantic_mapping` | 1 | Package Setup & Messages | **COMPLETED** | | 2 | Data Models (`map_object.py`) | **COMPLETED** | | 2.1 | Refactor to use shared utilities | **COMPLETED** | -| 3 | Matching Utilities (`matching.py`) | Pending | -| 4 | Object Mapper (`mapper.py`) | Pending | -| 5 | Cloud-Image Projection (`pointcloud_utils.py`) | Pending | -| 6 | Message Publishers (`map_object_utils.py`) | Pending | -| 7 | Image Storage (`storage.py`) | Pending | +| 3 | Matching Utilities (`matching.py`) | **COMPLETED** | +| 4 | Object Mapper (`mapper.py`) | **COMPLETED** | +| 5 | Cloud-Image Projection (`pointcloud_utils.py`) | **COMPLETED** | +| 6 | Message Publishers (`map_object_utils.py`) | **COMPLETED** | +| 7 | Image Storage (`storage.py`) | **COMPLETED** | | 8 | Mapping Node (`mapping_node.py`) | Pending | | 9 | Launch Files & Config | Pending | | 10 | Integration Testing | Pending | @@ -355,6 +355,11 @@ def create_object_pointcloud(objects: list[MapObject], header: Header) -> PointC def create_delete_markers(deleted_ids: list[int], header: Header, namespace: str) -> MarkerArray: ... ``` +Publisher helpers: +- `map_object_to_object_node` converts a `MapObject` into an `ObjectNode` with centroid, bbox corners, and point cloud. +- Marker helpers emit bbox wireframes and text labels for RViz. +- `create_object_pointcloud` builds a colored aggregate cloud from object voxels. + --- ## Phase 8: Image Storage (storage.py) @@ -369,6 +374,10 @@ class ImageSaveQueue: def queue_save(self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None) -> str: ... ``` +Image storage: +- Async worker writes `.npy` (and optional `.png`) crops to disk. +- Queue is bounded to avoid blocking the main mapping loop. + --- ## Phase 9: Launch Files @@ -398,13 +407,13 @@ DeclareLaunchArgument('target_object', default_value='') - [x] Add `normalize_angle`, `discretize_angles` to `transform_utils.py` - [x] Add `get_oriented_bbox` to `pointcloud_utils.py` - [x] Simplify `_compute_geometry()` using Open3D oriented bbox +- [x] Implement `matching.py` with IoU, distance, merge-compatibility functions +- [x] Implement `mapper.py` with ObjectDB-style tiering +- [x] Implement cloud-image projection helpers in `vector_perception_utils/pointcloud_utils.py` +- [x] Implement `map_object_utils.py` message conversion functions +- [x] Implement `storage.py` with async queue ### Remaining -- [ ] Implement `matching.py` with IoU, distance, merge-compatibility functions -- [ ] Implement `mapper.py` with ObjectDB-style tiering -- [ ] Implement cloud-image projection helpers in `vector_perception_utils/pointcloud_utils.py` -- [ ] Implement `map_object_utils.py` message conversion functions -- [ ] Implement `storage.py` with async queue - [ ] Implement `mapping_node.py` following detect_anything patterns - [ ] Create launch files and params.yaml - [ ] Integration testing with bag files From e08f117686fedeb77ca1f00d351f574889688a53 Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Fri, 23 Jan 2026 14:05:07 -0500 Subject: [PATCH 4/6] completed full migration of all core logic --- semantic_mapping/README.md | 21 +- .../docs/SEMANTIC_MAPPING_MIGRATION.md | 35 +- semantic_mapping/semantic_mapping/mapper.py | 2 +- .../semantic_mapping/mapping_node.py | 686 ++++++++++++++++++ .../semantic_mapping/utils/__init__.py | 1 + .../{ => utils}/map_object_utils.py | 9 +- .../semantic_mapping/{ => utils}/matching.py | 4 +- .../semantic_mapping/{ => utils}/storage.py | 0 8 files changed, 729 insertions(+), 29 deletions(-) create mode 100644 semantic_mapping/semantic_mapping/mapping_node.py create mode 100644 semantic_mapping/semantic_mapping/utils/__init__.py rename semantic_mapping/semantic_mapping/{ => utils}/map_object_utils.py (95%) rename semantic_mapping/semantic_mapping/{ => utils}/matching.py (98%) rename semantic_mapping/semantic_mapping/{ => utils}/storage.py (100%) diff --git a/semantic_mapping/README.md b/semantic_mapping/README.md index 4729659..3428c0b 100644 --- a/semantic_mapping/README.md +++ b/semantic_mapping/README.md @@ -13,12 +13,12 @@ The pipeline stores voxelized geometry, merges observations, and exports planner |-----------|------|--------| | Messages | `msg/*.msg` | Complete | | Data Models | `map_object.py` | Complete | -| Matching | `matching.py` | Complete | +| Matching | `utils/matching.py` | Complete | | Object Mapper | `mapper.py` | Complete | | Cloud Projection | `vector_perception_utils/pointcloud_utils.py` | Complete | -| Publishers | `map_object_utils.py` | Complete | -| Image Storage | `storage.py` | Complete | -| Mapping Node | `mapping_node.py` | Pending | +| Publishers | `utils/map_object_utils.py` | Complete | +| Image Storage | `utils/storage.py` | Complete | +| Mapping Node | `mapping_node.py` | Complete | | Launch Files | `launch/` | Pending | ## Messages @@ -44,6 +44,10 @@ User instruction for target object search. ### ViewpointRep.msg Viewpoint trigger for image saving. +Note: `detect_anything/DetectionResult` masks are cropped to the bbox and aligned to +the bbox top-left (as documented in `detect_anything/msg/DetectionResult.msg`). +The mapping node reconstructs full-image masks before projection. + ## Architecture ``` @@ -66,7 +70,7 @@ detect_anything/DetectionResult │ └────────────────┘ └──────────────┘ │ │ │ │ │ │ ▼ ▼ │ -│ storage.py mapping_node.py │ +│ utils/storage.py mapping_node.py │ └───────────────────────────────────────────────┘ │ ▼ @@ -75,11 +79,12 @@ detect_anything/DetectionResult Key components: - `map_object.py`: Object state, voxel voting, pose de‑duplication, cached geometry. -- `matching.py`: Track‑id and spatial matching with IoU/ratio heuristics. +- `utils/matching.py`: Track‑id and spatial matching with IoU/ratio heuristics. - `mapper.py`: Pending/permanent tiers, merge rules, and pruning. - `vector_perception_utils/pointcloud_utils.py`: Projection + mask segmentation utilities. -- `map_object_utils.py`: ROS message construction helpers. -- `storage.py`: Async image save queue. +- `utils/map_object_utils.py`: ROS message construction helpers. +- `utils/storage.py`: Async image save queue. +- `mapping_node.py`: ROS2 orchestration (subscriptions, sync, publish). ## Dependencies diff --git a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md index 53b4560..e02e8d0 100644 --- a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md +++ b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md @@ -22,12 +22,12 @@ This document outlines the comprehensive plan for porting the `semantic_mapping` | 1 | Package Setup & Messages | **COMPLETED** | | 2 | Data Models (`map_object.py`) | **COMPLETED** | | 2.1 | Refactor to use shared utilities | **COMPLETED** | -| 3 | Matching Utilities (`matching.py`) | **COMPLETED** | +| 3 | Matching Utilities (`utils/matching.py`) | **COMPLETED** | | 4 | Object Mapper (`mapper.py`) | **COMPLETED** | | 5 | Cloud-Image Projection (`pointcloud_utils.py`) | **COMPLETED** | -| 6 | Message Publishers (`map_object_utils.py`) | **COMPLETED** | -| 7 | Image Storage (`storage.py`) | **COMPLETED** | -| 8 | Mapping Node (`mapping_node.py`) | Pending | +| 6 | Message Publishers (`utils/map_object_utils.py`) | **COMPLETED** | +| 7 | Image Storage (`utils/storage.py`) | **COMPLETED** | +| 8 | Mapping Node (`mapping_node.py`) | **COMPLETED** | | 9 | Launch Files & Config | Pending | | 10 | Integration Testing | Pending | @@ -62,7 +62,7 @@ Key files in `VLM_ROS/src/semantic_mapping/`: - `update_map()` steps: 1. Transform incoming cloud into body/world frame using odom 2. Filter detections by confidence threshold - 3. For each detection: project masked cloud via `CloudImageFusion`, build per-object voxels, then either merge or create a `SingleObject` + 3. For each detection: reconstruct full-image masks (bbox-cropped in `detect_anything/msg/DetectionResult.msg`), project masked cloud, build per-object voxels, then either merge or create a `SingleObject` 4. Merge heuristics: match by tracker `obj_id`, 3D IoU, distance thresholds, and optional primitive groups (e.g., chair/sofa) 5. Maintain delete lists, save queues (best image/mask per object) 6. Publish ROS messages: `ObjectNode`, bbox markers, label markers, colored point cloud @@ -101,10 +101,11 @@ semantic_mapping/ │ ├── mapping_node.py # ROS2 node (orchestration) │ ├── mapper.py # ObjectMapper class (core logic) │ ├── map_object.py # Data classes: MapObject, Observation, VoxelManager -│ ├── matching.py # Object matching utilities (IoU, distance, track_id) -│ ├── (moved to utils) # Cloud-image projection helpers in vector_perception_utils -│ ├── map_object_utils.py # ROS message conversion helpers -│ └── storage.py # Async image saving queue +│ ├── utils/ +│ │ ├── matching.py # Object matching utilities (IoU, distance, track_id) +│ │ ├── map_object_utils.py # ROS message conversion helpers +│ │ └── storage.py # Async image saving queue +│ ├── (moved to vector_perception_utils) # Cloud-image projection helpers ├── docs/ │ └── SEMANTIC_MAPPING_MIGRATION.md └── resource/ @@ -223,7 +224,7 @@ class MapObject: --- -## Phase 3: Matching Utilities (matching.py) +## Phase 3: Matching Utilities (utils/matching.py) ```python def compute_iou_3d(bbox1: np.ndarray, bbox2: np.ndarray) -> float: @@ -345,10 +346,10 @@ def transform_segmented_clouds_to_world(...): ... --- -## Phase 7: Message Publishers (map_object_utils.py) +## Phase 7: Message Publishers (utils/map_object_utils.py) ```python -def map_object_to_object_node(obj: MapObject, header: Header, status: str) -> ObjectNode | None: ... +def map_object_to_object_node(obj: MapObject, header: Header, status: str, target_object: str | None = None) -> ObjectNode | None: ... def create_bbox_markers(objects: list[MapObject], header: Header) -> MarkerArray: ... def create_label_markers(objects: list[MapObject], header: Header) -> MarkerArray: ... def create_object_pointcloud(objects: list[MapObject], header: Header) -> PointCloud2 | None: ... @@ -362,7 +363,7 @@ Publisher helpers: --- -## Phase 8: Image Storage (storage.py) +## Phase 8: Image Storage (utils/storage.py) ```python class ImageSaveQueue: @@ -407,14 +408,14 @@ DeclareLaunchArgument('target_object', default_value='') - [x] Add `normalize_angle`, `discretize_angles` to `transform_utils.py` - [x] Add `get_oriented_bbox` to `pointcloud_utils.py` - [x] Simplify `_compute_geometry()` using Open3D oriented bbox -- [x] Implement `matching.py` with IoU, distance, merge-compatibility functions +- [x] Implement `utils/matching.py` with IoU, distance, merge-compatibility functions - [x] Implement `mapper.py` with ObjectDB-style tiering - [x] Implement cloud-image projection helpers in `vector_perception_utils/pointcloud_utils.py` -- [x] Implement `map_object_utils.py` message conversion functions -- [x] Implement `storage.py` with async queue +- [x] Implement `utils/map_object_utils.py` message conversion functions +- [x] Implement `utils/storage.py` with async queue +- [x] Implement `mapping_node.py` following detect_anything patterns ### Remaining -- [ ] Implement `mapping_node.py` following detect_anything patterns - [ ] Create launch files and params.yaml - [ ] Integration testing with bag files diff --git a/semantic_mapping/semantic_mapping/mapper.py b/semantic_mapping/semantic_mapping/mapper.py index f8a10ac..bcdfa90 100644 --- a/semantic_mapping/semantic_mapping/mapper.py +++ b/semantic_mapping/semantic_mapping/mapper.py @@ -7,7 +7,7 @@ import time import uuid -from .matching import compute_iou_3d_with_ratios, find_best_match +from .utils.matching import compute_iou_3d_with_ratios, find_best_match from .map_object import MapObject, ObjectStatus, Observation diff --git a/semantic_mapping/semantic_mapping/mapping_node.py b/semantic_mapping/semantic_mapping/mapping_node.py new file mode 100644 index 0000000..c33bcc0 --- /dev/null +++ b/semantic_mapping/semantic_mapping/mapping_node.py @@ -0,0 +1,686 @@ +"""ROS2 mapping node for semantic 3D object mapping.""" + +from __future__ import annotations + +from collections import deque +from pathlib import Path +import time + +import numpy as np +import yaml +from cv_bridge import CvBridge +import rclpy +from rclpy.node import Node +from rclpy.qos import QoSProfile, ReliabilityPolicy +from sensor_msgs.msg import CompressedImage, PointCloud2 +from sensor_msgs_py import point_cloud2 +from nav_msgs.msg import Odometry +from std_msgs.msg import Header +from visualization_msgs.msg import MarkerArray +from scipy.spatial.transform import Rotation, Slerp + +from detect_anything.msg import DetectionResult +from semantic_mapping.msg import ObjectType, ObjectNode, TargetObjectInstruction, ViewpointRep + +from vector_perception_utils.pointcloud_utils import ( + project_pointcloud_to_image, + segment_pointclouds_from_projection, + transform_segmented_clouds_to_world, +) + +from .map_object import Observation +from .utils.map_object_utils import ( + create_bbox_markers, + create_delete_markers, + create_label_markers, + create_object_pointcloud, + map_object_to_object_node, +) +from .mapper import ObjectMapper +from .utils.storage import ImageSaveQueue + + +class SemanticMappingNode(Node): + """ROS2 node for semantic object mapping. + + This node fuses 2D detections and LiDAR point clouds into persistent 3D objects: + - Buffers detections, clouds, and odometry with timestamps + - Interpolates odometry at detection time (position + orientation) + - Segments point clouds by detection masks and projects to world frame + - Updates the object mapper and publishes ROS messages + - Queues cropped images for VLM queries + """ + + def __init__(self) -> None: + """Initialize the mapping node, parameters, buffers, and ROS I/O.""" + super().__init__("semantic_mapping_node") + self._bridge = CvBridge() + + self.declare_parameter("detection_topic", "/detection_result") + self.declare_parameter("cloud_topic", "/registered_scan") + self.declare_parameter("odom_topic", "/state_estimation") + self.declare_parameter("viewpoint_topic", "/viewpoint_rep_header") + self.declare_parameter("object_type_answer_topic", "/object_type_answer") + self.declare_parameter("object_type_query_topic", "/object_type_query") + self.declare_parameter("target_object_instruction_topic", "/target_object_instruction") + self.declare_parameter("object_nodes_topic", "/object_nodes") + self.declare_parameter("obj_points_topic", "/obj_points") + self.declare_parameter("obj_boxes_topic", "/obj_boxes") + self.declare_parameter("obj_labels_topic", "/obj_labels") + self.declare_parameter("map_frame", "map") + + self.declare_parameter("confidence_threshold", 0.3) + self.declare_parameter("voxel_size", 0.03) + self.declare_parameter("distance_threshold", 0.5) + self.declare_parameter("iou_threshold", 0.2) + self.declare_parameter("min_detections_for_permanent", 3) + self.declare_parameter("pending_ttl_s", 10.0) + self.declare_parameter("max_inactive_frames", 20) + self.declare_parameter("num_angle_bins", 15) + + self.declare_parameter("detection_linear_state_time_bias", 0.0) + self.declare_parameter("detection_angular_state_time_bias", 0.0) + self.declare_parameter("cloud_window_before", 0.5) + self.declare_parameter("cloud_window_after", 0.1) + + self.declare_parameter("projection", "equirect") + self.declare_parameter("axis_mode", "xz") + self.declare_parameter("image_width", 1920) + self.declare_parameter("image_height", 640) + self.declare_parameter("depth_filter", False) + self.declare_parameter("depth_filter_margin", 0.15) + self.declare_parameter("depth_jump_threshold", 0.3) + self.declare_parameter("sensor_to_camera_tf", "config/sensor_to_camera_tf.yaml") + + self.declare_parameter("target_object", "") + self.declare_parameter("save_png", False) + self.declare_parameter("output_dir", "output/object_images") + + detection_topic = str(self.get_parameter("detection_topic").value) + cloud_topic = str(self.get_parameter("cloud_topic").value) + odom_topic = str(self.get_parameter("odom_topic").value) + viewpoint_topic = str(self.get_parameter("viewpoint_topic").value) + object_type_answer_topic = str(self.get_parameter("object_type_answer_topic").value) + object_type_query_topic = str(self.get_parameter("object_type_query_topic").value) + target_object_instruction_topic = str(self.get_parameter("target_object_instruction_topic").value) + object_nodes_topic = str(self.get_parameter("object_nodes_topic").value) + obj_points_topic = str(self.get_parameter("obj_points_topic").value) + obj_boxes_topic = str(self.get_parameter("obj_boxes_topic").value) + obj_labels_topic = str(self.get_parameter("obj_labels_topic").value) + self._map_frame = str(self.get_parameter("map_frame").value) + + self._confidence_threshold = float(self.get_parameter("confidence_threshold").value) + self._detection_linear_bias = float(self.get_parameter("detection_linear_state_time_bias").value) + self._detection_angular_bias = float(self.get_parameter("detection_angular_state_time_bias").value) + self._cloud_window_before = float(self.get_parameter("cloud_window_before").value) + self._cloud_window_after = float(self.get_parameter("cloud_window_after").value) + + self._projection = str(self.get_parameter("projection").value) + self._axis_mode = str(self.get_parameter("axis_mode").value) + self._image_width = int(self.get_parameter("image_width").value) + self._image_height = int(self.get_parameter("image_height").value) + self._depth_filter = bool(self.get_parameter("depth_filter").value) + self._depth_filter_margin = float(self.get_parameter("depth_filter_margin").value) + self._depth_jump_threshold = float(self.get_parameter("depth_jump_threshold").value) + self._sensor_to_camera = self._load_sensor_to_camera( + str(self.get_parameter("sensor_to_camera_tf").value) + ) + + self._target_object = str(self.get_parameter("target_object").value) + self._save_png = bool(self.get_parameter("save_png").value) + self._output_dir = Path(str(self.get_parameter("output_dir").value)) + + self._mapper = ObjectMapper( + voxel_size=float(self.get_parameter("voxel_size").value), + confidence_threshold=self._confidence_threshold, + distance_threshold=float(self.get_parameter("distance_threshold").value), + iou_threshold=float(self.get_parameter("iou_threshold").value), + min_detections_for_permanent=int(self.get_parameter("min_detections_for_permanent").value), + pending_ttl_s=float(self.get_parameter("pending_ttl_s").value), + max_inactive_frames=int(self.get_parameter("max_inactive_frames").value), + num_angle_bins=int(self.get_parameter("num_angle_bins").value), + ) + + self._image_saver = ImageSaveQueue( + output_dir=self._output_dir, + max_queue_size=100, + save_png=self._save_png, + ) + self._image_saver.start() + + self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=200) + self._odom_buffer: deque[tuple[float, dict]] = deque(maxlen=400) + self._detection_buffer: deque[DetectionResult] = deque(maxlen=100) + self._viewpoint_queue: deque[tuple[float, int]] = deque(maxlen=50) + self._processed_viewpoints: set[float] = set() + self._published_ids: set[int] = set() + self._mapping_busy = False + + qos_cloud = QoSProfile(depth=10, reliability=ReliabilityPolicy.BEST_EFFORT) + qos_default = QoSProfile(depth=10, reliability=ReliabilityPolicy.RELIABLE) + + self.create_subscription(DetectionResult, detection_topic, self._detection_callback, qos_default) + self.create_subscription(PointCloud2, cloud_topic, self._cloud_callback, qos_cloud) + self.create_subscription(Odometry, odom_topic, self._odom_callback, qos_cloud) + self.create_subscription(ViewpointRep, viewpoint_topic, self._viewpoint_callback, qos_default) + self.create_subscription(ObjectType, object_type_answer_topic, self._object_type_answer_callback, qos_default) + self.create_subscription( + TargetObjectInstruction, target_object_instruction_topic, self._target_object_callback, qos_default + ) + + self._object_type_query_pub = self.create_publisher(ObjectType, object_type_query_topic, qos_default) + self._object_nodes_pub = self.create_publisher(ObjectNode, object_nodes_topic, qos_default) + self._obj_points_pub = self.create_publisher(PointCloud2, obj_points_topic, qos_default) + self._obj_boxes_pub = self.create_publisher(MarkerArray, obj_boxes_topic, qos_default) + self._obj_labels_pub = self.create_publisher(MarkerArray, obj_labels_topic, qos_default) + + self.create_timer(0.5, self._mapping_callback) + + def destroy_node(self) -> bool: + """Stop background workers and destroy the ROS node.""" + self._image_saver.stop() + return super().destroy_node() + + def _detection_callback(self, msg: DetectionResult) -> None: + """Buffer detection results for later synchronization. + + Args: + msg: DetectionResult message with bboxes, labels, and masks. + """ + self._detection_buffer.append(msg) + + def _cloud_callback(self, msg: PointCloud2) -> None: + """Buffer incoming point clouds with timestamps. + + Args: + msg: PointCloud2 message in the sensor frame. + """ + stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 + points = point_cloud2.read_points_numpy(msg, field_names=("x", "y", "z")) + self._cloud_buffer.append((stamp, points)) + + def _odom_callback(self, msg: Odometry) -> None: + """Buffer odometry data for interpolation. + + Args: + msg: Odometry message containing pose and twist. + """ + stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 + odom = { + "position": np.array( + [msg.pose.pose.position.x, msg.pose.pose.position.y, msg.pose.pose.position.z] + ), + "orientation": np.array( + [ + msg.pose.pose.orientation.x, + msg.pose.pose.orientation.y, + msg.pose.pose.orientation.z, + msg.pose.pose.orientation.w, + ] + ), + "linear_velocity": np.array( + [msg.twist.twist.linear.x, msg.twist.twist.linear.y, msg.twist.twist.linear.z] + ), + "angular_velocity": np.array( + [msg.twist.twist.angular.x, msg.twist.twist.angular.y, msg.twist.twist.angular.z] + ), + } + self._odom_buffer.append((stamp, odom)) + + def _viewpoint_callback(self, msg: ViewpointRep) -> None: + """Queue viewpoint trigger requests. + + Args: + msg: ViewpointRep message with timestamp and viewpoint id. + """ + stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 + if stamp in self._processed_viewpoints: + return + self._viewpoint_queue.append((stamp, msg.viewpoint_id)) + + def _target_object_callback(self, msg: TargetObjectInstruction) -> None: + """Update target object label used for VLM query decisions. + + Args: + msg: TargetObjectInstruction message. + """ + self._target_object = msg.target_object + + def _object_type_answer_callback(self, msg: ObjectType) -> None: + """Apply VLM relabeling updates to tracked objects. + + Args: + msg: ObjectType message with final label for a track id. + """ + for obj in self._mapper.get_objects(include_pending=True): + if msg.object_id in obj.track_ids: + obj.class_votes[msg.final_label] = obj.class_votes.get(msg.final_label, 0) + 50 + obj.confidence_scores[msg.final_label] = max( + obj.confidence_scores.get(msg.final_label, 0.0), 1.0 + ) + obj.points_count[msg.final_label] = obj.points_count.get(msg.final_label, 0) + obj.updated_by_vlm = True + obj._invalidate_cache() + break + + def _mapping_callback(self) -> None: + """Main mapping loop: synchronize inputs, update map, and publish outputs.""" + if self._mapping_busy: + return + if len(self._detection_buffer) < 1 or len(self._cloud_buffer) < 1 or len(self._odom_buffer) < 2: + return + + self._mapping_busy = True + try: + detection_msg, viewpoint_stamp, viewpoint_id = self._select_detection() + if detection_msg is None: + return + + detection_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9 + image = self._bridge.compressed_imgmsg_to_cv2(detection_msg.image, desired_encoding="bgr8") + + odom = self._interpolate_odom(detection_stamp) + if odom is None: + return + + neighbor_cloud = self._collect_cloud_window(detection_stamp) + if neighbor_cloud is None: + return + + observations = self._build_observations( + detection_msg=detection_msg, + image=image, + odom=odom, + cloud=neighbor_cloud, + ) + if len(observations) == 0: + return + + self._mapper.update(observations, target_object=self._target_object) + self._update_best_images(observations, image) + self._publish_map(detection_stamp, viewpoint_id) + self._publish_object_type_queries() + + if viewpoint_stamp is not None: + self._processed_viewpoints.add(viewpoint_stamp) + finally: + self._mapping_busy = False + + def _select_detection(self) -> tuple[DetectionResult | None, float | None, int]: + """Pick the detection to process, optionally aligned to a viewpoint. + + Returns: + Tuple of (detection_msg, viewpoint_stamp, viewpoint_id). + """ + viewpoint_stamp = None + viewpoint_id = -1 + + if len(self._viewpoint_queue) > 0: + stamp, vid = self._viewpoint_queue[0] + detections = list(self._detection_buffer) + stamps = [msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 for msg in detections] + diffs = np.abs(np.array(stamps) - stamp) + idx = int(np.argmin(diffs)) + if diffs[idx] <= 1.0: + detection_msg = detections[idx] + viewpoint_stamp = stamp + viewpoint_id = vid + self._viewpoint_queue.popleft() + return detection_msg, viewpoint_stamp, viewpoint_id + self._viewpoint_queue.popleft() + + idx = -2 if len(self._detection_buffer) > 1 else -1 + return self._detection_buffer[idx], viewpoint_stamp, viewpoint_id + + def _interpolate_odom(self, detection_stamp: float) -> dict | None: + """Interpolate odometry at the detection timestamp. + + Args: + detection_stamp: Timestamp in seconds. + + Returns: + Interpolated odometry dict or None if interpolation is not possible. + """ + target_stamp = detection_stamp + self._detection_linear_bias + angular_stamp = detection_stamp + self._detection_angular_bias + stamps = [s for s, _ in self._odom_buffer] + if len(stamps) < 2: + return None + + if target_stamp < stamps[0] or target_stamp > stamps[-1]: + return None + if angular_stamp < stamps[0] or angular_stamp > stamps[-1]: + return None + + right_idx = next(i for i, s in enumerate(stamps) if s >= target_stamp) + left_idx = max(right_idx - 1, 0) + angular_right_idx = next(i for i, s in enumerate(stamps) if s >= angular_stamp) + angular_left_idx = max(angular_right_idx - 1, 0) + + left_stamp, left_odom = self._odom_buffer[left_idx] + right_stamp, right_odom = self._odom_buffer[right_idx] + angular_left_stamp, angular_left_odom = self._odom_buffer[angular_left_idx] + angular_right_stamp, angular_right_odom = self._odom_buffer[angular_right_idx] + denom = right_stamp - left_stamp + ratio = (target_stamp - left_stamp) / denom if denom > 0 else 0.5 + angular_denom = angular_right_stamp - angular_left_stamp + angular_ratio = (angular_stamp - angular_left_stamp) / angular_denom if angular_denom > 0 else 0.5 + + camera_odom = {} + camera_odom["position"] = right_odom["position"] * ratio + left_odom["position"] * (1 - ratio) + camera_odom["linear_velocity"] = ( + right_odom["linear_velocity"] * ratio + left_odom["linear_velocity"] * (1 - ratio) + ) + rotations = Rotation.from_quat( + [angular_left_odom["orientation"], angular_right_odom["orientation"]] + ) + slerp = Slerp([0, 1], rotations) + camera_odom["orientation"] = slerp(angular_ratio).as_quat() + camera_odom["angular_velocity"] = ( + angular_right_odom["angular_velocity"] * angular_ratio + + angular_left_odom["angular_velocity"] * (1 - angular_ratio) + ) + return camera_odom + + def _collect_cloud_window(self, detection_stamp: float) -> np.ndarray | None: + """Aggregate point clouds around a detection timestamp. + + Args: + detection_stamp: Timestamp in seconds. + + Returns: + Concatenated point cloud or None if no data in window. + """ + start = detection_stamp - self._cloud_window_before + end = detection_stamp + self._cloud_window_after + clouds = [pts for stamp, pts in self._cloud_buffer if start <= stamp <= end] + if len(clouds) == 0: + return None + return np.concatenate(clouds, axis=0) + + def _build_observations( + self, + detection_msg: DetectionResult, + image: np.ndarray, + odom: dict, + cloud: np.ndarray, + ) -> list[Observation]: + """Build Observation objects from detections and segmented point clouds. + + Args: + detection_msg: DetectionResult with bboxes, labels, and masks. + image: RGB image used for masks and cropping. + odom: Interpolated odometry dict (position + orientation). + cloud: Aggregated point cloud in sensor frame. + + Returns: + List of Observation instances for object updates. + """ + det_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9 + image_shape = image.shape[:2] + + masks = [] + labels = [] + confidences = [] + track_ids = [] + + for i in range(len(detection_msg.track_id)): + conf = float(detection_msg.confidence[i]) + if conf < self._confidence_threshold: + continue + x1 = int(detection_msg.x1[i]) + y1 = int(detection_msg.y1[i]) + x2 = int(detection_msg.x2[i]) + y2 = int(detection_msg.y2[i]) + x1 = max(0, min(x1, image_shape[1] - 1)) + x2 = max(0, min(x2, image_shape[1])) + y1 = max(0, min(y1, image_shape[0] - 1)) + y2 = max(0, min(y2, image_shape[0])) + if x2 <= x1 or y2 <= y1: + continue + + full_mask = np.zeros(image_shape, dtype=bool) + if i < len(detection_msg.masks): + mask_msg: CompressedImage = detection_msg.masks[i] + mask_crop = self._bridge.compressed_imgmsg_to_cv2(mask_msg, desired_encoding="mono8") + mask_bool = mask_crop.astype(bool) + crop_h = min(mask_bool.shape[0], y2 - y1) + crop_w = min(mask_bool.shape[1], x2 - x1) + full_mask[y1 : y1 + crop_h, x1 : x1 + crop_w] = mask_bool[:crop_h, :crop_w] + + masks.append(full_mask) + labels.append(detection_msg.label[i]) + confidences.append(conf) + track_ids.append(int(detection_msg.track_id[i])) + + if len(masks) == 0: + return [] + + R_b2w = Rotation.from_quat(odom["orientation"]).as_matrix() + t_b2w = np.array(odom["position"]) + R_w2b = R_b2w.T + t_w2b = -R_w2b @ t_b2w + cloud_body = cloud @ R_w2b.T + t_w2b + + _, pixel_idx = project_pointcloud_to_image( + points=cloud_body, + projection=self._projection, + image_width=self._image_width, + image_height=self._image_height, + axis_mode=self._axis_mode, + intrinsics=None, + depth_filter=self._depth_filter, + depth_filter_margin=self._depth_filter_margin, + transform=self._sensor_to_camera, + ) + seg_clouds, seg_depths = segment_pointclouds_from_projection( + points=cloud_body, + masks=masks, + pixel_idx=pixel_idx, + ) + world_clouds = transform_segmented_clouds_to_world( + clouds=seg_clouds, + depths=seg_depths, + R_b2w=R_b2w, + t_b2w=t_b2w, + depth_jump_threshold=self._depth_jump_threshold, + ) + + observations = [] + for idx, world_cloud in enumerate(world_clouds): + if world_cloud.shape[0] == 0: + continue + observations.append( + Observation( + track_id=track_ids[idx], + label=labels[idx], + confidence=confidences[idx], + points=world_cloud, + robot_R=R_b2w, + robot_t=t_b2w, + mask=masks[idx], + image=image, + timestamp=det_stamp, + ) + ) + + return observations + + def _update_best_images(self, observations: list[Observation], image: np.ndarray) -> None: + """Update best image crops for objects based on mask area. + + Args: + observations: Observations created for the current frame. + image: Full RGB image associated with the detections. + """ + for obs in observations: + obj = None + for candidate in self._mapper.get_objects(include_pending=True): + if obs.track_id in candidate.track_ids: + obj = candidate + break + if obj is None or obs.mask is None: + continue + + coords = np.argwhere(obs.mask) + if coords.size == 0: + continue + mask_area = coords.shape[0] + if mask_area <= obj.best_image_score: + continue + + y0, x0 = coords.min(axis=0) + y1, x1 = coords.max(axis=0) + 1 + padding = max(int(y1 - y0), int(x1 - x0), 80) + y0 = max(0, y0 - padding) + x0 = max(0, x0 - padding) + y1 = min(obs.mask.shape[0], y1 + padding) + x1 = min(obs.mask.shape[1], x1 + padding) + + if obj.best_image_path: + image_path = Path(obj.best_image_path) + mask_path = image_path.with_name(image_path.stem + "_mask.npy") + if image_path.exists(): + image_path.unlink() + if mask_path.exists(): + mask_path.unlink() + if self._save_png: + png_path = image_path.with_suffix(".png") + mask_png_path = image_path.with_name(image_path.stem + "_mask.png") + if png_path.exists(): + png_path.unlink() + if mask_png_path.exists(): + mask_png_path.unlink() + + cropped_image = image[y0:y1, x0:x1] + cropped_mask = obs.mask[y0:y1, x0:x1] + + obj.best_image_path = self._image_saver.queue_save( + object_id=str(obs.track_id), + image=cropped_image, + mask=cropped_mask, + ) + obj.best_image_score = float(mask_area) + obj.is_asked_vlm = False + + def _publish_map(self, stamp: float, viewpoint_id: int) -> None: + """Publish object nodes, markers, and aggregated point clouds. + + Args: + stamp: Timestamp (seconds) for message headers. + viewpoint_id: Viewpoint id for ObjectNode messages. + """ + seconds = int(stamp) + nanoseconds = int((stamp - seconds) * 1e9) + header = Header() + header.stamp.sec = seconds + header.stamp.nanosec = nanoseconds + header.frame_id = self._map_frame + + objects = self._mapper.get_objects(include_pending=True) + for obj in objects: + if obj.detection_count == 1: + status = "new" + elif obj.inactive_frames == 0: + status = "updated" + else: + status = "unchanged" + msg = map_object_to_object_node( + obj, + header, + status=status, + viewpoint_id=viewpoint_id, + target_object=self._target_object, + ) + if msg is not None: + self._object_nodes_pub.publish(msg) + + bbox_markers = create_bbox_markers(objects, header) + label_markers = create_label_markers(objects, header) + self._obj_boxes_pub.publish(bbox_markers) + self._obj_labels_pub.publish(label_markers) + + obj_cloud = create_object_pointcloud(objects, header) + if obj_cloud is not None: + self._obj_points_pub.publish(obj_cloud) + + current_ids = {tid for obj in objects for tid in obj.track_ids} + deleted_ids = list(self._published_ids - current_ids) + if len(deleted_ids) > 0: + delete_markers = create_delete_markers(deleted_ids, header, namespace="bbox") + self._obj_boxes_pub.publish(delete_markers) + delete_labels = create_delete_markers(deleted_ids, header, namespace="labels") + self._obj_labels_pub.publish(delete_labels) + self._published_ids = current_ids + + def _publish_object_type_queries(self) -> None: + """Publish VLM queries for target objects with good image crops.""" + for obj in self._mapper.get_objects(include_pending=False): + if obj.label != self._target_object: + continue + if obj.is_asked_vlm: + continue + if obj.best_image_score <= 500: + continue + msg = ObjectType() + msg.object_id = int(obj.track_ids[0]) if obj.track_ids else -1 + msg.img_path = obj.best_image_path or "" + msg.labels = list(obj.class_votes.keys()) + self._object_type_query_pub.publish(msg) + obj.is_asked_vlm = True + + def _load_sensor_to_camera(self, path: str) -> np.ndarray | None: + """Load sensor->camera transform from YAML. + + Args: + path: Path to YAML config with translation and rpy. + + Returns: + 4x4 transform matrix or None if config not found. + """ + config_path = Path(path) + if not config_path.is_absolute(): + config_path = Path(__file__).resolve().parent.parent / config_path + if not config_path.exists(): + return None + + data = yaml.safe_load(config_path.read_text()) or {} + tf_data = data.get("sensor_to_camera", {}) + trans = tf_data.get("translation", {}) + rpy = tf_data.get("rotation_rpy", {}) + translation = np.array( + [ + float(trans.get("x", 0.0)), + float(trans.get("y", 0.0)), + float(trans.get("z", 0.0)), + ] + ) + rotation = Rotation.from_euler( + "xyz", + [ + float(rpy.get("roll", 0.0)), + float(rpy.get("pitch", 0.0)), + float(rpy.get("yaw", 0.0)), + ], + ).as_matrix() + transform = np.eye(4) + transform[:3, :3] = rotation + transform[:3, 3] = translation + return transform + + +def main(args: list[str] | None = None) -> None: + """Entry point for the semantic mapping node.""" + rclpy.init(args=args) + node = SemanticMappingNode() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/semantic_mapping/semantic_mapping/utils/__init__.py b/semantic_mapping/semantic_mapping/utils/__init__.py new file mode 100644 index 0000000..d949aeb --- /dev/null +++ b/semantic_mapping/semantic_mapping/utils/__init__.py @@ -0,0 +1 @@ +"""Utility modules for semantic mapping.""" diff --git a/semantic_mapping/semantic_mapping/map_object_utils.py b/semantic_mapping/semantic_mapping/utils/map_object_utils.py similarity index 95% rename from semantic_mapping/semantic_mapping/map_object_utils.py rename to semantic_mapping/semantic_mapping/utils/map_object_utils.py index 6bc3d8d..b098c8c 100644 --- a/semantic_mapping/semantic_mapping/map_object_utils.py +++ b/semantic_mapping/semantic_mapping/utils/map_object_utils.py @@ -14,7 +14,7 @@ from semantic_mapping.msg import ObjectNode from vector_perception_utils.pointcloud_utils import create_pointcloud2_msg -from .map_object import MapObject +from ..map_object import MapObject def map_object_to_object_node( @@ -22,6 +22,7 @@ def map_object_to_object_node( header: Header, status: str = "new", viewpoint_id: int = -1, + target_object: str | None = None, ) -> Optional[ObjectNode]: """Convert a MapObject into an ObjectNode message. @@ -30,6 +31,7 @@ def map_object_to_object_node( header: ROS header to attach to the message. status: Lifecycle status string ("new", "updated", "unchanged"). viewpoint_id: Viewpoint identifier for image saving. + target_object: Target object label for potential labeling. Returns: ObjectNode message or None if required geometry is missing. @@ -40,7 +42,10 @@ def map_object_to_object_node( msg = ObjectNode() msg.header = header msg.object_id = obj.track_ids - msg.label = obj.label + if target_object and obj.label == target_object and not obj.is_asked_vlm: + msg.label = "Potential Target" + else: + msg.label = obj.label msg.status = status msg.position.x = float(obj.centroid[0]) msg.position.y = float(obj.centroid[1]) diff --git a/semantic_mapping/semantic_mapping/matching.py b/semantic_mapping/semantic_mapping/utils/matching.py similarity index 98% rename from semantic_mapping/semantic_mapping/matching.py rename to semantic_mapping/semantic_mapping/utils/matching.py index 418521e..fcfb7cd 100644 --- a/semantic_mapping/semantic_mapping/matching.py +++ b/semantic_mapping/semantic_mapping/utils/matching.py @@ -8,7 +8,7 @@ from vector_perception_utils.pointcloud_utils import compute_centroid, get_oriented_bbox -from .map_object import MapObject, Observation +from ..map_object import MapObject, Observation def _convex_hull_2d(points: np.ndarray) -> np.ndarray: @@ -208,6 +208,8 @@ def find_best_match( return candidate cand_center = candidate.centroid + if cand_center is None: + continue dist = float(np.linalg.norm(obs_center - cand_center)) if dist > distance_threshold: diff --git a/semantic_mapping/semantic_mapping/storage.py b/semantic_mapping/semantic_mapping/utils/storage.py similarity index 100% rename from semantic_mapping/semantic_mapping/storage.py rename to semantic_mapping/semantic_mapping/utils/storage.py From 1cf8b9ec2f506401cd44ecbc4ff522185f83892f Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Sat, 24 Jan 2026 14:55:40 -0500 Subject: [PATCH 5/6] Working detections, tuning in progress --- detect_anything/detect_anything/yoloe.py | 3 + rviz/semantic_mapping.rviz | 267 ++++++++++++++++++ semantic_mapping/README.md | 3 +- semantic_mapping/config/params.yaml | 55 ++++ .../config/sensor_to_camera_tf.yaml | 4 +- .../docs/SEMANTIC_MAPPING_MIGRATION.md | 55 ++-- .../launch/semantic_mapping.launch.py | 137 +++++++++ .../scripts/semantic_mapping_node | 0 .../semantic_mapping/map_object.py | 180 ++++++++---- semantic_mapping/semantic_mapping/mapper.py | 64 ++++- .../semantic_mapping/mapping_node.py | 197 ++++++++++++- .../utils/map_object_utils.py | 54 +++- .../semantic_mapping/utils/matching.py | 126 +++++---- .../semantic_mapping/utils/storage.py | 9 +- .../pointcloud_utils.py | 141 ++++++++- 15 files changed, 1120 insertions(+), 175 deletions(-) create mode 100644 rviz/semantic_mapping.rviz create mode 100644 semantic_mapping/config/params.yaml create mode 100644 semantic_mapping/launch/semantic_mapping.launch.py mode change 100644 => 100755 semantic_mapping/scripts/semantic_mapping_node diff --git a/detect_anything/detect_anything/yoloe.py b/detect_anything/detect_anything/yoloe.py index c0695c3..303afb9 100644 --- a/detect_anything/detect_anything/yoloe.py +++ b/detect_anything/detect_anything/yoloe.py @@ -9,6 +9,7 @@ import numpy as np import torch from ultralytics import YOLOE # type: ignore[attr-defined, import-not-found] +from ultralytics.utils import SETTINGS from detect_anything.detection import ( DetectionResults, @@ -41,6 +42,8 @@ def __init__( model_name = "yoloe-11s-seg-pf.pt" if prompt_mode == YoloePromptMode.LRPC else "yoloe-11s-seg.pt" model_dir = Path(model_path) if model_path else Path("models/yoloe") + model_dir = model_dir.expanduser().resolve() + SETTINGS["weights_dir"] = str(model_dir) self.model = YOLOE(model_dir / model_name) self.prompt_mode = prompt_mode self._visual_prompts: dict[str, np.ndarray] | None = None diff --git a/rviz/semantic_mapping.rviz b/rviz/semantic_mapping.rviz new file mode 100644 index 0000000..d56223b --- /dev/null +++ b/rviz/semantic_mapping.rviz @@ -0,0 +1,267 @@ +Panels: + - Class: rviz_common/Displays + Help Height: 78 + Name: Displays + Property Tree Widget: + Expanded: + - /Global Options1 + - /Status1 + - /Odometry1 + Splitter Ratio: 0.5 + Tree Height: 413 + - Class: rviz_common/Selection + Name: Selection + - Class: rviz_common/Tool Properties + Expanded: + - /2D Goal Pose1 + - /Publish Point1 + Name: Tool Properties + Splitter Ratio: 0.5886790156364441 + - Class: rviz_common/Views + Expanded: + - /Current View1 + Name: Views + Splitter Ratio: 0.5 + - Class: rviz_common/Time + Experimental: false + Name: Time + SyncMode: 0 + SyncSource: PointCloud2 +Visualization Manager: + Class: "" + Displays: + - Alpha: 0.5 + Cell Size: 1 + Class: rviz_default_plugins/Grid + Color: 160; 160; 164 + Enabled: true + Line Style: + Line Width: 0.029999999329447746 + Value: Lines + Name: Grid + Normal Cell Count: 0 + Offset: + X: 0 + Y: 0 + Z: 0 + Plane: XY + Plane Cell Count: 10 + Reference Frame: + Value: true + - Class: rviz_default_plugins/Image + Enabled: true + Max Value: 1 + Median window: 5 + Min Value: 0 + Name: Image + Normalize Range: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /annotated_image_detection + Value: true + - Angle Tolerance: 0.10000000149011612 + Class: rviz_default_plugins/Odometry + Covariance: + Orientation: + Alpha: 0.5 + Color: 255; 255; 127 + Color Style: Unique + Frame: Local + Offset: 1 + Scale: 1 + Value: true + Position: + Alpha: 0.30000001192092896 + Color: 204; 51; 204 + Scale: 1 + Value: true + Value: false + Enabled: true + Keep: 5 + Name: Odometry + Position Tolerance: 0.10000000149011612 + Shape: + Alpha: 1 + Axes Length: 1 + Axes Radius: 0.10000000149011612 + Color: 255; 25; 0 + Head Length: 0.30000001192092896 + Head Radius: 0.10000000149011612 + Shaft Length: 1 + Shaft Radius: 0.05000000074505806 + Value: Arrow + Topic: + Depth: 5 + Durability Policy: Volatile + Filter size: 10 + History Policy: Keep Last + Reliability Policy: Reliable + Value: /state_estimation + Value: true + - Alpha: 0.10000000149011612 + Autocompute Intensity Bounds: true + Autocompute Value Bounds: + Max Value: 10 + Min Value: -10 + Value: true + Axis: Z + Channel Name: intensity + Class: rviz_default_plugins/PointCloud2 + Color: 255; 255; 255 + Color Transformer: FlatColor + Decay Time: 3 + Enabled: true + Invert Rainbow: false + Max Color: 255; 255; 255 + Max Intensity: 165 + Min Color: 0; 0; 0 + Min Intensity: 0 + Name: PointCloud2 + Position Transformer: XYZ + Selectable: true + Size (Pixels): 3 + Size (m): 0.004999999888241291 + Style: Flat Squares + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /registered_scan + Use Fixed Frame: true + Use rainbow: true + Value: true + - Class: rviz_default_plugins/MarkerArray + Enabled: true + Name: MarkerArray + Namespaces: + bbox: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /obj_boxes + Value: true + - Alpha: 1 + Autocompute Intensity Bounds: true + Autocompute Value Bounds: + Max Value: 10 + Min Value: -10 + Value: true + Axis: Z + Channel Name: intensity + Class: rviz_default_plugins/PointCloud2 + Color: 255; 255; 255 + Color Transformer: RGB8 + Decay Time: 0 + Enabled: true + Invert Rainbow: false + Max Color: 255; 255; 255 + Max Intensity: 4096 + Min Color: 0; 0; 0 + Min Intensity: 0 + Name: PointCloud2 + Position Transformer: XYZ + Selectable: true + Size (Pixels): 3 + Size (m): 0.009999999776482582 + Style: Flat Squares + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /obj_points + Use Fixed Frame: true + Use rainbow: true + Value: true + Enabled: true + Global Options: + Background Color: 48; 48; 48 + Fixed Frame: map + Frame Rate: 30 + Name: root + Tools: + - Class: rviz_default_plugins/Interact + Hide Inactive Objects: true + - Class: rviz_default_plugins/MoveCamera + - Class: rviz_default_plugins/Select + - Class: rviz_default_plugins/FocusCamera + - Class: rviz_default_plugins/Measure + Line color: 128; 128; 0 + - Class: rviz_default_plugins/SetInitialPose + Covariance x: 0.25 + Covariance y: 0.25 + Covariance yaw: 0.06853891909122467 + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /initialpose + - Class: rviz_default_plugins/SetGoal + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /goal_pose + - Class: rviz_default_plugins/PublishPoint + Single click: true + Topic: + Depth: 5 + Durability Policy: Volatile + History Policy: Keep Last + Reliability Policy: Reliable + Value: /clicked_point + Transformation: + Current: + Class: rviz_default_plugins/TF + Value: true + Views: + Current: + Class: rviz_default_plugins/Orbit + Distance: 21.720943450927734 + Enable Stereo Rendering: + Stereo Eye Separation: 0.05999999865889549 + Stereo Focal Distance: 1 + Swap Stereo Eyes: false + Value: false + Focal Point: + X: 3.1760172843933105 + Y: 11.240063667297363 + Z: 2.561012029647827 + Focal Shape Fixed Size: false + Focal Shape Size: 0.05000000074505806 + Invert Z Axis: false + Name: Current View + Near Clip Distance: 0.009999999776482582 + Pitch: 1.1497966051101685 + Target Frame: + Value: Orbit (rviz_default_plugins) + Yaw: 3.4885518550872803 + Saved: ~ +Window Geometry: + Displays: + collapsed: false + Height: 996 + Hide Left Dock: false + Hide Right Dock: false + Image: + collapsed: false + QMainWindow State: 000000ff00000000fd00000004000000000000027b00000342fc0200000009fb0000001200530065006c0065006300740069006f006e00000001e10000009b0000005d00fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000a3fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c006100790073010000003f0000022b000000cc00fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261fb0000000a0049006d0061006700650100000270000001110000001700ffffff000000010000010f00000342fc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073010000003f00000342000000a900fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e10000019700000003000006b90000003efc0100000002fb0000000800540069006d00650100000000000006b90000026f00fffffffb0000000800540069006d00650100000000000004500000000000000000000003230000034200000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000 + Selection: + collapsed: false + Time: + collapsed: false + Tool Properties: + collapsed: false + Views: + collapsed: false + Width: 1721 + X: 3248 + Y: 331 diff --git a/semantic_mapping/README.md b/semantic_mapping/README.md index 3428c0b..d7f6c88 100644 --- a/semantic_mapping/README.md +++ b/semantic_mapping/README.md @@ -19,7 +19,7 @@ The pipeline stores voxelized geometry, merges observations, and exports planner | Publishers | `utils/map_object_utils.py` | Complete | | Image Storage | `utils/storage.py` | Complete | | Mapping Node | `mapping_node.py` | Complete | -| Launch Files | `launch/` | Pending | +| Launch Files | `launch/` | Complete | ## Messages @@ -104,6 +104,7 @@ ros2 launch semantic_mapping semantic_mapping.launch.py ## Configuration +- `config/params.yaml`: Default node parameters for `semantic_mapping_node`. - `config/sensor_to_camera_tf.yaml`: Static sensor→camera transform for projection. - `config/objects.yaml`: Object taxonomy for detection labels. diff --git a/semantic_mapping/config/params.yaml b/semantic_mapping/config/params.yaml new file mode 100644 index 0000000..370d9c9 --- /dev/null +++ b/semantic_mapping/config/params.yaml @@ -0,0 +1,55 @@ +semantic_mapping_node: + ros__parameters: + detection_topic: /detection_result + cloud_topic: /registered_scan + odom_topic: /state_estimation + viewpoint_topic: /viewpoint_rep_header + object_type_answer_topic: /object_type_answer + object_type_query_topic: /object_type_query + target_object_instruction_topic: /target_object_instruction + object_nodes_topic: /object_nodes + obj_points_topic: /obj_points + obj_boxes_topic: /obj_boxes + obj_labels_topic: /obj_labels + map_frame: map + + confidence_threshold: 0.35 + voxel_size: 0.03 + distance_threshold: 0.3 + iou_threshold: 0.5 + min_detections_for_permanent: 3 + pending_ttl_s: 10.0 + max_inactive_frames: 20 + num_angle_bins: 15 + dbscan_eps_scale: 2.0 + dbscan_min_points_initial: 6 + dbscan_min_points_mature: 12 + + detection_linear_state_time_bias: 0.0 + detection_angular_state_time_bias: 0.0 + cloud_window_before: 0.5 + cloud_window_after: 0.1 + + projection: equirect + axis_mode: xz + image_width: 1920 + image_height: 640 + depth_filter: true + depth_filter_margin: 0.15 + depth_jump_threshold: 0.3 + sensor_to_camera_tf: config/sensor_to_camera_tf.yaml + mask_erode_kernel: 4 + mask_erode_iterations: 3 + max_cloud_distance: 8.0 + outlier_filter: true + outlier_nb_neighbors: 12 + outlier_std_ratio: 1.6 + outlier_min_points: 20 + visualization_max_objects: 50 + ground_filter: true + ground_radius: 2.0 + ground_z_clearance: 0.05 + + target_object: refrigerator + save_png: True + output_dir: output/object_images diff --git a/semantic_mapping/config/sensor_to_camera_tf.yaml b/semantic_mapping/config/sensor_to_camera_tf.yaml index baa6724..e48c06f 100644 --- a/semantic_mapping/config/sensor_to_camera_tf.yaml +++ b/semantic_mapping/config/sensor_to_camera_tf.yaml @@ -2,8 +2,8 @@ sensor_to_camera: parent_frame: lidar child_frame: camera translation: - x: 0.0 - y: 0.0 + x: -0.12 + y: -0.075 z: 0.265 rotation_rpy: roll: -1.5707963 diff --git a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md index e02e8d0..07ef2c5 100644 --- a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md +++ b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md @@ -28,8 +28,8 @@ This document outlines the comprehensive plan for porting the `semantic_mapping` | 6 | Message Publishers (`utils/map_object_utils.py`) | **COMPLETED** | | 7 | Image Storage (`utils/storage.py`) | **COMPLETED** | | 8 | Mapping Node (`mapping_node.py`) | **COMPLETED** | -| 9 | Launch Files & Config | Pending | -| 10 | Integration Testing | Pending | +| 9 | Launch Files & Config | **COMPLETED** | +| 10 | Integration Testing | **COMPLETED** | ### Step 2.1: Refactored map_object.py to use shared utilities @@ -93,7 +93,7 @@ semantic_mapping/ │ └── ViewpointRep.msg ├── config/ │ ├── objects.yaml # Object taxonomy (to be copied from VLM_ROS) -│ └── params.yaml # Default node parameters (to be created) +│ └── params.yaml # Default node parameters ├── launch/ │ └── semantic_mapping.launch.py ├── semantic_mapping/ @@ -416,8 +416,8 @@ DeclareLaunchArgument('target_object', default_value='') - [x] Implement `mapping_node.py` following detect_anything patterns ### Remaining -- [ ] Create launch files and params.yaml -- [ ] Integration testing with bag files +- [x] Create launch files and params.yaml +- [x] Integration testing with bag files --- @@ -437,19 +437,38 @@ Items to remove: ## Compatibility Checklist -- [ ] `/object_nodes` publishes ObjectNode with exact schema -- [ ] `/obj_points` publishes colored PointCloud2 -- [ ] `/obj_boxes` publishes MarkerArray wireframes -- [ ] `/obj_labels` publishes MarkerArray text -- [ ] `/object_type_query` publishes ObjectType for VLM -- [ ] Subscribes to `/detection_result` from detect_anything -- [ ] Subscribes to `/registered_scan` PointCloud2 -- [ ] Subscribes to `/state_estimation` Odometry -- [ ] Subscribes to `/target_object_instruction` -- [ ] Subscribes to `/viewpoint_rep_header` -- [ ] Subscribes to `/object_type_answer` -- [ ] Object positions, bbox corners, status flags match expected semantics -- [ ] Delete markers properly remove objects from RViz +- [x] `/object_nodes` publishes ObjectNode with exact schema +- [x] `/obj_points` publishes colored PointCloud2 +- [x] `/obj_boxes` publishes MarkerArray wireframes +- [x] `/obj_labels` publishes MarkerArray text +- [x] `/object_type_query` publishes ObjectType for VLM +- [x] Subscribes to `/detection_result` from detect_anything +- [x] Subscribes to `/registered_scan` PointCloud2 +- [x] Subscribes to `/state_estimation` Odometry +- [x] Subscribes to `/target_object_instruction` +- [x] Subscribes to `/viewpoint_rep_header` +- [x] Subscribes to `/object_type_answer` +- [x] Object positions, bbox corners, status flags match expected semantics +- [x] Delete markers properly remove objects from RViz + +--- + +## Recent Updates (Jan 2026) + +- Unified launch to `semantic_mapping.launch.py` with mode handling, detect_anything integration, and bagfile defaults. +- Fixed cloud/image projection transform to match legacy `CloudImageFusion` math. +- Added mask erosion, depth-jump pruning, distance gating, and light outlier filtering for cleaner object clouds. +- Added reprojected observation-angle voting for better multi-view aggregation. +- Fixed bbox ordering and switched `/obj_boxes` to axis-aligned markers (via `get_oriented_bbox(axis_aligned=True)`). +- Updated DBSCAN regularization to v2 logic and exposed DBSCAN tuning params. +- Fixed image saving color order and ensured output dir is cleared on start. +- Set MobileCLIP weights cache to `detect_anything/models` and removed corrupted caches. + +## Temporary Handoff Notes + +- Add ground-height clamp to further suppress ground/background leakage. +- If bagfile sync issues persist, consider switching mapping inputs to `/sensor_scan` + `/state_estimation_at_scan`. +- Verify VLM objects.yaml path in VLM_ROS (`vlm_reasoning_node` still expects old path). --- diff --git a/semantic_mapping/launch/semantic_mapping.launch.py b/semantic_mapping/launch/semantic_mapping.launch.py new file mode 100644 index 0000000..0b7d8ab --- /dev/null +++ b/semantic_mapping/launch/semantic_mapping.launch.py @@ -0,0 +1,137 @@ +"""Launch semantic mapping and detect_anything with a mode selector.""" + +import os + +from ament_index_python.packages import get_package_share_directory +from launch import LaunchDescription +from launch.actions import DeclareLaunchArgument, OpaqueFunction +from launch.substitutions import LaunchConfiguration +from launch_ros.actions import Node + + +def _parse_bool(value: str, default: bool) -> bool: + if value == "": + return default + return value.lower() in ("true", "1", "yes") + + +def _launch_nodes(context): + mode = LaunchConfiguration("mode").perform(context).lower() + use_sim_time_arg = LaunchConfiguration("use_sim_time").perform(context) + use_compressed_arg = LaunchConfiguration("use_compressed").perform(context) + odom_topic_arg = LaunchConfiguration("odom_topic").perform(context) + + use_sim_time = _parse_bool(use_sim_time_arg, default=mode == "bagfile") + use_compressed = _parse_bool(use_compressed_arg, default=mode in ("bagfile", "sim")) + + if odom_topic_arg: + odom_topic = odom_topic_arg + else: + odom_topic = "/aft_mapped_to_init_incremental" if mode == "bagfile" else "/state_estimation" + + params_file = LaunchConfiguration("params_file").perform(context) + sensor_to_camera_tf = LaunchConfiguration("sensor_to_camera_tf").perform(context) + detection_topic = LaunchConfiguration("detection_topic").perform(context) + cloud_topic = LaunchConfiguration("cloud_topic").perform(context) + viewpoint_topic = LaunchConfiguration("viewpoint_topic").perform(context) + object_type_answer_topic = LaunchConfiguration("object_type_answer_topic").perform(context) + object_type_query_topic = LaunchConfiguration("object_type_query_topic").perform(context) + target_object_instruction_topic = LaunchConfiguration("target_object_instruction_topic").perform( + context + ) + object_nodes_topic = LaunchConfiguration("object_nodes_topic").perform(context) + obj_points_topic = LaunchConfiguration("obj_points_topic").perform(context) + obj_boxes_topic = LaunchConfiguration("obj_boxes_topic").perform(context) + obj_labels_topic = LaunchConfiguration("obj_labels_topic").perform(context) + map_frame = LaunchConfiguration("map_frame").perform(context) + target_object = LaunchConfiguration("target_object").perform(context) + save_png = _parse_bool(LaunchConfiguration("save_png").perform(context), default=False) + output_dir = LaunchConfiguration("output_dir").perform(context) + image_topic = LaunchConfiguration("image_topic").perform(context) + compressed_image_topic = LaunchConfiguration("compressed_image_topic").perform(context) + conf = float(LaunchConfiguration("conf").perform(context)) + max_area_ratio = float(LaunchConfiguration("max_area_ratio").perform(context)) + + detect_anything_node = Node( + package="detect_anything", + executable="detection_node", + name="detect_anything_detection_node", + output="screen", + parameters=[ + { + "use_sim_time": use_sim_time, + "use_compressed": use_compressed, + "image_topic": image_topic, + "compressed_image_topic": compressed_image_topic, + "conf": conf, + "max_area_ratio": max_area_ratio, + } + ], + ) + + semantic_mapping_node = Node( + package="semantic_mapping", + executable="semantic_mapping_node", + name="semantic_mapping_node", + output="screen", + parameters=[ + params_file, + { + "use_sim_time": use_sim_time, + "sensor_to_camera_tf": sensor_to_camera_tf, + "detection_topic": detection_topic, + "cloud_topic": cloud_topic, + "odom_topic": odom_topic, + "viewpoint_topic": viewpoint_topic, + "object_type_answer_topic": object_type_answer_topic, + "object_type_query_topic": object_type_query_topic, + "target_object_instruction_topic": target_object_instruction_topic, + "object_nodes_topic": object_nodes_topic, + "obj_points_topic": obj_points_topic, + "obj_boxes_topic": obj_boxes_topic, + "obj_labels_topic": obj_labels_topic, + "map_frame": map_frame, + "target_object": target_object, + "save_png": save_png, + "output_dir": output_dir, + }, + ], + ) + + return [detect_anything_node, semantic_mapping_node] + + +def generate_launch_description() -> LaunchDescription: + pkg_share = get_package_share_directory("semantic_mapping") + default_params = os.path.join(pkg_share, "config", "params.yaml") + default_sensor_tf = os.path.join(pkg_share, "config", "sensor_to_camera_tf.yaml") + + return LaunchDescription( + [ + DeclareLaunchArgument("mode", default_value="sim"), + DeclareLaunchArgument("use_sim_time", default_value=""), + DeclareLaunchArgument("use_compressed", default_value=""), + DeclareLaunchArgument("params_file", default_value=default_params), + DeclareLaunchArgument("sensor_to_camera_tf", default_value=default_sensor_tf), + DeclareLaunchArgument("detection_topic", default_value="/detection_result"), + DeclareLaunchArgument("cloud_topic", default_value="/registered_scan"), + DeclareLaunchArgument("odom_topic", default_value=""), + DeclareLaunchArgument("viewpoint_topic", default_value="/viewpoint_rep_header"), + DeclareLaunchArgument("object_type_answer_topic", default_value="/object_type_answer"), + DeclareLaunchArgument("object_type_query_topic", default_value="/object_type_query"), + DeclareLaunchArgument("target_object_instruction_topic", default_value="/target_object_instruction"), + DeclareLaunchArgument("object_nodes_topic", default_value="/object_nodes"), + DeclareLaunchArgument("obj_points_topic", default_value="/obj_points"), + DeclareLaunchArgument("obj_boxes_topic", default_value="/obj_boxes"), + DeclareLaunchArgument("obj_labels_topic", default_value="/obj_labels"), + DeclareLaunchArgument("map_frame", default_value="map"), + DeclareLaunchArgument("target_object", default_value="refrigerator"), + DeclareLaunchArgument("save_png", default_value="false"), + DeclareLaunchArgument("output_dir", default_value="output/object_images"), + DeclareLaunchArgument("image_topic", default_value="/camera/image"), + DeclareLaunchArgument("compressed_image_topic", default_value="/camera/image/compressed"), + DeclareLaunchArgument("conf", default_value="0.35"), + DeclareLaunchArgument("max_area_ratio", default_value="0.3"), + OpaqueFunction(function=_launch_nodes), + ] + ) diff --git a/semantic_mapping/scripts/semantic_mapping_node b/semantic_mapping/scripts/semantic_mapping_node old mode 100644 new mode 100755 diff --git a/semantic_mapping/semantic_mapping/map_object.py b/semantic_mapping/semantic_mapping/map_object.py index 8140835..5875d9c 100644 --- a/semantic_mapping/semantic_mapping/map_object.py +++ b/semantic_mapping/semantic_mapping/map_object.py @@ -20,6 +20,7 @@ from vector_perception_utils.pointcloud_utils import ( compute_centroid, create_pointcloud, + remove_statistical_outliers, ) @@ -180,6 +181,12 @@ def update( # Rebuild KDTree self._tree = cKDTree(self.voxels) + def update_observation_angles(self, voxel_indices: np.ndarray, obs_bins: np.ndarray) -> None: + """Update observation angle histogram for selected voxels.""" + if voxel_indices.size == 0: + return + self.observation_angles[voxel_indices, obs_bins] = 1 + def merge_from(self, other: VoxelFeatureManager) -> None: """Merge voxels from another VoxelFeatureManager. @@ -368,6 +375,13 @@ class MapObject: _diversity_percentile: float = field(default=0.85, repr=False) _angle_threshold: float = field(default=np.deg2rad(5), repr=False) _distance_threshold: float = field(default=0.3, repr=False) + _dbscan_eps_scale: float = field(default=2.0, repr=False) + _dbscan_min_points_initial: int = field(default=6, repr=False) + _dbscan_min_points_mature: int = field(default=12, repr=False) + _outlier_filter: bool = field(default=True, repr=False) + _outlier_nb_neighbors: int = field(default=10, repr=False) + _outlier_std_ratio: float = field(default=2.0, repr=False) + _outlier_min_points: int = field(default=30, repr=False) @classmethod def from_observation( @@ -376,6 +390,13 @@ def from_observation( obs: Observation, voxel_size: float = 0.03, num_angle_bins: int = 15, + dbscan_eps_scale: float = 2.0, + dbscan_min_points_initial: int = 6, + dbscan_min_points_mature: int = 12, + outlier_filter: bool = True, + outlier_nb_neighbors: int = 10, + outlier_std_ratio: float = 2.0, + outlier_min_points: int = 30, ) -> MapObject: """Create a new MapObject from an initial observation. @@ -384,6 +405,13 @@ def from_observation( obs: Initial observation. voxel_size: Voxel grid cell size in meters. num_angle_bins: Number of angle bins for observation tracking. + dbscan_eps_scale: DBSCAN eps scale (multiplier on voxel size). + dbscan_min_points_initial: Min points for new objects. + dbscan_min_points_mature: Min points for mature objects. + outlier_filter: Whether to run statistical outlier filtering. + outlier_nb_neighbors: Neighbor count for outlier filtering. + outlier_std_ratio: Standard deviation ratio for outlier filtering. + outlier_min_points: Minimum points to apply outlier filtering. Returns: New MapObject instance. @@ -405,6 +433,13 @@ def from_observation( points_count={obs.label: len(obs.points)}, robot_poses=[{"R": obs.robot_R.copy(), "t": obs.robot_t.copy()}], last_update_time=obs.timestamp, + _dbscan_eps_scale=dbscan_eps_scale, + _dbscan_min_points_initial=dbscan_min_points_initial, + _dbscan_min_points_mature=dbscan_min_points_mature, + _outlier_filter=outlier_filter, + _outlier_nb_neighbors=outlier_nb_neighbors, + _outlier_std_ratio=outlier_std_ratio, + _outlier_min_points=outlier_min_points, ) return obj @@ -592,92 +627,131 @@ def _compute_geometry(self) -> None: pcd = create_pointcloud(valid_voxels) obb = pcd.get_minimal_oriented_bounding_box() self._centroid = np.asarray(obb.center) - self._bbox3d_corners = np.asarray(obb.get_box_points()) + half_extent = 0.5 * np.asarray(obb.extent) + local_corners = np.array( + [ + [-half_extent[0], -half_extent[1], -half_extent[2]], + [half_extent[0], -half_extent[1], -half_extent[2]], + [half_extent[0], half_extent[1], -half_extent[2]], + [-half_extent[0], half_extent[1], -half_extent[2]], + [-half_extent[0], -half_extent[1], half_extent[2]], + [half_extent[0], -half_extent[1], half_extent[2]], + [half_extent[0], half_extent[1], half_extent[2]], + [-half_extent[0], half_extent[1], half_extent[2]], + ], + dtype=float, + ) + self._bbox3d_corners = local_corners @ np.asarray(obb.R).T + np.asarray(obb.center) self._needs_recompute = False def regularize_shape(self, percentile: float = 0.85) -> None: - """Apply DBSCAN clustering to filter outlier voxels. - - This method clusters the voxels using DBSCAN, then selects clusters - by descending weight (observation angle diversity) until the target - percentile of total weight is reached. Clusters far from the main - cluster are down-weighted. - - Args: - percentile: Target fraction of total weight to include (0-1). - """ + """Apply v2 shape regularization (ported from legacy pipeline).""" voxels = self.voxel_manager.voxels if len(voxels) < 3: self.voxel_manager.regularized_mask = np.ones(len(voxels), dtype=bool) self._invalidate_cache() return - # DBSCAN clustering pcd = create_pointcloud(voxels) - - eps = self.voxel_manager.voxel_size * 2.5 - min_points = 10 if self.detection_count >= 3 else 5 - + eps = self.voxel_manager.voxel_size * self._dbscan_eps_scale + min_points = ( + self._dbscan_min_points_mature + if self.detection_count >= 3 + else self._dbscan_min_points_initial + ) labels = np.array(pcd.cluster_dbscan(eps=eps, min_points=min_points)) unique_labels = np.unique(labels) - # Collect cluster info cluster_masks = [] cluster_weights = [] cluster_centroids = [] - for label in unique_labels: - if label == -1: # Noise + if label == -1: continue mask = labels == label + if not np.any(mask): + continue weight = float(np.sum(self.voxel_manager.observation_angles[mask])) - if weight < 5: # Skip tiny clusters + if weight < 5: continue cluster_masks.append(mask) cluster_weights.append(weight) cluster_centroids.append(compute_centroid(voxels[mask])) if len(cluster_masks) == 0: - self.voxel_manager.regularized_mask = np.zeros(len(voxels), dtype=bool) + self.voxel_manager.regularized_mask = np.zeros( + self.voxel_manager.voxels.shape[0], dtype=bool + ) elif len(cluster_masks) == 1: self.voxel_manager.regularized_mask = cluster_masks[0] else: - weights = np.array(cluster_weights) - centroids = np.array(cluster_centroids) - - # Find main cluster - main_idx = int(np.argmax(weights)) - main_centroid = centroids[main_idx] - - # Distance-based weight adjustment - dists = np.linalg.norm(centroids - main_centroid, axis=1) - dist_base = np.max(dists) if np.max(dists) > 0 else 1.0 - - alpha = 3.0 # Decay rate - scale = np.exp(-alpha * dists / dist_base) - adjusted_weights = weights * scale - - # Select clusters up to percentile - target = percentile * np.sum(adjusted_weights) - order = np.argsort(adjusted_weights)[::-1] - + clustering_weight = np.array(cluster_weights, dtype=float, copy=False) + cluster_centroids = np.array(cluster_centroids, dtype=float, copy=False) + + main_idx = int(np.argmax(clustering_weight)) + main_centroid = cluster_centroids[main_idx] + + dists = np.linalg.norm(cluster_centroids - main_centroid, axis=1) + if len(voxels) >= 4: + try: + obb = pcd.get_minimal_oriented_bounding_box() + dist_base = float(np.linalg.norm(np.asarray(obb.extent))) + except Exception: + dist_base = float(np.max(dists)) + else: + dist_base = float(np.max(dists)) + if dist_base == 0: + adjusted_weight = clustering_weight.copy() + else: + alpha = 3.0 + scale = np.exp(-alpha * dists / dist_base) + adjusted_weight = clustering_weight * scale + + main_adjusted_idx = int(np.argmax(adjusted_weight)) chosen = np.zeros(len(cluster_masks), dtype=bool) - acc = 0.0 - - for i in order: - if acc + adjusted_weights[i] <= target + 1e-8: - chosen[i] = True - acc += adjusted_weights[i] - if acc >= target: - break - # Merge chosen clusters - valid_mask = np.zeros(len(voxels), dtype=bool) + if percentile is not None: + target = percentile * np.sum(adjusted_weight) + acc = adjusted_weight[main_adjusted_idx] + chosen[main_adjusted_idx] = True + order = np.argsort(adjusted_weight)[::-1] + for i in order: + if chosen[i]: + continue + if acc + adjusted_weight[i] <= target + 1e-8: + chosen[i] = True + acc += adjusted_weight[i] + if acc >= target: + break + else: + chosen[:] = True + + valid_cluster_mask = np.zeros(self.voxel_manager.voxels.shape[0], dtype=bool) for i, cmask in enumerate(cluster_masks): if chosen[i]: - valid_mask |= cmask - - self.voxel_manager.regularized_mask = valid_mask + valid_cluster_mask |= cmask + self.voxel_manager.regularized_mask = valid_cluster_mask + + if ( + self._outlier_filter + and np.any(self.voxel_manager.regularized_mask) + and np.sum(self.voxel_manager.regularized_mask) >= self._outlier_min_points + ): + keep_indices = np.nonzero(self.voxel_manager.regularized_mask)[0] + keep_points = voxels[keep_indices] + _, _, inlier_indices = remove_statistical_outliers( + keep_points, + nb_neighbors=self._outlier_nb_neighbors, + std_ratio=self._outlier_std_ratio, + ) + if inlier_indices.size > 0: + refined_mask = np.zeros_like(self.voxel_manager.regularized_mask) + refined_mask[keep_indices[inlier_indices]] = True + self.voxel_manager.regularized_mask = refined_mask + else: + self.voxel_manager.regularized_mask = np.zeros_like( + self.voxel_manager.regularized_mask + ) self._invalidate_cache() diff --git a/semantic_mapping/semantic_mapping/mapper.py b/semantic_mapping/semantic_mapping/mapper.py index bcdfa90..8b1b259 100644 --- a/semantic_mapping/semantic_mapping/mapper.py +++ b/semantic_mapping/semantic_mapping/mapper.py @@ -7,7 +7,7 @@ import time import uuid -from .utils.matching import compute_iou_3d_with_ratios, find_best_match +from .utils.matching import compute_bbox, compute_iou_3d_with_ratios, find_best_match from .map_object import MapObject, ObjectStatus, Observation @@ -36,6 +36,13 @@ def __init__( pending_ttl_s: float = 10.0, max_inactive_frames: int = 20, num_angle_bins: int = 15, + dbscan_eps_scale: float = 2.0, + dbscan_min_points_initial: int = 6, + dbscan_min_points_mature: int = 12, + outlier_filter: bool = True, + outlier_nb_neighbors: int = 10, + outlier_std_ratio: float = 2.0, + outlier_min_points: int = 30, ) -> None: """Initialize the object mapper and its thresholds. @@ -48,6 +55,13 @@ def __init__( pending_ttl_s: Time-to-live for pending objects. max_inactive_frames: Frames before pruning inactive objects. num_angle_bins: Observation angle bins for voxel manager. + dbscan_eps_scale: DBSCAN eps scale (multiplier on voxel size). + dbscan_min_points_initial: DBSCAN min points for new objects. + dbscan_min_points_mature: DBSCAN min points for mature objects. + outlier_filter: Whether to run statistical outlier filtering. + outlier_nb_neighbors: Neighbor count for outlier filtering. + outlier_std_ratio: Standard deviation ratio for outlier filtering. + outlier_min_points: Minimum points to apply outlier filtering. """ self._pending_objects: dict[str, MapObject] = {} self._permanent_objects: dict[str, MapObject] = {} @@ -62,6 +76,13 @@ def __init__( self._pending_ttl_s = pending_ttl_s self._max_inactive_frames = max_inactive_frames self._num_angle_bins = num_angle_bins + self._dbscan_eps_scale = dbscan_eps_scale + self._dbscan_min_points_initial = dbscan_min_points_initial + self._dbscan_min_points_mature = dbscan_min_points_mature + self._outlier_filter = outlier_filter + self._outlier_nb_neighbors = outlier_nb_neighbors + self._outlier_std_ratio = outlier_std_ratio + self._outlier_min_points = outlier_min_points def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats: """Update the map with new observations. @@ -103,6 +124,7 @@ def update(self, observations: list[Observation], target_object: str | None = No for obj in list(self._pending_objects.values()) + list(self._permanent_objects.values()): if obj.object_id not in updated_ids: obj.inactive_frames += 1 + obj.regularize_shape(percentile=obj._diversity_percentile) stats.deleted += len(self._prune_inactive(target_object, now)) stats.merged += self.merge_objects() @@ -134,6 +156,13 @@ def _create_object(self, obs: Observation) -> MapObject: obs=obs, voxel_size=self._voxel_size, num_angle_bins=self._num_angle_bins, + dbscan_eps_scale=self._dbscan_eps_scale, + dbscan_min_points_initial=self._dbscan_min_points_initial, + dbscan_min_points_mature=self._dbscan_min_points_mature, + outlier_filter=self._outlier_filter, + outlier_nb_neighbors=self._outlier_nb_neighbors, + outlier_std_ratio=self._outlier_std_ratio, + outlier_min_points=self._outlier_min_points, ) self._track_id_map[obs.track_id] = obj.object_id return obj @@ -176,18 +205,31 @@ def merge_objects(self) -> int: j += 1 continue - obj_corners = obj.bbox3d_corners - other_corners = other.bbox3d_corners - obj_extent = ( - obj_corners.max(axis=0) - obj_corners.min(axis=0) - if obj_corners is not None and obj_corners.size > 0 - else None + obj_points = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, ) - other_extent = ( - other_corners.max(axis=0) - other_corners.min(axis=0) - if other_corners is not None and other_corners.size > 0 - else None + if len(obj_points) == 0: + obj_points = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + other_points = other.voxel_manager.get_valid_voxels( + diversity_percentile=other._diversity_percentile, + use_regularized=True, ) + if len(other_points) == 0: + other_points = other.voxel_manager.get_valid_voxels( + diversity_percentile=other._diversity_percentile, + use_regularized=False, + ) + + obj_bbox = compute_bbox(obj_points) if len(obj_points) >= 3 else None + other_bbox = compute_bbox(other_points) if len(other_points) >= 3 else None + obj_corners = obj_bbox[0] if obj_bbox is not None else None + other_corners = other_bbox[0] if other_bbox is not None else None + obj_extent = obj_bbox[1] if obj_bbox is not None else None + other_extent = other_bbox[1] if other_bbox is not None else None dist_merge = dist < 0.25 if obj_extent is not None and other_extent is not None: diff --git a/semantic_mapping/semantic_mapping/mapping_node.py b/semantic_mapping/semantic_mapping/mapping_node.py index c33bcc0..0a4cf37 100644 --- a/semantic_mapping/semantic_mapping/mapping_node.py +++ b/semantic_mapping/semantic_mapping/mapping_node.py @@ -9,6 +9,7 @@ import numpy as np import yaml from cv_bridge import CvBridge +import cv2 import rclpy from rclpy.node import Node from rclpy.qos import QoSProfile, ReliabilityPolicy @@ -27,6 +28,7 @@ segment_pointclouds_from_projection, transform_segmented_clouds_to_world, ) +from vector_perception_utils.transform_utils import discretize_angles from .map_object import Observation from .utils.map_object_utils import ( @@ -77,6 +79,9 @@ def __init__(self) -> None: self.declare_parameter("pending_ttl_s", 10.0) self.declare_parameter("max_inactive_frames", 20) self.declare_parameter("num_angle_bins", 15) + self.declare_parameter("dbscan_eps_scale", 2.0) + self.declare_parameter("dbscan_min_points_initial", 6) + self.declare_parameter("dbscan_min_points_mature", 12) self.declare_parameter("detection_linear_state_time_bias", 0.0) self.declare_parameter("detection_angular_state_time_bias", 0.0) @@ -91,6 +96,17 @@ def __init__(self) -> None: self.declare_parameter("depth_filter_margin", 0.15) self.declare_parameter("depth_jump_threshold", 0.3) self.declare_parameter("sensor_to_camera_tf", "config/sensor_to_camera_tf.yaml") + self.declare_parameter("mask_erode_kernel", 3) + self.declare_parameter("mask_erode_iterations", 2) + self.declare_parameter("max_cloud_distance", 8.0) + self.declare_parameter("outlier_filter", True) + self.declare_parameter("outlier_nb_neighbors", 10) + self.declare_parameter("outlier_std_ratio", 2.0) + self.declare_parameter("outlier_min_points", 30) + self.declare_parameter("visualization_max_objects", 50) + self.declare_parameter("ground_filter", True) + self.declare_parameter("ground_radius", 2.0) + self.declare_parameter("ground_z_clearance", 0.0) self.declare_parameter("target_object", "") self.declare_parameter("save_png", False) @@ -125,6 +141,25 @@ def __init__(self) -> None: self._sensor_to_camera = self._load_sensor_to_camera( str(self.get_parameter("sensor_to_camera_tf").value) ) + self._mask_erode_kernel_size = max(1, int(self.get_parameter("mask_erode_kernel").value)) + self._mask_erode_iterations = max(0, int(self.get_parameter("mask_erode_iterations").value)) + self._mask_erode_kernel = cv2.getStructuringElement( + cv2.MORPH_RECT, + (self._mask_erode_kernel_size, self._mask_erode_kernel_size), + ) + self._max_cloud_distance = float(self.get_parameter("max_cloud_distance").value) + self._outlier_filter = bool(self.get_parameter("outlier_filter").value) + self._outlier_nb_neighbors = int(self.get_parameter("outlier_nb_neighbors").value) + self._outlier_std_ratio = float(self.get_parameter("outlier_std_ratio").value) + self._outlier_min_points = int(self.get_parameter("outlier_min_points").value) + self._visualization_max_objects = int( + self.get_parameter("visualization_max_objects").value + ) + self._ground_filter = bool(self.get_parameter("ground_filter").value) + self._ground_radius = float(self.get_parameter("ground_radius").value) + self._ground_z_clearance = float(self.get_parameter("ground_z_clearance").value) + self._ground_percentile = 0.05 + self._ground_min_points = 80 self._target_object = str(self.get_parameter("target_object").value) self._save_png = bool(self.get_parameter("save_png").value) @@ -139,6 +174,13 @@ def __init__(self) -> None: pending_ttl_s=float(self.get_parameter("pending_ttl_s").value), max_inactive_frames=int(self.get_parameter("max_inactive_frames").value), num_angle_bins=int(self.get_parameter("num_angle_bins").value), + dbscan_eps_scale=float(self.get_parameter("dbscan_eps_scale").value), + dbscan_min_points_initial=int(self.get_parameter("dbscan_min_points_initial").value), + dbscan_min_points_mature=int(self.get_parameter("dbscan_min_points_mature").value), + outlier_filter=self._outlier_filter, + outlier_nb_neighbors=self._outlier_nb_neighbors, + outlier_std_ratio=self._outlier_std_ratio, + outlier_min_points=self._outlier_min_points, ) self._image_saver = ImageSaveQueue( @@ -297,6 +339,7 @@ def _mapping_callback(self) -> None: return self._mapper.update(observations, target_object=self._target_object) + self._update_observation_angles(observations) self._update_best_images(observations, image) self._publish_map(detection_stamp, viewpoint_id) self._publish_object_type_queries() @@ -398,6 +441,31 @@ def _collect_cloud_window(self, detection_stamp: float) -> np.ndarray | None: return None return np.concatenate(clouds, axis=0) + def _compute_ground_z_threshold( + self, + points: np.ndarray, + robot_xy: np.ndarray, + ) -> float | None: + if not self._ground_filter: + return None + if points.shape[0] < self._ground_min_points: + return None + if self._ground_radius <= 0.0: + return None + if not 0.0 < self._ground_percentile < 1.0: + return None + + diffs = points[:, :2] - robot_xy + dist2 = diffs[:, 0] ** 2 + diffs[:, 1] ** 2 + radius2 = self._ground_radius ** 2 + mask = dist2 <= radius2 + if np.count_nonzero(mask) < self._ground_min_points: + return None + + z_vals = points[mask, 2] + z_thresh = float(np.percentile(z_vals, self._ground_percentile * 100.0)) + return z_thresh + self._ground_z_clearance + def _build_observations( self, detection_msg: DetectionResult, @@ -442,8 +510,17 @@ def _build_observations( full_mask = np.zeros(image_shape, dtype=bool) if i < len(detection_msg.masks): mask_msg: CompressedImage = detection_msg.masks[i] - mask_crop = self._bridge.compressed_imgmsg_to_cv2(mask_msg, desired_encoding="mono8") + mask_crop = self._bridge.compressed_imgmsg_to_cv2(mask_msg) + if mask_crop.ndim == 3: + mask_crop = cv2.cvtColor(mask_crop, cv2.COLOR_BGR2GRAY) mask_bool = mask_crop.astype(bool) + if self._mask_erode_iterations > 0: + mask_eroded = cv2.erode( + mask_bool.astype(np.uint8), + self._mask_erode_kernel, + iterations=self._mask_erode_iterations, + ) + mask_bool = mask_eroded.astype(bool) crop_h = min(mask_bool.shape[0], y2 - y1) crop_w = min(mask_bool.shape[1], x2 - x1) full_mask[y1 : y1 + crop_h, x1 : x1 + crop_w] = mask_bool[:crop_h, :crop_w] @@ -485,9 +562,16 @@ def _build_observations( t_b2w=t_b2w, depth_jump_threshold=self._depth_jump_threshold, ) + cloud_world = cloud_body @ R_b2w.T + t_b2w + ground_z_threshold = self._compute_ground_z_threshold(cloud_world, t_b2w[:2]) observations = [] for idx, world_cloud in enumerate(world_clouds): + if self._max_cloud_distance > 0: + dist = np.linalg.norm(world_cloud - t_b2w, axis=1) + world_cloud = world_cloud[dist < self._max_cloud_distance] + if ground_z_threshold is not None: + world_cloud = world_cloud[world_cloud[:, 2] > ground_z_threshold] if world_cloud.shape[0] == 0: continue observations.append( @@ -506,6 +590,62 @@ def _build_observations( return observations + def _update_observation_angles(self, observations: list[Observation]) -> None: + """Reproject object voxels onto the current mask to update angle votes.""" + if len(observations) == 0: + return + + objects = self._mapper.get_objects(include_pending=True) + track_to_obj = {} + for obj in objects: + for track_id in obj.track_ids: + track_to_obj[track_id] = obj + + for obs in observations: + obj = track_to_obj.get(obs.track_id) + if obj is None or obs.mask is None: + continue + voxels = obj.voxel_manager.voxels + if voxels.shape[0] == 0: + continue + + R_b2w = obs.robot_R + t_b2w = obs.robot_t + R_w2b = R_b2w.T + t_w2b = -R_w2b @ t_b2w + voxels_body = voxels @ R_w2b.T + t_w2b + _, pixel_idx = project_pointcloud_to_image( + points=voxels_body, + projection=self._projection, + image_width=self._image_width, + image_height=self._image_height, + axis_mode=self._axis_mode, + intrinsics=None, + depth_filter=self._depth_filter, + depth_filter_margin=self._depth_filter_margin, + transform=self._sensor_to_camera, + ) + pix = pixel_idx.astype(int) + valid = ( + (pix[:, 0] >= 0) + & (pix[:, 0] < obs.mask.shape[1]) + & (pix[:, 1] >= 0) + & (pix[:, 1] < obs.mask.shape[0]) + ) + if not np.any(valid): + continue + pix = pix[valid] + voxel_indices = np.nonzero(valid)[0] + on_mask = obs.mask[pix[:, 1], pix[:, 0]].astype(bool) + if not np.any(on_mask): + continue + voxel_indices = voxel_indices[on_mask] + vectors = voxels[voxel_indices] - t_b2w + obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0]) + obs_bins = discretize_angles(obs_angles, obj.voxel_manager.num_angle_bins) + obj.voxel_manager.update_observation_angles(voxel_indices, obs_bins) + obj._invalidate_cache() + def _update_best_images(self, observations: list[Observation], image: np.ndarray) -> None: """Update best image crops for objects based on mask area. @@ -595,16 +735,22 @@ def _publish_map(self, stamp: float, viewpoint_id: int) -> None: if msg is not None: self._object_nodes_pub.publish(msg) - bbox_markers = create_bbox_markers(objects, header) - label_markers = create_label_markers(objects, header) + viz_objects = objects + if self._visualization_max_objects > 0 and len(objects) > self._visualization_max_objects: + viz_objects = sorted( + objects, key=lambda obj: obj.last_update_time, reverse=True + )[: self._visualization_max_objects] + + bbox_markers = create_bbox_markers(viz_objects, header) + label_markers = create_label_markers(viz_objects, header) self._obj_boxes_pub.publish(bbox_markers) self._obj_labels_pub.publish(label_markers) - obj_cloud = create_object_pointcloud(objects, header) + obj_cloud = create_object_pointcloud(viz_objects, header) if obj_cloud is not None: self._obj_points_pub.publish(obj_cloud) - current_ids = {tid for obj in objects for tid in obj.track_ids} + current_ids = {int(obj.track_ids[0]) if obj.track_ids else 0 for obj in viz_objects} deleted_ids = list(self._published_ids - current_ids) if len(deleted_ids) > 0: delete_markers = create_delete_markers(deleted_ids, header, namespace="bbox") @@ -648,21 +794,46 @@ def _load_sensor_to_camera(self, path: str) -> np.ndarray | None: tf_data = data.get("sensor_to_camera", {}) trans = tf_data.get("translation", {}) rpy = tf_data.get("rotation_rpy", {}) - translation = np.array( + cam_offset = np.array( [ float(trans.get("x", 0.0)), float(trans.get("y", 0.0)), float(trans.get("z", 0.0)), ] ) - rotation = Rotation.from_euler( - "xyz", + roll = float(rpy.get("roll", 0.0)) + pitch = float(rpy.get("pitch", 0.0)) + yaw = float(rpy.get("yaw", 0.0)) + cos_roll = np.cos(roll) + sin_roll = np.sin(roll) + cos_pitch = np.cos(pitch) + sin_pitch = np.sin(pitch) + cos_yaw = np.cos(yaw) + sin_yaw = np.sin(yaw) + r_z = np.array( [ - float(rpy.get("roll", 0.0)), - float(rpy.get("pitch", 0.0)), - float(rpy.get("yaw", 0.0)), - ], - ).as_matrix() + [cos_yaw, -sin_yaw, 0.0], + [sin_yaw, cos_yaw, 0.0], + [0.0, 0.0, 1.0], + ] + ) + r_y = np.array( + [ + [cos_pitch, 0.0, sin_pitch], + [0.0, 1.0, 0.0], + [-sin_pitch, 0.0, cos_pitch], + ] + ) + r_x = np.array( + [ + [1.0, 0.0, 0.0], + [0.0, cos_roll, -sin_roll], + [0.0, sin_roll, cos_roll], + ] + ) + cam_rotation = r_z @ r_y @ r_x + rotation = cam_rotation.T + translation = -rotation @ cam_offset transform = np.eye(4) transform[:3, :3] = rotation transform[:3, 3] = translation diff --git a/semantic_mapping/semantic_mapping/utils/map_object_utils.py b/semantic_mapping/semantic_mapping/utils/map_object_utils.py index b098c8c..5ad4163 100644 --- a/semantic_mapping/semantic_mapping/utils/map_object_utils.py +++ b/semantic_mapping/semantic_mapping/utils/map_object_utils.py @@ -17,6 +17,39 @@ from ..map_object import MapObject +def _axis_aligned_corners(points: np.ndarray) -> Optional[np.ndarray]: + if points.size == 0: + return None + min_bound = np.min(points, axis=0) + max_bound = np.max(points, axis=0) + return np.array( + [ + [min_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], max_bound[1], max_bound[2]], + [min_bound[0], max_bound[1], max_bound[2]], + ], + dtype=float, + ) + + +def get_object_seed(obj: MapObject) -> int: + """Generate a deterministic seed from a MapObject for color generation. + + Args: + obj: MapObject instance. + + Returns: + CRC32 hash of the object's identifier. + """ + seed_key = obj.object_id or (str(obj.track_ids[0]) if obj.track_ids else obj.label) + return zlib.crc32(seed_key.encode("utf-8")) + + def map_object_to_object_node( obj: MapObject, header: Header, @@ -66,7 +99,7 @@ def map_object_to_object_node( use_regularized=False, ) if len(voxels) > 0: - seed = zlib.crc32(obj.label.encode("utf-8")) + seed = get_object_seed(obj) color = np.array( [ (seed & 0xFF) / 255.0, @@ -105,7 +138,17 @@ def create_bbox_markers( ] for obj in objects: - if obj.bbox3d_corners is None: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=True, + ) + if len(voxels) == 0: + voxels = obj.voxel_manager.get_valid_voxels( + diversity_percentile=obj._diversity_percentile, + use_regularized=False, + ) + corners = _axis_aligned_corners(voxels) if len(voxels) > 0 else None + if corners is None: continue marker = Marker() marker.header = header @@ -115,13 +158,12 @@ def create_bbox_markers( marker.action = Marker.ADD marker.scale.x = 0.02 - seed = zlib.crc32(obj.label.encode("utf-8")) + seed = get_object_seed(obj) marker.color.r = (seed & 0xFF) / 255.0 marker.color.g = ((seed >> 8) & 0xFF) / 255.0 marker.color.b = ((seed >> 16) & 0xFF) / 255.0 marker.color.a = 1.0 - corners = obj.bbox3d_corners for i, j in edges: marker.points.append(Point(x=float(corners[i, 0]), y=float(corners[i, 1]), z=float(corners[i, 2]))) marker.points.append(Point(x=float(corners[j, 0]), y=float(corners[j, 1]), z=float(corners[j, 2]))) @@ -155,7 +197,7 @@ def create_label_markers( marker.pose.position.y = float(obj.centroid[1]) marker.pose.position.z = float(obj.centroid[2]) + 0.3 - seed = zlib.crc32(obj.label.encode("utf-8")) + seed = get_object_seed(obj) marker.color.r = (seed & 0xFF) / 255.0 marker.color.g = ((seed >> 8) & 0xFF) / 255.0 marker.color.b = ((seed >> 16) & 0xFF) / 255.0 @@ -184,7 +226,7 @@ def create_object_pointcloud(objects: list[MapObject], header: Header) -> Option if len(voxels) == 0: continue - seed = zlib.crc32(obj.label.encode("utf-8")) + seed = get_object_seed(obj) color = np.array( [ (seed & 0xFF) / 255.0, diff --git a/semantic_mapping/semantic_mapping/utils/matching.py b/semantic_mapping/semantic_mapping/utils/matching.py index fcfb7cd..e5aaa19 100644 --- a/semantic_mapping/semantic_mapping/utils/matching.py +++ b/semantic_mapping/semantic_mapping/utils/matching.py @@ -4,50 +4,14 @@ from typing import Optional +import cv2 import numpy as np -from vector_perception_utils.pointcloud_utils import compute_centroid, get_oriented_bbox +from vector_perception_utils.pointcloud_utils import compute_centroid from ..map_object import MapObject, Observation -def _convex_hull_2d(points: np.ndarray) -> np.ndarray: - """Compute 2D convex hull in CCW order (Monotonic chain). - - Args: - points: Input points (N, 2). - - Returns: - Hull vertices in CCW order (M, 2). - """ - if points.shape[0] < 3: - return points - - pts = np.unique(points, axis=0) - if pts.shape[0] < 3: - return pts - - pts = pts[np.lexsort((pts[:, 1], pts[:, 0]))] - - def cross(o: np.ndarray, a: np.ndarray, b: np.ndarray) -> float: - return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0]) - - lower: list[np.ndarray] = [] - for p in pts: - while len(lower) >= 2 and cross(lower[-2], lower[-1], p) <= 0: - lower.pop() - lower.append(p) - - upper: list[np.ndarray] = [] - for p in reversed(pts): - while len(upper) >= 2 and cross(upper[-2], upper[-1], p) <= 0: - upper.pop() - upper.append(p) - - hull = np.array(lower[:-1] + upper[:-1]) - return hull - - def _polygon_area(poly: np.ndarray) -> float: """Compute polygon area using the shoelace formula. @@ -64,6 +28,48 @@ def _polygon_area(poly: np.ndarray) -> float: return 0.5 * abs(float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1)))) +def _ensure_ccw(poly: np.ndarray) -> np.ndarray: + """Ensure polygon vertices are ordered counter-clockwise.""" + if poly.shape[0] < 3: + return poly + x = poly[:, 0] + y = poly[:, 1] + signed = float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1))) + if signed < 0: + return poly[::-1] + return poly + + +def compute_bbox(points: np.ndarray) -> Optional[tuple[np.ndarray, np.ndarray]]: + """Compute yaw-only oriented bbox corners + extents.""" + if points.shape[0] < 3: + return None + + xy = points[:, :2].astype(np.float32) + unique_xy = np.unique(xy, axis=0) + if unique_xy.shape[0] < 3: + return None + + rect = cv2.minAreaRect(unique_xy) + (width, height) = rect[1] + if width == 0.0 or height == 0.0: + return None + + box = cv2.boxPoints(rect).astype(np.float64) + rect_xy = _ensure_ccw(box) + + zmin = float(np.min(points[:, 2])) + zmax = float(np.max(points[:, 2])) + extent = np.array([width, height, zmax - zmin], dtype=float) + z_center = zmax - extent[2] / 2.0 + half_z = extent[2] / 2.0 + + bottom = np.hstack([rect_xy, np.full((4, 1), z_center - half_z)]) + top = np.hstack([rect_xy, np.full((4, 1), z_center + half_z)]) + corners = np.vstack([bottom, top]) + return corners, extent + + def _line_intersection(p1: np.ndarray, p2: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> np.ndarray: """Intersection of line p1->p2 with line cp1->cp2. @@ -113,7 +119,7 @@ def inside(p: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> bool: input_list = output output = [] if len(input_list) == 0: - break + return np.empty((0, 2), dtype=np.float64) s = input_list[-1] for e in input_list: if inside(e, cp1, cp2): @@ -124,13 +130,13 @@ def inside(p: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> bool: output.append(_line_intersection(s, e, cp1, cp2)) s = e output = np.array(output, dtype=np.float64) - return output + return np.array(output, dtype=np.float64) def compute_iou_3d_with_ratios( bbox1: np.ndarray, bbox2: np.ndarray ) -> tuple[float, float, float]: - """Compute IoU and intersection ratios using 2D polygon overlap + Z overlap. + """Compute IoU and intersection ratios using yaw-only bbox overlap. Args: bbox1: Corner points for bbox1 (N, 3). @@ -142,12 +148,12 @@ def compute_iou_3d_with_ratios( if bbox1.shape[0] < 4 or bbox2.shape[0] < 4: return 0.0, 0.0, 0.0 - hull1 = _convex_hull_2d(bbox1[:, :2]) - hull2 = _convex_hull_2d(bbox2[:, :2]) - if hull1.shape[0] < 3 or hull2.shape[0] < 3: + rect1 = _ensure_ccw(bbox1[:4, :2]) + rect2 = _ensure_ccw(bbox2[:4, :2]) + if rect1.shape[0] < 3 or rect2.shape[0] < 3: return 0.0, 0.0, 0.0 - poly_inter = _polygon_clip(hull1, hull2) + poly_inter = _polygon_clip(rect1, rect2) inter_area = _polygon_area(poly_inter) if poly_inter.shape[0] >= 3 else 0.0 zmin1, zmax1 = float(np.min(bbox1[:, 2])), float(np.max(bbox1[:, 2])) @@ -155,8 +161,8 @@ def compute_iou_3d_with_ratios( height = max(0.0, min(zmax1, zmax2) - max(zmin1, zmin2)) inter_vol = inter_area * height - area1 = _polygon_area(hull1) - area2 = _polygon_area(hull2) + area1 = _polygon_area(rect1) + area2 = _polygon_area(rect2) vol1 = area1 * (zmax1 - zmin1) vol2 = area2 * (zmax2 - zmin2) denom = vol1 + vol2 - inter_vol @@ -192,13 +198,9 @@ def find_best_match( return None obs_center = compute_centroid(obs.points) - obs_bbox = get_oriented_bbox(obs.points) - obs_corners = obs_bbox[1] if obs_bbox is not None else None - obs_extent = ( - np.max(obs_corners, axis=0) - np.min(obs_corners, axis=0) - if obs_corners is not None and obs_corners.size > 0 - else None - ) + obs_bbox = compute_bbox(obs.points) + obs_corners = obs_bbox[0] if obs_bbox is not None else None + obs_extent = obs_bbox[1] if obs_bbox is not None else None best_candidate: Optional[MapObject] = None best_dist = float("inf") @@ -215,12 +217,18 @@ def find_best_match( if dist > distance_threshold: continue - cand_corners = candidate.bbox3d_corners - cand_extent = ( - np.max(cand_corners, axis=0) - np.min(cand_corners, axis=0) - if cand_corners is not None and cand_corners.size > 0 - else None + cand_points = candidate.voxel_manager.get_valid_voxels( + diversity_percentile=candidate._diversity_percentile, + use_regularized=True, ) + if len(cand_points) == 0: + cand_points = candidate.voxel_manager.get_valid_voxels( + diversity_percentile=candidate._diversity_percentile, + use_regularized=False, + ) + cand_bbox = compute_bbox(cand_points) if len(cand_points) >= 3 else None + cand_corners = cand_bbox[0] if cand_bbox is not None else None + cand_extent = cand_bbox[1] if cand_bbox is not None else None dist_merge = dist < min_distance_merge if obs_extent is not None and cand_extent is not None: diff --git a/semantic_mapping/semantic_mapping/utils/storage.py b/semantic_mapping/semantic_mapping/utils/storage.py index 6699373..d2be3cc 100644 --- a/semantic_mapping/semantic_mapping/utils/storage.py +++ b/semantic_mapping/semantic_mapping/utils/storage.py @@ -3,6 +3,7 @@ from __future__ import annotations from pathlib import Path +import shutil import queue import threading import time @@ -23,6 +24,8 @@ def __init__(self, output_dir: Path, max_queue_size: int = 100, save_png: bool = save_png: If True, also save a .png copy of the image. """ self._output_dir = Path(output_dir) + if self._output_dir.exists() and any(self._output_dir.iterdir()): + shutil.rmtree(self._output_dir) self._output_dir.mkdir(parents=True, exist_ok=True) self._save_png = save_png self._queue: queue.Queue = queue.Queue(maxsize=max_queue_size) @@ -81,11 +84,7 @@ def _run(self) -> None: np.save(mask_path, mask) if self._save_png: png_path = Path(image_path).with_suffix(".png") - if image.ndim == 3 and image.shape[2] == 3: - bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) - cv2.imwrite(str(png_path), bgr) - else: - cv2.imwrite(str(png_path), image) + cv2.imwrite(str(png_path), image) except Exception: pass self._queue.task_done() diff --git a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py index c27a23b..e86678d 100644 --- a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py +++ b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py @@ -148,6 +148,75 @@ def remove_radius_outliers( return points_filtered, colors_filtered, inlier_indices +def fit_plane( + points: np.ndarray, + distance_threshold: float = 0.05, + ransac_n: int = 3, + num_iterations: int = 50, + max_points: int = 8000, + min_inlier_ratio: float = 0.0, + min_normal_z: float = 0.0, + min_above_ratio: float = 0.0, + prefer_up: bool = True, +) -> Optional[np.ndarray]: + """Fit a plane to a point cloud using RANSAC. + + Args: + points: Point coordinates (N, 3). + distance_threshold: Max distance for inliers during RANSAC. + ransac_n: Number of points to sample per RANSAC iteration. + num_iterations: RANSAC iterations. + max_points: Max points to sample for fitting (0 keeps all). + min_inlier_ratio: Minimum inlier ratio to accept the plane. + min_normal_z: Minimum absolute Z component of the plane normal. + min_above_ratio: Minimum ratio of points above the plane. + prefer_up: If True, flip normal to have positive Z. + + Returns: + Plane coefficients [a, b, c, d] for ax + by + cz + d = 0, or None. + """ + if points.shape[0] < 3: + return None + + sample_points = points + if max_points > 0 and points.shape[0] > max_points: + indices = np.random.choice(points.shape[0], max_points, replace=False) + sample_points = points[indices] + + pcd = create_pointcloud(sample_points) + plane_model, inliers = pcd.segment_plane( + distance_threshold=distance_threshold, + ransac_n=ransac_n, + num_iterations=num_iterations, + ) + if len(inliers) == 0: + return None + if min_inlier_ratio > 0.0 and len(inliers) / sample_points.shape[0] < min_inlier_ratio: + return None + + normal = np.array(plane_model[:3], dtype=float) + norm = float(np.linalg.norm(normal)) + if norm == 0.0: + return None + normal /= norm + d = float(plane_model[3]) / norm + + if min_normal_z > 0.0 and abs(normal[2]) < min_normal_z: + return None + if prefer_up and normal[2] < 0: + normal = -normal + d = -d + + if min_above_ratio > 0.0: + check_points = sample_points + distances = check_points @ normal + d + above_ratio = float(np.mean(distances > 0.0)) + if above_ratio < min_above_ratio: + return None + + return np.array([normal[0], normal[1], normal[2], d], dtype=float) + + def transform_pointcloud( points: np.ndarray, transform: np.ndarray @@ -208,14 +277,66 @@ def compute_centroid(points: np.ndarray) -> np.ndarray: """ return np.mean(points, axis=0) +def get_bbox_center(bbox) -> np.ndarray: + """Return bounding box center for Open3D AABB/OBB variants.""" + if hasattr(bbox, "center"): + return np.asarray(bbox.center) + if hasattr(bbox, "get_center"): + return np.asarray(bbox.get_center()) + if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"): + min_bound = np.asarray(bbox.get_min_bound()) + max_bound = np.asarray(bbox.get_max_bound()) + return (min_bound + max_bound) * 0.5 + return np.zeros(3, dtype=float) + + +def get_bbox_extent(bbox) -> np.ndarray: + """Return bounding box extent for Open3D AABB/OBB variants.""" + if hasattr(bbox, "extent"): + return np.asarray(bbox.extent) + if hasattr(bbox, "get_extent"): + return np.asarray(bbox.get_extent()) + if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"): + min_bound = np.asarray(bbox.get_min_bound()) + max_bound = np.asarray(bbox.get_max_bound()) + return max_bound - min_bound + return np.zeros(3, dtype=float) + + +def get_bbox_corners(bbox) -> np.ndarray: + """Return 8 bbox corners for Open3D AABB/OBB variants.""" + if hasattr(bbox, "get_box_points"): + return np.asarray(bbox.get_box_points()) + if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"): + min_bound = np.asarray(bbox.get_min_bound()) + max_bound = np.asarray(bbox.get_max_bound()) + return np.array( + [ + [min_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], min_bound[1], min_bound[2]], + [max_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], max_bound[1], min_bound[2]], + [min_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], min_bound[1], max_bound[2]], + [max_bound[0], max_bound[1], max_bound[2]], + [min_bound[0], max_bound[1], max_bound[2]], + ], + dtype=float, + ) + return np.zeros((0, 3), dtype=float) + + def get_oriented_bbox( points: np.ndarray, + axis_aligned: bool = True, ) -> Optional[Tuple[np.ndarray, np.ndarray]]: """ - Compute minimal oriented bounding box from point cloud. + Compute bounding box from point cloud. Args: points: Point coordinates (N, 3) + axis_aligned: If True, compute axis-aligned bounding box. + If False, compute minimal oriented bounding box. Returns: Tuple of (center, corners) where center is (3,) and corners is (8, 3), @@ -227,12 +348,15 @@ def get_oriented_bbox( pcd = create_pointcloud(points) try: - obb = pcd.get_minimal_oriented_bounding_box() + if axis_aligned: + bbox = pcd.get_axis_aligned_bounding_box() + else: + bbox = pcd.get_minimal_oriented_bounding_box() except Exception: return None - center = np.asarray(obb.center) - corners = np.asarray(obb.get_box_points()) + center = get_bbox_center(bbox) + corners = get_bbox_corners(bbox) return center, corners @@ -257,9 +381,12 @@ def pointcloud_to_bbox3d(points: np.ndarray, header=None): obb = pcd.get_minimal_oriented_bounding_box() except: return None - center = obb.center - extent = obb.extent - quat = Rotation.from_matrix(obb.R).as_quat() + center = get_bbox_center(obb) + extent = get_bbox_extent(obb) + if hasattr(obb, "R"): + quat = Rotation.from_matrix(obb.R).as_quat() + else: + quat = np.array([0.0, 0.0, 0.0, 1.0], dtype=float) bbox = BoundingBox3D() bbox.center.position = Point( From 8c4b248be3f17f2b7ed3b86b49a28f61e7f7bcac Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Sun, 25 Jan 2026 19:00:04 -0500 Subject: [PATCH 6/6] Changed DBSCAN to voxelmap connected graph for more comutationally efficient dedubplication and pointcloud filtering --- semantic_mapping/README.md | 13 ++ semantic_mapping/config/params.yaml | 25 +-- .../docs/SEMANTIC_MAPPING_MIGRATION.md | 10 +- .../semantic_mapping/map_object.py | 208 +++++++----------- semantic_mapping/semantic_mapping/mapper.py | 62 ++---- .../semantic_mapping/mapping_node.py | 46 ++-- 6 files changed, 152 insertions(+), 212 deletions(-) diff --git a/semantic_mapping/README.md b/semantic_mapping/README.md index d7f6c88..62b78e5 100644 --- a/semantic_mapping/README.md +++ b/semantic_mapping/README.md @@ -108,6 +108,11 @@ ros2 launch semantic_mapping semantic_mapping.launch.py - `config/sensor_to_camera_tf.yaml`: Static sensor→camera transform for projection. - `config/objects.yaml`: Object taxonomy for detection labels. +Key runtime params: +- `regularize_voxel_size`: Connectivity voxel size used in shape regularization. +- `ground_filter`, `ground_radius`, `ground_z_clearance`: Simple ground clamp around robot. +- `visualization_max_objects`: Limits RViz markers/pointcloud to most recent objects. + ## Topics ### Subscriptions @@ -137,3 +142,11 @@ ros2 launch semantic_mapping semantic_mapping.launch.py - External Gemini API for VLM reasoning - Detection: `detect_anything` package (decoupled) - dimos patterns: `dimos/perception/detection/objectDB.py` + +## Handoff Notes + +- Regularization uses vote-weighted filtering + voxel connected components; adjust + `regularize_voxel_size` and the regularization percentile (in `MapObject`) if + thin structures are dropped. +- Mapping node logs summary stats and a single mapping loop time per cycle. +- Permanent objects persist; only pending objects time out via `pending_ttl_s`. diff --git a/semantic_mapping/config/params.yaml b/semantic_mapping/config/params.yaml index 370d9c9..0749d09 100644 --- a/semantic_mapping/config/params.yaml +++ b/semantic_mapping/config/params.yaml @@ -14,16 +14,13 @@ semantic_mapping_node: map_frame: map confidence_threshold: 0.35 - voxel_size: 0.03 + voxel_size: 0.05 + regularize_voxel_size: 0.2 distance_threshold: 0.3 - iou_threshold: 0.5 - min_detections_for_permanent: 3 - pending_ttl_s: 10.0 - max_inactive_frames: 20 + iou_threshold: 0.3 + min_detections_for_permanent: 4 + pending_ttl_s: 5.0 num_angle_bins: 15 - dbscan_eps_scale: 2.0 - dbscan_min_points_initial: 6 - dbscan_min_points_mature: 12 detection_linear_state_time_bias: 0.0 detection_angular_state_time_bias: 0.0 @@ -38,17 +35,13 @@ semantic_mapping_node: depth_filter_margin: 0.15 depth_jump_threshold: 0.3 sensor_to_camera_tf: config/sensor_to_camera_tf.yaml - mask_erode_kernel: 4 - mask_erode_iterations: 3 + mask_erode_kernel: 5 + mask_erode_iterations: 2 max_cloud_distance: 8.0 - outlier_filter: true - outlier_nb_neighbors: 12 - outlier_std_ratio: 1.6 - outlier_min_points: 20 visualization_max_objects: 50 ground_filter: true - ground_radius: 2.0 - ground_z_clearance: 0.05 + ground_radius: 2.5 + ground_z_clearance: 0.0 target_object: refrigerator save_png: True diff --git a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md index 07ef2c5..3f22372 100644 --- a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md +++ b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md @@ -460,13 +460,19 @@ Items to remove: - Added mask erosion, depth-jump pruning, distance gating, and light outlier filtering for cleaner object clouds. - Added reprojected observation-angle voting for better multi-view aggregation. - Fixed bbox ordering and switched `/obj_boxes` to axis-aligned markers (via `get_oriented_bbox(axis_aligned=True)`). -- Updated DBSCAN regularization to v2 logic and exposed DBSCAN tuning params. +- Replaced DBSCAN regularization with vote-weighted filtering + voxel connected components. +- Added `regularize_voxel_size` to grow connectivity for thin structures. +- Simplified ground filtering to radius-based Z clamp around robot. +- Added `visualization_max_objects` to keep recent objects visible in RViz. +- Logging: mapping stats (total/total_deleted/total_permanent) + mapping loop time. - Fixed image saving color order and ensured output dir is cleared on start. - Set MobileCLIP weights cache to `detect_anything/models` and removed corrupted caches. ## Temporary Handoff Notes -- Add ground-height clamp to further suppress ground/background leakage. +- Regularization is now connected-components; tune `regularize_voxel_size` and keep ratio + (`MapObject.regularize_shape` percentile) if legs/sparse parts are dropped. +- Ground filter uses `ground_filter`, `ground_radius`, `ground_z_clearance`. - If bagfile sync issues persist, consider switching mapping inputs to `/sensor_scan` + `/state_estimation_at_scan`. - Verify VLM objects.yaml path in VLM_ROS (`vlm_reasoning_node` still expects old path). diff --git a/semantic_mapping/semantic_mapping/map_object.py b/semantic_mapping/semantic_mapping/map_object.py index 5875d9c..a9e3633 100644 --- a/semantic_mapping/semantic_mapping/map_object.py +++ b/semantic_mapping/semantic_mapping/map_object.py @@ -20,7 +20,6 @@ from vector_perception_utils.pointcloud_utils import ( compute_centroid, create_pointcloud, - remove_statistical_outliers, ) @@ -375,13 +374,8 @@ class MapObject: _diversity_percentile: float = field(default=0.85, repr=False) _angle_threshold: float = field(default=np.deg2rad(5), repr=False) _distance_threshold: float = field(default=0.3, repr=False) - _dbscan_eps_scale: float = field(default=2.0, repr=False) - _dbscan_min_points_initial: int = field(default=6, repr=False) - _dbscan_min_points_mature: int = field(default=12, repr=False) - _outlier_filter: bool = field(default=True, repr=False) - _outlier_nb_neighbors: int = field(default=10, repr=False) - _outlier_std_ratio: float = field(default=2.0, repr=False) - _outlier_min_points: int = field(default=30, repr=False) + _needs_regularization: bool = field(default=True, repr=False) + _regularize_voxel_size: float = field(default=0.06, repr=False) @classmethod def from_observation( @@ -389,14 +383,8 @@ def from_observation( object_id: str, obs: Observation, voxel_size: float = 0.03, + regularize_voxel_size: float = 0.06, num_angle_bins: int = 15, - dbscan_eps_scale: float = 2.0, - dbscan_min_points_initial: int = 6, - dbscan_min_points_mature: int = 12, - outlier_filter: bool = True, - outlier_nb_neighbors: int = 10, - outlier_std_ratio: float = 2.0, - outlier_min_points: int = 30, ) -> MapObject: """Create a new MapObject from an initial observation. @@ -404,14 +392,8 @@ def from_observation( object_id: Unique identifier for this object. obs: Initial observation. voxel_size: Voxel grid cell size in meters. + regularize_voxel_size: Voxel size for regularization clustering. num_angle_bins: Number of angle bins for observation tracking. - dbscan_eps_scale: DBSCAN eps scale (multiplier on voxel size). - dbscan_min_points_initial: Min points for new objects. - dbscan_min_points_mature: Min points for mature objects. - outlier_filter: Whether to run statistical outlier filtering. - outlier_nb_neighbors: Neighbor count for outlier filtering. - outlier_std_ratio: Standard deviation ratio for outlier filtering. - outlier_min_points: Minimum points to apply outlier filtering. Returns: New MapObject instance. @@ -433,13 +415,7 @@ def from_observation( points_count={obs.label: len(obs.points)}, robot_poses=[{"R": obs.robot_R.copy(), "t": obs.robot_t.copy()}], last_update_time=obs.timestamp, - _dbscan_eps_scale=dbscan_eps_scale, - _dbscan_min_points_initial=dbscan_min_points_initial, - _dbscan_min_points_mature=dbscan_min_points_mature, - _outlier_filter=outlier_filter, - _outlier_nb_neighbors=outlier_nb_neighbors, - _outlier_std_ratio=outlier_std_ratio, - _outlier_min_points=outlier_min_points, + _regularize_voxel_size=regularize_voxel_size, ) return obj @@ -497,6 +473,7 @@ def update(self, obs: Observation) -> None: self.detection_count += 1 self.inactive_frames = 0 self.last_update_time = obs.timestamp + self._needs_regularization = True # Check if pose is new (not similar to existing) is_similar = self._is_similar_pose(obs.robot_R, obs.robot_t) @@ -533,6 +510,7 @@ def merge(self, other: MapObject) -> None: # Merge voxel managers self.voxel_manager.merge_from(other.voxel_manager) + self._needs_regularization = True # Merge class statistics for label, count in other.class_votes.items(): @@ -646,112 +624,82 @@ def _compute_geometry(self) -> None: self._needs_recompute = False def regularize_shape(self, percentile: float = 0.85) -> None: - """Apply v2 shape regularization (ported from legacy pipeline).""" + """Apply vote-weighted regularization with light outlier removal.""" + if not self._needs_regularization: + return voxels = self.voxel_manager.voxels if len(voxels) < 3: self.voxel_manager.regularized_mask = np.ones(len(voxels), dtype=bool) + self._needs_regularization = False self._invalidate_cache() return - pcd = create_pointcloud(voxels) - eps = self.voxel_manager.voxel_size * self._dbscan_eps_scale - min_points = ( - self._dbscan_min_points_mature - if self.detection_count >= 3 - else self._dbscan_min_points_initial - ) - labels = np.array(pcd.cluster_dbscan(eps=eps, min_points=min_points)) - unique_labels = np.unique(labels) - - cluster_masks = [] - cluster_weights = [] - cluster_centroids = [] - for label in unique_labels: - if label == -1: - continue - mask = labels == label - if not np.any(mask): - continue - weight = float(np.sum(self.voxel_manager.observation_angles[mask])) - if weight < 5: - continue - cluster_masks.append(mask) - cluster_weights.append(weight) - cluster_centroids.append(compute_centroid(voxels[mask])) - - if len(cluster_masks) == 0: - self.voxel_manager.regularized_mask = np.zeros( - self.voxel_manager.voxels.shape[0], dtype=bool - ) - elif len(cluster_masks) == 1: - self.voxel_manager.regularized_mask = cluster_masks[0] + weights = np.sum(self.voxel_manager.observation_angles, axis=1) * self.voxel_manager.votes + keep_ratio = percentile if percentile is not None else 1.0 + if keep_ratio >= 1.0: + keep_mask = np.ones(voxels.shape[0], dtype=bool) else: - clustering_weight = np.array(cluster_weights, dtype=float, copy=False) - cluster_centroids = np.array(cluster_centroids, dtype=float, copy=False) - - main_idx = int(np.argmax(clustering_weight)) - main_centroid = cluster_centroids[main_idx] - - dists = np.linalg.norm(cluster_centroids - main_centroid, axis=1) - if len(voxels) >= 4: - try: - obb = pcd.get_minimal_oriented_bounding_box() - dist_base = float(np.linalg.norm(np.asarray(obb.extent))) - except Exception: - dist_base = float(np.max(dists)) - else: - dist_base = float(np.max(dists)) - if dist_base == 0: - adjusted_weight = clustering_weight.copy() - else: - alpha = 3.0 - scale = np.exp(-alpha * dists / dist_base) - adjusted_weight = clustering_weight * scale - - main_adjusted_idx = int(np.argmax(adjusted_weight)) - chosen = np.zeros(len(cluster_masks), dtype=bool) - - if percentile is not None: - target = percentile * np.sum(adjusted_weight) - acc = adjusted_weight[main_adjusted_idx] - chosen[main_adjusted_idx] = True - order = np.argsort(adjusted_weight)[::-1] - for i in order: - if chosen[i]: - continue - if acc + adjusted_weight[i] <= target + 1e-8: - chosen[i] = True - acc += adjusted_weight[i] - if acc >= target: - break - else: - chosen[:] = True - - valid_cluster_mask = np.zeros(self.voxel_manager.voxels.shape[0], dtype=bool) - for i, cmask in enumerate(cluster_masks): - if chosen[i]: - valid_cluster_mask |= cmask - self.voxel_manager.regularized_mask = valid_cluster_mask - - if ( - self._outlier_filter - and np.any(self.voxel_manager.regularized_mask) - and np.sum(self.voxel_manager.regularized_mask) >= self._outlier_min_points - ): - keep_indices = np.nonzero(self.voxel_manager.regularized_mask)[0] - keep_points = voxels[keep_indices] - _, _, inlier_indices = remove_statistical_outliers( - keep_points, - nb_neighbors=self._outlier_nb_neighbors, - std_ratio=self._outlier_std_ratio, - ) - if inlier_indices.size > 0: - refined_mask = np.zeros_like(self.voxel_manager.regularized_mask) - refined_mask[keep_indices[inlier_indices]] = True - self.voxel_manager.regularized_mask = refined_mask - else: - self.voxel_manager.regularized_mask = np.zeros_like( - self.voxel_manager.regularized_mask - ) - + cutoff = float(np.quantile(weights, 1.0 - keep_ratio)) + keep_mask = weights >= cutoff + + if not np.any(keep_mask): + keep_mask = np.ones(voxels.shape[0], dtype=bool) + + selected_indices = np.nonzero(keep_mask)[0] + points_sel = voxels[selected_indices] + weights_sel = weights[selected_indices] + if points_sel.shape[0] >= 3: + voxel_size = self._regularize_voxel_size + keys = np.floor(points_sel / voxel_size).astype(np.int32) + voxel_map: dict[tuple[int, int, int], list[int]] = {} + for idx, key in enumerate(map(tuple, keys)): + voxel_map.setdefault(key, []).append(idx) + + neighbor_offsets = [ + (dx, dy, dz) + for dx in (-1, 0, 1) + for dy in (-1, 0, 1) + for dz in (-1, 0, 1) + if not (dx == 0 and dy == 0 and dz == 0) + ] + visited: set[tuple[int, int, int]] = set() + components: list[tuple[np.ndarray, float, np.ndarray]] = [] + for key in voxel_map: + if key in visited: + continue + stack = [key] + visited.add(key) + comp_indices: list[int] = [] + while stack: + current = stack.pop() + comp_indices.extend(voxel_map[current]) + for dx, dy, dz in neighbor_offsets: + neighbor = (current[0] + dx, current[1] + dy, current[2] + dz) + if neighbor in voxel_map and neighbor not in visited: + visited.add(neighbor) + stack.append(neighbor) + + comp_idx = np.array(comp_indices, dtype=int) + comp_weight = float(np.sum(weights_sel[comp_idx])) + comp_centroid = compute_centroid(points_sel[comp_idx]) + components.append((comp_idx, comp_weight, comp_centroid)) + + if components: + main_idx = int(np.argmax([c[1] for c in components])) + main_points = points_sel[components[main_idx][0]] + main_centroid = components[main_idx][2] + extent = np.max(main_points, axis=0) - np.min(main_points, axis=0) + dist_thresh = max(0.3, 0.5 * float(np.max(extent))) + + keep_sel_mask = np.zeros(points_sel.shape[0], dtype=bool) + for comp_idx, _, comp_centroid in components: + if np.linalg.norm(comp_centroid - main_centroid) <= dist_thresh: + keep_sel_mask[comp_idx] = True + if np.any(keep_sel_mask): + keep_mask = np.zeros(voxels.shape[0], dtype=bool) + keep_mask[selected_indices[keep_sel_mask]] = True + + self.voxel_manager.regularized_mask = keep_mask + + self._needs_regularization = False self._invalidate_cache() diff --git a/semantic_mapping/semantic_mapping/mapper.py b/semantic_mapping/semantic_mapping/mapper.py index 8b1b259..cc67cf1 100644 --- a/semantic_mapping/semantic_mapping/mapper.py +++ b/semantic_mapping/semantic_mapping/mapper.py @@ -29,39 +29,25 @@ class ObjectMapper: def __init__( self, voxel_size: float = 0.03, + regularize_voxel_size: float = 0.06, confidence_threshold: float = 0.30, distance_threshold: float = 0.5, iou_threshold: float = 0.2, min_detections_for_permanent: int = 3, pending_ttl_s: float = 10.0, - max_inactive_frames: int = 20, num_angle_bins: int = 15, - dbscan_eps_scale: float = 2.0, - dbscan_min_points_initial: int = 6, - dbscan_min_points_mature: int = 12, - outlier_filter: bool = True, - outlier_nb_neighbors: int = 10, - outlier_std_ratio: float = 2.0, - outlier_min_points: int = 30, ) -> None: """Initialize the object mapper and its thresholds. Args: voxel_size: Voxel grid cell size in meters. + regularize_voxel_size: Voxel size for regularization clustering. confidence_threshold: Minimum detection confidence to accept. distance_threshold: Max centroid distance for spatial matching. iou_threshold: IoU threshold for merge checks. min_detections_for_permanent: Detections before promotion. pending_ttl_s: Time-to-live for pending objects. - max_inactive_frames: Frames before pruning inactive objects. num_angle_bins: Observation angle bins for voxel manager. - dbscan_eps_scale: DBSCAN eps scale (multiplier on voxel size). - dbscan_min_points_initial: DBSCAN min points for new objects. - dbscan_min_points_mature: DBSCAN min points for mature objects. - outlier_filter: Whether to run statistical outlier filtering. - outlier_nb_neighbors: Neighbor count for outlier filtering. - outlier_std_ratio: Standard deviation ratio for outlier filtering. - outlier_min_points: Minimum points to apply outlier filtering. """ self._pending_objects: dict[str, MapObject] = {} self._permanent_objects: dict[str, MapObject] = {} @@ -69,20 +55,13 @@ def __init__( self._lock = threading.RLock() self._voxel_size = voxel_size + self._regularize_voxel_size = regularize_voxel_size self._confidence_threshold = confidence_threshold self._distance_threshold = distance_threshold self._iou_threshold = iou_threshold self._min_detections_for_permanent = min_detections_for_permanent self._pending_ttl_s = pending_ttl_s - self._max_inactive_frames = max_inactive_frames self._num_angle_bins = num_angle_bins - self._dbscan_eps_scale = dbscan_eps_scale - self._dbscan_min_points_initial = dbscan_min_points_initial - self._dbscan_min_points_mature = dbscan_min_points_mature - self._outlier_filter = outlier_filter - self._outlier_nb_neighbors = outlier_nb_neighbors - self._outlier_std_ratio = outlier_std_ratio - self._outlier_min_points = outlier_min_points def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats: """Update the map with new observations. @@ -97,6 +76,7 @@ def update(self, observations: list[Observation], target_object: str | None = No stats = UpdateStats() now = max((obs.timestamp for obs in observations), default=time.time()) updated_ids: set[str] = set() + regularized_ids: set[str] = set() with self._lock: for obs in observations: @@ -117,18 +97,21 @@ def update(self, observations: list[Observation], target_object: str | None = No self._track_id_map[track_id] = obj.object_id if obj.status == ObjectStatus.PENDING: - self._check_promotion(obj) - if obj.status == ObjectStatus.PERMANENT: + if self._check_promotion(obj): stats.promoted += 1 + if obj.object_id not in regularized_ids and obj.inactive_frames == 0: + obj.regularize_shape(percentile=obj._diversity_percentile) + regularized_ids.add(obj.object_id) + for obj in list(self._pending_objects.values()) + list(self._permanent_objects.values()): if obj.object_id not in updated_ids: obj.inactive_frames += 1 - obj.regularize_shape(percentile=obj._diversity_percentile) - stats.deleted += len(self._prune_inactive(target_object, now)) + deleted = self._prune_inactive(target_object, now) + stats.deleted = len(deleted) stats.merged += self.merge_objects() - stats.total = len(self._pending_objects) + len(self._permanent_objects) + stats.total = len(self._permanent_objects) return stats @@ -155,28 +138,23 @@ def _create_object(self, obs: Observation) -> MapObject: object_id=str(uuid.uuid4()), obs=obs, voxel_size=self._voxel_size, + regularize_voxel_size=self._regularize_voxel_size, num_angle_bins=self._num_angle_bins, - dbscan_eps_scale=self._dbscan_eps_scale, - dbscan_min_points_initial=self._dbscan_min_points_initial, - dbscan_min_points_mature=self._dbscan_min_points_mature, - outlier_filter=self._outlier_filter, - outlier_nb_neighbors=self._outlier_nb_neighbors, - outlier_std_ratio=self._outlier_std_ratio, - outlier_min_points=self._outlier_min_points, ) self._track_id_map[obs.track_id] = obj.object_id return obj - def _check_promotion(self, obj: MapObject) -> None: + def _check_promotion(self, obj: MapObject) -> bool: """Promote object from pending to permanent if threshold met.""" if obj.status != ObjectStatus.PENDING: - return + return False if obj.detection_count < self._min_detections_for_permanent: - return + return False if obj.object_id in self._pending_objects: self._pending_objects.pop(obj.object_id) obj.status = ObjectStatus.PERMANENT self._permanent_objects[obj.object_id] = obj + return True def merge_objects(self) -> int: """Merge overlapping permanent objects. @@ -267,15 +245,13 @@ def _prune_inactive(self, target_object: str | None, now: float) -> list[MapObje deleted: list[MapObject] = [] for obj_id, obj in list(self._pending_objects.items()): - if (now - obj.last_update_time) > self._pending_ttl_s or ( - obj.inactive_frames > self._max_inactive_frames - ): + if (now - obj.last_update_time) > self._pending_ttl_s: deleted.append(self._pending_objects.pop(obj_id)) for obj_id, obj in list(self._permanent_objects.items()): if target_object and obj.label == target_object: continue - if obj.inactive_frames > self._max_inactive_frames: + if obj.status == ObjectStatus.DELETED: deleted.append(self._permanent_objects.pop(obj_id)) for obj in deleted: diff --git a/semantic_mapping/semantic_mapping/mapping_node.py b/semantic_mapping/semantic_mapping/mapping_node.py index 0a4cf37..16b5649 100644 --- a/semantic_mapping/semantic_mapping/mapping_node.py +++ b/semantic_mapping/semantic_mapping/mapping_node.py @@ -73,15 +73,12 @@ def __init__(self) -> None: self.declare_parameter("confidence_threshold", 0.3) self.declare_parameter("voxel_size", 0.03) + self.declare_parameter("regularize_voxel_size", 0.06) self.declare_parameter("distance_threshold", 0.5) self.declare_parameter("iou_threshold", 0.2) self.declare_parameter("min_detections_for_permanent", 3) self.declare_parameter("pending_ttl_s", 10.0) - self.declare_parameter("max_inactive_frames", 20) self.declare_parameter("num_angle_bins", 15) - self.declare_parameter("dbscan_eps_scale", 2.0) - self.declare_parameter("dbscan_min_points_initial", 6) - self.declare_parameter("dbscan_min_points_mature", 12) self.declare_parameter("detection_linear_state_time_bias", 0.0) self.declare_parameter("detection_angular_state_time_bias", 0.0) @@ -99,10 +96,6 @@ def __init__(self) -> None: self.declare_parameter("mask_erode_kernel", 3) self.declare_parameter("mask_erode_iterations", 2) self.declare_parameter("max_cloud_distance", 8.0) - self.declare_parameter("outlier_filter", True) - self.declare_parameter("outlier_nb_neighbors", 10) - self.declare_parameter("outlier_std_ratio", 2.0) - self.declare_parameter("outlier_min_points", 30) self.declare_parameter("visualization_max_objects", 50) self.declare_parameter("ground_filter", True) self.declare_parameter("ground_radius", 2.0) @@ -148,10 +141,6 @@ def __init__(self) -> None: (self._mask_erode_kernel_size, self._mask_erode_kernel_size), ) self._max_cloud_distance = float(self.get_parameter("max_cloud_distance").value) - self._outlier_filter = bool(self.get_parameter("outlier_filter").value) - self._outlier_nb_neighbors = int(self.get_parameter("outlier_nb_neighbors").value) - self._outlier_std_ratio = float(self.get_parameter("outlier_std_ratio").value) - self._outlier_min_points = int(self.get_parameter("outlier_min_points").value) self._visualization_max_objects = int( self.get_parameter("visualization_max_objects").value ) @@ -167,20 +156,13 @@ def __init__(self) -> None: self._mapper = ObjectMapper( voxel_size=float(self.get_parameter("voxel_size").value), + regularize_voxel_size=float(self.get_parameter("regularize_voxel_size").value), confidence_threshold=self._confidence_threshold, distance_threshold=float(self.get_parameter("distance_threshold").value), iou_threshold=float(self.get_parameter("iou_threshold").value), min_detections_for_permanent=int(self.get_parameter("min_detections_for_permanent").value), pending_ttl_s=float(self.get_parameter("pending_ttl_s").value), - max_inactive_frames=int(self.get_parameter("max_inactive_frames").value), num_angle_bins=int(self.get_parameter("num_angle_bins").value), - dbscan_eps_scale=float(self.get_parameter("dbscan_eps_scale").value), - dbscan_min_points_initial=int(self.get_parameter("dbscan_min_points_initial").value), - dbscan_min_points_mature=int(self.get_parameter("dbscan_min_points_mature").value), - outlier_filter=self._outlier_filter, - outlier_nb_neighbors=self._outlier_nb_neighbors, - outlier_std_ratio=self._outlier_std_ratio, - outlier_min_points=self._outlier_min_points, ) self._image_saver = ImageSaveQueue( @@ -313,6 +295,7 @@ def _mapping_callback(self) -> None: return self._mapping_busy = True + start_time = time.perf_counter() try: detection_msg, viewpoint_stamp, viewpoint_id = self._select_detection() if detection_msg is None: @@ -338,7 +321,12 @@ def _mapping_callback(self) -> None: if len(observations) == 0: return - self._mapper.update(observations, target_object=self._target_object) + stats = self._mapper.update(observations, target_object=self._target_object) + self.get_logger().info( + "Mapping stats:\n" + f" total: {stats.total}\n" + f" deleted: {stats.deleted}" + ) self._update_observation_angles(observations) self._update_best_images(observations, image) self._publish_map(detection_stamp, viewpoint_id) @@ -347,6 +335,8 @@ def _mapping_callback(self) -> None: if viewpoint_stamp is not None: self._processed_viewpoints.add(viewpoint_stamp) finally: + elapsed_ms = (time.perf_counter() - start_time) * 1000.0 + self.get_logger().info(f"Mapping loop time: {elapsed_ms:.2f} ms") self._mapping_busy = False def _select_detection(self) -> tuple[DetectionResult | None, float | None, int]: @@ -555,6 +545,20 @@ def _build_observations( masks=masks, pixel_idx=pixel_idx, ) + if self._depth_filter: + filtered_clouds = [] + filtered_depths = [] + for cloud, depth in zip(seg_clouds, seg_depths): + if depth.shape[0] == 0: + filtered_clouds.append(cloud) + filtered_depths.append(depth) + continue + min_depth = float(np.min(depth)) + keep = depth[:, 0] <= (min_depth + self._depth_filter_margin) + filtered_clouds.append(cloud[keep]) + filtered_depths.append(depth[keep]) + seg_clouds = filtered_clouds + seg_depths = filtered_depths world_clouds = transform_segmented_clouds_to_world( clouds=seg_clouds, depths=seg_depths,