From 156c63f7c441c95c2008a47eb383360796554044 Mon Sep 17 00:00:00 2001 From: RoyYang0714 Date: Tue, 5 Aug 2025 18:10:32 +0200 Subject: [PATCH 1/3] feat: Add MultiScaleDeformableAttention. --- vis4d/op/base/csp_darknet.py | 3 +- vis4d/op/base/vit.py | 4 +- vis4d/op/detect/faster_rcnn.py | 2 +- vis4d/op/detect/mask_rcnn.py | 2 +- vis4d/op/detect/rcnn.py | 2 +- vis4d/op/detect/rpn.py | 4 +- vis4d/op/{ => detect}/typing.py | 2 +- vis4d/op/detect/yolox.py | 2 +- vis4d/op/detect3d/qd_3dt.py | 2 +- vis4d/op/fpp/dla_up.py | 3 +- vis4d/op/fpp/yolox_pafpn.py | 3 +- vis4d/op/layer/__init__.py | 26 +---- vis4d/op/layer/attention.py | 8 +- vis4d/op/layer/ms_deform_attn.py | 190 ++++++++++++++++++++++++++++++- vis4d/op/track/qdtrack.py | 2 +- 15 files changed, 208 insertions(+), 47 deletions(-) rename vis4d/op/{ => detect}/typing.py (94%) diff --git a/vis4d/op/base/csp_darknet.py b/vis4d/op/base/csp_darknet.py index 0ad36e368..830d171f3 100644 --- a/vis4d/op/base/csp_darknet.py +++ b/vis4d/op/base/csp_darknet.py @@ -12,7 +12,8 @@ from torch import nn from torch.nn.modules.batchnorm import _BatchNorm -from vis4d.op.layer import Conv2d, CSPLayer +from vis4d.op.layer.conv2d import Conv2d +from vis4d.op.layer.csp_layer import CSPLayer class Focus(nn.Module): diff --git a/vis4d/op/base/vit.py b/vis4d/op/base/vit.py index 0948d2a05..186a36f4c 100644 --- a/vis4d/op/base/vit.py +++ b/vis4d/op/base/vit.py @@ -6,7 +6,9 @@ from timm.models import named_apply from torch import nn -from ..layer import PatchEmbed, TransformerBlock +from vis4d.op.layer.patch_embed import PatchEmbed +from vis4d.op.layer.transformer import TransformerBlock + from .base import BaseModel diff --git a/vis4d/op/detect/faster_rcnn.py b/vis4d/op/detect/faster_rcnn.py index f69a7dd5c..08062d72b 100644 --- a/vis4d/op/detect/faster_rcnn.py +++ b/vis4d/op/detect/faster_rcnn.py @@ -17,9 +17,9 @@ match_and_sample_proposals, ) -from ..typing import Proposals, Targets from .rcnn import RCNNHead, RCNNOut from .rpn import RPN2RoI, RPNHead, RPNOut +from .typing import Proposals, Targets class FRCNNOut(NamedTuple): diff --git a/vis4d/op/detect/mask_rcnn.py b/vis4d/op/detect/mask_rcnn.py index 924961a29..c4a6d37ee 100644 --- a/vis4d/op/detect/mask_rcnn.py +++ b/vis4d/op/detect/mask_rcnn.py @@ -13,7 +13,7 @@ from vis4d.op.box.poolers import MultiScaleRoIAlign from vis4d.op.mask.util import paste_masks_in_image, remove_overlap -from ..typing import Proposals, Targets +from .typing import Proposals, Targets class MaskRCNNHeadOut(NamedTuple): diff --git a/vis4d/op/detect/rcnn.py b/vis4d/op/detect/rcnn.py index ddac6ae9e..bb4fa5cbd 100644 --- a/vis4d/op/detect/rcnn.py +++ b/vis4d/op/detect/rcnn.py @@ -15,7 +15,7 @@ from vis4d.op.box.encoder import DeltaXYWHBBoxDecoder, DeltaXYWHBBoxEncoder from vis4d.op.box.poolers import MultiScaleRoIAlign from vis4d.op.detect.common import DetOut -from vis4d.op.layer import add_conv_branch +from vis4d.op.layer.conv2d import add_conv_branch from vis4d.op.layer.weight_init import kaiming_init, normal_init, xavier_init from vis4d.op.loss.common import l1_loss from vis4d.op.loss.reducer import SumWeightedLoss diff --git a/vis4d/op/detect/rpn.py b/vis4d/op/detect/rpn.py index ab81fb388..ae646d47b 100644 --- a/vis4d/op/detect/rpn.py +++ b/vis4d/op/detect/rpn.py @@ -16,11 +16,11 @@ from vis4d.op.box.encoder import DeltaXYWHBBoxDecoder, DeltaXYWHBBoxEncoder from vis4d.op.box.matchers import Matcher, MaxIoUMatcher from vis4d.op.box.samplers import RandomSampler, Sampler +from vis4d.op.layer.conv2d import Conv2d from vis4d.op.loss.common import l1_loss -from ..layer import Conv2d -from ..typing import Proposals from .dense_anchor import DenseAnchorHeadLoss, DenseAnchorHeadLosses +from .typing import Proposals class RPNOut(NamedTuple): diff --git a/vis4d/op/typing.py b/vis4d/op/detect/typing.py similarity index 94% rename from vis4d/op/typing.py rename to vis4d/op/detect/typing.py index cb1330561..5c9dc21d2 100644 --- a/vis4d/op/typing.py +++ b/vis4d/op/detect/typing.py @@ -1,4 +1,4 @@ -"""Vis4D op typing.""" +"""Detect op typing.""" from __future__ import annotations diff --git a/vis4d/op/detect/yolox.py b/vis4d/op/detect/yolox.py index e70ebcc84..7854aa972 100644 --- a/vis4d/op/detect/yolox.py +++ b/vis4d/op/detect/yolox.py @@ -20,7 +20,7 @@ from vis4d.op.box.encoder import YOLOXBBoxDecoder from vis4d.op.box.matchers import SimOTAMatcher from vis4d.op.box.samplers import PseudoSampler -from vis4d.op.layer import Conv2d +from vis4d.op.layer.conv2d import Conv2d from vis4d.op.layer.weight_init import bias_init_with_prob from vis4d.op.loss import IoULoss from vis4d.op.loss.reducer import SumWeightedLoss diff --git a/vis4d/op/detect3d/qd_3dt.py b/vis4d/op/detect3d/qd_3dt.py index 1891f4621..d92370b8e 100644 --- a/vis4d/op/detect3d/qd_3dt.py +++ b/vis4d/op/detect3d/qd_3dt.py @@ -18,7 +18,7 @@ match_and_sample_proposals, ) from vis4d.op.geometry.rotation import generate_rotation_output -from vis4d.op.layer import Conv2d, add_conv_branch +from vis4d.op.layer.conv2d import Conv2d, add_conv_branch from vis4d.op.layer.weight_init import kaiming_init, xavier_init from vis4d.op.loss.base import Loss from vis4d.op.loss.common import rotation_loss, smooth_l1_loss diff --git a/vis4d/op/fpp/dla_up.py b/vis4d/op/fpp/dla_up.py index bf66bfff5..966280d7c 100644 --- a/vis4d/op/fpp/dla_up.py +++ b/vis4d/op/fpp/dla_up.py @@ -12,8 +12,9 @@ from torch import nn from vis4d.common import NDArrayI64 +from vis4d.op.layer.conv2d import Conv2d +from vis4d.op.layer.deform_conv import DeformConv -from ..layer import Conv2d, DeformConv from .base import FeaturePyramidProcessing diff --git a/vis4d/op/fpp/yolox_pafpn.py b/vis4d/op/fpp/yolox_pafpn.py index eca8fb341..ce5794f35 100644 --- a/vis4d/op/fpp/yolox_pafpn.py +++ b/vis4d/op/fpp/yolox_pafpn.py @@ -10,7 +10,8 @@ import torch from torch import nn -from vis4d.op.layer import Conv2d, CSPLayer +from vis4d.op.layer.conv2d import Conv2d +from vis4d.op.layer.csp_layer import CSPLayer from .base import FeaturePyramidProcessing diff --git a/vis4d/op/layer/__init__.py b/vis4d/op/layer/__init__.py index d11ec9a19..7baaa368a 100644 --- a/vis4d/op/layer/__init__.py +++ b/vis4d/op/layer/__init__.py @@ -1,25 +1 @@ -"""Init layers module.""" - -from .attention import Attention -from .conv2d import Conv2d, UnetDownConv, UnetUpConv, add_conv_branch -from .csp_layer import CSPLayer -from .deform_conv import DeformConv -from .drop import DropPath -from .mlp import ResnetBlockFC, TransformerBlockMLP -from .patch_embed import PatchEmbed -from .transformer import TransformerBlock - -__all__ = [ - "Conv2d", - "add_conv_branch", - "CSPLayer", - "DeformConv", - "ResnetBlockFC", - "UnetDownConv", - "UnetUpConv", - "Attention", - "PatchEmbed", - "DropPath", - "TransformerBlockMLP", - "TransformerBlock", -] +"""layers op module.""" diff --git a/vis4d/op/layer/attention.py b/vis4d/op/layer/attention.py index f3ba881e5..f07212d0e 100644 --- a/vis4d/op/layer/attention.py +++ b/vis4d/op/layer/attention.py @@ -163,12 +163,12 @@ def forward( value (Tensor): The value tensor with same shape as `key`. Same in `nn.MultiheadAttention.forward`. Defaults to None. If None, the `key` will be used. - identity (Tensor): This tensor, with the same shape as x, + identity (Tensor): This tensor, with the same shape as query, will be used for the identity link. - If None, `x` will be used. Defaults to None. + If None, `query` will be used. Defaults to None. query_pos (Tensor): The positional encoding for query, with - the same shape as `x`. If not None, it will - be added to `x` before forward function. Defaults to None. + the same shape as `query`. If not None, it will + be added to `query` before forward function. Defaults to None. key_pos (Tensor): The positional encoding for `key`, with the same shape as `key`. Defaults to None. If not None, it will be added to `key` before forward function. If None, and diff --git a/vis4d/op/layer/ms_deform_attn.py b/vis4d/op/layer/ms_deform_attn.py index 1ee8a7532..cd91ec0e4 100644 --- a/vis4d/op/layer/ms_deform_attn.py +++ b/vis4d/op/layer/ms_deform_attn.py @@ -193,7 +193,10 @@ def is_power_of_2(number: int) -> None: class MSDeformAttention(nn.Module): - """Multi-Scale Deformable Attention Module.""" + """Multi-Scale Deformable Attention Module. + + This is the original implementation from Deformable DETR. + """ def __init__( self, @@ -228,10 +231,6 @@ def __init__( self.n_points = n_points self.im2col_step = im2col_step - # Aligned Attributes to MHA - self.embed_dims = d_model - self.num_heads = n_heads - self.sampling_offsets = nn.Linear( d_model, n_heads * n_levels * n_points * 2 ) @@ -381,3 +380,184 @@ def __call__( input_level_start_index, input_padding_mask, ) + + +class MultiScaleDeformableAttention(nn.Module): + """A wrapper for ``MSDeformAttention``. + + This module implements MSDeformAttention with identity connection, + and positional encoding is also passed as input. + """ + + def __init__( + self, + embed_dims: int = 256, + num_heads: int = 8, + num_levels: int = 4, + num_points: int = 4, + im2col_step: int = 64, + dropout: float = 0.0, + ) -> None: + """Init.""" + super().__init__() + if embed_dims % num_heads != 0: + raise ValueError( + "embed_dims must be divisible by num_heads, but got " + + f"{embed_dims} and {num_heads}." + ) + + is_power_of_2(embed_dims // num_heads) + + self.embed_dims = embed_dims + self.num_heads = num_heads + self.num_levels = num_levels + self.num_points = num_points + self.im2col_step = im2col_step + + self.sampling_offsets = nn.Linear( + embed_dims, num_heads * num_levels * num_points * 2 + ) + self.attention_weights = nn.Linear( + embed_dims, num_heads * num_levels * num_points + ) + self.value_proj = nn.Linear(embed_dims, embed_dims) + self.output_proj = nn.Linear(embed_dims, embed_dims) + + self.dropout = nn.Dropout(dropout) + + self._init_weights() + + def _init_weights(self) -> None: + """Initialize weights.""" + constant_(self.sampling_offsets.weight.data, 0.0) + thetas = torch.mul( + torch.arange(self.num_heads, dtype=torch.float32), + (2.0 * math.pi / self.num_heads), + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(self.num_heads, 1, 1, 2) + .repeat(1, self.num_levels, self.num_points, 1) + ) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + constant_(self.attention_weights.weight.data, 0.0) + constant_(self.attention_weights.bias.data, 0.0) + xavier_uniform_(self.value_proj.weight.data) + constant_(self.value_proj.bias.data, 0.0) + xavier_uniform_(self.output_proj.weight.data) + constant_(self.output_proj.bias.data, 0.0) + + def forward( + self, + query: Tensor, + reference_points: Tensor, + input_flatten: Tensor, + input_spatial_shapes: Tensor, + input_level_start_index: Tensor, + query_pos: Tensor | None = None, + identity: Tensor | None = None, + input_padding_mask: Tensor | None = None, + ) -> Tensor: + r"""Forward function. + + Args: + query (Tensor): The input query with shape [bs, num_queries, + embed_dims]. + reference_points (Tensor): (bs, num_queries, num_levels, 2), + range in [0, 1], top-left (0,0), bottom-right (1, 1), including + padding area or (bs, num_queries, num_levels, 4), add + additional (w, h) to form reference boxes. + input_flatten (Tensor): (bs, \sum_{l=0}^{L-1} H_l \cdot W_l, C). + input_spatial_shapes (Tensor): (num_levels, 2), [(H_0, W_0), + (H_1, W_1), ..., (H_{L-1}, W_{L-1})]. + input_level_start_index (Tensor): (num_levels, ), [0, H_0*W_0, + H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., + H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]. + query_pos (Tensor | None): The positional encoding for query, with + the same shape as `query`. If not None, it will + be added to `query` before forward function. Defaults to None. + identity (Tensor | None): With the same shape as query, it will be + used for the identity link. If None, `query` will be used. + Defaults to None. + input_padding_mask (Tensor): (bs, \sum_{l=0}^{L-1} H_l \cdot W_l), + True for padding elements, False for non-padding elements. + + Returns + output (Tensor): (bs, num_queries, C). + """ + if identity is None: + identity = query + + if query_pos is not None: + query = query + query_pos + + n, len_q, _ = query.shape + n, len_in, _ = input_flatten.shape + assert ( + input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1] + ).sum() == len_in + + value = self.value_proj(input_flatten) + if input_padding_mask is not None: + value = value.masked_fill(input_padding_mask[..., None], float(0)) + value = value.view( + n, len_in, self.num_heads, self.embed_dims // self.num_heads + ) + sampling_offsets = self.sampling_offsets(query).view( + n, len_q, self.num_heads, self.num_levels, self.num_points, 2 + ) + attention_weights = self.attention_weights(query).view( + n, len_q, self.num_heads, self.num_levels * self.num_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + n, len_q, self.num_heads, self.num_levels, self.num_points + ) + # n, len_q, num_heads, num_levels, num_points, 2 + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack( + [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], + -1, + ) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets + / offset_normalizer[None, None, None, :, None, :] + ) + elif reference_points.shape[-1] == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets + / self.num_points + * reference_points[:, :, None, :, None, 2:] + * 0.5 + ) + else: + raise ValueError( + "Last dim of reference_points must be 2 or 4, " + + f"but get {reference_points.shape[-1]} instead." + ) + + if torch.cuda.is_available() and value.is_cuda: + output = MSDeformAttentionFunction.apply( + value, + input_spatial_shapes, + input_level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + else: + output = ms_deformable_attention_cpu( + value, + input_spatial_shapes, + sampling_locations, + attention_weights, + ) + + output = self.output_proj(output) + + return self.dropout(output) + identity diff --git a/vis4d/op/track/qdtrack.py b/vis4d/op/track/qdtrack.py index bbc46c2f2..45672b981 100644 --- a/vis4d/op/track/qdtrack.py +++ b/vis4d/op/track/qdtrack.py @@ -12,7 +12,7 @@ from vis4d.op.box.matchers.max_iou import MaxIoUMatcher from vis4d.op.box.poolers import MultiScaleRoIAlign, MultiScaleRoIPooler from vis4d.op.box.samplers import CombinedSampler, match_and_sample_proposals -from vis4d.op.layer import add_conv_branch +from vis4d.op.layer.conv2d import add_conv_branch from vis4d.op.loss import EmbeddingDistanceLoss, MultiPosCrossEntropyLoss from .assignment import TrackIDCounter, greedy_assign From d3864ef8f21d8bc817906f3303d0956b2dba6c39 Mon Sep 17 00:00:00 2001 From: RoyYang0714 Date: Tue, 5 Aug 2025 18:12:24 +0200 Subject: [PATCH 2/3] fix: Refactor common api. --- vis4d/common/__init__.py | 36 ----------------------- vis4d/common/ckpt.py | 2 +- vis4d/common/dict.py | 2 +- vis4d/common/distributed.py | 2 +- vis4d/data/datasets/base.py | 2 +- vis4d/data/datasets/coco.py | 2 +- vis4d/engine/callbacks/evaluator.py | 2 +- vis4d/engine/callbacks/logging.py | 2 +- vis4d/engine/callbacks/visualizer.py | 2 +- vis4d/engine/callbacks/yolox_callbacks.py | 2 +- vis4d/engine/run.py | 2 +- vis4d/engine/trainer.py | 2 +- vis4d/eval/common/seg.py | 8 +++-- vis4d/eval/shift/multitask_writer.py | 8 +++-- vis4d/model/cls/vit.py | 2 +- vis4d/model/detect/retinanet.py | 2 +- vis4d/op/box/poolers/roi_pooler.py | 2 +- vis4d/op/box/samplers/combined.py | 2 +- vis4d/op/box/samplers/random.py | 2 +- vis4d/op/detect/dense_anchor.py | 2 +- vis4d/op/detect/yolox.py | 2 +- vis4d/op/fpp/dla_up.py | 2 +- vis4d/state/track3d/motion/lstm_3d.py | 2 +- vis4d/zoo/run.py | 2 +- 24 files changed, 33 insertions(+), 61 deletions(-) diff --git a/vis4d/common/__init__.py b/vis4d/common/__init__.py index 72476b68b..07af59721 100644 --- a/vis4d/common/__init__.py +++ b/vis4d/common/__init__.py @@ -1,37 +1 @@ """Contains common functions and types that are used across modules.""" - -from .typing import ( - ArgsType, - DictStrAny, - DictStrArrNested, - GenericFunc, - ListAny, - LossesType, - MetricLogs, - ModelOutput, - NDArrayF32, - NDArrayF64, - NDArrayI64, - NDArrayNumber, - NDArrayUI8, - TorchCheckpoint, - TorchLossFunc, -) - -__all__ = [ - "DictStrAny", - "DictStrArrNested", - "ModelOutput", - "ArgsType", - "NDArrayF32", - "NDArrayF64", - "NDArrayI64", - "NDArrayUI8", - "NDArrayNumber", - "MetricLogs", - "TorchCheckpoint", - "LossesType", - "TorchLossFunc", - "GenericFunc", - "ListAny", -] diff --git a/vis4d/common/ckpt.py b/vis4d/common/ckpt.py index d5e36faa3..6c610ac53 100644 --- a/vis4d/common/ckpt.py +++ b/vis4d/common/ckpt.py @@ -15,7 +15,6 @@ from torch import nn from torch.hub import load_state_dict_from_url as load_url -from vis4d.common import TorchCheckpoint from vis4d.common.distributed import ( get_rank, get_world_size, @@ -23,6 +22,7 @@ synchronize, ) from vis4d.common.logging import rank_zero_info, rank_zero_warn +from vis4d.common.typing import TorchCheckpoint CheckpointLoadFunc = Callable[ [str, Union[str, torch.device, None]], TorchCheckpoint diff --git a/vis4d/common/dict.py b/vis4d/common/dict.py index 1ea0bc8a4..da2746926 100644 --- a/vis4d/common/dict.py +++ b/vis4d/common/dict.py @@ -4,7 +4,7 @@ from typing import Any -from vis4d.common import DictStrAny +from vis4d.common.typing import DictStrAny def flatten_dict(dictionary: DictStrAny, seperator: str) -> list[str]: diff --git a/vis4d/common/distributed.py b/vis4d/common/distributed.py index 6511dc444..34fdebc66 100644 --- a/vis4d/common/distributed.py +++ b/vis4d/common/distributed.py @@ -17,7 +17,7 @@ from torch.distributed import broadcast_object_list from torch.nn.parallel import DataParallel, DistributedDataParallel -from vis4d.common import ArgsType, DictStrAny, GenericFunc +from vis4d.common.typing import ArgsType, DictStrAny, GenericFunc # no coverage for these functions, since we don't unittest distributed setting diff --git a/vis4d/data/datasets/base.py b/vis4d/data/datasets/base.py index 208a99cad..a0647f262 100644 --- a/vis4d/data/datasets/base.py +++ b/vis4d/data/datasets/base.py @@ -12,7 +12,7 @@ from torch.utils.data import Dataset as TorchDataset -from vis4d.common import ArgsType +from vis4d.common.typing import ArgsType from vis4d.data.io.base import DataBackend from vis4d.data.io.file import FileBackend from vis4d.data.typing import DictData diff --git a/vis4d/data/datasets/coco.py b/vis4d/data/datasets/coco.py index 30f183002..44d60528d 100644 --- a/vis4d/data/datasets/coco.py +++ b/vis4d/data/datasets/coco.py @@ -11,7 +11,7 @@ import pycocotools.mask as maskUtils from pycocotools.coco import COCO as COCOAPI -from vis4d.common import ArgsType, DictStrAny +from vis4d.common.typing import ArgsType, DictStrAny from vis4d.data.const import CommonKeys as K from vis4d.data.typing import DictData diff --git a/vis4d/engine/callbacks/evaluator.py b/vis4d/engine/callbacks/evaluator.py index e79e746a0..2430edf10 100644 --- a/vis4d/engine/callbacks/evaluator.py +++ b/vis4d/engine/callbacks/evaluator.py @@ -7,7 +7,6 @@ import lightning.pytorch as pl -from vis4d.common import ArgsType, MetricLogs from vis4d.common.distributed import ( all_gather_object_cpu, broadcast, @@ -15,6 +14,7 @@ synchronize, ) from vis4d.common.logging import rank_zero_info +from vis4d.common.typing import ArgsType, MetricLogs from vis4d.data.typing import DictData from vis4d.eval.base import Evaluator diff --git a/vis4d/engine/callbacks/logging.py b/vis4d/engine/callbacks/logging.py index 01d7b70b9..86fcb6e58 100644 --- a/vis4d/engine/callbacks/logging.py +++ b/vis4d/engine/callbacks/logging.py @@ -7,10 +7,10 @@ import lightning.pytorch as pl -from vis4d.common import ArgsType, MetricLogs from vis4d.common.logging import rank_zero_info from vis4d.common.progress import compose_log_str from vis4d.common.time import Timer +from vis4d.common.typing import ArgsType, MetricLogs from .base import Callback diff --git a/vis4d/engine/callbacks/visualizer.py b/vis4d/engine/callbacks/visualizer.py index e6b24ef55..30179597f 100644 --- a/vis4d/engine/callbacks/visualizer.py +++ b/vis4d/engine/callbacks/visualizer.py @@ -7,8 +7,8 @@ import lightning.pytorch as pl -from vis4d.common import ArgsType from vis4d.common.distributed import broadcast, synchronize +from vis4d.common.typing import ArgsType from vis4d.vis.base import Visualizer from .base import Callback diff --git a/vis4d/engine/callbacks/yolox_callbacks.py b/vis4d/engine/callbacks/yolox_callbacks.py index dfd63ad3b..868e825dc 100644 --- a/vis4d/engine/callbacks/yolox_callbacks.py +++ b/vis4d/engine/callbacks/yolox_callbacks.py @@ -13,7 +13,6 @@ from torch.nn.modules.batchnorm import _NormBase from torch.utils.data import DataLoader -from vis4d.common import ArgsType, DictStrAny from vis4d.common.distributed import ( all_reduce_dict, broadcast, @@ -22,6 +21,7 @@ synchronize, ) from vis4d.common.logging import rank_zero_info, rank_zero_warn +from vis4d.common.typing import ArgsType, DictStrAny from vis4d.data.const import CommonKeys as K from vis4d.data.data_pipe import DataPipe from vis4d.op.detect.yolox import YOLOXHeadLoss diff --git a/vis4d/engine/run.py b/vis4d/engine/run.py index de8e731f8..d7de4c7a7 100644 --- a/vis4d/engine/run.py +++ b/vis4d/engine/run.py @@ -9,8 +9,8 @@ from absl import app # pylint: disable=no-name-in-module from torch.utils.collect_env import get_pretty_env_info -from vis4d.common import ArgsType from vis4d.common.logging import dump_config, rank_zero_info, setup_logger +from vis4d.common.typing import ArgsType from vis4d.common.util import set_tf32 from vis4d.config import instantiate_classes from vis4d.config.typing import ExperimentConfig diff --git a/vis4d/engine/trainer.py b/vis4d/engine/trainer.py index 5aadf210d..616e593e5 100644 --- a/vis4d/engine/trainer.py +++ b/vis4d/engine/trainer.py @@ -11,9 +11,9 @@ from lightning.pytorch.loggers.wandb import WandbLogger from lightning.pytorch.strategies.ddp import DDPStrategy -from vis4d.common import ArgsType from vis4d.common.imports import TENSORBOARD_AVAILABLE from vis4d.common.logging import rank_zero_info +from vis4d.common.typing import ArgsType class PLTrainer(Trainer): diff --git a/vis4d/eval/common/seg.py b/vis4d/eval/common/seg.py index 9d067acf4..28817a89c 100644 --- a/vis4d/eval/common/seg.py +++ b/vis4d/eval/common/seg.py @@ -5,9 +5,13 @@ import numpy as np from terminaltables import AsciiTable -from vis4d.common import MetricLogs from vis4d.common.array import array_to_numpy -from vis4d.common.typing import ArrayLike, NDArrayI64, NDArrayNumber +from vis4d.common.typing import ( + ArrayLike, + MetricLogs, + NDArrayI64, + NDArrayNumber, +) from vis4d.eval.base import Evaluator diff --git a/vis4d/eval/shift/multitask_writer.py b/vis4d/eval/shift/multitask_writer.py index f94a01643..e6d228bed 100644 --- a/vis4d/eval/shift/multitask_writer.py +++ b/vis4d/eval/shift/multitask_writer.py @@ -11,10 +11,14 @@ import numpy as np from PIL import Image -from vis4d.common import MetricLogs from vis4d.common.array import array_to_numpy from vis4d.common.imports import SCALABEL_AVAILABLE -from vis4d.common.typing import ArrayLike, GenericFunc, NDArrayNumber +from vis4d.common.typing import ( + ArrayLike, + GenericFunc, + MetricLogs, + NDArrayNumber, +) from vis4d.data.datasets.shift import shift_det_map from vis4d.data.io import DataBackend, ZipBackend from vis4d.eval.base import Evaluator diff --git a/vis4d/model/cls/vit.py b/vis4d/model/cls/vit.py index dea3cc4fc..ea4e6366c 100644 --- a/vis4d/model/cls/vit.py +++ b/vis4d/model/cls/vit.py @@ -6,8 +6,8 @@ import torch from torch import nn -from vis4d.common import ArgsType from vis4d.common.ckpt import load_model_checkpoint +from vis4d.common.typing import ArgsType from vis4d.op.base.vit import VisionTransformer, ViT_PRESET from .common import ClsOut diff --git a/vis4d/model/detect/retinanet.py b/vis4d/model/detect/retinanet.py index 4ae88643b..204bd8666 100644 --- a/vis4d/model/detect/retinanet.py +++ b/vis4d/model/detect/retinanet.py @@ -4,8 +4,8 @@ from torch import Tensor, nn -from vis4d.common import LossesType from vis4d.common.ckpt import load_model_checkpoint +from vis4d.common.typing import LossesType from vis4d.op.base.resnet import ResNet from vis4d.op.box.anchor import AnchorGenerator from vis4d.op.box.box2d import scale_and_clip_boxes diff --git a/vis4d/op/box/poolers/roi_pooler.py b/vis4d/op/box/poolers/roi_pooler.py index be459f13a..12a133a0e 100644 --- a/vis4d/op/box/poolers/roi_pooler.py +++ b/vis4d/op/box/poolers/roi_pooler.py @@ -8,7 +8,7 @@ import torch from torchvision.ops import roi_align, roi_pool -from vis4d.common import ArgsType +from vis4d.common.typing import ArgsType from .base import RoIPooler from .utils import assign_boxes_to_levels, boxes_to_tensor diff --git a/vis4d/op/box/samplers/combined.py b/vis4d/op/box/samplers/combined.py index e94ea0076..622f7591f 100644 --- a/vis4d/op/box/samplers/combined.py +++ b/vis4d/op/box/samplers/combined.py @@ -5,7 +5,7 @@ import torch from torch import Tensor -from vis4d.common import ArgsType +from vis4d.common.typing import ArgsType from ..box2d import non_intersection, random_choice from ..matchers.base import MatchResult diff --git a/vis4d/op/box/samplers/random.py b/vis4d/op/box/samplers/random.py index 6e181def5..d0bc6d202 100644 --- a/vis4d/op/box/samplers/random.py +++ b/vis4d/op/box/samplers/random.py @@ -4,7 +4,7 @@ import torch -from vis4d.common import ArgsType +from vis4d.common.typing import ArgsType from ..matchers.base import MatchResult from .base import Sampler, SamplingResult diff --git a/vis4d/op/detect/dense_anchor.py b/vis4d/op/detect/dense_anchor.py index 74816cbc8..7d8dd92ee 100644 --- a/vis4d/op/detect/dense_anchor.py +++ b/vis4d/op/detect/dense_anchor.py @@ -8,7 +8,7 @@ import torch.nn.functional as F from torch import Tensor, nn -from vis4d.common import TorchLossFunc +from vis4d.common.typing import TorchLossFunc from vis4d.op.box.anchor import AnchorGenerator, anchor_inside_image from vis4d.op.box.encoder import DeltaXYWHBBoxEncoder from vis4d.op.box.matchers import Matcher diff --git a/vis4d/op/detect/yolox.py b/vis4d/op/detect/yolox.py index 7854aa972..28ecbca80 100644 --- a/vis4d/op/detect/yolox.py +++ b/vis4d/op/detect/yolox.py @@ -14,8 +14,8 @@ from torch import Tensor, nn from torchvision.ops import batched_nms -from vis4d.common import TorchLossFunc from vis4d.common.distributed import reduce_mean +from vis4d.common.typing import TorchLossFunc from vis4d.op.box.anchor import MlvlPointGenerator from vis4d.op.box.encoder import YOLOXBBoxDecoder from vis4d.op.box.matchers import SimOTAMatcher diff --git a/vis4d/op/fpp/dla_up.py b/vis4d/op/fpp/dla_up.py index 966280d7c..7e37cacf8 100644 --- a/vis4d/op/fpp/dla_up.py +++ b/vis4d/op/fpp/dla_up.py @@ -11,7 +11,7 @@ import torch from torch import nn -from vis4d.common import NDArrayI64 +from vis4d.common.typing import NDArrayI64 from vis4d.op.layer.conv2d import Conv2d from vis4d.op.layer.deform_conv import DeformConv diff --git a/vis4d/state/track3d/motion/lstm_3d.py b/vis4d/state/track3d/motion/lstm_3d.py index 9c2f8e77b..d943e7c1c 100644 --- a/vis4d/state/track3d/motion/lstm_3d.py +++ b/vis4d/state/track3d/motion/lstm_3d.py @@ -5,7 +5,7 @@ import torch from torch import Tensor, nn -from vis4d.common import ArgsType +from vis4d.common.typing import ArgsType from vis4d.model.motion.velo_lstm import VeloLSTM from vis4d.op.geometry.rotation import acute_angle, normalize_angle diff --git a/vis4d/zoo/run.py b/vis4d/zoo/run.py index 6251740a0..a53b8d955 100644 --- a/vis4d/zoo/run.py +++ b/vis4d/zoo/run.py @@ -4,7 +4,7 @@ from absl import app # pylint: disable=no-name-in-module -from vis4d.common import ArgsType +from vis4d.common.typing import ArgsType from vis4d.zoo import AVAILABLE_MODELS From 028c11aafe5e548bc441a9c80a861e7a793aa6bf Mon Sep 17 00:00:00 2001 From: RoyYang0714 Date: Tue, 5 Aug 2025 18:19:37 +0200 Subject: [PATCH 3/3] feat: Change version. --- pyproject.toml | 2 +- vis4d/__init__.py | 2 +- vis4d/op/layer/mlp.py | 67 ------------------------------------------- 3 files changed, 2 insertions(+), 69 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6cc6bd5ab..7f4a02dff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,7 +113,7 @@ plugins = ["numpy.typing.mypy_plugin"] [project] name = "vis4d" -version = "0.1.3" +version = "0.1.4" authors = [{name = "Vis4D Team"}] description = "Vis4D Python package for Visual 4D scene understanding" readme = "README.md" diff --git a/vis4d/__init__.py b/vis4d/__init__.py index 39b70481b..04da26f57 100644 --- a/vis4d/__init__.py +++ b/vis4d/__init__.py @@ -7,7 +7,7 @@ import logging -__version__ = "0.1.3" +__version__ = "0.1.4" _root_logger = logging.getLogger() _logger = logging.getLogger(__name__) diff --git a/vis4d/op/layer/mlp.py b/vis4d/op/layer/mlp.py index 76fff1738..ea157511a 100644 --- a/vis4d/op/layer/mlp.py +++ b/vis4d/op/layer/mlp.py @@ -2,76 +2,9 @@ from __future__ import annotations -import torch from torch import Tensor, nn -class ResnetBlockFC(nn.Module): - """Fully connected ResNet Block consisting of two linear layers.""" - - def __init__( - self, - size_in: int, - size_out: int | None = None, - size_h: int | None = None, - ): - """Fully connected ResNet Block consisting of two linear layers. - - Args: - size_in: (int) input dimension - size_out: Optional(int) output dimension, - if not specified same as input - size_h: Optional(int) hidden dimension, - if not specfied same as min(in,out) - """ - super().__init__() - # Attributes - if size_out is None: - size_out = size_in - - if size_h is None: - size_h = min(size_in, size_out) - - self.size_in = size_in - self.size_h = size_h - self.size_out = size_out - # Submodules - self.fc_0 = nn.Linear(size_in, size_h) - self.fc_1 = nn.Linear(size_h, size_out) - self.actvn = nn.ReLU() - - if size_in == size_out: - self.shortcut = None - else: - self.shortcut = nn.Linear(size_in, size_out, bias=False) - # Initialization - nn.init.zeros_(self.fc_1.weight) - - def __call__(self, data: torch.Tensor) -> torch.Tensor: - """Applies the layer. - - Args: - data: (tensor) input shape [N, C] - """ - return self._call_impl(data) - - def forward(self, data: torch.Tensor) -> torch.Tensor: - """Applies the layer. - - Args: - data: (tensor) input shape [N, C] - """ - net = self.fc_0(self.actvn(data)) - dx = self.fc_1(self.actvn(net)) - - if self.shortcut is not None: - x_s = self.shortcut(data) - else: - x_s = data - - return x_s + dx - - class TransformerBlockMLP(nn.Module): """MLP as used in Vision Transformer, MLP-Mixer and related networks."""