From 2484e0f38ff2134e28a327d9a2c41fddfe9dea4f Mon Sep 17 00:00:00 2001
From: "d.savchenkov" <d.savchenkov@partner.samsung.com>
Date: Tue, 24 Feb 2026 18:14:58 +0300
Subject: [PATCH] [quantization] Introduce QuantConv3dDecomposed wrapper for
 Conv3d

This change introduces QuantConv3dDecomposed wrapper to support post-training quantization of Conv3d operation that uses Conv2d and Add operations internally.

TICO-DCO-1.0-Signed-off-by: d.savchenkov <d.savchenkov@partner.samsung.com>
---
 .../wrapq/wrappers/nn/test_quant_conv3d.py    |   3 +
 .../nn/test_quant_conv3d_decomposed.py        | 600 ++++++++++++++++++
 .../qwen_vl/test_quant_vision_patch_embed.py  |   6 +-
 .../wrapq/examples/nn/quantize_conv3d.py      | 129 ++++
 .../nn/quantize_conv3d_special_case.py        | 134 ++++
 .../wrappers/nn/quant_conv3d_decomposed.py    | 395 ++++++++++++
 tico/quantization/wrapq/wrappers/registry.py  |   2 +-
 7 files changed, 1265 insertions(+), 4 deletions(-)
 create mode 100644 test/quantization/wrapq/wrappers/nn/test_quant_conv3d_decomposed.py
 create mode 100644 tico/quantization/wrapq/examples/nn/quantize_conv3d.py
 create mode 100644 tico/quantization/wrapq/examples/nn/quantize_conv3d_special_case.py
 create mode 100644 tico/quantization/wrapq/wrappers/nn/quant_conv3d_decomposed.py

diff --git a/test/quantization/wrapq/wrappers/nn/test_quant_conv3d.py b/test/quantization/wrapq/wrappers/nn/test_quant_conv3d.py
index 718ff66a..cf61d12f 100644
--- a/test/quantization/wrapq/wrappers/nn/test_quant_conv3d.py
+++ b/test/quantization/wrapq/wrappers/nn/test_quant_conv3d.py
@@ -24,6 +24,9 @@
 from tico.quantization.wrapq.wrappers.nn.quant_conv3d import QuantConv3d
 
 
+@unittest.skip(
+    "This test is skipped becuse QuantConv3d is not currently used to wrap Conv3d"
+)
 class TestQuantConv3d(unittest.TestCase):
     def setUp(self):
         torch.manual_seed(0)
diff --git a/test/quantization/wrapq/wrappers/nn/test_quant_conv3d_decomposed.py b/test/quantization/wrapq/wrappers/nn/test_quant_conv3d_decomposed.py
new file mode 100644
index 00000000..dadc5f0f
--- /dev/null
+++ b/test/quantization/wrapq/wrappers/nn/test_quant_conv3d_decomposed.py
@@ -0,0 +1,600 @@
+# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from tico.quantization.config.ptq import PTQConfig
+from tico.quantization.wrapq.dtypes import DType
+from tico.quantization.wrapq.mode import Mode
+from tico.quantization.wrapq.wrappers.nn.quant_conv3d_decomposed import (
+    QuantConv3dDecomposed,
+)
+
+
+class TestQuantConv3dDecomposed(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(42)
+
+        # Create a simple Conv3d module (matches Qwen3-VL patch embed structure)
+        self.fp32 = nn.Conv3d(
+            in_channels=3,
+            out_channels=16,
+            kernel_size=(2, 3, 3),
+            stride=(1, 1, 1),
+            padding=(0, 1, 1),
+            bias=True,
+        )
+
+        # Input tensor: (batch, in_channels, depth, height, width)
+        self.x = torch.randn(2, 3, 4, 8, 8)
+
+        # Create quantized wrapper
+        self.q_conv = QuantConv3dDecomposed(self.fp32)
+
+    def test_mode_transitions(self):
+        """Test quantization mode transitions: NO_QUANT → CALIB → QUANT"""
+        # Initially in NO_QUANT mode
+        self.assertIs(self.q_conv._mode, Mode.NO_QUANT)
+
+        # Enable calibration
+        self.q_conv.enable_calibration()
+        _ = self.q_conv(self.x)
+        self.assertIs(self.q_conv._mode, Mode.CALIB)
+
+        # Freeze quantization parameters
+        self.q_conv.freeze_qparams()
+        self.assertIs(self.q_conv._mode, Mode.QUANT)
+
+    def test_decomposition_correctness_no_quant(self):
+        """
+        In NO_QUANT mode, decomposition should match FP32 Conv3d exactly.
+        This verifies the slice+Conv2d+Add logic is correct.
+        """
+        # Create quantized wrapper (stays in NO_QUANT)
+        q_conv = QuantConv3dDecomposed(self.fp32)
+
+        # Run forward pass
+        q_out = q_conv(self.x)
+
+        # Run original Conv3d
+        fp_out = F.conv3d(
+            self.x,
+            self.fp32.weight,
+            self.fp32.bias,
+            stride=self.fp32.stride,
+            padding=self.fp32.padding,
+        )
+
+        # Check shape and values
+        self.assertEqual(q_out.shape, fp_out.shape)
+        self.assertTrue(torch.allclose(q_out, fp_out, atol=1e-6, rtol=1e-6))
+
+    def test_decomposition_various_shapes(self):
+        """Test decomposition correctness across various input shapes."""
+        test_cases = [
+            (1, 3, 4, 8, 8),  # Small input
+            (2, 3, 4, 8, 8),  # Reference shape
+            (4, 3, 4, 8, 8),  # Larger batch
+            (2, 3, 8, 16, 16),  # Larger spatial dimensions
+            (2, 3, 6, 8, 8),  # More frames
+        ]
+
+        for batch, in_ch, depth, height, width in test_cases:
+            with self.subTest(
+                batch=batch, in_ch=in_ch, depth=depth, height=height, width=width
+            ):
+                fp32 = nn.Conv3d(in_ch, 16, (2, 3, 3), padding=(0, 1, 1), bias=True)
+                q_conv = QuantConv3dDecomposed(fp32)
+
+                x = torch.randn(batch, in_ch, depth, height, width)
+
+                q_out = q_conv(x)
+                fp_out = F.conv3d(x, fp32.weight, fp32.bias, padding=(0, 1, 1))
+
+                self.assertEqual(q_out.shape, fp_out.shape)
+                self.assertTrue(torch.allclose(q_out, fp_out, atol=1e-6, rtol=1e-6))
+
+    def test_quantized_output_close(self):
+        """
+        After calibration and freeze, quantized output should:
+        - Differ from FP reference (quantization actually applied)
+        - Stay within reasonable error bounds
+        """
+        # Calibration
+        self.q_conv.enable_calibration()
+        for _ in range(5):
+            _ = self.q_conv(self.x)
+        self.q_conv.freeze_qparams()
+
+        # Compare outputs
+        with torch.no_grad():
+            q_out = self.q_conv(self.x)
+            fp_out = F.conv3d(
+                self.x,
+                self.fp32.weight,
+                self.fp32.bias,
+                stride=self.fp32.stride,
+                padding=self.fp32.padding,
+            )
+
+        diff = (fp_out - q_out).abs().mean().item()
+        self.assertGreater(diff, 0.0, "Quantized output should differ from FP32")
+        self.assertLess(diff, 0.5, "Quantization error should be reasonable")
+
+    def test_dynamic_observers_created(self):
+        """Test that dynamic observers are created during first forward pass."""
+        self.assertFalse(self.q_conv._dynamic_obs_calibrated)
+
+        # Enable calibration
+        self.q_conv.enable_calibration()
+        _ = self.q_conv(self.x)
+
+        # Observers should be created
+        self.assertTrue(self.q_conv._dynamic_obs_calibrated)
+
+        # Check observer counts
+        kT = self.fp32.kernel_size[0]  # 2
+        T_out = (4 + 0 - 1 * (2 - 1) - 1) // 1 + 1  # 3
+
+        self.assertEqual(len(self.q_conv._input_slice_obs), kT)
+        self.assertEqual(len(self.q_conv._conv2d_obs), kT)
+        self.assertEqual(len(self.q_conv._acc_obs), T_out)
+
+    def test_observers_reused_across_calls(self):
+        """Test that observers are reused when input shape doesn't change."""
+        # Calibration
+        self.q_conv.enable_calibration()
+        _ = self.q_conv(self.x)
+
+        # Get initial observer count
+        initial_count = len(self.q_conv._input_slice_obs)
+
+        # Second forward pass with same shape
+        _ = self.q_conv(self.x)
+
+        # Observer count should not change
+        self.assertEqual(len(self.q_conv._input_slice_obs), initial_count)
+
+    def test_per_channel_weight_quantization(self):
+        """
+        Test that per-channel weight quantization produces correct number of scales.
+        """
+        # Calibration
+        self.q_conv.enable_calibration()
+        self.q_conv.obs_weight.compute_qparams()
+        self.q_conv.freeze_qparams()
+
+        # Check that scale/zero_point have correct shape (per output channel)
+        expected_num_channels = self.fp32.out_channels
+        self.assertEqual(
+            self.q_conv.obs_weight._cached_scale.shape[0], expected_num_channels
+        )
+        self.assertEqual(
+            self.q_conv.obs_weight._cached_zp.shape[0], expected_num_channels
+        )
+
+    def test_activation_stats_collected(self):
+        """Test that activation statistics are collected during calibration."""
+        # Calibration
+        self.q_conv.enable_calibration()
+
+        # Run forward pass
+        _ = self.q_conv(self.x)
+
+        # Check that activation observers have collected stats
+        self.assertTrue(self.q_conv.obs_act_in.min_val.numel() > 0)
+        self.assertTrue(self.q_conv.obs_act_out.min_val.numel() > 0)
+
+        # Freeze and check qparams exist
+        self.q_conv.freeze_qparams()
+        self.assertTrue(self.q_conv.obs_act_in.has_qparams)
+        self.assertTrue(self.q_conv.obs_act_out.has_qparams)
+
+    def test_dynamic_activation_stats_collected(self):
+        """Test that dynamic activation observers collect stats."""
+        # Calibration
+        self.q_conv.enable_calibration()
+        _ = self.q_conv(self.x)
+
+        # Check that dynamic observers have collected stats
+        for obs in self.q_conv._input_slice_obs.values():
+            self.assertTrue(obs.min_val.numel() > 0)
+
+        for obs in self.q_conv._conv2d_obs.values():
+            self.assertTrue(obs.min_val.numel() > 0)
+
+        for obs in self.q_conv._acc_obs.values():
+            self.assertTrue(obs.min_val.numel() > 0)
+
+    def test_dtype_override(self):
+        """Test that PTQConfig overrides propagate to observers."""
+        cfg = PTQConfig(
+            default_dtype=DType.uint(8),
+            overrides={
+                "act_in": {"dtype": DType.uint(4)},
+                "act_out": {"dtype": DType.uint(4)},
+                "weight": {"dtype": DType.uint(4)},
+            },
+        )
+
+        qcustom = QuantConv3dDecomposed(self.fp32, qcfg=cfg)
+
+        # Check that overrides were applied
+        self.assertEqual(qcustom.obs_weight.dtype, DType.uint(4))
+        self.assertEqual(qcustom.obs_act_in.dtype, DType.uint(4))
+        self.assertEqual(qcustom.obs_act_out.dtype, DType.uint(4))
+
+    def test_conv3d_without_bias(self):
+        """Test that Conv3d without bias is handled correctly."""
+        fp32_no_bias = nn.Conv3d(3, 16, (2, 3, 3), bias=False)
+        q_conv_no_bias = QuantConv3dDecomposed(fp32_no_bias)
+
+        # Calibration and forward
+        q_conv_no_bias.enable_calibration()
+        _ = q_conv_no_bias(self.x)
+        q_conv_no_bias.freeze_qparams()
+
+        # Should not raise
+        with torch.no_grad():
+            _ = q_conv_no_bias(self.x)
+
+    def test_different_kernel_sizes(self):
+        """Test with various kernel sizes."""
+        kernel_sizes = [
+            (2, 3, 3),  # Temporal kernel = 2 (like Qwen3-VL)
+            (3, 3, 3),  # Standard 3D kernel
+            (1, 3, 3),  # No temporal kernel
+        ]
+
+        for ksize in kernel_sizes:
+            with self.subTest(kernel_size=ksize):
+                fp32 = nn.Conv3d(3, 16, ksize, padding=(ksize[0] // 2, 1, 1))
+                q_conv = QuantConv3dDecomposed(fp32)
+
+                # Test decomposition correctness
+                x = torch.randn(2, 3, 4, 8, 8)
+                q_out = q_conv(x)
+                fp_out = F.conv3d(
+                    x, fp32.weight, fp32.bias, padding=(ksize[0] // 2, 1, 1)
+                )
+
+                self.assertTrue(torch.allclose(q_out, fp_out, atol=1e-6, rtol=1e-6))
+
+    def test_different_padding(self):
+        """
+        Test that different padding schemes produce correct outputs.
+        Covers all branches in _parse_padding method.
+        """
+        # Define test cases: (padding, description)
+        test_cases = [
+            # String-based padding
+            ("same", "String padding='same'"),
+            ("valid", "String padding='valid'"),
+            # List/tuple-based padding
+            ([1], "Single-element list padding"),
+            ([1, 1, 1], "Three-element list padding"),
+            ((1,), "Single-element tuple padding"),
+            ((1, 1, 1), "Three-element tuple padding"),
+            ([1, 2, 3], "Asymmetric list padding"),
+            ((1, 2, 3), "Asymmetric tuple padding"),
+            # Integer padding
+            (1, "Integer padding=1"),
+            (2, "Integer padding=2"),
+            # Edge cases
+            ([2, 1, 2], "List padding with different values"),
+            ((2, 1, 2), "Tuple padding with different values"),
+        ]
+
+        for padding, description in test_cases:
+            with self.subTest(padding=padding, description=description):
+                try:
+                    # Create Conv3d with the given padding
+                    fp32 = nn.Conv3d(
+                        in_channels=3,
+                        out_channels=8,
+                        kernel_size=(2, 3, 3),
+                        padding=padding,
+                        bias=True,
+                    )
+
+                    # Create quantized wrapper
+                    q_conv = QuantConv3dDecomposed(fp32)
+
+                    # Test input
+                    x = torch.randn(2, 3, 4, 8, 8)
+
+                    # Get outputs
+                    q_out = q_conv(x)
+
+                    # Calculate expected padding for FP32 reference
+                    ref_padding = None
+                    if isinstance(padding, str):
+                        if padding == "same":
+                            ref_padding = (
+                                fp32.kernel_size[0] // 2,
+                                fp32.kernel_size[1] // 2,
+                                fp32.kernel_size[2] // 2,
+                            )
+                        elif padding == "valid":
+                            ref_padding = (0, 0, 0)
+                    elif isinstance(padding, (list, tuple)):
+                        if len(padding) == 1:
+                            ref_padding = (padding[0], padding[0], padding[0])
+                        elif len(padding) == 3:
+                            ref_padding = (padding[0], padding[1], padding[2])
+                        else:
+                            continue  # Skip unsupported padding format
+                    elif isinstance(padding, int):
+                        ref_padding = (padding, padding, padding)
+                    else:
+                        continue  # Skip unsupported padding type
+
+                    # Get FP32 reference output
+                    fp_out = F.conv3d(
+                        x,
+                        fp32.weight,
+                        fp32.bias,
+                        stride=(1, 1, 1),
+                        padding=ref_padding,
+                    )
+
+                    # Verify outputs match
+                    self.assertEqual(
+                        q_out.shape,
+                        fp_out.shape,
+                        f"Output shapes don't match for {description}",
+                    )
+                    self.assertTrue(
+                        torch.allclose(q_out, fp_out, atol=1e-6, rtol=1e-6),
+                        f"Output values don't match for {description}",
+                    )
+
+                except ValueError as e:
+                    # Expected for unsupported padding types (2D padding for 3D conv)
+                    if "Unsupported padding" in str(e):
+                        # This is expected behavior for invalid padding formats
+                        continue
+                    else:
+                        raise
+
+    def test_temporal_padding(self):
+        """Test temporal padding with zeros+cat."""
+        fp32_padded = nn.Conv3d(3, 16, (2, 3, 3), padding=(1, 1, 1), bias=True)
+        q_conv_padded = QuantConv3dDecomposed(fp32_padded)
+
+        x = torch.randn(2, 3, 4, 8, 8)
+
+        # Test decomposition correctness
+        q_out = q_conv_padded(x)
+        fp_out = F.conv3d(x, fp32_padded.weight, fp32_padded.bias, padding=(1, 1, 1))
+
+        self.assertTrue(torch.allclose(q_out, fp_out, atol=1e-6, rtol=1e-6))
+
+    def test_different_strides(self):
+        """Test with different stride configurations."""
+        strides = [(1, 1, 1), (1, 2, 2), (2, 1, 1)]
+
+        for stride in strides:
+            with self.subTest(stride=stride):
+                fp32 = nn.Conv3d(3, 16, (2, 3, 3), stride=stride, padding=(0, 1, 1))
+                q_conv = QuantConv3dDecomposed(fp32)
+
+                x = torch.randn(2, 3, 4, 8, 8)
+
+                # Test decomposition correctness
+                q_out = q_conv(x)
+                fp_out = F.conv3d(
+                    x, fp32.weight, fp32.bias, stride=stride, padding=(0, 1, 1)
+                )
+
+                self.assertTrue(torch.allclose(q_out, fp_out, atol=1e-6, rtol=1e-6))
+
+    def test_dilation(self):
+        """Test temporal dilation support."""
+        fp32_dilated = nn.Conv3d(
+            3, 16, (2, 3, 3), dilation=(2, 1, 1), padding=(0, 1, 1)
+        )
+        q_conv_dilated = QuantConv3dDecomposed(fp32_dilated)
+
+        x = torch.randn(2, 3, 8, 8, 8)  # Need more frames for dilation
+
+        # Test decomposition correctness
+        q_out = q_conv_dilated(x)
+        fp_out = F.conv3d(
+            x,
+            fp32_dilated.weight,
+            fp32_dilated.bias,
+            dilation=(2, 1, 1),
+            padding=(0, 1, 1),
+        )
+
+        self.assertTrue(torch.allclose(q_out, fp_out, atol=1e-6, rtol=1e-6))
+
+    def test_registration_in_registry(self):
+        """Test that nn.Conv3d is properly registered."""
+        import warnings
+
+        # Suppress warnings from PyTorch's Swig-generated types
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                message="builtin type SwigPyPacked has no __module__ attribute",
+            )
+            warnings.filterwarnings(
+                "ignore",
+                message="builtin type SwigPyObject has no __module__ attribute",
+            )
+
+            from tico.quantization.wrapq.wrappers.nn.quant_conv3d_decomposed import (
+                QuantConv3dDecomposed,
+            )
+            from tico.quantization.wrapq.wrappers.registry import lookup
+
+            # Verify Conv3d maps to QuantConv3dDecomposed
+            wrapper_cls = lookup(nn.Conv3d)
+            self.assertIs(wrapper_cls, QuantConv3dDecomposed)
+
+    def test_all_observers_yielded(self):
+        """Test that _all_observers returns all observers."""
+        # Calibration to create dynamic observers
+        self.q_conv.enable_calibration()
+        _ = self.q_conv(self.x)
+
+        # Get all observers
+        observers = list(self.q_conv._all_observers())
+
+        # Should include static observers
+        self.assertIn(self.q_conv.obs_weight, observers)
+        self.assertIn(self.q_conv.obs_act_in, observers)
+        self.assertIn(self.q_conv.obs_act_out, observers)
+
+        # Should include dynamic observers
+        self.assertGreater(len(observers), 3)
+
+    def test_multiple_calibration_cycles(self):
+        """Test that multiple calibration cycles work correctly."""
+        # First calibration
+        self.q_conv.enable_calibration()
+        for _ in range(3):
+            _ = self.q_conv(self.x)
+        self.q_conv.freeze_qparams()
+
+        # Get first output
+        with torch.no_grad():
+            q_out_1 = self.q_conv(self.x)
+
+        # Second calibration
+        self.q_conv.enable_calibration()
+        for _ in range(3):
+            _ = self.q_conv(self.x)
+        self.q_conv.freeze_qparams()
+
+        # Get second output
+        with torch.no_grad():
+            q_out_2 = self.q_conv(self.x)
+
+        # Outputs should be close (same calibration data)
+        self.assertTrue(torch.allclose(q_out_1, q_out_2, atol=1e-5, rtol=1e-5))
+
+    def test_output_shape_correctness(self):
+        """Test that output shape matches Conv3d formula."""
+        test_cases = [
+            (2, 3, 4, 8, 8),  # Reference case
+            (1, 3, 8, 16, 16),  # Larger input
+            (4, 3, 2, 4, 4),  # Small input
+        ]
+
+        for input_shape in test_cases:
+            with self.subTest(input_shape=input_shape):
+                q_conv = QuantConv3dDecomposed(self.fp32)
+
+                x = torch.randn(*input_shape)
+                q_out = q_conv(x)
+                fp_out = self.fp32(x)
+
+                self.assertEqual(q_out.shape, fp_out.shape)
+
+    def test_special_case_optimization(self):
+        """
+        Test the special case optimization where Conv3d can be converted to Conv2d
+        without addition operations.
+
+        Special case conditions:
+        - kernel_size[D] = input_size[D] for all dimensions D
+        - stride[D] = kernel_size[D] for all dimensions D
+        - padding[D] = 0 for all dimensions D
+        - groups = 1
+        - dilation = 1 for all dimensions D
+        """
+        # Create a Conv3d that meets the special case conditions
+        # Input: (N=2, C=3, T=2, H=16, W=16)
+        # Kernel: (2, 16, 16) - matches temporal and spatial dimensions
+        # Stride: (2, 16, 16) - equals kernel size
+        # Padding: 0
+        fp32_special = nn.Conv3d(
+            in_channels=3,
+            out_channels=1024,
+            kernel_size=(2, 16, 16),
+            stride=(2, 16, 16),
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+
+        # Create quantized wrapper
+        q_conv_special = QuantConv3dDecomposed(fp32_special)
+
+        # Input that matches the kernel size in temporal and spatial dimensions
+        x_special = torch.randn(2, 3, 2, 16, 16)
+
+        # Test that the special case produces the same result as standard Conv3d
+        q_out = q_conv_special(x_special)
+        fp_out = F.conv3d(
+            x_special,
+            fp32_special.weight,
+            fp32_special.bias,
+            stride=(2, 16, 16),
+            padding=0,
+        )
+
+        # Check shape and values
+        self.assertEqual(q_out.shape, fp_out.shape)
+        diff = (fp_out - q_out).abs().mean().item()
+        self.assertGreater(diff, 0.0, "Quantized output should differ from FP32")
+        self.assertLess(diff, 0.7, "Quantization error should be reasonable")
+
+    def test_special_case_without_bias(self):
+        """
+        Test the special case optimization with bias=False.
+        """
+        # Create a Conv3d without bias that meets the special case conditions
+        fp32_special = nn.Conv3d(
+            in_channels=3,
+            out_channels=512,
+            kernel_size=(2, 8, 8),
+            stride=(2, 8, 8),
+            padding=0,
+            bias=False,
+            groups=1,
+        )
+
+        # Create quantized wrapper
+        q_conv_special = QuantConv3dDecomposed(fp32_special)
+
+        # Input that matches the kernel size in temporal and spatial dimensions
+        x_special = torch.randn(3, 3, 2, 8, 8)
+
+        # Test that the special case produces the same result as standard Conv3d
+        q_out = q_conv_special(x_special)
+        fp_out = F.conv3d(
+            x_special,
+            fp32_special.weight,
+            fp32_special.bias,
+            stride=(2, 8, 8),
+            padding=0,
+        )
+
+        # Check shape and values
+        self.assertEqual(q_out.shape, fp_out.shape)
+        diff = (fp_out - q_out).abs().mean().item()
+        self.assertGreater(diff, 0.0, "Quantized output should differ from FP32")
+        self.assertLess(diff, 0.7, "Quantization error should be reasonable")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/quantization/wrapq/wrappers/qwen_vl/test_quant_vision_patch_embed.py b/test/quantization/wrapq/wrappers/qwen_vl/test_quant_vision_patch_embed.py
index 604a0167..ee7b2f30 100644
--- a/test/quantization/wrapq/wrappers/qwen_vl/test_quant_vision_patch_embed.py
+++ b/test/quantization/wrapq/wrappers/qwen_vl/test_quant_vision_patch_embed.py
@@ -92,7 +92,7 @@ def test_forward_diff(self):
 
         diff = (fp_out - q_out).abs().mean().item()
         self.assertGreater(diff, 0.0)  # not identical
-        self.assertLess(diff, 0.4)  # acceptably close
+        self.assertLess(diff, 0.7)  # acceptably close
         self.assertEqual(fp_out.shape, q_out.shape)
 
     def test_proj_override(self):
@@ -112,7 +112,7 @@ def test_proj_override(self):
         q_patch = QuantQwen3VLVisionPatchEmbed(self.fp_patch_embed, qcfg=cfg)
         q_conv3d = q_patch.proj.wrapped
 
-        self.assertIsInstance(q_conv3d, QuantConv3d)
+        self.assertIn("QuantConv3d", type(q_conv3d).__name__)
         self.assertEqual(q_conv3d.obs_weight.dtype, DType.uint(4))
         self.assertEqual(q_conv3d.obs_act_in.dtype, DType.uint(4))
         self.assertEqual(q_conv3d.obs_act_out.dtype, DType.uint(4))
@@ -225,4 +225,4 @@ def test_different_batch_sizes(self):
 
             self.assertEqual(q_out.shape, fp_out.shape)
             diff = (fp_out - q_out).abs().mean().item()
-            self.assertLess(diff, 0.4)
+            self.assertLess(diff, 0.8)
diff --git a/tico/quantization/wrapq/examples/nn/quantize_conv3d.py b/tico/quantization/wrapq/examples/nn/quantize_conv3d.py
new file mode 100644
index 00000000..b1b0bfe6
--- /dev/null
+++ b/tico/quantization/wrapq/examples/nn/quantize_conv3d.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import sys
+
+import tico
+import tico.quantization
+import tico.quantization.config.ptq
+
+import torch
+import torch.nn as nn
+from tico.quantization.evaluation.metric import compute_peir
+from tico.quantization.evaluation.utils import plot_two_outputs
+
+
+def generate_calibration_data(
+    num_batches: int,
+    batch_size: int,
+    in_channels: int,
+    depth: int,
+    height: int,
+    width: int,
+) -> list:
+    """Generate calibration data for PTQ"""
+    calibration_data = []
+    for i in range(num_batches):
+        x = torch.randn(batch_size, in_channels, depth, height, width)
+        calibration_data.append(x)
+    return calibration_data
+
+
+def main():
+    # Create Conv3d model (matches Qwen3-VL patch embed structure)
+    # Input: (B, C, T, H, W) - Batch, Channels, Time (frames), Height, Width
+    # Output: (B, C_out, T_out, H_out, W_out)
+    model = nn.Conv3d(
+        in_channels=3,  # RGB channels
+        out_channels=1024,  # Hidden dimension (like Qwen3-VL)
+        kernel_size=(2, 16, 16),  # Temporal kernel=2, spatial kernel=16x16
+        stride=(2, 16, 16),
+        bias=True,
+    )
+    orig_model = copy.deepcopy(model)
+    model.eval()
+
+    # Model architecture:
+    # Conv3d(
+    #     (weight): Parameter [1024, 3, 2, 16, 16]
+    #     (bias): Parameter [1024]
+    # )
+
+    print(f"Input channels:  {model.in_channels}")
+    print(f"Output channels: {model.out_channels}")
+    print(f"Kernel size:     {model.kernel_size}")
+    print(f"Stride:          {model.stride}")
+    print(f"Padding:         {model.padding}")
+
+    # Generate calibration data
+    # Input shape: (batch_size, in_channels, depth, height, width)
+    # Example: (10, 3, 4, 64, 64) - 10 samples, 3 channels (RGB), 4 frames, 64×64 pixels
+    batch_size = 10
+    in_channels = 3
+    depth = 4
+    height = 64
+    width = 64
+    calibration_data = generate_calibration_data(
+        num_batches=2,
+        batch_size=batch_size,
+        in_channels=in_channels,
+        depth=depth,
+        height=height,
+        width=width,
+    )
+    example_input = calibration_data[0]
+
+    # Configure PTQ
+    ptq_config = tico.quantization.config.ptq.PTQConfig()
+
+    # Prepare the model for quantization
+    prepared_model = tico.quantization.prepare(
+        model, ptq_config, inplace=True  # Transform the model in place
+    )
+
+    # Calibrate the model (collect statistics)
+    with torch.no_grad():
+        for i, batch in enumerate(calibration_data):
+            prepared_model(batch)
+
+    # Convert to quantized model
+    quantized_model = tico.quantization.convert(prepared_model, inplace=True)
+
+    # Compute PEIR (Peak Error-to-Input Ratio) between quantized model and original model
+    with torch.no_grad():
+        quant_out = quantized_model(example_input)
+        fp_out = orig_model(example_input)
+
+    print(f"Input shape:              {example_input.shape}")
+    print(f"Output shape (FP32):      {fp_out.shape}")
+    print(f"Output shape (Quantized): {quant_out.shape}")
+    print(f"┌───────────── Quantization Error Summary ─────────────")
+    print(f"│ Mean |diff|: {(quant_out - fp_out).abs().mean().item():.6f}")
+    print(f"│ PEIR       : {compute_peir(fp_out, quant_out) * 100:.6f} %")
+    print(f"└──────────────────────────────────────────────────────")
+    print(plot_two_outputs(fp_out, quant_out))
+
+    # Convert to Circle format
+    circle_model = tico.convert(quantized_model.eval(), (example_input,))
+
+    # Save the Circle model
+    filename = "quantized_conv3d.circle"
+    circle_model.save(filename)
+    print(f"Circle model saved as '{filename}'")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tico/quantization/wrapq/examples/nn/quantize_conv3d_special_case.py b/tico/quantization/wrapq/examples/nn/quantize_conv3d_special_case.py
new file mode 100644
index 00000000..49c89a64
--- /dev/null
+++ b/tico/quantization/wrapq/examples/nn/quantize_conv3d_special_case.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import sys
+
+import tico
+import tico.quantization
+import tico.quantization.config.ptq
+
+import torch
+import torch.nn as nn
+from tico.quantization.evaluation.metric import compute_peir
+from tico.quantization.evaluation.utils import plot_two_outputs
+
+
+def generate_calibration_data(
+    num_batches: int,
+    batch_size: int,
+    in_channels: int,
+    depth: int,
+    height: int,
+    width: int,
+) -> list:
+    """Generate calibration data for PTQ"""
+    calibration_data = []
+    for i in range(num_batches):
+        x = torch.randn(batch_size, in_channels, depth, height, width)
+        calibration_data.append(x)
+    return calibration_data
+
+
+def main():
+    # Create a Conv3d that meets the special case conditions
+    # Input: (N=2, C=3, T=2, H=16, W=16)
+    # Kernel: (2, 16, 16) - matches temporal and spatial dimensions
+    # Stride: (2, 16, 16) - equals kernel size
+    # Padding: 0
+    model = nn.Conv3d(
+        in_channels=3,
+        out_channels=1024,
+        kernel_size=(2, 16, 16),
+        stride=(2, 16, 16),
+        padding=0,
+        bias=True,
+        groups=1,
+    )
+    orig_model = copy.deepcopy(model)
+    model.eval()
+
+    # Model architecture:
+    # Conv3d(
+    #     (weight): Parameter [1024, 3, 2, 16, 16]
+    #     (bias): Parameter [1024]
+    # )
+
+    print(f"Input channels:  {model.in_channels}")
+    print(f"Output channels: {model.out_channels}")
+    print(f"Kernel size:     {model.kernel_size}")
+    print(f"Stride:          {model.stride}")
+    print(f"Padding:         {model.padding}")
+
+    # Generate calibration data that matches the kernel size in temporal and spatial dimensions.
+    # Input shape: (batch_size, in_channels, depth, height, width)
+    # Example: (10, 3, 2, 16, 16) - 10 samples, 3 channels (RGB), 2 frames, 16×16 pixels
+    batch_size = 10
+    in_channels = 3
+    depth = 2
+    height = 16
+    width = 16
+    calibration_data = generate_calibration_data(
+        num_batches=2,
+        batch_size=batch_size,
+        in_channels=in_channels,
+        depth=depth,
+        height=height,
+        width=width,
+    )
+    example_input = calibration_data[0]
+
+    # Configure PTQ
+    ptq_config = tico.quantization.config.ptq.PTQConfig()
+
+    # Prepare the model for quantization
+    prepared_model = tico.quantization.prepare(
+        model, ptq_config, inplace=True  # Transform the model in place
+    )
+
+    # Calibrate the model (collect statistics)
+    with torch.no_grad():
+        for i, batch in enumerate(calibration_data):
+            prepared_model(batch)
+
+    # Convert to quantized model
+    quantized_model = tico.quantization.convert(prepared_model, inplace=True)
+
+    # Compute PEIR (Peak Error-to-Input Ratio) between quantized model and original model
+    with torch.no_grad():
+        quant_out = quantized_model(example_input)
+        fp_out = orig_model(example_input)
+
+    print(f"Input shape:              {example_input.shape}")
+    print(f"Output shape (FP32):      {fp_out.shape}")
+    print(f"Output shape (Quantized): {quant_out.shape}")
+    print(f"┌───────────── Quantization Error Summary ─────────────")
+    print(f"│ Mean |diff|: {(quant_out - fp_out).abs().mean().item():.6f}")
+    print(f"│ PEIR       : {compute_peir(fp_out, quant_out) * 100:.6f} %")
+    print(f"└──────────────────────────────────────────────────────")
+    print(plot_two_outputs(fp_out, quant_out))
+
+    # Convert to Circle format
+    print("\nConverting to Circle format...")
+    circle_model = tico.convert(quantized_model.eval(), (example_input,))
+
+    # Save the Circle model
+    filename = "quantized_conv3d.circle"
+    circle_model.save(filename)
+    print(f"Circle model saved as '{filename}'")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tico/quantization/wrapq/wrappers/nn/quant_conv3d_decomposed.py b/tico/quantization/wrapq/wrappers/nn/quant_conv3d_decomposed.py
new file mode 100644
index 00000000..bb176a36
--- /dev/null
+++ b/tico/quantization/wrapq/wrappers/nn/quant_conv3d_decomposed.py
@@ -0,0 +1,395 @@
+# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from tico.quantization.config.ptq import PTQConfig
+from tico.quantization.wrapq.mode import Mode
+from tico.quantization.wrapq.observers.base import ObserverBase
+from tico.quantization.wrapq.qscheme import QScheme
+from tico.quantization.wrapq.wrappers.quant_module_base import QuantModuleBase
+from tico.quantization.wrapq.wrappers.registry import register
+
+
+@register(nn.Conv3d)
+class QuantConv3dDecomposed(QuantModuleBase):
+    """
+    Quantization wrapper for nn.Conv3d with decomposition to Conv2d.
+
+    This class decomposes Conv3d into multiple Conv2d operations to ensure
+    all computations remain quantized. The decomposition follows the slice + Conv2d + Add
+    approach, avoiding graph passes that introduce floating-point operations.
+
+    Quantization:
+    - Per-channel weight quantization (asymmetric)
+    - Per-tensor input activation quantization
+    - Per-tensor output activation quantization
+    - Per-tensor quantization for all intermediate tensors (input slices, conv2d outputs, accumulators)
+    """
+
+    def __init__(
+        self,
+        fp: nn.Conv3d,
+        *,
+        qcfg: Optional[PTQConfig] = None,
+        fp_name: Optional[str] = None,
+    ):
+        super().__init__(qcfg, fp_name=fp_name)
+
+        # Static observers (always exist)
+        self.obs_weight = self._make_obs(
+            "weight", qscheme=QScheme.PER_CHANNEL_ASYMM, channel_axis=0
+        )
+        self.obs_act_in = self._make_obs("act_in")
+        self.obs_act_out = self._make_obs("act_out")
+
+        # Store original module
+        self.module = fp
+
+        # Dynamic observers (created lazily during first forward pass)
+        self._input_slice_obs: Dict[int, ObserverBase] = {}  # Maps k (int) -> observer
+        self._conv2d_obs: Dict[int, ObserverBase] = {}  # Maps k (int) -> observer
+        self._acc_obs: Dict[int, ObserverBase] = {}  # Maps t_out (int) -> observer
+
+        # Tracking for lazy observer creation
+        self._dynamic_obs_calibrated = False
+
+    def enable_calibration(self) -> None:
+        """Enable calibration mode."""
+        super().enable_calibration()
+
+        # Collect weight statistics immediately (weights are static)
+        self.obs_weight.collect(self.module.weight)
+
+        # Reset dynamic observers for new calibration
+        self._dynamic_obs_calibrated = False
+
+    def _create_dynamic_observers(
+        self,
+        kT: int,
+        T_out: int,
+    ):
+        """
+        Create dynamic observers for intermediate quantization points.
+
+        Args:
+            kT: Kernel temporal dimension
+            T_out: Number of output temporal positions
+        """
+
+        def create_observer(obs_name_prefix, obs_dictionary, dict_key):
+            obs_name = f"{obs_name_prefix}{dict_key}"
+            obs = self._make_obs(obs_name)
+            obs_dictionary[dict_key] = obs
+            self.add_module(obs_name, obs)
+            # self.add_module(obs_name, obs) is required for torch.export() to properly access
+            # the observer and its internal quantization parameters (cached_scale, cached_zp)
+            # during graph construction. When torch.export() traces the model, it creates
+            # 'get_attr' nodes to access module attributes. If observers are stored only in
+            # dictionaries or via setattr(), torch.export() cannot create valid get_attr nodes
+            # for the observer's cached_scale and cached_zp tensors, leading to warnings:
+            #   "Attempted to insert a get_attr Node with no underlying reference"
+            # By registering with add_module(), the observer becomes part of the module's
+            # named_modules() tree, making both the observer AND its quantization parameters
+            # accessible to the graph construction process. This ensures that the exported
+            # graph can properly reference quantization parameters during Circle conversion.
+
+        # Input slice observers (one for each temporal kernel position)
+        for k in range(kT):
+            create_observer(f"input_slice_k", self._input_slice_obs, k)
+
+        # Conv2d output observers (one for each temporal kernel position)
+        for k in range(kT):
+            create_observer(f"conv2d_out_k", self._conv2d_obs, k)
+
+        # Accumulator observers (one for each output temporal position)
+        for t_out in range(T_out):
+            create_observer(f"accumulator_t", self._acc_obs, t_out)
+
+        self._dynamic_obs_calibrated = True
+
+    def _parse_padding(self, padding) -> Tuple[int, int, int]:
+        """Parse padding parameter to (temporal, height, width) tuple."""
+        if isinstance(padding, str):
+            if padding == "same":
+                kT, kH, kW = self.module.kernel_size
+                return kT // 2, kH // 2, kW // 2
+            elif padding == "valid":
+                return 0, 0, 0
+            else:
+                raise ValueError(f"Unsupported padding string: {padding}")
+        elif isinstance(padding, (list, tuple)):
+            if len(padding) == 1:
+                return padding[0], padding[0], padding[0]
+            elif len(padding) == 3:
+                return padding[0], padding[1], padding[2]
+            else:
+                raise ValueError(f"Unsupported padding format: {padding}")
+        elif isinstance(padding, int):  # int
+            return padding, padding, padding
+        else:
+            raise ValueError(f"Unsupported padding type: {type(padding)}")
+
+    def _apply_temporal_padding(
+        self,
+        x: torch.Tensor,
+        temporal_padding: int,
+    ) -> torch.Tensor:
+        """Apply temporal padding using zeros and cat."""
+        if temporal_padding == 0:
+            return x
+
+        N, C_in, T_in, H_in, W_in = x.shape
+
+        # Create zero padding tensors
+        zero_pad = torch.zeros(
+            N, C_in, temporal_padding, H_in, W_in, dtype=x.dtype, device=x.device
+        )
+
+        # Cat: [zeros, input, zeros]
+        padded = torch.cat([zero_pad, x, zero_pad], dim=2)
+
+        return padded
+
+    def _get_padded_input_slice(
+        self,
+        padded_x: torch.Tensor,
+        t_idx: int,
+        k: int,
+    ) -> torch.Tensor:
+        """
+        Get and quantize input slice at temporal position.
+
+        Args:
+            padded_x: Temporally padded input tensor (N, C_in, T_padded, H_in, W_in)
+            t_idx: Temporal index to slice
+            k: Kernel temporal position (for observer lookup)
+
+        Returns:
+            Quantized input slice (N, C_in, 1, H_in, W_in)
+        """
+        # Slice at temporal position
+        input_slice = padded_x[:, :, t_idx : t_idx + 1, :, :]
+
+        # Quantize input slice
+        input_slice_q = self._fq(input_slice, self._input_slice_obs[k])
+
+        return input_slice_q
+
+    def _apply_conv2d_quantized(
+        self,
+        input_2d: torch.Tensor,
+        weight_slice: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        k: int,
+        H_out: int,
+        W_out: int,
+        padding: Tuple[int, int, int],
+    ) -> torch.Tensor:
+        """
+        Apply quantized Conv2d operation.
+
+        Args:
+            input_2d: 2D input (N, C_in, H_in, W_in)
+            weight_slice: 2D weight slice (C_out, C_in, kH, kW)
+            bias: Optional bias tensor
+            k: Kernel temporal position (for observer lookup)
+            H_out: Output height
+            W_out: Output width
+
+        Returns:
+            Quantized Conv2d output (N, C_out, H_out, W_out)
+        """
+        # Apply Conv2d
+        conv_out = F.conv2d(
+            input_2d,
+            weight_slice,
+            bias=None,  # Bias added after accumulation
+            stride=(self.module.stride[1], self.module.stride[2]),
+            padding=(padding[1], padding[2]),
+            dilation=(self.module.dilation[1], self.module.dilation[2]),
+            groups=self.module.groups,
+        )
+
+        # Quantize Conv2d output
+        conv_out_q = self._fq(conv_out, self._conv2d_obs[k])
+
+        return conv_out_q
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass with quantized Conv3d decomposition.
+
+        Decomposes Conv3d into:
+        1. Temporal padding (if needed)
+        2. Slice input at each temporal kernel position
+        3. Apply Conv2d to each slice
+        4. Accumulate Conv2d results with quantization
+        5. Add bias (if present)
+        6. Stack temporal outputs
+
+        Special case optimization:
+        When kernel_size = input_size, stride = kernel_size,
+        padding = 0, groups = 1, and dilation = 1 for all dimensions,
+        the Conv3d operation reduces to matrix multiplication and is handled
+        with a more efficient direct approach.
+
+        All intermediate tensors are quantized to ensure integer-only computation.
+        """
+        N, C_in, T_in, H_in, W_in = x.shape
+        C_out, C_in_weight, kT, kH, kW = self.module.weight.shape
+        sT, sH, sW = self.module.stride
+        dT, dH, dW = self.module.dilation
+        groups = self.module.groups
+
+        if C_in != C_in_weight:
+            raise RuntimeError("Channels number mismatch")
+
+        # Parse padding
+        padding = self._parse_padding(self.module.padding)
+        temporal_padding, h_padding, w_padding = padding
+
+        # Quantize input activation
+        x_q = self._fq(x, self.obs_act_in)
+
+        # Get quantized weight
+        w = self.module.weight
+        if self._mode is Mode.QUANT:
+            w = self.obs_weight.fake_quant(w)
+
+        # Check for special case:
+        # kernel_size = input_size,
+        # stride = kernel_size,
+        # padding = 0,
+        # no dilation
+        # groups = 1
+        is_special_case = (
+            (kT, kH, kW) == (T_in, H_in, W_in)
+            and (sT, sH, sW) == (kT, kH, kW)
+            and (temporal_padding, h_padding, w_padding) == (0, 0, 0)
+            and (dT, dH, dW) == (1, 1, 1)
+            and groups == 1
+        )
+
+        # Special case: Conv3d reduces to matrix multiplication
+        if is_special_case:
+            # Reshape input: (N, C_in, T_in, H_in, W_in) -> (N, 1, 1, C_in*T_in*H_in*W_in)
+            x_q = x_q.reshape(N, 1, 1, -1)
+
+            # Reshape weights: (C_out, C_in, kT, kH, kW) -> (C_out, 1, 1, C_in*kT*kH*kW)
+            w = w.reshape(C_out, 1, 1, -1)
+
+            # Apply Conv2d directly
+            if self.module.bias is not None:
+                conv2d_result = F.conv2d(x_q, w, self.module.bias)
+            else:
+                conv2d_result = F.conv2d(x_q, w)
+
+            # Reshape output: (1, C_out, N*C_in, 1) -> (N, C_out, 1, 1, 1)
+            result = conv2d_result.reshape(N, C_out, 1, 1, 1)
+
+            # Quantize output activation
+            result_q = self._fq(result, self.obs_act_out)
+            return result_q
+
+        # Normal case: Conv3d is decomposed to multiple Conv2D and Add operations
+        else:
+            # Calculate output dimensions
+            T_padded = T_in + 2 * temporal_padding
+            T_out = (T_padded - dT * (kT - 1) - 1) // sT + 1
+            H_out = (H_in + 2 * h_padding - dH * (kH - 1) - 1) // sH + 1
+            W_out = (W_in + 2 * w_padding - dW * (kW - 1) - 1) // sW + 1
+
+            # Create dynamic observers on first forward pass
+            if not self._dynamic_obs_calibrated:
+                if self._mode is Mode.QUANT:
+                    raise RuntimeError(
+                        "Trying to quantize without calibration. Need to calibrate first."
+                    )
+                self._create_dynamic_observers(kT, T_out)
+
+            # Apply temporal padding
+            padded_input = self._apply_temporal_padding(x_q, temporal_padding)
+
+            # Temporal processing loop
+            temporal_outputs = []
+            for t_out in range(T_out):
+                t_in = t_out * sT
+                accumulator = None
+
+                for k in range(kT):
+                    t_idx = t_in + k * dT
+
+                    # Handle dilation: mask out-of-bounds positions
+                    if dT > 1 and t_idx >= T_padded:
+                        # Skip this kernel position (out of bounds)
+                        continue
+
+                    # Get and quantize input slice
+                    input_slice_q = self._get_padded_input_slice(padded_input, t_idx, k)
+
+                    # Remove temporal dimension: (N, C_in, 1, H_in, W_in) → (N, C_in, H_in, W_in)
+                    input_2d = input_slice_q.squeeze(2)
+
+                    # Slice weight at temporal position k
+                    weight_slice = w[:, :, k, :, :]  # (C_out, C_in, kH, kW)
+
+                    # Apply quantized Conv2d
+                    conv_out_q = self._apply_conv2d_quantized(
+                        input_2d,
+                        weight_slice,
+                        self.module.bias,
+                        k,
+                        H_out,
+                        W_out,
+                        padding,
+                    )
+
+                    # Accumulate with quantization
+                    if accumulator is None:
+                        accumulator = conv_out_q
+                    else:
+                        accumulator = self._fq(
+                            accumulator + conv_out_q, self._acc_obs[t_out]
+                        )
+
+                # Add bias if present
+                if self.module.bias is not None:
+                    bias_reshaped = self.module.bias.reshape(1, C_out, 1, 1)
+                    accumulator = accumulator + bias_reshaped
+
+                temporal_outputs.append(accumulator)
+
+            # Stack temporal outputs
+            unsqueezed = [t.unsqueeze(2) for t in temporal_outputs]  # type: ignore[union-attr]
+            stacked = torch.cat(unsqueezed, dim=2)  # (N, C_out, T_out, H_out, W_out)
+
+            # Quantize output activation
+            stacked_q = self._fq(stacked, self.obs_act_out)
+
+            return stacked_q
+
+    def _all_observers(self):
+        """Return all observers for this module."""
+        # Static observers
+        yield from (self.obs_weight, self.obs_act_in, self.obs_act_out)
+
+        # Dynamic observers (if created)
+        yield from self._input_slice_obs.values()
+        yield from self._conv2d_obs.values()
+        yield from self._acc_obs.values()
diff --git a/tico/quantization/wrapq/wrappers/registry.py b/tico/quantization/wrapq/wrappers/registry.py
index 08d23405..dfdd0186 100644
--- a/tico/quantization/wrapq/wrappers/registry.py
+++ b/tico/quantization/wrapq/wrappers/registry.py
@@ -27,7 +27,7 @@
     "tico.quantization.wrapq.wrappers.nn.quant_embedding",
     "tico.quantization.wrapq.wrappers.nn.quant_layernorm",
     "tico.quantization.wrapq.wrappers.nn.quant_linear",
-    "tico.quantization.wrapq.wrappers.nn.quant_conv3d",
+    "tico.quantization.wrapq.wrappers.nn.quant_conv3d_decomposed",
     # This includes not only `nn.SiLU` but also `SiLUActivation` from transformers
     # as they are same operation.
     "tico.quantization.wrapq.wrappers.nn.quant_silu",