CrazyForks · pull · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py
@@ -1395,7 +1395,7 @@ def train_llama3():
 
   params = get_parameters(model)
 
-  if getenv("FAKEDATA"):
+  if getenv("EMPTYWEIGHT"):
     for v in get_parameters(model):
       v = v.assign(Tensor.empty(v.shape, dtype=v.dtype))
 

diff --git a/..._submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh b/..._submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh
@@ -17,7 +17,7 @@ export FP8=${FP8:-1}
 export ALLREDUCE_CAST=${ALLREDUCE_CAST:-1}
 
 export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
-export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-16} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
+export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
 export GBS=$((BS * GRADIENT_ACC_STEPS))
 
 export MODEL="llama3"
@@ -36,7 +36,7 @@ export DATA_SEED=${DATA_SEED:-5760}
 export JITBEAM=${JITBEAM:-3}
 export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1
 
-export FAKEDATA=1 BENCHMARK=${BENCHMARK:-10}
+export FAKEDATA=${FAKEDATA:-1} BENCHMARK=${BENCHMARK:-10}
 if [ -z "$FULL_LAYERS" ]; then
   export LLAMA_LAYERS=2
 fi

diff --git a/...g_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh b/...g_submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_run.sh
@@ -17,7 +17,7 @@ export FP8=${FP8:-1}
 export ALLREDUCE_CAST=${ALLREDUCE_CAST:-1}
 
 export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
-export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-16} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
+export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
 export GBS=$((BS * GRADIENT_ACC_STEPS))
 
 export MODEL="llama3"

diff --git a/...mission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/run_and_time.sh b/...mission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/run_and_time.sh
@@ -18,7 +18,7 @@ export FP8=1
 export ALLREDUCE_CAST=1
 
 export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
-export DP=8 MP=1 BS=16 EVAL_BS=16 GRADIENT_ACC_STEPS=2
+export DP=8 MP=1 BS=16 EVAL_BS=8 GRADIENT_ACC_STEPS=2
 export GBS=$((BS * GRADIENT_ACC_STEPS))
 
 export MODEL="llama3"

diff --git a/extra/sqtt/install_rocprof_decoder.py b/extra/sqtt/install_rocprof_decoder.py
@@ -1,20 +1,23 @@
 #!/usr/bin/env python3
-import os, shutil
+import os, platform, shutil, subprocess
 from pathlib import Path
 from tinygrad.helpers import fetch, OSX
 
+VERSION = "0.1.6"
 DEST = Path("/usr/local/lib")
 DEST.mkdir(exist_ok=True)
 
 if __name__ == "__main__":
   if OSX:
-    fp = fetch("https://github.com/ROCm/rocprof-trace-decoder/releases/download/0.1.4/rocprof-trace-decoder-macos-arm64-0.1.4-Darwin.sh")
-    lib = fp.parent/"rocprof-trace-decoder-macos-arm64-0.1.4-Darwin"/"lib"/"librocprof-trace-decoder.dylib"
-    os.chmod(fp, 0o755)
-    os.system(f"sudo {fp} --prefix={fp.parent} --include-subdir")
-    shutil.copy2(lib, DEST)
+    arch = "arm64" if platform.machine() == "arm64" else "x86_64"
+    dmg = fetch(f"https://github.com/ROCm/rocprof-trace-decoder/releases/download/{VERSION}/rocprof-trace-decoder-macos-{arch}-{VERSION}-Darwin.dmg")
+    mnt = Path(subprocess.check_output(["hdiutil", "attach", "-nobrowse", "-readonly", "-mountrandom", "/tmp", str(dmg)],
+                                       text=True).split("\t")[-1].strip())
+    try: shutil.copy2(next(mnt.rglob("librocprof-trace-decoder.dylib")), DEST)
+    finally: subprocess.run(["hdiutil", "detach", str(mnt)], check=True)
+    lib = DEST/"librocprof-trace-decoder.dylib"
   else:
     lib = DEST/"librocprof-trace-decoder.so"
-    os.system("sudo curl -L https://github.com/ROCm/rocprof-trace-decoder/raw/43bf0fef74a83c3c25badfc5a09c0bd39ed8c6f9/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so -o"+str(lib))
+    os.system(f"sudo curl -L https://github.com/ROCm/rocprof-trace-decoder/raw/{VERSION}/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so -o {lib}")
     os.system("sudo ldconfig")
-  print(f"Installed {lib.name} to", DEST)
+  print(f"Installed {lib.name} ({VERSION}) to", DEST)
diff --git a/test/mockgpu/usb.py b/test/mockgpu/usb.py
@@ -202,6 +202,8 @@ def process_cdb(self, cdb:bytes, rlen:int, send_data:bytes|None) -> bytes|None:
     return None
 
 class MockUSB3:
+  @classmethod
+  def list_devices(cls, vendor, dev): return [(0, "usb:mock")]
   def __init__(self, *args, **kwargs):
     self.product, self.is_custom = "", False
   def send_batch(self, cdbs:list[bytes], idata:list[int]|None=None, odata:list[bytes|None]|None=None) -> list[bytes|None]:

diff --git a/test/null/test_tensor.py b/test/null/test_tensor.py
@@ -189,6 +189,15 @@ class TestTensorDevice(unittest.TestCase):
   def test_create_from_single_device_tuple(self):
     (Tensor([1.0], device=(Device.DEFAULT,)) + Tensor([2.0])).realize()
 
+class TestTensorPad(unittest.TestCase):
+  # padding int tensor with float-only value (like -inf) must promote dtype to fit value
+  def test_pad_int_with_neg_inf(self):
+    t = Tensor.arange(9).reshape(1, 1, 3, 3)
+    self.assertEqual(t.dtype, dtypes.int)
+    r = t.pad((1, 2, 0, -1), value=-float('inf'))
+    self.assertEqual(r.dtype, dtypes.float)
+    self.assertEqual(r.shape, (1, 1, 2, 6))
+
 class TestTensorDeviceMismatch(unittest.TestCase):
   def test_gather(self):
     x = Tensor.empty(3, 4, device="NULL")

diff --git a/test/null/test_tensor_uop_mixin.py b/test/null/test_tensor_uop_mixin.py
@@ -151,6 +151,25 @@ def test_amin(self): self._check(_t(3, 4).float(), Tensor([[0, 1, 0, 1]]*3, dtyp
   def test_mean_exclude_self(self):
     self._check(_t(3, 4).float(), Tensor([[0, 1, 0, 1]]*3, dtype=dtypes.int32), Tensor.ones(3, 4).float(), reduce="mean", include_self=False)
 
+class TestTensorUOpPool(unittest.TestCase):
+  def test_avg_pool2d(self):                _check(self, _t(1, 1, 5, 5).float(), lambda x: x.avg_pool2d())
+  def test_avg_pool2d_padding(self):        _check(self, _t(1, 1, 5, 5).float(), lambda x: x.avg_pool2d(padding=1))
+  def test_avg_pool2d_ceil(self):           _check(self, _t(1, 1, 5, 5).float(), lambda x: x.avg_pool2d(ceil_mode=True))
+  def test_avg_pool2d_no_count_pad(self):   _check(self, _t(1, 1, 5, 5).float(), lambda x: x.avg_pool2d(padding=1, count_include_pad=False))
+  def test_max_pool2d(self):                _check(self, _t(1, 1, 5, 5).float(), lambda x: x.max_pool2d())
+  def test_max_pool2d_padding(self):        _check(self, _t(1, 1, 5, 5).float(), lambda x: x.max_pool2d(padding=1))
+  def test_max_pool2d_ceil(self):           _check(self, _t(1, 1, 5, 5).float(), lambda x: x.max_pool2d(ceil_mode=True))
+  def test_max_pool2d_return_indices(self):
+    t = _t(1, 1, 5, 5).float()
+    vt, it = t.max_pool2d(return_indices=True)
+    vu, iu = t.uop.max_pool2d(return_indices=True)
+    self.assertIs(_strip_unique(vt.uop), _strip_unique(vu))
+    self.assertIs(_strip_unique(it.uop), _strip_unique(iu))
+  def test_max_unpool2d(self):
+    t = _t(1, 1, 4, 4).float()
+    out, idx = t.max_pool2d(return_indices=True)
+    self.assertIs(_strip_unique(out.max_unpool2d(idx).uop), _strip_unique(out.uop.max_unpool2d(idx.uop)))
+
 class TestTensorUOpCat(unittest.TestCase):
   def test_cat_dim0(self):     _check(self, _t(2, 3), lambda x: x.cat(x, dim=0))
   def test_cat_dim1(self):     _check(self, _t(2, 3), lambda x: x.cat(x, dim=1))

diff --git a/tinygrad/llm/model.py b/tinygrad/llm/model.py
@@ -219,9 +219,8 @@ def _attention(self, x:Tensor, start_pos:int|UOp) -> Tensor:
       self.freqs_cis[start_pos:start_pos+T])
 
     k_store = c_kv.reshape(B, 1, T, self.config.kv_lora_rank).cat(k_rope.reshape(B, 1, T, self.config.rope_dim), dim=-1)
-    v_store = c_kv.reshape(B, 1, T, self.config.kv_lora_rank)
     k = Tensor(self.cache_k.uop.after(self.cache_k[:, :, start_pos:start_pos+T, :].uop.store(k_store.uop)))[:, :, 0:start_pos+T, :]
-    v = Tensor(self.cache_v.uop.after(self.cache_v[:, :, start_pos:start_pos+T, :].uop.store(v_store.uop)))[:, :, 0:start_pos+T, :]
+    v = k[..., :self.config.kv_lora_rank]
 
     mask = Tensor.full((1, 1, T, start_pos+T), float("-inf"), dtype=x.dtype, device=x.device).triu(start_pos+1) if resolve(T != 1) else None
     attn = q @ k.transpose(-1, -2) * (1.0 / self.config.head_dim ** 0.5)
@@ -233,7 +232,6 @@ def _attention(self, x:Tensor, start_pos:int|UOp) -> Tensor:
   def _init_state(self, x:Tensor):
     if not hasattr(self, "cache_k"):
       self.cache_k = Tensor.empty(x.shape[0], 1, self.config.max_context, self.config.kv_lora_rank + self.config.rope_dim, device=x.device)
-      self.cache_v = Tensor.empty(x.shape[0], 1, self.config.max_context, self.config.kv_lora_rank, device=x.device)
       self.freqs_cis = precompute_freqs_cis(self.config.rope_dim, self.config.max_context, self.config.rope_theta)
 
 class GatedDeltaNetBlock(FFNBlock):

diff --git a/tinygrad/mixin/__init__.py b/tinygrad/mixin/__init__.py
@@ -197,8 +197,10 @@ def _pad_constant(self, pX, value:float) -> Self:
     has_neg = not all(resolve(p >= 0) for p in flatten(pX))
     X = self.shrink(tuple((-smin(pB,0),smin(pA+s,s)) for (pB,pA),s in zip(pX, self.shape))) if has_neg else self
     pads = tuple((smax(pB,0), smax(pA,0)) for pB,pA in pX) if has_neg else pX
-    if value == 0: return MovementMixin.pad(X, pads)
-    return MovementMixin.pad(X, pads) + MovementMixin.pad(X.ones_like(), pads).cast(dtypes.bool).where(0, value)
+    base = MovementMixin.pad(X, pads)
+    if value == 0: return base
+    base = base.cast(least_upper_dtype(base.dtype, dtypes.from_py(value)))
+    return base + MovementMixin.pad(X.ones_like(), pads).cast(dtypes.bool).where(base.zeros_like(), base.full_like(value))
 
   def _ufix_keep_dtype(self, x) -> bool:
     # matches Tensor scalar-wrapping behavior: keep self.dtype for float self, or for int self with int/Invalid scalar
@@ -861,6 +863,145 @@ def linear(self, weight:Self, bias:Self|None=None, dtype:DTypeLike|None=None) ->
     x = self.mul(weight) if len(weight.shape) == 1 else self.dot(weight)
     return x.add(bias) if bias is not None else x
 
+  def _apply_ceil_mode(self, pads:Sequence[int], k_:tuple[sint, ...], s_:int|tuple[int, ...], d_:int|tuple[int, ...]) -> list[int]:
+    (d_,s_), i_ = (make_tuple(x, len(k_)) for x in (d_,s_)), self.shape[-len(k_):]
+    grouped_pads = list(flat_to_grouped(pads))
+    # https://arxiv.org/pdf/1603.07285 section 5.1, relationship 15.
+    o_ = [ceildiv(i+pB+pA - (d*(k-1)+1), s) + 1 for i,d,k,s,(pB,pA) in zip(i_,d_,k_,s_,grouped_pads)]
+    for dim,(o,i,s,k,d,(pB,pA)) in enumerate(zip(o_,i_,s_,k_,d_,grouped_pads)):
+      # we have to do additional padding before `_pool` so that `o_` in `_pool` is calculated correctly
+      # `s*(o-1) + (d*(k-1)+1) - (i+pB+pA)` -> last_sliding_window_start + full_kernel_size - padded_input_shape
+      # we decrease padding in the case that a sliding window starts in the end padded region, thereby decreasing `o_` in `_pool`
+      # `smax(s*(o-1) - (pB+i-1), 0)` -> last_sliding_window_start - (pad_before + input_size - zero_offset)
+      grouped_pads[dim] = (pB, pA + s*(o-1) + (d*(k-1)+1) - (i+pB+pA) - smax(s*(o-1) - (pB+i-1), 0))
+    return flatten(reversed(grouped_pads))
+
+  # NOTE: these work for more than 2D
+  def avg_pool2d(self, kernel_size:tuple[int, ...]=(2,2), stride=None, dilation=1, padding:int|tuple[int, ...]=0,
+                 ceil_mode=False, count_include_pad=True) -> Self:
+    """
+    Applies average pooling over a tensor.
+
+    This function supports three different types of `padding`
+
+    1. `int` (single value):
+      Applies the same padding value uniformly to all spatial dimensions.
+
+    2. `tuple[int, ...]` (length = number of spatial dimensions):
+      Specifies a distinct padding value for each spatial dimension in the form `(padding_height, padding_width, ...)`.
+
+    3. `tuple[int, ...]` (length = 2 * number of spatial dimensions):
+      Specifies explicit padding for each side of each spatial dimension in the form
+      `(padding_left, padding_right, padding_top, padding_bottom, ...)`.
+
+    When `ceil_mode` is set to `True`, output shape will be determined using ceil division.
+    When `count_include_pad` is set to `False`, zero padding will not be included in the averaging calculation.
+
+    NOTE: unlike PyTorch, this implementation is not limited to only 2d pooling and instead works for any number of dimensions.
+
+    ```python exec="true" source="above" session="tensor" result="python"
+    t = Tensor.arange(25).reshape(1, 1, 5, 5)
+    print(t.avg_pool2d().numpy())
+    ```
+    ```python exec="true" source="above" session="tensor" result="python"
+    print(t.avg_pool2d(ceil_mode=True).numpy())
+    ```
+    ```python exec="true" source="above" session="tensor" result="python"
+    print(t.avg_pool2d(padding=1).numpy())
+    ```
+    ```python exec="true" source="above" session="tensor" result="python"
+    print(t.avg_pool2d(padding=1, count_include_pad=False).numpy())
+    ```
+    """
+    axis = tuple(range(-len(k_ := make_tuple(kernel_size, 2)), 0))
+    def pool(x:Self, padding_:Sequence[int]) -> Self:
+      return x._pad_constant(((0,0),)*(x.ndim-len(k_)) + flat_to_grouped(padding_), 0.0)._pool(k_, stride if stride is not None else k_, dilation)
+    reg_pads = resolve_pool_pads(padding, len(k_))
+    ceil_pads = self._apply_ceil_mode(reg_pads, k_, stride if stride is not None else k_, dilation)
+    if not count_include_pad:
+      pads = ceil_pads if ceil_mode else reg_pads
+      return pool(self, pads).sum(axis) / pool(self.ones_like(), pads).sum(axis)
+    if not ceil_mode: return pool(self, reg_pads).mean(axis)
+    return pool(self, ceil_pads).sum(axis) / pool(self._pad_constant(((0,0),)*(self.ndim-len(k_)) + flat_to_grouped(reg_pads), 0.0).ones_like(),
+                                                  tuple(cp-rp for cp,rp in zip(ceil_pads, reg_pads))).sum(axis)
+
+  def max_pool2d(self, kernel_size:tuple[int, ...]=(2,2), stride=None, dilation=1, padding:int|tuple[int, ...]=0,
+                 ceil_mode=False, return_indices=False) -> Self | tuple[Self, Self]:
+    """
+    Applies max pooling over a tensor.
+
+    This function supports three different types of `padding`
+
+    1. `int` (single value):
+      Applies the same padding value uniformly to all spatial dimensions.
+
+    2. `tuple[int, ...]` (length = number of spatial dimensions):
+      Specifies a distinct padding value for each spatial dimension in the form `(padding_height, padding_width, ...)`.
+
+    3. `tuple[int, ...]` (length = 2 * number of spatial dimensions):
+      Specifies explicit padding for each side of each spatial dimension in the form
+      `(padding_left, padding_right, padding_top, padding_bottom, ...)`.
+
+    When `ceil_mode` is set to `True`, output shape will be determined using ceil division.
+    When `return_indices` is set to `True`, the argmax will be returned along with the max values.
+
+    NOTE: unlike PyTorch, this implementation is not limited to only 2d pooling and instead works for any number of dimensions.
+
+    ```python exec="true" source="above" session="tensor" result="python"
+    t = Tensor.arange(25).reshape(1, 1, 5, 5)
+    print(t.max_pool2d().numpy())
+    ```
+    ```python exec="true" source="above" session="tensor" result="python"
+    print(t.max_pool2d(ceil_mode=True).numpy())
+    ```
+    ```python exec="true" source="above" session="tensor" result="python"
+    print(t.max_pool2d(padding=1).numpy())
+    ```
+    """
+    axis = tuple(range(-len(k_ := make_tuple(kernel_size, 2)), 0))
+    pads = resolve_pool_pads(padding, len(k_))
+    if ceil_mode: pads = self._apply_ceil_mode(pads, k_, stride if stride is not None else k_, dilation)
+    s_ = stride if stride is not None else k_
+    pooled = self._pad_constant(((0,0),)*(self.ndim-len(k_)) + flat_to_grouped(pads), self.dtype.min)._pool(k_, s_, dilation)
+    if not return_indices: return pooled.max(axis)
+    spatial_sz = int(prod(spatial_shape := self.shape[-len(k_):]))
+    idx = type(self).arange(spatial_sz, 0, -1, device=self.device).reshape(spatial_shape)
+    m = pooled.eq(pooled.max(axis, keepdim=True))
+    idx = m * idx._pad_constant(((0,0),)*(idx.ndim-len(k_)) + flat_to_grouped(pads), idx.dtype.min)._pool(k_, s_, dilation)
+    return pooled.max(axis), spatial_sz - idx.max(axis)
+
+  def max_unpool2d(self, indices:Self, kernel_size:tuple[int, ...]=(2,2), stride=None, dilation=1, padding:int|tuple[int, ...]=0,
+                   output_size=None) -> Self:
+    """
+    Performs a partial inverse of `max_pool2d` using the indices from the argmax.
+
+    When `output_size` is provided, the output shape disambiguates to the provided shape.
+
+    NOTE: unlike PyTorch, this implementation is not limited to only 2d pooling and instead works for any number of dimensions.
+
+    ```python exec="true" source="above" session="tensor" result="python"
+    t = Tensor.arange(1, 17).reshape(1, 1, 4, 4)
+    print(t.numpy())
+    ```
+    ```python exec="true" source="above" session="tensor" result="python"
+    output, indices = Tensor.max_pool2d(t, return_indices=True)
+    print(output.numpy())
+    print(indices.numpy())
+    ```
+    ```python exec="true" source="above" session="tensor" result="python"
+    print(Tensor.max_unpool2d(output, indices).numpy())
+    ```
+    """
+    bs,c,*spatial_shape = self.shape
+    if output_size is None:
+      k_,d_,s_ = (make_tuple(x, len(spatial_shape)) for x in (kernel_size, dilation, stride if stride is not None else kernel_size))
+      p_ = flat_to_grouped(resolve_pool_pads(padding, len(spatial_shape)))
+      # https://arxiv.org/pdf/1603.07285 inverse of relationship 15 in section 5.1.
+      output_size = tuple((i-1)*s - (pB+pA) + (d*(k-1)+1) for i,k,d,s,(pA,pB) in zip(spatial_shape,k_,d_,s_,p_))
+    else: output_size = output_size[-len(spatial_shape):]
+    ret = (indices.reshape(bs,c,1,-1)._one_hot_along_dim(prod(output_size), 2).where(self.reshape(bs,c,1,-1), 0)).sum(3)
+    return ret.reshape(bs,c,*output_size)
+
   def conv2d(self, weight:Self, bias:Self|None=None, groups=1, stride=1, dilation=1, padding:int|Sequence[int]=0,
              dtype:DTypeLike|None=None) -> Self:
     (bs,cin_), (cout,cin), HW = self.shape[:2], weight.shape[:2], weight.shape[2:]

diff --git a/tinygrad/renderer/amd/sqtt.py b/tinygrad/renderer/amd/sqtt.py
@@ -573,7 +573,7 @@ def _build_decode_tables(packet_types: dict[int, type[PacketType]]) -> tuple[dic
   sorted_types = sorted(packet_types.items(), key=lambda x: (-bin(x[1].encoding.mask).count('1'), x[0] == 16))
   state_table = bytes(next((op for op, cls in sorted_types if (b & cls.encoding.mask) == cls.encoding.default), 16) for b in range(256))
   # Build decode info: opcode -> (pkt_cls, nib_count, delta_lo, delta_mask, special_case)
-  # special_case: 0=none, 1=TS_DELTA_OR_MARK (check is_marker), 2=TS_DELTA_SHORT (add 8), 3=CDNA_MISC (*4), 4=CDNA_TIMESTAMP (absolute)
+  # special_case: 0=none, 1=TS_DELTA_OR_MARK (check is_marker), 2=TS_DELTA_SHORT (add 4), 3=CDNA_MISC (*4), 4=CDNA_TIMESTAMP (absolute)
   _special = {TS_DELTA_OR_MARK: 1, TS_DELTA_OR_MARK_RDNA4: 1, TS_DELTA_SHORT: 2, CDNA_MISC: 3, CDNA_TIMESTAMP: 4}
   decode_info = {}
   for opcode, pkt_cls in packet_types.items():
@@ -609,7 +609,7 @@ def decode(data: bytes) -> Iterator[PacketType]:
     if special == 1:  # TS_DELTA_OR_MARK
       pkt = pkt_cls.from_raw(reg, 0)  # create packet to check is_marker
       if pkt.is_marker: delta = 0
-    elif special == 2: delta += 8  # TS_DELTA_SHORT
+    elif special == 2: delta += 4  # TS_DELTA_SHORT
     elif special == 3: delta *= 4  # CDNA_DELTA
     elif special == 4:  # CDNA_TIMESTAMP (absolute timestamp anchoring)
       if (reg >> 4) & 0xfff == 0:  # unk_0 == 0 means absolute timestamp