Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/mlperf/model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -1395,7 +1395,7 @@ def train_llama3():

params = get_parameters(model)

if getenv("FAKEDATA"):
if getenv("EMPTYWEIGHT"):
for v in get_parameters(model):
v = v.assign(Tensor.empty(v.shape, dtype=v.dtype))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ export FP8=${FP8:-1}
export ALLREDUCE_CAST=${ALLREDUCE_CAST:-1}

export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-16} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
export GBS=$((BS * GRADIENT_ACC_STEPS))

export MODEL="llama3"
Expand All @@ -36,7 +36,7 @@ export DATA_SEED=${DATA_SEED:-5760}
export JITBEAM=${JITBEAM:-3}
export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=1

export FAKEDATA=1 BENCHMARK=${BENCHMARK:-10}
export FAKEDATA=${FAKEDATA:-1} BENCHMARK=${BENCHMARK:-10}
if [ -z "$FULL_LAYERS" ]; then
export LLAMA_LAYERS=2
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ export FP8=${FP8:-1}
export ALLREDUCE_CAST=${ALLREDUCE_CAST:-1}

export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-16} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
export DP=${DP:-8} MP=${MP:-1} BS=${BS:-16} EVAL_BS=${EVAL_BS:-8} GRADIENT_ACC_STEPS=${GRADIENT_ACC_STEPS:-2}
export GBS=$((BS * GRADIENT_ACC_STEPS))

export MODEL="llama3"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ export FP8=1
export ALLREDUCE_CAST=1

export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=8 MP=1 BS=16 EVAL_BS=16 GRADIENT_ACC_STEPS=2
export DP=8 MP=1 BS=16 EVAL_BS=8 GRADIENT_ACC_STEPS=2
export GBS=$((BS * GRADIENT_ACC_STEPS))

export MODEL="llama3"
Expand Down
19 changes: 11 additions & 8 deletions extra/sqtt/install_rocprof_decoder.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
#!/usr/bin/env python3
import os, shutil
import os, platform, shutil, subprocess
from pathlib import Path
from tinygrad.helpers import fetch, OSX

VERSION = "0.1.6"
DEST = Path("/usr/local/lib")
DEST.mkdir(exist_ok=True)

if __name__ == "__main__":
if OSX:
fp = fetch("https://github.com/ROCm/rocprof-trace-decoder/releases/download/0.1.4/rocprof-trace-decoder-macos-arm64-0.1.4-Darwin.sh")
lib = fp.parent/"rocprof-trace-decoder-macos-arm64-0.1.4-Darwin"/"lib"/"librocprof-trace-decoder.dylib"
os.chmod(fp, 0o755)
os.system(f"sudo {fp} --prefix={fp.parent} --include-subdir")
shutil.copy2(lib, DEST)
arch = "arm64" if platform.machine() == "arm64" else "x86_64"
dmg = fetch(f"https://github.com/ROCm/rocprof-trace-decoder/releases/download/{VERSION}/rocprof-trace-decoder-macos-{arch}-{VERSION}-Darwin.dmg")
mnt = Path(subprocess.check_output(["hdiutil", "attach", "-nobrowse", "-readonly", "-mountrandom", "/tmp", str(dmg)],
text=True).split("\t")[-1].strip())
try: shutil.copy2(next(mnt.rglob("librocprof-trace-decoder.dylib")), DEST)
finally: subprocess.run(["hdiutil", "detach", str(mnt)], check=True)
lib = DEST/"librocprof-trace-decoder.dylib"
else:
lib = DEST/"librocprof-trace-decoder.so"
os.system("sudo curl -L https://github.com/ROCm/rocprof-trace-decoder/raw/43bf0fef74a83c3c25badfc5a09c0bd39ed8c6f9/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so -o"+str(lib))
os.system(f"sudo curl -L https://github.com/ROCm/rocprof-trace-decoder/raw/{VERSION}/releases/linux_glibc_2_28_x86_64/librocprof-trace-decoder.so -o {lib}")
os.system("sudo ldconfig")
print(f"Installed {lib.name} to", DEST)
print(f"Installed {lib.name} ({VERSION}) to", DEST)
2 changes: 2 additions & 0 deletions test/mockgpu/usb.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,8 @@ def process_cdb(self, cdb:bytes, rlen:int, send_data:bytes|None) -> bytes|None:
return None

class MockUSB3:
@classmethod
def list_devices(cls, vendor, dev): return [(0, "usb:mock")]
def __init__(self, *args, **kwargs):
self.product, self.is_custom = "", False
def send_batch(self, cdbs:list[bytes], idata:list[int]|None=None, odata:list[bytes|None]|None=None) -> list[bytes|None]:
Expand Down
9 changes: 9 additions & 0 deletions test/null/test_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,15 @@ class TestTensorDevice(unittest.TestCase):
def test_create_from_single_device_tuple(self):
(Tensor([1.0], device=(Device.DEFAULT,)) + Tensor([2.0])).realize()

class TestTensorPad(unittest.TestCase):
# padding int tensor with float-only value (like -inf) must promote dtype to fit value
def test_pad_int_with_neg_inf(self):
t = Tensor.arange(9).reshape(1, 1, 3, 3)
self.assertEqual(t.dtype, dtypes.int)
r = t.pad((1, 2, 0, -1), value=-float('inf'))
self.assertEqual(r.dtype, dtypes.float)
self.assertEqual(r.shape, (1, 1, 2, 6))

class TestTensorDeviceMismatch(unittest.TestCase):
def test_gather(self):
x = Tensor.empty(3, 4, device="NULL")
Expand Down
19 changes: 19 additions & 0 deletions test/null/test_tensor_uop_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,25 @@ def test_amin(self): self._check(_t(3, 4).float(), Tensor([[0, 1, 0, 1]]*3, dtyp
def test_mean_exclude_self(self):
self._check(_t(3, 4).float(), Tensor([[0, 1, 0, 1]]*3, dtype=dtypes.int32), Tensor.ones(3, 4).float(), reduce="mean", include_self=False)

class TestTensorUOpPool(unittest.TestCase):
def test_avg_pool2d(self): _check(self, _t(1, 1, 5, 5).float(), lambda x: x.avg_pool2d())
def test_avg_pool2d_padding(self): _check(self, _t(1, 1, 5, 5).float(), lambda x: x.avg_pool2d(padding=1))
def test_avg_pool2d_ceil(self): _check(self, _t(1, 1, 5, 5).float(), lambda x: x.avg_pool2d(ceil_mode=True))
def test_avg_pool2d_no_count_pad(self): _check(self, _t(1, 1, 5, 5).float(), lambda x: x.avg_pool2d(padding=1, count_include_pad=False))
def test_max_pool2d(self): _check(self, _t(1, 1, 5, 5).float(), lambda x: x.max_pool2d())
def test_max_pool2d_padding(self): _check(self, _t(1, 1, 5, 5).float(), lambda x: x.max_pool2d(padding=1))
def test_max_pool2d_ceil(self): _check(self, _t(1, 1, 5, 5).float(), lambda x: x.max_pool2d(ceil_mode=True))
def test_max_pool2d_return_indices(self):
t = _t(1, 1, 5, 5).float()
vt, it = t.max_pool2d(return_indices=True)
vu, iu = t.uop.max_pool2d(return_indices=True)
self.assertIs(_strip_unique(vt.uop), _strip_unique(vu))
self.assertIs(_strip_unique(it.uop), _strip_unique(iu))
def test_max_unpool2d(self):
t = _t(1, 1, 4, 4).float()
out, idx = t.max_pool2d(return_indices=True)
self.assertIs(_strip_unique(out.max_unpool2d(idx).uop), _strip_unique(out.uop.max_unpool2d(idx.uop)))

class TestTensorUOpCat(unittest.TestCase):
def test_cat_dim0(self): _check(self, _t(2, 3), lambda x: x.cat(x, dim=0))
def test_cat_dim1(self): _check(self, _t(2, 3), lambda x: x.cat(x, dim=1))
Expand Down
4 changes: 1 addition & 3 deletions tinygrad/llm/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,8 @@ def _attention(self, x:Tensor, start_pos:int|UOp) -> Tensor:
self.freqs_cis[start_pos:start_pos+T])

k_store = c_kv.reshape(B, 1, T, self.config.kv_lora_rank).cat(k_rope.reshape(B, 1, T, self.config.rope_dim), dim=-1)
v_store = c_kv.reshape(B, 1, T, self.config.kv_lora_rank)
k = Tensor(self.cache_k.uop.after(self.cache_k[:, :, start_pos:start_pos+T, :].uop.store(k_store.uop)))[:, :, 0:start_pos+T, :]
v = Tensor(self.cache_v.uop.after(self.cache_v[:, :, start_pos:start_pos+T, :].uop.store(v_store.uop)))[:, :, 0:start_pos+T, :]
v = k[..., :self.config.kv_lora_rank]

mask = Tensor.full((1, 1, T, start_pos+T), float("-inf"), dtype=x.dtype, device=x.device).triu(start_pos+1) if resolve(T != 1) else None
attn = q @ k.transpose(-1, -2) * (1.0 / self.config.head_dim ** 0.5)
Expand All @@ -233,7 +232,6 @@ def _attention(self, x:Tensor, start_pos:int|UOp) -> Tensor:
def _init_state(self, x:Tensor):
if not hasattr(self, "cache_k"):
self.cache_k = Tensor.empty(x.shape[0], 1, self.config.max_context, self.config.kv_lora_rank + self.config.rope_dim, device=x.device)
self.cache_v = Tensor.empty(x.shape[0], 1, self.config.max_context, self.config.kv_lora_rank, device=x.device)
self.freqs_cis = precompute_freqs_cis(self.config.rope_dim, self.config.max_context, self.config.rope_theta)

class GatedDeltaNetBlock(FFNBlock):
Expand Down
145 changes: 143 additions & 2 deletions tinygrad/mixin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,10 @@ def _pad_constant(self, pX, value:float) -> Self:
has_neg = not all(resolve(p >= 0) for p in flatten(pX))
X = self.shrink(tuple((-smin(pB,0),smin(pA+s,s)) for (pB,pA),s in zip(pX, self.shape))) if has_neg else self
pads = tuple((smax(pB,0), smax(pA,0)) for pB,pA in pX) if has_neg else pX
if value == 0: return MovementMixin.pad(X, pads)
return MovementMixin.pad(X, pads) + MovementMixin.pad(X.ones_like(), pads).cast(dtypes.bool).where(0, value)
base = MovementMixin.pad(X, pads)
if value == 0: return base
base = base.cast(least_upper_dtype(base.dtype, dtypes.from_py(value)))
return base + MovementMixin.pad(X.ones_like(), pads).cast(dtypes.bool).where(base.zeros_like(), base.full_like(value))

def _ufix_keep_dtype(self, x) -> bool:
# matches Tensor scalar-wrapping behavior: keep self.dtype for float self, or for int self with int/Invalid scalar
Expand Down Expand Up @@ -861,6 +863,145 @@ def linear(self, weight:Self, bias:Self|None=None, dtype:DTypeLike|None=None) ->
x = self.mul(weight) if len(weight.shape) == 1 else self.dot(weight)
return x.add(bias) if bias is not None else x

def _apply_ceil_mode(self, pads:Sequence[int], k_:tuple[sint, ...], s_:int|tuple[int, ...], d_:int|tuple[int, ...]) -> list[int]:
(d_,s_), i_ = (make_tuple(x, len(k_)) for x in (d_,s_)), self.shape[-len(k_):]
grouped_pads = list(flat_to_grouped(pads))
# https://arxiv.org/pdf/1603.07285 section 5.1, relationship 15.
o_ = [ceildiv(i+pB+pA - (d*(k-1)+1), s) + 1 for i,d,k,s,(pB,pA) in zip(i_,d_,k_,s_,grouped_pads)]
for dim,(o,i,s,k,d,(pB,pA)) in enumerate(zip(o_,i_,s_,k_,d_,grouped_pads)):
# we have to do additional padding before `_pool` so that `o_` in `_pool` is calculated correctly
# `s*(o-1) + (d*(k-1)+1) - (i+pB+pA)` -> last_sliding_window_start + full_kernel_size - padded_input_shape
# we decrease padding in the case that a sliding window starts in the end padded region, thereby decreasing `o_` in `_pool`
# `smax(s*(o-1) - (pB+i-1), 0)` -> last_sliding_window_start - (pad_before + input_size - zero_offset)
grouped_pads[dim] = (pB, pA + s*(o-1) + (d*(k-1)+1) - (i+pB+pA) - smax(s*(o-1) - (pB+i-1), 0))
return flatten(reversed(grouped_pads))

# NOTE: these work for more than 2D
def avg_pool2d(self, kernel_size:tuple[int, ...]=(2,2), stride=None, dilation=1, padding:int|tuple[int, ...]=0,
ceil_mode=False, count_include_pad=True) -> Self:
"""
Applies average pooling over a tensor.

This function supports three different types of `padding`

1. `int` (single value):
Applies the same padding value uniformly to all spatial dimensions.

2. `tuple[int, ...]` (length = number of spatial dimensions):
Specifies a distinct padding value for each spatial dimension in the form `(padding_height, padding_width, ...)`.

3. `tuple[int, ...]` (length = 2 * number of spatial dimensions):
Specifies explicit padding for each side of each spatial dimension in the form
`(padding_left, padding_right, padding_top, padding_bottom, ...)`.

When `ceil_mode` is set to `True`, output shape will be determined using ceil division.
When `count_include_pad` is set to `False`, zero padding will not be included in the averaging calculation.

NOTE: unlike PyTorch, this implementation is not limited to only 2d pooling and instead works for any number of dimensions.

```python exec="true" source="above" session="tensor" result="python"
t = Tensor.arange(25).reshape(1, 1, 5, 5)
print(t.avg_pool2d().numpy())
```
```python exec="true" source="above" session="tensor" result="python"
print(t.avg_pool2d(ceil_mode=True).numpy())
```
```python exec="true" source="above" session="tensor" result="python"
print(t.avg_pool2d(padding=1).numpy())
```
```python exec="true" source="above" session="tensor" result="python"
print(t.avg_pool2d(padding=1, count_include_pad=False).numpy())
```
"""
axis = tuple(range(-len(k_ := make_tuple(kernel_size, 2)), 0))
def pool(x:Self, padding_:Sequence[int]) -> Self:
return x._pad_constant(((0,0),)*(x.ndim-len(k_)) + flat_to_grouped(padding_), 0.0)._pool(k_, stride if stride is not None else k_, dilation)
reg_pads = resolve_pool_pads(padding, len(k_))
ceil_pads = self._apply_ceil_mode(reg_pads, k_, stride if stride is not None else k_, dilation)
if not count_include_pad:
pads = ceil_pads if ceil_mode else reg_pads
return pool(self, pads).sum(axis) / pool(self.ones_like(), pads).sum(axis)
if not ceil_mode: return pool(self, reg_pads).mean(axis)
return pool(self, ceil_pads).sum(axis) / pool(self._pad_constant(((0,0),)*(self.ndim-len(k_)) + flat_to_grouped(reg_pads), 0.0).ones_like(),
tuple(cp-rp for cp,rp in zip(ceil_pads, reg_pads))).sum(axis)

def max_pool2d(self, kernel_size:tuple[int, ...]=(2,2), stride=None, dilation=1, padding:int|tuple[int, ...]=0,
ceil_mode=False, return_indices=False) -> Self | tuple[Self, Self]:
"""
Applies max pooling over a tensor.

This function supports three different types of `padding`

1. `int` (single value):
Applies the same padding value uniformly to all spatial dimensions.

2. `tuple[int, ...]` (length = number of spatial dimensions):
Specifies a distinct padding value for each spatial dimension in the form `(padding_height, padding_width, ...)`.

3. `tuple[int, ...]` (length = 2 * number of spatial dimensions):
Specifies explicit padding for each side of each spatial dimension in the form
`(padding_left, padding_right, padding_top, padding_bottom, ...)`.

When `ceil_mode` is set to `True`, output shape will be determined using ceil division.
When `return_indices` is set to `True`, the argmax will be returned along with the max values.

NOTE: unlike PyTorch, this implementation is not limited to only 2d pooling and instead works for any number of dimensions.

```python exec="true" source="above" session="tensor" result="python"
t = Tensor.arange(25).reshape(1, 1, 5, 5)
print(t.max_pool2d().numpy())
```
```python exec="true" source="above" session="tensor" result="python"
print(t.max_pool2d(ceil_mode=True).numpy())
```
```python exec="true" source="above" session="tensor" result="python"
print(t.max_pool2d(padding=1).numpy())
```
"""
axis = tuple(range(-len(k_ := make_tuple(kernel_size, 2)), 0))
pads = resolve_pool_pads(padding, len(k_))
if ceil_mode: pads = self._apply_ceil_mode(pads, k_, stride if stride is not None else k_, dilation)
s_ = stride if stride is not None else k_
pooled = self._pad_constant(((0,0),)*(self.ndim-len(k_)) + flat_to_grouped(pads), self.dtype.min)._pool(k_, s_, dilation)
if not return_indices: return pooled.max(axis)
spatial_sz = int(prod(spatial_shape := self.shape[-len(k_):]))
idx = type(self).arange(spatial_sz, 0, -1, device=self.device).reshape(spatial_shape)
m = pooled.eq(pooled.max(axis, keepdim=True))
idx = m * idx._pad_constant(((0,0),)*(idx.ndim-len(k_)) + flat_to_grouped(pads), idx.dtype.min)._pool(k_, s_, dilation)
return pooled.max(axis), spatial_sz - idx.max(axis)

def max_unpool2d(self, indices:Self, kernel_size:tuple[int, ...]=(2,2), stride=None, dilation=1, padding:int|tuple[int, ...]=0,
output_size=None) -> Self:
"""
Performs a partial inverse of `max_pool2d` using the indices from the argmax.

When `output_size` is provided, the output shape disambiguates to the provided shape.

NOTE: unlike PyTorch, this implementation is not limited to only 2d pooling and instead works for any number of dimensions.

```python exec="true" source="above" session="tensor" result="python"
t = Tensor.arange(1, 17).reshape(1, 1, 4, 4)
print(t.numpy())
```
```python exec="true" source="above" session="tensor" result="python"
output, indices = Tensor.max_pool2d(t, return_indices=True)
print(output.numpy())
print(indices.numpy())
```
```python exec="true" source="above" session="tensor" result="python"
print(Tensor.max_unpool2d(output, indices).numpy())
```
"""
bs,c,*spatial_shape = self.shape
if output_size is None:
k_,d_,s_ = (make_tuple(x, len(spatial_shape)) for x in (kernel_size, dilation, stride if stride is not None else kernel_size))
p_ = flat_to_grouped(resolve_pool_pads(padding, len(spatial_shape)))
# https://arxiv.org/pdf/1603.07285 inverse of relationship 15 in section 5.1.
output_size = tuple((i-1)*s - (pB+pA) + (d*(k-1)+1) for i,k,d,s,(pA,pB) in zip(spatial_shape,k_,d_,s_,p_))
else: output_size = output_size[-len(spatial_shape):]
ret = (indices.reshape(bs,c,1,-1)._one_hot_along_dim(prod(output_size), 2).where(self.reshape(bs,c,1,-1), 0)).sum(3)
return ret.reshape(bs,c,*output_size)

def conv2d(self, weight:Self, bias:Self|None=None, groups=1, stride=1, dilation=1, padding:int|Sequence[int]=0,
dtype:DTypeLike|None=None) -> Self:
(bs,cin_), (cout,cin), HW = self.shape[:2], weight.shape[:2], weight.shape[2:]
Expand Down
4 changes: 2 additions & 2 deletions tinygrad/renderer/amd/sqtt.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ def _build_decode_tables(packet_types: dict[int, type[PacketType]]) -> tuple[dic
sorted_types = sorted(packet_types.items(), key=lambda x: (-bin(x[1].encoding.mask).count('1'), x[0] == 16))
state_table = bytes(next((op for op, cls in sorted_types if (b & cls.encoding.mask) == cls.encoding.default), 16) for b in range(256))
# Build decode info: opcode -> (pkt_cls, nib_count, delta_lo, delta_mask, special_case)
# special_case: 0=none, 1=TS_DELTA_OR_MARK (check is_marker), 2=TS_DELTA_SHORT (add 8), 3=CDNA_MISC (*4), 4=CDNA_TIMESTAMP (absolute)
# special_case: 0=none, 1=TS_DELTA_OR_MARK (check is_marker), 2=TS_DELTA_SHORT (add 4), 3=CDNA_MISC (*4), 4=CDNA_TIMESTAMP (absolute)
_special = {TS_DELTA_OR_MARK: 1, TS_DELTA_OR_MARK_RDNA4: 1, TS_DELTA_SHORT: 2, CDNA_MISC: 3, CDNA_TIMESTAMP: 4}
decode_info = {}
for opcode, pkt_cls in packet_types.items():
Expand Down Expand Up @@ -609,7 +609,7 @@ def decode(data: bytes) -> Iterator[PacketType]:
if special == 1: # TS_DELTA_OR_MARK
pkt = pkt_cls.from_raw(reg, 0) # create packet to check is_marker
if pkt.is_marker: delta = 0
elif special == 2: delta += 8 # TS_DELTA_SHORT
elif special == 2: delta += 4 # TS_DELTA_SHORT
elif special == 3: delta *= 4 # CDNA_DELTA
elif special == 4: # CDNA_TIMESTAMP (absolute timestamp anchoring)
if (reg >> 4) & 0xfff == 0: # unk_0 == 0 means absolute timestamp
Expand Down
Loading
Loading