Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 17 additions & 24 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -643,8 +643,7 @@ jobs:
runs-on: ubuntu-24.04
timeout-minutes: 20
env:
DEV: AMD
MOCKGPU: 1
DEV: MOCKKFD+AMD
steps:
- name: Checkout Code
uses: actions/checkout@v6
Expand All @@ -670,7 +669,7 @@ jobs:
- name: Run AMD renderer tests
run: python -m pytest -n=auto test/amd/ --durations 20
- name: Run AMD renderer tests (AMD:LLVM)
run: DEV=AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
run: DEV=MOCKKFD+AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
- name: Run SQTT profiling tests
run: PROFILE=1 SQTT=1 python3 -m pytest -n=auto test/amd/test_sqtt_profiler.py
- name: Run AMD emulated tests on NULL backend
Expand All @@ -679,20 +678,19 @@ jobs:
run: |
PYTHONPATH=. DEV=NULL::gfx1100 python extra/mmapeak/mmapeak.py
PYTHONPATH=. DEV=NULL::gfx1201 python3 -m pytest -n=auto test/testextra/test_tk.py test/backend/test_asm_gemm.py
- name: Run matmul on MOCKGPU
- name: Run matmul on MOCKKFD
run: |
PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_asm_matmul.py
PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_copy_matmul.py
PYTHONPATH="." DEV=MOCKKFD+AMD N=256 python3 extra/gemm/amd_asm_matmul.py
PYTHONPATH="." DEV=MOCKKFD+AMD N=256 python3 extra/gemm/amd_copy_matmul.py
- name: Run LLVM test
run: DEV=AMD:LLVM python test/device/test_amd_llvm.py
run: DEV=MOCKKFD+AMD:LLVM python test/device/test_amd_llvm.py

testmockam:
name: Linux (am)
runs-on: ubuntu-24.04
timeout-minutes: 15
env:
DEV: PCI+AMD
MOCKGPU: 1
DEV: MOCKPCI+AMD
steps:
- name: Checkout Code
uses: actions/checkout@v6
Expand All @@ -704,13 +702,13 @@ jobs:
amd: 'true'
- name: Run test_tiny on MOCKAM
run: python test/test_tiny.py
- name: Run test_tiny on MOCKAM USB
run: GMMU=0 DEV=USB+AMD python test/test_tiny.py
- name: Run test_hcq on MOCKAM
- name: Run test_tiny on MOCKUSB
run: GMMU=0 DEV=MOCKUSB+AMD python test/test_tiny.py
- name: Run test_hcq on MOCKPCI
run: python -m pytest test/device/test_hcq.py
- name: Run disk copy tests on MOCKAM
- name: Run disk copy tests on MOCKPCI
run: python -m pytest test/unit/test_disk_tensor.py -k test_copy_from_disk
- name: Run test_tiny on MOCKAM Remote
- name: Run test_tiny on MOCKPCI Remote
run: |
python extra/remote/serve.py 6667 &
sleep 2
Expand All @@ -728,8 +726,7 @@ jobs:
runs-on: ubuntu-22.04
timeout-minutes: 15
env:
DEV: AMD:${{ matrix.backend == 'amdllvm' && 'LLVM' || '' }}:${{ matrix.arch }}
MOCKGPU: 1
DEV: MOCKKFD+AMD:${{ matrix.backend == 'amdllvm' && 'LLVM' || '' }}:${{ matrix.arch }}
SKIP_SLOW_TEST: 1
steps:
- name: Checkout Code
Expand Down Expand Up @@ -764,7 +761,6 @@ jobs:
runs-on: ubuntu-22.04
timeout-minutes: 20
env:
MOCKGPU: 1
FORWARD_ONLY: 1
steps:
- name: Checkout Code
Expand All @@ -777,7 +773,7 @@ jobs:
cuda: 'true'
ocelot: 'true'
- name: Set env
run: printf "${{ matrix.backend == 'ptx' && 'DEV=CUDA:PTX' || matrix.backend == 'nv' && 'DEV=NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
run: printf "${{ matrix.backend == 'ptx' && 'DEV=MOCK+CUDA:PTX' || matrix.backend == 'nv' && 'DEV=MOCKNVK+NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
- name: Check Device.DEFAULT and print some source
run: |
python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
Expand Down Expand Up @@ -862,22 +858,19 @@ jobs:
run: DEV=METAL TRANSCENDENTAL=2 python -m pytest -n=auto test/backend/test_ops.py::TestOps::test_sin test/backend/test_ops.py::TestOps::test_cos test/backend/test_ops.py::TestOps::test_tan test/backend/test_ops.py::TestOps::test_exp test/backend/test_ops.py::TestOps::test_log --durations=20
- name: Run pytest (amd)
env:
MOCKGPU: 1
DEV: AMD
DEV: MOCKKFD+AMD
FORWARD_ONLY: 1
run: |
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
- name: Run pytest (amd with llvm backend)
env:
MOCKGPU: 1
DEV: "AMD:LLVM"
DEV: "MOCKKFD+AMD:LLVM"
FORWARD_ONLY: 1
run: |
python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
- name: Run pytest (ptx)
env:
MOCKGPU: 1
DEV: "NV:PTX"
DEV: "MOCKNVK+NV:PTX"
FORWARD_ONLY: 1
# TODO: failing due to library loading error
CAPTURE_PROCESS_REPLAY: 0
Expand Down
6 changes: 3 additions & 3 deletions docs/abstractions4.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# tinygrad allows you to write kernels at many different abstractions levels.
# This is for RDNA3, but if you don't have one you can run with the emulator
# PYTHONPATH="." MOCKGPU=1 DEV=AMD
# PYTHONPATH="." DEV=MOCKPCI+AMD

from tinygrad import Tensor, Context, GlobalCounters, UOp, Device
from tinygrad.helpers import DEBUG, getenv
from tinygrad.helpers import DEV, DEBUG, getenv
from tinygrad.uop.ops import AxisType, KernelInfo, Ops
from tinygrad.dtype import AddrSpace, dtypes
from tinygrad.runtime.autogen.amd.rdna3.ins import *
Expand All @@ -16,7 +16,7 @@ def eval_harness(name, tensor, fxn, check=None):
print(f"computed in {GlobalCounters.time_sum_s*1000:.2f} ms, {(a.nbytes()/1e9)/GlobalCounters.time_sum_s:.2f} GB/s")
return out

SZ = 256*1024 if getenv("MOCKGPU") else 1024*1024*1024
SZ = 256*1024 if DEV.interface.startswith("MOCK") else 1024*1024*1024

def example_2_hip(a:Tensor, correct):
GLOBALS = 1024
Expand Down
2 changes: 1 addition & 1 deletion examples/mlperf/model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -1438,7 +1438,7 @@ def train_llama3():
if FP8:
from tinygrad.nn.state import get_state_dict
model_state = get_state_dict(model)
for wname in ["wqkv", "wo", "w1", "w2", "w3"]:
for wname in ["wqkv", "wo", "w13", "w2"]:
w = model_state[wname]
w._inv_scale = model._fp8_inv_scale[wname]
if optim.master_params:
Expand Down
54 changes: 30 additions & 24 deletions examples/mlperf/models/flat_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ def matmul(x:Tensor, w:Tensor, fp8=FP8, amax_x:Tensor|None=None, w_inv_scale:Ten
if can_use_asm_gemm(x_fp8, w.T): return asm_gemm(x_fp8, w.T, x_scale=x_scale, w_scale=w_inv_scale), x_new_amax, x_fp8, w
return x_fp8.dot(w.T, dtype=dtypes.float) * x_scale * w_inv_scale, x_new_amax, x_fp8, w

def matmul_fp8_precomputed(x_fp8:Tensor, x_inv_scale:Tensor, x_new_amax:Tensor, w:Tensor, w_inv_scale:Tensor) -> tuple[Tensor,...]:
if getenv("ASM_GEMM"):
from extra.gemm.cdna_asm_gemm import can_use_asm_gemm, asm_gemm
if can_use_asm_gemm(x_fp8, w.T): return asm_gemm(x_fp8, w.T, x_scale=x_inv_scale, w_scale=w_inv_scale), x_new_amax, x_fp8, w
return x_fp8.dot(w.T, dtype=dtypes.float) * x_inv_scale * w_inv_scale, x_new_amax, x_fp8, w

def _rmsnorm_fwd(x_in:Tensor, eps:float) -> tuple[Tensor, Tensor]:
x = x_in.float()
rrms = (x.square().mean(-1, keepdim=True) + eps).rsqrt()
Expand Down Expand Up @@ -84,6 +90,7 @@ def __init__(self, dim:int, hidden_dim:int, n_heads:int, n_layers:int, norm_eps:
self.n_kv_heads = n_kv_heads if n_kv_heads is not None else n_heads # n_kv_heads != n_heads implies MQA [arxiv/2307.09288, A.2.1]
self.head_dim = dim // n_heads
self.n_rep = self.n_heads // self.n_kv_heads
self.hidden_dim = hidden_dim

scaled_std = 0.02 / math.sqrt(2 * n_layers)

Expand All @@ -93,9 +100,8 @@ def __init__(self, dim:int, hidden_dim:int, n_heads:int, n_layers:int, norm_eps:
self.wo = self.lin_per_layer(self.n_heads * self.head_dim, dim, std=scaled_std)

# FeedForward
self.w1 = self.lin_per_layer(dim, hidden_dim)
self.w13 = self.lin_per_layer(dim, hidden_dim * 2)
self.w2 = self.lin_per_layer(hidden_dim, dim, std=scaled_std)
self.w3 = self.lin_per_layer(dim, hidden_dim)

self.norm_eps = norm_eps
self.attention_norm = Tensor.ones(n_layers, dim).contiguous()
Expand All @@ -110,10 +116,10 @@ def __init__(self, dim:int, hidden_dim:int, n_heads:int, n_layers:int, norm_eps:

if FP8:
def _amax(): return Tensor.full((), FP8_MAX).contiguous().requires_grad_(False)
names = ["xqkv", "xo", "x1", "x2", "x3"]
names = ["xqkv", "xo", "x13", "x2"]
self._fp8_amax = {name: [_amax() for _ in range(n_layers)] for name in names}
# per-weight inv_scale: single (n_layers,) float32 tensor per weight (kernel reads float* pointers)
w_names = ["wqkv", "wo", "w1", "w2", "w3"]
w_names = ["wqkv", "wo", "w13", "w2"]
self._fp8_inv_scale = {}
for wname, inv_scales in zip(w_names, self._init_inv_scales):
self._fp8_inv_scale[wname] = inv_scales.float().contiguous().requires_grad_(False)
Expand Down Expand Up @@ -162,20 +168,21 @@ def attention(self, x:Tensor, freqs_cis:Tensor, attention_norm:Tensor, wqkv:Tens
saves.extend(ret[1:] + [out])
return (out, *new_amaxs, *saves)

def feed_forward(self, x:Tensor, ffn_norm:Tensor, w1:Tensor, w2:Tensor, w3:Tensor,
amax_x1=None, amax_x2=None, amax_x3=None, s_1=None, s_2=None, s_3=None):
def feed_forward(self, x:Tensor, ffn_norm:Tensor, w13:Tensor, w2:Tensor,
amax_x13=None, amax_x2=None, s_13=None, s_2=None):
new_amaxs, saves = [], []

x, rrms = rmsnorm(x, self.norm_eps)
saves.extend([x, rrms])
x = x * ffn_norm

x_w1, *ret = matmul(x, w1, amax_x=amax_x1, w_inv_scale=s_1)
x_w13, *ret = matmul(x, w13, amax_x=amax_x13, w_inv_scale=s_13)
new_amaxs.extend(ret[:1])
saves.extend(ret[1:] + [x_w1])
x_w3, *ret = matmul(x.contiguous_backward(), w3, amax_x=amax_x3, w_inv_scale=s_3)
new_amaxs.extend(ret[:1])
saves.extend(ret[1:] + [x_w3])
saves.extend(ret[1:] + [x_w13])

x_w1 = x_w13[..., :self.hidden_dim]
x_w3 = x_w13[..., self.hidden_dim:]

out, *ret = matmul(x_w1.silu() * x_w3, w2, amax_x=amax_x2, w_inv_scale=s_2)
new_amaxs.extend(ret[:1])
saves.extend(ret[1:] + [out])
Expand All @@ -184,19 +191,19 @@ def feed_forward(self, x:Tensor, ffn_norm:Tensor, w1:Tensor, w2:Tensor, w3:Tenso
@function(precompile=True, precompile_backward=True)
def run_layer(self, x:Tensor, freqs_cis:Tensor,
attention_norm:Tensor, wqkv:Tensor, wo:Tensor,
ffn_norm:Tensor, w1:Tensor, w2:Tensor, w3:Tensor,
ffn_norm:Tensor, w13:Tensor, w2:Tensor,
amax_xqkv=None, amax_xo=None,
amax_x1=None, amax_x2=None, amax_x3=None,
s_qkv=None, s_o=None, s_1=None, s_2=None, s_3=None):
amax_x13=None, amax_x2=None,
s_qkv=None, s_o=None, s_13=None, s_2=None):
attn, *attn_ret = self.attention(x, freqs_cis, attention_norm, wqkv, wo,
amax_xqkv=amax_xqkv, amax_xo=amax_xo,
s_qkv=s_qkv, s_o=s_o)
attn_amaxs, attn_saves = attn_ret[:2], attn_ret[2:]
h = x + attn
ffn, *ffn_ret = self.feed_forward(h, ffn_norm, w1, w2, w3,
amax_x1=amax_x1, amax_x2=amax_x2, amax_x3=amax_x3,
s_1=s_1, s_2=s_2, s_3=s_3)
ffn_amaxs, ffn_saves = ffn_ret[:3], ffn_ret[3:]
ffn, *ffn_ret = self.feed_forward(h, ffn_norm, w13, w2,
amax_x13=amax_x13, amax_x2=amax_x2,
s_13=s_13, s_2=s_2)
ffn_amaxs, ffn_saves = ffn_ret[:2], ffn_ret[2:]
h = h + ffn
return (h, *attn_amaxs, *ffn_amaxs, *attn_saves, *ffn_saves)

Expand All @@ -208,9 +215,8 @@ def shard(self, device:tuple[str, ...], mp:bool=False):
# flat per-layer weights: axis 0 is n_layers, so shard axes are +1 vs per-layer Transformer
self.wqkv.shard_(device, axis=1).realize() # (n_layers, out, dim) shard out
self.wo.shard_(device, axis=2).realize() # (n_layers, dim, in) shard in
self.w1.shard_(device, axis=1).realize() # (n_layers, hidden, dim) shard out
self.w13.shard_(device, axis=1).realize() # (n_layers, hidden*2, dim) shard out
self.w2.shard_(device, axis=2).realize() # (n_layers, dim, hidden) shard in
self.w3.shard_(device, axis=1).realize() # (n_layers, hidden, dim) shard out
self.attention_norm.shard_(device, axis=None).realize()
self.ffn_norm.shard_(device, axis=None).realize()
self.norm.weight.shard_(device, axis=None).realize()
Expand All @@ -231,16 +237,16 @@ def __call__(self, tokens:Tensor):
s = self._fp8_inv_scale if FP8 else None
for i in range(self.n_layers):
amax_layer = {"amax_xqkv": a["xqkv"][i], "amax_xo": a["xo"][i],
"amax_x1": a["x1"][i], "amax_x2": a["x2"][i], "amax_x3": a["x3"][i]} if a else {}
"amax_x13": a["x13"][i], "amax_x2": a["x2"][i]} if a else {}
scale_layer = {"s_qkv": s["wqkv"][i], "s_o": s["wo"][i],
"s_1": s["w1"][i], "s_2": s["w2"][i], "s_3": s["w3"][i]} if s else {}
"s_13": s["w13"][i], "s_2": s["w2"][i]} if s else {}
h, *ret = self.run_layer(h, freqs_cis,
self.attention_norm[i], self.wqkv[i], self.wo[i],
self.ffn_norm[i], self.w1[i], self.w2[i], self.w3[i],
self.ffn_norm[i], self.w13[i], self.w2[i],
**amax_layer, **scale_layer)
if a:
amaxs = ret[:5]
amax_names = ["xqkv", "xo", "x1", "x3", "x2"]
amax_names = ["xqkv", "xo", "x13", "x2"]
for name, new_val in zip(amax_names, amaxs):
a[name][i].assign(new_val)

Expand Down
3 changes: 1 addition & 2 deletions test/amd/test_mockgpu_invalid.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ def test_unsupported_instruction_raises(self):
'''

env = os.environ.copy()
env["AMD"] = "1"
env["MOCKGPU"] = "1"
env["DEV"] = "MOCKKFD+AMD"
env["HCQDEV_WAIT_TIMEOUT_MS"] = "10000"

st = time.perf_counter()
Expand Down
2 changes: 1 addition & 1 deletion test/amd/test_sqtt_encoder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""Tests for SQTT encoder: verifies the emulator produces correct SQTT traces for known kernels.

Run with: DEV=AMD MOCKGPU=1 python -m pytest test/amd/test_sqtt_encoder.py -v
Run with: DEV=MOCKKFD+AMD python -m pytest test/amd/test_sqtt_encoder.py -v
"""
import ctypes, unittest
from tinygrad.helpers import Context
Expand Down
4 changes: 2 additions & 2 deletions test/backend/test_asm_gemm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest
from tinygrad import Tensor, Device, dtypes, Context
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import getenv, system
from tinygrad.helpers import getenv, system, DEV
from extra.gemm.cdna_asm_gemm import asm_gemm
from test.helpers import needs_second_gpu
from examples.mlperf.models.flat_llama import FP8_DTYPE
Expand Down Expand Up @@ -131,7 +131,7 @@ class TestGemmLlama(unittest.TestCase):
dtype = dtypes.bfloat16

def setUp(self):
if not is_cdna4() or getenv("MOCKGPU"):
if not is_cdna4() or DEV.interface.startswith("MOCK"):
self.skipTest("very slow on non mi350x")

def test_empty(self): asm_gemm(Tensor.empty(N:=getenv("N", 4096), N, dtype=self.dtype), Tensor.empty(N, N, dtype=self.dtype)).realize()
Expand Down
5 changes: 3 additions & 2 deletions test/backend/test_dtype_alu.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest, operator, math
from tinygrad import Context, Tensor, dtypes, Device
from tinygrad.dtype import DType, truncate, fp8_to_float
from tinygrad.helpers import CI, EMULATED_DTYPES, getenv
from tinygrad.helpers import CI, EMULATED_DTYPES, DEV, getenv
from tinygrad.tensor import _to_np_dtype
from tinygrad.device import is_dtype_supported
from tinygrad.runtime.ops_python import from_storage_scalar
Expand Down Expand Up @@ -32,7 +32,8 @@
#binary_operations.append(operator.truediv)

# TODO: CI CUDA segfaults on sin, WEBGPU and NIR sines are not precise enough for large numbers
if (getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}) or Device.DEFAULT == "WEBGPU" or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer):
if ((DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"})
or Device.DEFAULT == "WEBGPU" or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer)):
unary_operations.remove((Tensor.sin, np.sin))
unary_operations.remove((Tensor.cos, np.cos))

Expand Down
4 changes: 2 additions & 2 deletions test/backend/test_edgecases.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
import torch
from tinygrad import Tensor, dtypes, nn
from tinygrad.device import Device
from tinygrad.helpers import getenv
from tinygrad.helpers import DEV
from tinygrad.renderer.nir import NIRRenderer

MOCKGPU = getenv("MOCKGPU")
MOCKGPU = DEV.interface.startswith("MOCK")

class TestNaNEdgeCases(unittest.TestCase):
# we don't need more of these. it's unclear if torch's behavior is desired here
Expand Down
4 changes: 2 additions & 2 deletions test/backend/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
import torch
import numpy as np

from tinygrad.helpers import getenv, CI
from tinygrad.helpers import CI, DEV
from tinygrad.tensor import Tensor
from tinygrad.device import Device
from tinygrad.dtype import _from_torch_dtype, _to_torch_dtype

MOCKGPU = getenv("MOCKGPU")
MOCKGPU = DEV.interface.startswith("MOCK")

@unittest.skipIf(Device.DEFAULT not in ["METAL", "CUDA"] or MOCKGPU, f"no support on {Device.DEFAULT}")
class TestInterop(unittest.TestCase):
Expand Down
4 changes: 2 additions & 2 deletions test/backend/test_jit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from tinygrad.engine.jit import TinyJit, JitError, GraphRunner, MultiGraphRunner, graph_class
from tinygrad.engine.realize import CompiledRunner, BufferCopy, BufferXfer
from tinygrad.device import Device
from tinygrad.helpers import Context, JIT, GlobalCounters, getenv
from tinygrad.helpers import Context, JIT, DEV, GlobalCounters
from tinygrad.dtype import dtypes
from extra.models.unet import ResBlock

Expand Down Expand Up @@ -812,7 +812,7 @@ def f(inp, inp_d1):
hcqgraph=[self.ji_graph(6)])

@unittest.skip("this fails if you don't have SDMA or are using AMD_DISABLE_SDMA=1")
@unittest.skipIf(getenv("MOCKGPU"), "MockGPU does not support parallel copies")
@unittest.skipIf(DEV.interface.startswith("MOCK"), "MockGPU does not support parallel copies")
def test_jit_multidev_copy(self):
if Device.DEFAULT in {"CPU"}: raise unittest.SkipTest("CPU/LLVM is not a valid default device for this test (zero-copies)")

Expand Down
4 changes: 2 additions & 2 deletions test/backend/test_linearizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
from tinygrad.device import Device, Buffer, is_dtype_supported
from tinygrad.tensor import Tensor, _to_np_dtype
from tinygrad.engine.realize import run_schedule, CompiledRunner, get_program
from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, getenv
from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, DEV
from tinygrad.dtype import DType, dtypes, PtrDType, AddrSpace
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.cstyle import CUDARenderer
from test.helpers import replace_opts
MOCKGPU = getenv("MOCKGPU")
MOCKGPU = DEV.interface.startswith("MOCK")

from tinygrad.uop.ops import print_uops # noqa: F401 # pylint: disable=unused-import

Expand Down
Loading
Loading