CrazyForks · pull · Apr 18, 2026 · Apr 18, 2026 · Apr 18, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -643,8 +643,7 @@ jobs:
     runs-on: ubuntu-24.04
     timeout-minutes: 20
     env:
-      DEV: AMD
-      MOCKGPU: 1
+      DEV: MOCKKFD+AMD
     steps:
       - name: Checkout Code
         uses: actions/checkout@v6
@@ -670,7 +669,7 @@ jobs:
       - name: Run AMD renderer tests
         run: python -m pytest -n=auto test/amd/ --durations 20
       - name: Run AMD renderer tests (AMD:LLVM)
-        run: DEV=AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
+        run: DEV=MOCKKFD+AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
       - name: Run SQTT profiling tests
         run: PROFILE=1 SQTT=1 python3 -m pytest -n=auto test/amd/test_sqtt_profiler.py
       - name: Run AMD emulated tests on NULL backend
@@ -679,20 +678,19 @@ jobs:
         run: |
           PYTHONPATH=. DEV=NULL::gfx1100 python extra/mmapeak/mmapeak.py
           PYTHONPATH=. DEV=NULL::gfx1201 python3 -m pytest -n=auto test/testextra/test_tk.py test/backend/test_asm_gemm.py
-      - name: Run matmul on MOCKGPU
+      - name: Run matmul on MOCKKFD
         run: |
-          PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_asm_matmul.py
-          PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_copy_matmul.py
+          PYTHONPATH="." DEV=MOCKKFD+AMD N=256 python3 extra/gemm/amd_asm_matmul.py
+          PYTHONPATH="." DEV=MOCKKFD+AMD N=256 python3 extra/gemm/amd_copy_matmul.py
       - name: Run LLVM test
-        run: DEV=AMD:LLVM python test/device/test_amd_llvm.py
+        run: DEV=MOCKKFD+AMD:LLVM python test/device/test_amd_llvm.py
 
   testmockam:
     name: Linux (am)
     runs-on: ubuntu-24.04
     timeout-minutes: 15
     env:
-      DEV: PCI+AMD
-      MOCKGPU: 1
+      DEV: MOCKPCI+AMD
     steps:
       - name: Checkout Code
         uses: actions/checkout@v6
@@ -704,13 +702,13 @@ jobs:
           amd: 'true'
       - name: Run test_tiny on MOCKAM
         run: python test/test_tiny.py
-      - name: Run test_tiny on MOCKAM USB
-        run: GMMU=0 DEV=USB+AMD python test/test_tiny.py
-      - name: Run test_hcq on MOCKAM
+      - name: Run test_tiny on MOCKUSB
+        run: GMMU=0 DEV=MOCKUSB+AMD python test/test_tiny.py
+      - name: Run test_hcq on MOCKPCI
         run: python -m pytest test/device/test_hcq.py
-      - name: Run disk copy tests on MOCKAM
+      - name: Run disk copy tests on MOCKPCI
         run: python -m pytest test/unit/test_disk_tensor.py -k test_copy_from_disk
-      - name: Run test_tiny on MOCKAM Remote
+      - name: Run test_tiny on MOCKPCI Remote
         run: |
           python extra/remote/serve.py 6667 &
           sleep 2
@@ -728,8 +726,7 @@ jobs:
     runs-on: ubuntu-22.04
     timeout-minutes: 15
     env:
-      DEV: AMD:${{ matrix.backend == 'amdllvm' && 'LLVM' || '' }}:${{ matrix.arch }}
-      MOCKGPU: 1
+      DEV: MOCKKFD+AMD:${{ matrix.backend == 'amdllvm' && 'LLVM' || '' }}:${{ matrix.arch }}
       SKIP_SLOW_TEST: 1
     steps:
       - name: Checkout Code
@@ -764,7 +761,6 @@ jobs:
     runs-on: ubuntu-22.04
     timeout-minutes: 20
     env:
-      MOCKGPU: 1
       FORWARD_ONLY: 1
     steps:
       - name: Checkout Code
@@ -777,7 +773,7 @@ jobs:
           cuda: 'true'
           ocelot: 'true'
       - name: Set env
-        run: printf "${{ matrix.backend == 'ptx' && 'DEV=CUDA:PTX' || matrix.backend == 'nv' && 'DEV=NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'ptx' && 'DEV=MOCK+CUDA:PTX' || matrix.backend == 'nv' && 'DEV=MOCKNVK+NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
       - name: Check Device.DEFAULT and print some source
         run: |
           python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
@@ -862,22 +858,19 @@ jobs:
       run: DEV=METAL TRANSCENDENTAL=2 python -m pytest -n=auto test/backend/test_ops.py::TestOps::test_sin test/backend/test_ops.py::TestOps::test_cos test/backend/test_ops.py::TestOps::test_tan test/backend/test_ops.py::TestOps::test_exp test/backend/test_ops.py::TestOps::test_log --durations=20
     - name: Run pytest (amd)
       env:
-        MOCKGPU: 1
-        DEV: AMD
+        DEV: MOCKKFD+AMD
         FORWARD_ONLY: 1
       run: |
         python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
     - name: Run pytest (amd with llvm backend)
       env:
-        MOCKGPU: 1
-        DEV: "AMD:LLVM"
+        DEV: "MOCKKFD+AMD:LLVM"
         FORWARD_ONLY: 1
       run: |
         python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
     - name: Run pytest (ptx)
       env:
-        MOCKGPU: 1
-        DEV: "NV:PTX"
+        DEV: "MOCKNVK+NV:PTX"
         FORWARD_ONLY: 1
         # TODO: failing due to library loading error
         CAPTURE_PROCESS_REPLAY: 0

diff --git a/docs/abstractions4.py b/docs/abstractions4.py
@@ -1,9 +1,9 @@
 # tinygrad allows you to write kernels at many different abstractions levels.
 # This is for RDNA3, but if you don't have one you can run with the emulator
-# PYTHONPATH="." MOCKGPU=1 DEV=AMD
+# PYTHONPATH="." DEV=MOCKPCI+AMD
 
 from tinygrad import Tensor, Context, GlobalCounters, UOp, Device
-from tinygrad.helpers import DEBUG, getenv
+from tinygrad.helpers import DEV, DEBUG, getenv
 from tinygrad.uop.ops import AxisType, KernelInfo, Ops
 from tinygrad.dtype import AddrSpace, dtypes
 from tinygrad.runtime.autogen.amd.rdna3.ins import *
@@ -16,7 +16,7 @@ def eval_harness(name, tensor, fxn, check=None):
   print(f"computed in {GlobalCounters.time_sum_s*1000:.2f} ms, {(a.nbytes()/1e9)/GlobalCounters.time_sum_s:.2f} GB/s")
   return out
 
-SZ = 256*1024 if getenv("MOCKGPU") else 1024*1024*1024
+SZ = 256*1024 if DEV.interface.startswith("MOCK") else 1024*1024*1024
 
 def example_2_hip(a:Tensor, correct):
   GLOBALS = 1024

diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py
@@ -1438,7 +1438,7 @@ def train_llama3():
   if FP8:
     from tinygrad.nn.state import get_state_dict
     model_state = get_state_dict(model)
-    for wname in ["wqkv", "wo", "w1", "w2", "w3"]:
+    for wname in ["wqkv", "wo", "w13", "w2"]:
       w = model_state[wname]
       w._inv_scale = model._fp8_inv_scale[wname]
       if optim.master_params:

diff --git a/examples/mlperf/models/flat_llama.py b/examples/mlperf/models/flat_llama.py
@@ -55,6 +55,12 @@ def matmul(x:Tensor, w:Tensor, fp8=FP8, amax_x:Tensor|None=None, w_inv_scale:Ten
     if can_use_asm_gemm(x_fp8, w.T): return asm_gemm(x_fp8, w.T, x_scale=x_scale, w_scale=w_inv_scale), x_new_amax, x_fp8, w
   return x_fp8.dot(w.T, dtype=dtypes.float) * x_scale * w_inv_scale, x_new_amax, x_fp8, w
 
+def matmul_fp8_precomputed(x_fp8:Tensor, x_inv_scale:Tensor, x_new_amax:Tensor, w:Tensor, w_inv_scale:Tensor) -> tuple[Tensor,...]:
+  if getenv("ASM_GEMM"):
+    from extra.gemm.cdna_asm_gemm import can_use_asm_gemm, asm_gemm
+    if can_use_asm_gemm(x_fp8, w.T): return asm_gemm(x_fp8, w.T, x_scale=x_inv_scale, w_scale=w_inv_scale), x_new_amax, x_fp8, w
+  return x_fp8.dot(w.T, dtype=dtypes.float) * x_inv_scale * w_inv_scale, x_new_amax, x_fp8, w
+
 def _rmsnorm_fwd(x_in:Tensor, eps:float) -> tuple[Tensor, Tensor]:
   x = x_in.float()
   rrms = (x.square().mean(-1, keepdim=True) + eps).rsqrt()
@@ -84,6 +90,7 @@ def __init__(self, dim:int, hidden_dim:int, n_heads:int, n_layers:int, norm_eps:
     self.n_kv_heads = n_kv_heads if n_kv_heads is not None else n_heads # n_kv_heads != n_heads implies MQA [arxiv/2307.09288, A.2.1]
     self.head_dim = dim // n_heads
     self.n_rep = self.n_heads // self.n_kv_heads
+    self.hidden_dim = hidden_dim
 
     scaled_std = 0.02 / math.sqrt(2 * n_layers)
 
@@ -93,9 +100,8 @@ def __init__(self, dim:int, hidden_dim:int, n_heads:int, n_layers:int, norm_eps:
     self.wo = self.lin_per_layer(self.n_heads * self.head_dim, dim, std=scaled_std)
 
     # FeedForward
-    self.w1 = self.lin_per_layer(dim, hidden_dim)
+    self.w13 = self.lin_per_layer(dim, hidden_dim * 2)
     self.w2 = self.lin_per_layer(hidden_dim, dim, std=scaled_std)
-    self.w3 = self.lin_per_layer(dim, hidden_dim)
 
     self.norm_eps = norm_eps
     self.attention_norm = Tensor.ones(n_layers, dim).contiguous()
@@ -110,10 +116,10 @@ def __init__(self, dim:int, hidden_dim:int, n_heads:int, n_layers:int, norm_eps:
 
     if FP8:
       def _amax(): return Tensor.full((), FP8_MAX).contiguous().requires_grad_(False)
-      names = ["xqkv", "xo", "x1", "x2", "x3"]
+      names = ["xqkv", "xo", "x13", "x2"]
       self._fp8_amax = {name: [_amax() for _ in range(n_layers)] for name in names}
       # per-weight inv_scale: single (n_layers,) float32 tensor per weight (kernel reads float* pointers)
-      w_names = ["wqkv", "wo", "w1", "w2", "w3"]
+      w_names = ["wqkv", "wo", "w13", "w2"]
       self._fp8_inv_scale = {}
       for wname, inv_scales in zip(w_names, self._init_inv_scales):
         self._fp8_inv_scale[wname] = inv_scales.float().contiguous().requires_grad_(False)
@@ -162,20 +168,21 @@ def attention(self, x:Tensor, freqs_cis:Tensor, attention_norm:Tensor, wqkv:Tens
     saves.extend(ret[1:] + [out])
     return (out, *new_amaxs, *saves)
 
-  def feed_forward(self, x:Tensor, ffn_norm:Tensor, w1:Tensor, w2:Tensor, w3:Tensor,
-                   amax_x1=None, amax_x2=None, amax_x3=None, s_1=None, s_2=None, s_3=None):
+  def feed_forward(self, x:Tensor, ffn_norm:Tensor, w13:Tensor, w2:Tensor,
+                   amax_x13=None, amax_x2=None, s_13=None, s_2=None):
     new_amaxs, saves = [], []
 
     x, rrms = rmsnorm(x, self.norm_eps)
     saves.extend([x, rrms])
     x = x * ffn_norm
 
-    x_w1, *ret = matmul(x, w1, amax_x=amax_x1, w_inv_scale=s_1)
+    x_w13, *ret = matmul(x, w13, amax_x=amax_x13, w_inv_scale=s_13)
     new_amaxs.extend(ret[:1])
-    saves.extend(ret[1:] + [x_w1])
-    x_w3, *ret = matmul(x.contiguous_backward(), w3, amax_x=amax_x3, w_inv_scale=s_3)
-    new_amaxs.extend(ret[:1])
-    saves.extend(ret[1:] + [x_w3])
+    saves.extend(ret[1:] + [x_w13])
+
+    x_w1 = x_w13[..., :self.hidden_dim]
+    x_w3 = x_w13[..., self.hidden_dim:]
+
     out, *ret = matmul(x_w1.silu() * x_w3, w2, amax_x=amax_x2, w_inv_scale=s_2)
     new_amaxs.extend(ret[:1])
     saves.extend(ret[1:] + [out])
@@ -184,19 +191,19 @@ def feed_forward(self, x:Tensor, ffn_norm:Tensor, w1:Tensor, w2:Tensor, w3:Tenso
   @function(precompile=True, precompile_backward=True)
   def run_layer(self, x:Tensor, freqs_cis:Tensor,
                 attention_norm:Tensor, wqkv:Tensor, wo:Tensor,
-                ffn_norm:Tensor, w1:Tensor, w2:Tensor, w3:Tensor,
+                ffn_norm:Tensor, w13:Tensor, w2:Tensor,
                 amax_xqkv=None, amax_xo=None,
-                amax_x1=None, amax_x2=None, amax_x3=None,
-                s_qkv=None, s_o=None, s_1=None, s_2=None, s_3=None):
+                amax_x13=None, amax_x2=None,
+                s_qkv=None, s_o=None, s_13=None, s_2=None):
     attn, *attn_ret = self.attention(x, freqs_cis, attention_norm, wqkv, wo,
                                      amax_xqkv=amax_xqkv, amax_xo=amax_xo,
                                      s_qkv=s_qkv, s_o=s_o)
     attn_amaxs, attn_saves = attn_ret[:2], attn_ret[2:]
     h = x + attn
-    ffn, *ffn_ret = self.feed_forward(h, ffn_norm, w1, w2, w3,
-                                      amax_x1=amax_x1, amax_x2=amax_x2, amax_x3=amax_x3,
-                                      s_1=s_1, s_2=s_2, s_3=s_3)
-    ffn_amaxs, ffn_saves = ffn_ret[:3], ffn_ret[3:]
+    ffn, *ffn_ret = self.feed_forward(h, ffn_norm, w13, w2,
+                                      amax_x13=amax_x13, amax_x2=amax_x2,
+                                      s_13=s_13, s_2=s_2)
+    ffn_amaxs, ffn_saves = ffn_ret[:2], ffn_ret[2:]
     h = h + ffn
     return (h, *attn_amaxs, *ffn_amaxs, *attn_saves, *ffn_saves)
 
@@ -208,9 +215,8 @@ def shard(self, device:tuple[str, ...], mp:bool=False):
       # flat per-layer weights: axis 0 is n_layers, so shard axes are +1 vs per-layer Transformer
       self.wqkv.shard_(device, axis=1).realize()          # (n_layers, out, dim) shard out
       self.wo.shard_(device, axis=2).realize()             # (n_layers, dim, in) shard in
-      self.w1.shard_(device, axis=1).realize()             # (n_layers, hidden, dim) shard out
+      self.w13.shard_(device, axis=1).realize()             # (n_layers, hidden*2, dim) shard out
       self.w2.shard_(device, axis=2).realize()             # (n_layers, dim, hidden) shard in
-      self.w3.shard_(device, axis=1).realize()             # (n_layers, hidden, dim) shard out
       self.attention_norm.shard_(device, axis=None).realize()
       self.ffn_norm.shard_(device, axis=None).realize()
       self.norm.weight.shard_(device, axis=None).realize()
@@ -231,16 +237,16 @@ def __call__(self, tokens:Tensor):
     s = self._fp8_inv_scale if FP8 else None
     for i in range(self.n_layers):
       amax_layer = {"amax_xqkv": a["xqkv"][i], "amax_xo": a["xo"][i],
-                    "amax_x1": a["x1"][i], "amax_x2": a["x2"][i], "amax_x3": a["x3"][i]} if a else {}
+                    "amax_x13": a["x13"][i], "amax_x2": a["x2"][i]} if a else {}
       scale_layer = {"s_qkv": s["wqkv"][i], "s_o": s["wo"][i],
-                     "s_1": s["w1"][i], "s_2": s["w2"][i], "s_3": s["w3"][i]} if s else {}
+                     "s_13": s["w13"][i], "s_2": s["w2"][i]} if s else {}
       h, *ret = self.run_layer(h, freqs_cis,
                                self.attention_norm[i], self.wqkv[i], self.wo[i],
-                               self.ffn_norm[i], self.w1[i], self.w2[i], self.w3[i],
+                               self.ffn_norm[i], self.w13[i], self.w2[i],
                                **amax_layer, **scale_layer)
       if a:
         amaxs = ret[:5]
-        amax_names = ["xqkv", "xo", "x1", "x3", "x2"]
+        amax_names = ["xqkv", "xo", "x13", "x2"]
         for name, new_val in zip(amax_names, amaxs):
           a[name][i].assign(new_val)
 

diff --git a/test/amd/test_mockgpu_invalid.py b/test/amd/test_mockgpu_invalid.py
@@ -37,8 +37,7 @@ def test_unsupported_instruction_raises(self):
 '''
 
     env = os.environ.copy()
-    env["AMD"] = "1"
-    env["MOCKGPU"] = "1"
+    env["DEV"] = "MOCKKFD+AMD"
     env["HCQDEV_WAIT_TIMEOUT_MS"] = "10000"
 
     st = time.perf_counter()

diff --git a/test/amd/test_sqtt_encoder.py b/test/amd/test_sqtt_encoder.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """Tests for SQTT encoder: verifies the emulator produces correct SQTT traces for known kernels.
 
-Run with: DEV=AMD MOCKGPU=1 python -m pytest test/amd/test_sqtt_encoder.py -v
+Run with: DEV=MOCKKFD+AMD python -m pytest test/amd/test_sqtt_encoder.py -v
 """
 import ctypes, unittest
 from tinygrad.helpers import Context

diff --git a/test/backend/test_asm_gemm.py b/test/backend/test_asm_gemm.py
@@ -1,7 +1,7 @@
 import unittest
 from tinygrad import Tensor, Device, dtypes, Context
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import getenv, system
+from tinygrad.helpers import getenv, system, DEV
 from extra.gemm.cdna_asm_gemm import asm_gemm
 from test.helpers import needs_second_gpu
 from examples.mlperf.models.flat_llama import FP8_DTYPE
@@ -131,7 +131,7 @@ class TestGemmLlama(unittest.TestCase):
   dtype = dtypes.bfloat16
 
   def setUp(self):
-    if not is_cdna4() or getenv("MOCKGPU"):
+    if not is_cdna4() or DEV.interface.startswith("MOCK"):
       self.skipTest("very slow on non mi350x")
 
   def test_empty(self): asm_gemm(Tensor.empty(N:=getenv("N", 4096), N, dtype=self.dtype), Tensor.empty(N, N, dtype=self.dtype)).realize()

diff --git a/test/backend/test_dtype_alu.py b/test/backend/test_dtype_alu.py
@@ -1,7 +1,7 @@
 import unittest, operator, math
 from tinygrad import Context, Tensor, dtypes, Device
 from tinygrad.dtype import DType, truncate, fp8_to_float
-from tinygrad.helpers import CI, EMULATED_DTYPES, getenv
+from tinygrad.helpers import CI, EMULATED_DTYPES, DEV, getenv
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.device import is_dtype_supported
 from tinygrad.runtime.ops_python import from_storage_scalar
@@ -32,7 +32,8 @@
 #binary_operations.append(operator.truediv)
 
 # TODO: CI CUDA segfaults on sin, WEBGPU and NIR sines are not precise enough for large numbers
-if (getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}) or Device.DEFAULT == "WEBGPU" or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer):
+if ((DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"})
+    or Device.DEFAULT == "WEBGPU" or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer)):
   unary_operations.remove((Tensor.sin, np.sin))
   unary_operations.remove((Tensor.cos, np.cos))
 

diff --git a/test/backend/test_edgecases.py b/test/backend/test_edgecases.py
@@ -27,10 +27,10 @@
 import torch
 from tinygrad import Tensor, dtypes, nn
 from tinygrad.device import Device
-from tinygrad.helpers import getenv
+from tinygrad.helpers import DEV
 from tinygrad.renderer.nir import NIRRenderer
 
-MOCKGPU = getenv("MOCKGPU")
+MOCKGPU = DEV.interface.startswith("MOCK")
 
 class TestNaNEdgeCases(unittest.TestCase):
   # we don't need more of these. it's unclear if torch's behavior is desired here

diff --git a/test/backend/test_interop.py b/test/backend/test_interop.py
@@ -3,12 +3,12 @@
 import torch
 import numpy as np
 
-from tinygrad.helpers import getenv, CI
+from tinygrad.helpers import CI, DEV
 from tinygrad.tensor import Tensor
 from tinygrad.device import Device
 from tinygrad.dtype import _from_torch_dtype, _to_torch_dtype
 
-MOCKGPU = getenv("MOCKGPU")
+MOCKGPU = DEV.interface.startswith("MOCK")
 
 @unittest.skipIf(Device.DEFAULT not in ["METAL", "CUDA"] or MOCKGPU, f"no support on {Device.DEFAULT}")
 class TestInterop(unittest.TestCase):

diff --git a/test/backend/test_jit.py b/test/backend/test_jit.py
@@ -8,7 +8,7 @@
 from tinygrad.engine.jit import TinyJit, JitError, GraphRunner, MultiGraphRunner, graph_class
 from tinygrad.engine.realize import CompiledRunner, BufferCopy, BufferXfer
 from tinygrad.device import Device
-from tinygrad.helpers import Context, JIT, GlobalCounters, getenv
+from tinygrad.helpers import Context, JIT, DEV, GlobalCounters
 from tinygrad.dtype import dtypes
 from extra.models.unet import ResBlock
 
@@ -812,7 +812,7 @@ def f(inp, inp_d1):
       hcqgraph=[self.ji_graph(6)])
 
   @unittest.skip("this fails if you don't have SDMA or are using AMD_DISABLE_SDMA=1")
-  @unittest.skipIf(getenv("MOCKGPU"), "MockGPU does not support parallel copies")
+  @unittest.skipIf(DEV.interface.startswith("MOCK"), "MockGPU does not support parallel copies")
   def test_jit_multidev_copy(self):
     if Device.DEFAULT in {"CPU"}: raise unittest.SkipTest("CPU/LLVM is not a valid default device for this test (zero-copies)")
 

diff --git a/test/backend/test_linearizer.py b/test/backend/test_linearizer.py
@@ -7,12 +7,12 @@
 from tinygrad.device import Device, Buffer, is_dtype_supported
 from tinygrad.tensor import Tensor, _to_np_dtype
 from tinygrad.engine.realize import run_schedule, CompiledRunner, get_program
-from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, getenv
+from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, DEV
 from tinygrad.dtype import DType, dtypes, PtrDType, AddrSpace
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.renderer.cstyle import CUDARenderer
 from test.helpers import replace_opts
-MOCKGPU = getenv("MOCKGPU")
+MOCKGPU = DEV.interface.startswith("MOCK")
 
 from tinygrad.uop.ops import print_uops # noqa: F401 # pylint: disable=unused-import