CrazyForks · pull · Apr 25, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/test/null/test_attention.py b/test/null/test_attention.py
@@ -16,9 +16,9 @@ def test_half_qkv_buffers(self):
     k = Tensor.ones(BS, seqlen, dim, dtype=dtypes.half).contiguous().realize()
     v = Tensor.ones(BS, seqlen, dim, dtype=dtypes.half).contiguous().realize()
     attn = q.scaled_dot_product_attention(k, v)
-    sched = attn.schedule()
+    sched = attn.schedule_linear()
     # attention has 4 kernels now
-    self.assertEqual(len(sched), 4)
+    self.assertEqual(len(sched.src), 4)
 
   def test_apply_rope_jit_prune(self):
     def rope_fn(x_in, pos): return apply_rope(x_in, pos)

diff --git a/test/null/test_compile_failures.py b/test/null/test_compile_failures.py
@@ -3,11 +3,11 @@
 from tinygrad import Tensor, dtypes, Device
 from tinygrad.helpers import OSX, DEV
 from tinygrad.device import is_dtype_supported
-from tinygrad.engine.realize import get_program
+from tinygrad.engine.realize import get_program, compile_linear
 
 class TestCompileFailures(unittest.TestCase):
   def compile(self, out:Tensor):
-    for si in out.schedule(): si.lower()
+    compile_linear(out.schedule_linear())
 
   @unittest.skipUnless(is_dtype_supported(dtypes.uchar), f"no uint8 on {Device.DEFAULT}")
   def test_interpolate_atari(self):
@@ -21,8 +21,8 @@ class TestDisassembly(unittest.TestCase):
   @unittest.skipUnless(Device.DEFAULT in ("CPU",) and DEV.renderer not in ("LLVM", "LVP") and OSX, "m series cpus support fp16 arithmetic")
   def test_float16_alu(self):
     c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16)
-    s = c.schedule()[-1]
-    p = get_program(s.ast, Device[Device.DEFAULT].renderer)
+    s = c.schedule_linear().src[-1]
+    p = get_program(s.src[0], Device[Device.DEFAULT].renderer)
     lib = Device[Device.DEFAULT].compiler.compile(p.src)
     out = io.StringIO()
     with redirect_stdout(out): Device[Device.DEFAULT].compiler.disassemble(lib)

diff --git a/test/null/test_const_folding.py b/test/null/test_const_folding.py
@@ -7,8 +7,8 @@
 
 def _check_ast_count(desired_count:int, t:Tensor):
   # NOTE: this has side effect because everything can be scheduled only once
-  schedule = t.schedule()
-  asts = [s for s in schedule if s.ast.op is Ops.SINK]
+  linear = t.schedule_linear()
+  asts = [s for s in linear.src if s.src[0].op is Ops.SINK]
   len(asts)
   # NOT SUPPORTED ANYMORE
   #assert len(asts) == desired_count, f"{len(asts)} != {desired_count}"

diff --git a/test/null/test_gc.py b/test/null/test_gc.py
@@ -60,7 +60,7 @@ def test_schedule_gc(self):
     init = bufs_allocated()
     x = Tensor.ones(256).contiguous().realize()
     y = Tensor.ones(5, 5).contiguous()
-    y.schedule()
+    y.schedule_linear()
     del x
     del y
     self.assertEqual(bufs_allocated()-init, 0)

diff --git a/test/null/test_linearizer_rewrite.py b/test/null/test_linearizer_rewrite.py
@@ -9,29 +9,29 @@ def test_reduction(self):
     t = Tensor.ones((64,64), device="NULL").contiguous().realize()
     out = (t*2).sum(axis=1)
     with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0):
-      si = out.schedule()[-1]
+      si = out.schedule_linear().src[-1]
       opts_to_apply = []
       opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))
       opts_to_apply.append(Opt(OptOps.UNROLL, 0, 4))
-      ast = si.ast.replace(arg=KernelInfo(opts_to_apply=tuple(opts_to_apply)))
+      ast = si.src[0].replace(arg=KernelInfo(opts_to_apply=tuple(opts_to_apply)))
       prg = get_program(ast, Device["CPU"].renderer)
       print(prg.src)
 
   def test_arange(self):
     out = Tensor.arange(32, device="NULL")
     with Context(SPLIT_REDUCEOP=0, DEVECTORIZE=0):
-      si = out.schedule()[-1]
+      si = out.schedule_linear().src[-1]
       opts_to_apply = []
       opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))
-      ast = si.ast.replace(arg=KernelInfo(opts_to_apply=tuple(opts_to_apply)))
+      ast = si.src[0].replace(arg=KernelInfo(opts_to_apply=tuple(opts_to_apply)))
       prg = get_program(ast, Device["CPU"].renderer)
       print(prg.src)
 
   def test_kernel_info(self):
     out = Tensor.arange(4, device="NULL")
-    si = out.schedule()[-1]
+    si = out.schedule_linear().src[-1]
 
-    ast = si.ast.replace(arg=KernelInfo(opts_to_apply=()))
+    ast = si.src[0].replace(arg=KernelInfo(opts_to_apply=()))
     prg = get_program(ast, Device["CPU"].renderer)
     assert prg.applied_opts == (), f"expected no opts, got {prg}"
 

diff --git a/test/null/test_process_replay.py b/test/null/test_process_replay.py
@@ -9,7 +9,7 @@
 class TestProcessReplay(unittest.TestCase):
   @classmethod
   def setUpClass(cls):
-    cls.ast = (Tensor.empty(N, N) @ Tensor.empty(N, N)).schedule()[-1].ast
+    cls.ast = (Tensor.empty(N, N) @ Tensor.empty(N, N)).schedule_linear().src[-1].src[0]
     cls.renderer = Device[Device.DEFAULT].renderer
 
   def test_replay_no_opts(self):
@@ -35,9 +35,9 @@ def test_replay_with_opt(self):
 
   def test_beam(self):
     with Context(BEAM=1):
-      si = (Tensor.empty(N, N) @ Tensor.empty(N, N)).schedule()[-1]
-    p = do_to_program(si.ast, self.renderer)
-    good, compare, _ = replay_to_program(p, si.ast, self.renderer)
+      ast = (Tensor.empty(N, N) @ Tensor.empty(N, N)).schedule_linear().src[-1].src[0]
+    p = do_to_program(ast, self.renderer)
+    good, compare, _ = replay_to_program(p, ast, self.renderer)
     self.assertEqual(good, compare)
 
 if __name__ == '__main__':

diff --git a/test/null/test_schedule.py b/test/null/test_schedule.py
@@ -87,7 +87,7 @@ def test_unused_var_not_in_var_vals(self):
     # unused variable should not appear in var_vals even when there's other work
     a = Tensor(UOp.variable("unused", 0, 10).bind(1))
     b = Tensor.empty(3) + 1
-    _, var_vals = Tensor.schedule_with_vars(a, b)
+    _, var_vals = Tensor.linear_with_vars(a, b)
     self.assertEqual(var_vals, {})
     self.assertIsNone(a.uop.base.realized)
 
@@ -208,8 +208,8 @@ def test_realize_view_of_realized_has_empty_schedule(self):
     t = Tensor.zeros((3, 3)).contiguous().realize()
     v = t[1]  # view - is_realized but not has_buffer_identity
     assert v.uop.is_realized
-    sched, _ = Tensor.schedule_with_vars(v)
-    self.assertEqual(len(sched), 0)
+    linear, _ = Tensor.linear_with_vars(v)
+    self.assertEqual(len(linear.src), 0)
 
   # NOTE: because empty does not have a lowered ExecItem if realize is called on a childless empty, it never gets allocated.
   def test_childless_empty_never_allocates(self):

diff --git a/test/null/test_schedule_cache.py b/test/null/test_schedule_cache.py
@@ -4,15 +4,15 @@
 from tinygrad.schedule import schedule_cache
 
 def schedule_one():
-  Tensor([1]).schedule()
+  Tensor([1]).schedule_linear()
 
 class TestScheduleCache(unittest.TestCase):
   def test_bound_variable_var_vals(self):
     v = Variable('pos', 1, 100)
     x = Tensor.ones(10).contiguous().realize()
 
     t = x + Tensor(v.bind(42))
-    _, var_vals = t.schedule_with_vars()
+    _, var_vals = t.linear_with_vars()
     self.assertEqual(var_vals, {'pos': 42})
 
   def test_disable_schedule_cache(self):

diff --git a/test/null/test_tensor.py b/test/null/test_tensor.py
@@ -62,11 +62,12 @@ def _find_op(self, ast: UOp, op: Ops):
     for src in ast.src:
       if (ret:=self._find_op(src, op)) is not None: return ret
   def _schedule_render(self, a: Tensor):
-    schedule, _ = a.schedule_with_vars()
-    for s in schedule:
-      if s.ast.op is Ops.SINK:
-        renderer = Device[s.bufs[0].device].renderer
-        prg = get_program(s.ast, renderer)
+    linear, _ = a.linear_with_vars()
+    for si in linear.src:
+      ast = si.src[0]
+      if ast.op is Ops.SINK:
+        renderer = Device[si.src[1].buffer.device].renderer
+        prg = get_program(ast, renderer)
         return prg.uops
 
   def _assert(self, dtype: DType, a: Tensor):
@@ -162,9 +163,9 @@ class TestRand(unittest.TestCase):
   def test_rand_large_tensor(self):
     # large tensor rand (num > uint32.max) should not crash in frontend
     Tensor.manual_seed(0)
-    Tensor.rand(2**17, 2**17).schedule()
-    Tensor.rand(2**17, 2**17).schedule()
-    Tensor.rand(2**17, 2**17).schedule()
+    Tensor.rand(2**17, 2**17).schedule_linear()
+    Tensor.rand(2**17, 2**17).schedule_linear()
+    Tensor.rand(2**17, 2**17).schedule_linear()
 
 class TestTensorConstLike(unittest.TestCase):
   def test_const_like_shape(self):

diff --git a/test/null/test_tensor_uop_mixin.py b/test/null/test_tensor_uop_mixin.py
@@ -163,6 +163,48 @@ def test_sparse_categorical_crossentropy_ignore_index(self):
     t, Y = _t(2, 3).float(), Tensor([1, 2], dtype=dtypes.int32)
     self.assertIs(_strip_unique(t.sparse_categorical_crossentropy(Y, ignore_index=0).uop),
                   _strip_unique(t.uop.sparse_categorical_crossentropy(Y.uop, ignore_index=0)))
+  def test_nll_loss(self):
+    t, Y = _t(2, 3).float().log_softmax(), Tensor([1, 2], dtype=dtypes.int32)
+    self.assertIs(_strip_unique(t.nll_loss(Y).uop), _strip_unique(t.uop.nll_loss(Y.uop)))
+  def test_nll_loss_weight(self):
+    t, Y, w = _t(2, 3).float().log_softmax(), Tensor([1, 2], dtype=dtypes.int32), _t(3).float()
+    self.assertIs(_strip_unique(t.nll_loss(Y, weight=w).uop), _strip_unique(t.uop.nll_loss(Y.uop, weight=w.uop)))
+  def test_nll_loss_ignore_index(self):
+    t, Y = _t(2, 3).float().log_softmax(), Tensor([1, 2], dtype=dtypes.int32)
+    self.assertIs(_strip_unique(t.nll_loss(Y, ignore_index=1).uop), _strip_unique(t.uop.nll_loss(Y.uop, ignore_index=1)))
+  def test_nll_loss_none_reduction(self):
+    t, Y = _t(2, 3).float().log_softmax(), Tensor([1, 2], dtype=dtypes.int32)
+    self.assertIs(_strip_unique(t.nll_loss(Y, reduction="none").uop), _strip_unique(t.uop.nll_loss(Y.uop, reduction="none")))
+  def test_nll_loss_weight_ignore_index(self):
+    t, Y, w = _t(2, 3).float().log_softmax(), Tensor([1, 2], dtype=dtypes.int32), _t(3).float()
+    self.assertIs(_strip_unique(t.nll_loss(Y, weight=w, ignore_index=1).uop),
+                  _strip_unique(t.uop.nll_loss(Y.uop, weight=w.uop, ignore_index=1)))
+
+class TestTensorUOpScatter(unittest.TestCase):
+  def test_scatter(self):
+    x, idx, src = _t(3, 4).float(), Tensor([[0, 1, 2, 0]], dtype=dtypes.int32), _t(1, 4).float()
+    self.assertIs(_strip_unique(x.scatter(0, idx, src).uop), _strip_unique(x.uop.scatter(0, idx.uop, src.uop)))
+  def test_scatter_scalar_src(self):
+    x, idx = _t(3, 4).float(), Tensor([[0, 1]], dtype=dtypes.int32)
+    self.assertIs(_strip_unique(x.scatter(1, idx, 3.14).uop), _strip_unique(x.uop.scatter(1, idx.uop, 3.14)))
+  # inf cannot be cast to int — this regresses if scalar src is routed through index.dtype first
+  def test_scatter_inf_src(self):
+    x, idx = _t(3, 4).float(), Tensor([[0, 1]], dtype=dtypes.int32)
+    self.assertIs(_strip_unique(x.scatter(1, idx, float("inf")).uop),
+                  _strip_unique(x.uop.scatter(1, idx.uop, float("inf"))))
+  def test_scatter_add(self):
+    x, idx = _t(3, 4).float(), Tensor([[0, 1]], dtype=dtypes.int32)
+    self.assertIs(_strip_unique(x.scatter(1, idx, 3.14, reduce="add").uop),
+                  _strip_unique(x.uop.scatter(1, idx.uop, 3.14, reduce="add")))
+  def test_scatter_multiply(self):
+    x, idx = _t(3, 4).float(), Tensor([[0, 1]], dtype=dtypes.int32)
+    self.assertIs(_strip_unique(x.scatter(1, idx, 3.14, reduce="multiply").uop),
+                  _strip_unique(x.uop.scatter(1, idx.uop, 3.14, reduce="multiply")))
+  # tensor src with reduce hits the "elif reduce: raise" branch in both Tensor and UOp paths
+  def test_scatter_tensor_src_with_reduce_raises(self):
+    x, idx, src = _t(3, 4).float(), Tensor([[0, 1]], dtype=dtypes.int32), _t(1, 2).float()
+    with self.assertRaises(TypeError): x.scatter(1, idx, src, reduce="add")
+    with self.assertRaises(TypeError): x.uop.scatter(1, idx.uop, src.uop, reduce="add")
 
 class TestTensorUOpScatterReduce(unittest.TestCase):
   def _check(self, x, idx, src, **kw):

diff --git a/test/null/test_tensor_uop_representation.py b/test/null/test_tensor_uop_representation.py
@@ -16,7 +16,7 @@ def test_mutate_add(self):
     pa = a.uop
     pb = b.uop
     pr = ret.uop
-    ret.schedule()
+    ret.schedule_linear()
     self.assertIsNot(pa, a.uop)
     self.assertIsNot(pb, b.uop)
     self.assertIsNot(pr, ret.uop)

diff --git a/test/null/test_tinyfs.py b/test/null/test_tinyfs.py
@@ -5,22 +5,22 @@ class TestLoadStore(unittest.TestCase):
   def test_load_shape(self):
     t = Tensor(bytes(16)).fs_load(1024)
     assert t.shape == (1024,), t.shape
-    t.schedule()
+    t.schedule_linear()
 
   def test_store_shape(self):
     t = Tensor.zeros(1024).fs_store()
     assert t.shape == (16,), t.shape
-    t.schedule()
+    t.schedule_linear()
 
   def test_load_large_shape(self):
     t = Tensor(bytes(16)).fs_load(10_000_000)
     assert t.shape == (10_000_000,), t.shape
-    t.schedule()
+    t.schedule_linear()
 
   def test_store_large_shape(self):
     t = Tensor.zeros(10_000_000).fs_store()
     assert t.shape == (16,), t.shape
-    t.schedule()
+    t.schedule_linear()
 
 if __name__ == "__main__":
   unittest.main()
diff --git a/test/null/test_uops.py b/test/null/test_uops.py
@@ -228,7 +228,7 @@ def test_uop_variables(self):
     a = UOp.variable("a", 1, 10)
     uop_var = Tensor(a.bind(1))
     st_var = Tensor.empty((2, 10))[:, :a.bind(1)]
-    _, var_vals = (uop_var+st_var).schedule_with_vars()
+    _, var_vals = (uop_var+st_var).linear_with_vars()
     self.assertEqual(len(var_vals), 1)
     self.assertEqual(list(var_vals)[0], a.expr)
 

diff --git a/test/null/test_uops_stats.py b/test/null/test_uops_stats.py
@@ -1,7 +1,7 @@
 import unittest
 from tinygrad import Tensor
 from tinygrad.helpers import GlobalCounters, DEV
-from tinygrad.engine.realize import get_program
+from tinygrad.engine.realize import get_program, compile_linear, estimate_uop
 from tinygrad.renderer import ProgramSpec
 from tinygrad.renderer import Estimates
 from tinygrad.uop.ops import Ops, UOp
@@ -18,8 +18,8 @@ def flops_mem(uops, ignore_indexing=False):
 # **************** new FlopCounter ****************
 
 def get_stats(x:Tensor):
-  si = x.schedule()[-1].lower()
-  return si.prg.estimates.ops, si.prg.estimates.mem
+  est = estimate_uop(compile_linear(x.schedule_linear()).src[-1])
+  return est.ops, est.mem
 
 @unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu does extra load/store for packed types")
 class TestMemoryCount(unittest.TestCase):
@@ -165,8 +165,8 @@ def test_mulacc(self):
 class TestStatsOptimized(unittest.TestCase):
   @classmethod
   def setUpClass(cls):
-    cls.ast_gemm = (Tensor.empty(N, N) @ Tensor.empty(N, N)).schedule()[-1].ast
-    cls.ast_reduce = (Tensor.empty(N*N).sum()).schedule()[-1].ast
+    cls.ast_gemm = (Tensor.empty(N, N) @ Tensor.empty(N, N)).schedule_linear().src[-1].src[0]
+    cls.ast_reduce = (Tensor.empty(N*N).sum()).schedule_linear().src[-1].src[0]
 
   def check_gemm(self, p:ProgramSpec, extra_flops=0):
     #p.uops.print()