diff --git a/docs/env_vars.md b/docs/env_vars.md
index 58efa542fbc6d..92cd5c79906f6 100644
--- a/docs/env_vars.md
+++ b/docs/env_vars.md
@@ -57,6 +57,8 @@ AMD:LLVM      | use the AMD device with the LLVM renderer
 NV:CUDA:sm_70 | use the NV device with the CUDA renderer targetting sm_70
 AMD::gfx950   | use the AMD device targetting gfx950
 USB+AMD       | use the AMD device over the USB interface
+CPU:LLVM      | use the CPU device with the LLVM renderer
+CPU:LLVM:x86_64,znver2,avx2,-avx512f | use the CPU device with the LLVM renderer, with [additional arch flags](runtime.md#cpu-arch)
 
 ### Debug breakdown
 
diff --git a/docs/runtime.md b/docs/runtime.md
index 257ad0ff0c744..ac2d0e7d60658 100644
--- a/docs/runtime.md
+++ b/docs/runtime.md
@@ -10,7 +10,7 @@ tinygrad supports various runtimes, enabling your code to scale across a wide ra
 | [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | - | M1+ Macs; Metal 3.0+ for `bfloat` support |
 | [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | nvrtc (default)<br> PTX (`DEV=CUDA:PTX`) | NVIDIA GPU with CUDA support |
 | [CL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cl.py) | Accelerates computations using OpenCL on GPUs | - | OpenCL 2.0 compatible device |
-| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)<br>LLVM IR (`DEV=CPU:LLVM`) | `clang` compiler in system `PATH` |
+| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)<br>LLVM IR (`DEV=CPU:LLVM`) | `clang` compiler in system `PATH`<br>You can specify additional arch parameters via [the `DEV` variable](env_vars.md#dev-variable). See [CPU arch](#cpu-arch) for details. |
 | [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | - | Dawn library installed and discoverable. Binaries: [pydawn v0.3.0](https://github.com/wpmed92/pydawn/releases/tag/v0.3.0) |
 
 
@@ -79,3 +79,13 @@ NV backend supports several interfaces for communicating with devices:
 
 * `NVK`: uses the nvidia driver
 * `PCI`: uses the [NV driver](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/support/nv/nvdev.py)
+
+## CPU Arch
+The CPU renderers may be additionally configured using the arch component of [the `DEV` environment variable](env_vars.md#dev-variable).
+CPU arch should be specified as a comma-separated list of parameters, and must contain at least two values: the architecture family (ie. x86_64, arm64, or riscv64) and the cpu type (as accepted by `clang`'s `-march`).
+If native is specified as the cpu type, tinygrad (or delegate compiler) will query the host cpu type. Additional comma-separated values may be specified as follows:
+
+* `AMX`: emit Apple silicon AMX instructions
+
+All other additional values are interpreted as cpu feature flags. When a value is preceded by a `-` character, the corresponding feature flag will be disabled, otherwise the flag will be enabled.
+Note that enabled feature flags should not be preceded by a `+`.
diff --git a/extra/optimization/extract_dataset.py b/extra/optimization/extract_dataset.py
index b33530b2097cb..327773008dced 100755
--- a/extra/optimization/extract_dataset.py
+++ b/extra/optimization/extract_dataset.py
@@ -10,4 +10,4 @@ def extract_ast(*args) -> None:
   return None
 
 if __name__ == "__main__":
-  _pmap({"get_program":extract_ast})
+  _pmap({"do_to_program":extract_ast})
diff --git a/test/external/process_replay/process_replay.py b/test/external/process_replay/process_replay.py
index 7602820d0dfa4..7d63217d73977 100755
--- a/test/external/process_replay/process_replay.py
+++ b/test/external/process_replay/process_replay.py
@@ -8,8 +8,8 @@
 if not int(os.getenv("ASSERT_PROCESS_REPLAY", "1")): ASSERT_DIFF = 0
 
 try:
-  from tinygrad.renderer import Renderer, ProgramSpec
-  from tinygrad.engine.realize import get_program
+  from tinygrad.renderer import Renderer
+  from tinygrad.codegen import to_program
   from tinygrad.uop.ops import UOp, Ops
   from tinygrad.helpers import VERSION, Context, ContextVar, colored, db_connection, getenv, tqdm
 except ImportError as e:
@@ -41,23 +41,25 @@ class ProcessReplayWarning(Warning): pass
 
 # *** replay the function and convert return values to string
 
-def replay_get_program(p:ProgramSpec, ast:UOp, renderer:Renderer) -> tuple[str, str, tuple[Any, ...]]:
+def replay_to_program(p:UOp, ast:UOp, renderer:Renderer) -> tuple[str, str, tuple[Any, ...]]:
   if ast.op is Ops.PROGRAM: input_ast = ast
   else:
     sink_arg = ast.arg
-    if sink_arg.beam: sink_arg = replace(sink_arg, opts_to_apply=p.applied_opts)
-    input_ast = ast.replace(arg=replace(sink_arg, name=p.name))
-  p2 = get_program(input_ast, renderer=renderer)
-  def to_str(ret:ProgramSpec) -> str:
+    if sink_arg.beam: sink_arg = replace(sink_arg, opts_to_apply=p.src[0].arg.applied_opts)
+    input_ast = ast.replace(arg=replace(sink_arg, name=p.src[0].arg.name))
+  p2 = to_program(input_ast, renderer=renderer)
+  device = p.src[1].arg
+  def to_str(ret:UOp) -> str:
+    src = ret.src[3].arg
     # PYTHON renderer pickles UOps, first unpickle and decode here
-    if p.device.startswith("PYTHON"): return "\n".join([str(x) for x in pickle.loads(base64.b64decode(ret.src))])
-    return ret.src
+    if device.startswith("PYTHON"): return "\n".join([str(x) for x in pickle.loads(base64.b64decode(src))])
+    return src
   # properly color the name arg
   ast_repr = codecs.decode(str(input_ast), "unicode_escape")
   return to_str(p2), to_str(p), (ast_repr, renderer)
 
 replayers: dict[str, Callable[..., tuple[str, str, tuple[Any, ...]]]] = {}
-replayers["get_program"] = replay_get_program
+replayers["do_to_program"] = replay_to_program
 
 # *** run replayers on captured rows and print diffs
 
diff --git a/test/null/test_device.py b/test/null/test_device.py
index 44716240d701c..90f8588fef15a 100644
--- a/test/null/test_device.py
+++ b/test/null/test_device.py
@@ -132,6 +132,7 @@ def test_parse(self):
     for d, t in [("AMD", Target(device="AMD", renderer="")), ("AMD:LLVM", Target(device="AMD", renderer="LLVM")),
                  (":LLVM", Target(device="", renderer="LLVM")), ("AMD::gfx1100", Target(device="AMD", arch="gfx1100")),
                  ("AMD:LLVM:gfx1100", Target(device="AMD", renderer="LLVM", arch="gfx1100")), ("::gfx1100", Target(arch="gfx1100")),
+                 ("CPU:LLVM:arm64,native,AMX", Target(device="CPU", renderer="LLVM", arch="arm64,native,AMX")),
                  ("USB+", Target(interface="USB")), ("USB+AMD", Target(device="AMD", interface="USB")),
                  ("PCI:0+AMD", Target(device="AMD", interface="PCI", indices="0")), (":0+AMD", Target(device="AMD", indices="0")),
                  ("PCI:0,1+AMD", Target(device="AMD", interface="PCI", indices="0,1")),
diff --git a/test/null/test_elf.py b/test/null/test_elf.py
index c8ab2820855ca..f7d350bd34e36 100644
--- a/test/null/test_elf.py
+++ b/test/null/test_elf.py
@@ -23,7 +23,7 @@ def test_clang_jit_compiler_external_raise(self):
       }
     '''
     with self.assertRaisesRegex(RuntimeError, 'evil_external_function'):
-      ClangJITCompiler({'AMD64':'x86_64', 'aarch64':'arm64'}.get(m:=platform.machine(), m)+",native").compile(src)
+      ClangJITCompiler([{'AMD64':'x86_64', 'aarch64':'arm64'}.get(m:=platform.machine(), m), "native"]).compile(src)
   def test_link(self):
     src = '''
       float powf(float, float); // from libm
diff --git a/test/null/test_process_replay.py b/test/null/test_process_replay.py
index 46343795a2de8..87c032ed88378 100644
--- a/test/null/test_process_replay.py
+++ b/test/null/test_process_replay.py
@@ -1,8 +1,8 @@
 import unittest
 from tinygrad import Tensor, Device, Context
-from tinygrad.engine.realize import get_program
+from tinygrad.codegen import do_to_program
 from tinygrad.codegen.opt import Opt, OptOps
-from test.external.process_replay.process_replay import replay_get_program
+from test.external.process_replay.process_replay import replay_to_program
 from test.helpers import replace_opts
 
 N = 16
@@ -14,30 +14,30 @@ def setUpClass(cls):
 
   def test_replay_no_opts(self):
     # opts=None means use default heuristic path
-    p = get_program(self.ast, self.renderer)
-    good, compare, _ = replay_get_program(p, self.ast, self.renderer)
+    p = do_to_program(self.ast, self.renderer)
+    good, compare, _ = replay_to_program(p, self.ast, self.renderer)
     self.assertEqual(good, compare)
 
   def test_replay_empty_opts(self):
     # opts=[] means explicitly apply zero opts (unoptimized)
     ast = replace_opts(self.ast, [])
-    p = get_program(ast, self.renderer)
-    good, compare, _ = replay_get_program(p, ast, self.renderer)
+    p = do_to_program(ast, self.renderer)
+    good, compare, _ = replay_to_program(p, ast, self.renderer)
     self.assertEqual(good, compare)
 
   def test_replay_with_opt(self):
     # opts=[Opt(...)] means apply a specific opt
     opts = [Opt(OptOps.UPCAST, 0, 4)]
     ast = replace_opts(self.ast, opts)
-    p = get_program(ast, self.renderer)
-    good, compare, _ = replay_get_program(p, ast, self.renderer)
+    p = do_to_program(ast, self.renderer)
+    good, compare, _ = replay_to_program(p, ast, self.renderer)
     self.assertEqual(good, compare)
 
   def test_beam(self):
     with Context(BEAM=1):
       si = (Tensor.empty(N, N) @ Tensor.empty(N, N)).schedule()[-1]
-    p = get_program(si.ast, self.renderer)
-    good, compare, _ = replay_get_program(p, si.ast, self.renderer)
+    p = do_to_program(si.ast, self.renderer)
+    good, compare, _ = replay_to_program(p, si.ast, self.renderer)
     self.assertEqual(good, compare)
 
 if __name__ == '__main__':
diff --git a/test/null/test_tensor_uop_mixin.py b/test/null/test_tensor_uop_mixin.py
index bafc25725357f..23be0b6be4947 100644
--- a/test/null/test_tensor_uop_mixin.py
+++ b/test/null/test_tensor_uop_mixin.py
@@ -12,6 +12,17 @@ def _t(*shape):
 def _check(tc: unittest.TestCase, t: Tensor, fn):
   tc.assertIs(fn(t).uop, fn(t.uop), f"\ntensor.uop = {fn(t).uop}\nuop = {fn(t.uop)}")
 
+class TestTensorUOpBinop(unittest.TestCase):
+  # Tensor's binop upcasts mixed dtypes via least_upper_dtype + explicit CAST; UOp should match.
+  def test_mul_float_int(self):
+    t = _t(3).float()
+    self.assertIs(_strip_unique((t * Tensor.arange(3)).uop), _strip_unique(t.uop * UOp.arange(3)))
+  def test_mul_bool_int(self):
+    t = _t(3)
+    self.assertIs(_strip_unique((t.eq(1) * Tensor.arange(3)).uop), _strip_unique(t.uop.eq(1) * UOp.arange(3)))
+  # Tensor's ufix picks float dtype when scalar is float and self is int; UOp should match.
+  def test_add_scalar_float_on_int(self): _check(self, _t(3), lambda x: x + 1.5)
+
 class TestTensorUOpGetitem(unittest.TestCase):
   # ---- pure slice patterns ----
   def test_slice_full(self):           _check(self, _t(4), lambda x: x[slice(None)])
diff --git a/test/null/test_uop_symbolic.py b/test/null/test_uop_symbolic.py
index 51c4ebe714529..29854de302e89 100644
--- a/test/null/test_uop_symbolic.py
+++ b/test/null/test_uop_symbolic.py
@@ -6,7 +6,7 @@
 from tinygrad.helpers import Context
 from test.helpers import get_uops
 from tinygrad.uop.ops import UOp, Ops, graph_rewrite, sym_infer
-from tinygrad.uop.symbolic import sym, commutative, pm_simplify_valid
+from tinygrad.uop.symbolic import sym, commutative, pm_simplify_valid, pm_move_where_on_load
 from tinygrad.uop.validate import uops_to_z3
 
 def check_uop_against_string(self, v:UOp, s:str):
@@ -851,6 +851,19 @@ def test_simplex_lt(self):
     self.helper_test_variable((a+b+c*2<1).ne(True), 0, 1, "((((a+b)+c)<1)!=True)")
     self.helper_test_variable((a+b*2+c*4<1).ne(True), 0, 1, "((((a+b)+c)<1)!=True)")
 
+  def test_cast_bool_to_int_ne_const(self):
+    cond = Variable("a", 0, 3) < 2
+    # CAST(bool -> int) != 0  ->  cond
+    self.helper_test_variable(cond.cast(dtypes.int).ne(0), 0, 1, "(a<2)")
+    # CAST(bool -> int) != 1  ->  !cond
+    self.helper_test_variable(cond.cast(dtypes.int).ne(1), 0, 1, "((a<2)!=True)")
+    # CAST(bool -> int) != c (c not in {0,1})  ->  always True (CAST is 0 or 1)
+    self.helper_test_variable(cond.cast(dtypes.int).ne(2), 1, 1, "True")
+    self.helper_test_variable(cond.cast(dtypes.int).ne(-1), 1, 1, "True")
+    # CAST(bool -> weakint) folds too
+    self.helper_test_variable(cond.cast(dtypes.weakint).ne(0), 0, 1, "(a<2)")
+    self.helper_test_variable(cond.cast(dtypes.weakint).ne(1), 0, 1, "((a<2)!=True)")
+
   def test_where_removal(self):
     cond = Variable("a", 0, 3) < 2
     u1, u0 = cond.const_like(True), cond.const_like(False)
@@ -1234,6 +1247,23 @@ def test_store_load_folding(self):
     # Negative: store(idx, load(idx) + 1) should NOT fold
     self.assertEqual(graph_rewrite(index.store(index.load() + UOp.const(dtypes.int, 1)), sym).op, Ops.STORE)
 
+class TestMoveWhereOnLoad(unittest.TestCase):
+  def test_bool_index_preserves_dtype(self):
+    buf = UOp.param(0, dtypes.bool.ptr(8))
+    a = Variable("a", 0, 7)
+    r = UOp.range(8, 0)
+    # cond has a range that the rewrite can move into the valid: gate (a<4) goes into load valid
+    cond = (a < 4) & (r < 2)
+    valid = (a < 2)  # pre-existing valid on the load (to pass can_move check for the r-only clause)
+    idx = buf.index(a.valid(valid), ptr=True)
+    expr = cond.where(idx, 0)
+    out = graph_rewrite(expr, pm_move_where_on_load)
+    # any WHERE in the rewritten graph must have matched-dtype branches
+    for u in out.toposort():
+      if u.op is Ops.WHERE:
+        self.assertEqual(u.dtype, u.src[1].dtype, f"WHERE branch 1 dtype mismatch: {u}")
+        self.assertEqual(u.dtype, u.src[2].dtype, f"WHERE branch 2 dtype mismatch: {u}")
+
 class TestSymbolicRealWorld(unittest.TestCase):
   def test_resnet_half(self):
     gidx0 = Variable("gidx0", 0, 3)
diff --git a/test/null/test_viz.py b/test/null/test_viz.py
index fed098efba901..aa7ddfc36a595 100644
--- a/test/null/test_viz.py
+++ b/test/null/test_viz.py
@@ -12,6 +12,7 @@
 
 from tinygrad.uop.ops import tracked_keys, tracked_ctxs, uop_fields, active_rewrites, active_group, _name_cnt, RewriteTrace
 from tinygrad.viz.serve import load_rewrites, get_full_rewrite, uop_to_json, VizData
+from tinygrad.codegen import to_program_cache
 
 @track_rewrites(name=True)
 def exec_rewrite(sink:UOp, pm_lst:list[PatternMatcher], names:None|list[str]=None) -> UOp:
@@ -39,6 +40,7 @@ def get_details(self, rewrite_idx:int, step:int) -> Generator[dict, None, None]:
 @contextlib.contextmanager
 def save_viz():
   for lst in [tracked_keys, tracked_ctxs, active_rewrites, active_group, _name_cnt]: lst.clear()
+  to_program_cache.clear()
   Buffer.profile_events.clear()
   cpu_events.clear()
   viz = VizTrace()
@@ -225,6 +227,10 @@ def test_const_node_visibility(self):
     self.assertEqual(list(graphs[0]), [id(a), id(alu)])
     self.assertEqual(list(graphs[1]), [id(z)])
 
+  # TODO: DEFINE_VAR (shape ()) now gets wrapped in RESHAPE+EXPAND when broadcast against a shaped operand
+  # (due to shared OpMixin._binop using _broadcasted). Either extend viz to fold RESHAPE/EXPAND around
+  # DEFINE_VAR/RANGE/SPECIAL the way it does for CONST, or redesign scalar-compiler-op broadcasting.
+  @unittest.expectedFailure
   def test_const_reshape_expand_folded(self):
     # CONST->RESHAPE->EXPAND should be folded into the ALU node, not shown as separate RESHAPE/EXPAND nodes
     c = UOp.const(dtypes.float, 1.0, device="CPU", shape=(3,4))  # creates CONST->RESHAPE->EXPAND chain
diff --git a/test/opt/test_gen_float4.py b/test/opt/test_gen_float4.py
index 8e8111c7b03ca..61f66f2ecfe15 100644
--- a/test/opt/test_gen_float4.py
+++ b/test/opt/test_gen_float4.py
@@ -3,9 +3,11 @@
 from tinygrad.uop.ops import UOp, Ops
 from tinygrad.codegen.opt import Opt, OptOps
 from tinygrad.engine.realize import get_program
-from tinygrad.helpers import AMX
+from tinygrad.helpers import DEV
 from test.helpers import replace_opts
 
+AMX = "AMX" in DEV.arch
+
 @unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "need backends that support float4")
 class TestFloat4(unittest.TestCase):
   @staticmethod
diff --git a/test/opt/test_tensor_cores.py b/test/opt/test_tensor_cores.py
index 736c26d110b34..2d04574441591 100644
--- a/test/opt/test_tensor_cores.py
+++ b/test/opt/test_tensor_cores.py
@@ -7,7 +7,7 @@
 from tinygrad.uop.ops import Ops
 from tinygrad.dtype import DType
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import AMX, DEV, Context
+from tinygrad.helpers import DEV, Context
 from test.helpers import slow, replace_opts
 from tinygrad.engine.realize import CompiledRunner, get_program
 from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
@@ -18,6 +18,8 @@
 
 # NOTE: get_program always passes in Device[Device.DEFAULT].renderer explicitly for process_replay!!!
 
+AMX = "AMX" in DEV.arch
+
 def helper_tc_ensure_uops_and_opts_count(N: int, M:int, K:int, dtype_in:DType, dtype_out:DType, axis:int=0, tc_select:int=-1, tc_opt:int=0,
                                          ensure_triggered:bool=True):
   a, b = Tensor.rand(M, K, dtype=dtype_in), Tensor.rand(K, N, dtype=dtype_in)
diff --git a/tinygrad/codegen/__init__.py b/tinygrad/codegen/__init__.py
index 0ee01d8e35f42..95042992fb7b0 100644
--- a/tinygrad/codegen/__init__.py
+++ b/tinygrad/codegen/__init__.py
@@ -1,7 +1,8 @@
 from typing import cast
 from dataclasses import replace
-import itertools
-from tinygrad.helpers import DISABLE_FAST_IDIV, DEVECTORIZE, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, TracingKey, Context, Target, panic
+import itertools, weakref
+from tinygrad.helpers import DISABLE_FAST_IDIV, DEVECTORIZE, TRANSCENDENTAL, SPEC, DEBUG, VIZ, IMAGE, NOOPT, EMULATED_DTYPES
+from tinygrad.helpers import TracingKey, Context, Target, panic
 from tinygrad.uop.ops import PatternMatcher, graph_rewrite, UOp, pm_lower_index_dtype, Ops, UPat, track_rewrites, KernelInfo, pyrender
 from tinygrad.uop.spec import type_verify, program_spec, kernel_spec
 from tinygrad.renderer import Renderer, ProgramSpec, Estimates
@@ -152,31 +153,34 @@ def do_compile(ctx:Renderer, prg:UOp, source:UOp) -> UOp|None:
   (UPat(Ops.PROGRAM, src=(UPat(), UPat(Ops.DEVICE), UPat(Ops.LINEAR), UPat(Ops.SOURCE, name="source")), name="prg"), do_compile),
 ])
 
+@track_rewrites(name=lambda ast,renderer,ret,**kwargs: TracingKey(ret.src[0].arg.name,(ret.src[0].arg.function_name, ast), ret=renderer), replay=True)
 @Context(ALLOW_DEVICE_USAGE=0)
-@track_rewrites(name=lambda ast,renderer,ret,**kwargs: TracingKey(ret.name, (ret.function_name, ast), ret=renderer), replay=True)
-def get_program(ast:UOp, renderer:Renderer) -> ProgramSpec:
+def do_to_program(ast:UOp, renderer:Renderer) -> UOp:
   """
-  Transform an AST into a ProgramSpec. May trigger BEAM search.
+  Transform an AST into a compiled PROGRAM. May trigger BEAM search.
 
   Args:
-    ast: The Ops.SINK rooted AST
+    ast: The Ops.SINK/Ops.PROGRAM rooted AST
     renderer: The renderer used to generate the code
 
   Returns:
-    The ProgramSpec of the program.
+    The Ops.PROGRAM with SINK/DEVICE/LINEAR/SOURCE/BINARY.
   """
-
   if ast.op is Ops.PROGRAM: prg = ast
   elif ast.op is Ops.SINK:
-    # rewrite to prg
-    assert isinstance(ast.arg, KernelInfo), "requires KernelInfo on arg to get_program"
+    assert isinstance(ast.arg, KernelInfo), "requires KernelInfo on arg to to_program"
     full_sink = full_rewrite_to_sink(ast, renderer, optimize=ast.tag is None, beam=ast.arg.beam)
     prg = UOp(Ops.PROGRAM, src=(full_sink, UOp(Ops.DEVICE, arg=renderer.target.device)))
-  else:
-    raise RuntimeError(f"can't call get_program on {ast.op}")
-
+  else: raise RuntimeError(f"can't call to_program on {ast.op}")
   prg = graph_rewrite(prg, pm_to_program, ctx=renderer, name="linearize/render")
   if VIZ: graph_rewrite(prg, PatternMatcher([]), name="View Program")
+  return prg
+
+to_program_cache: weakref.WeakValueDictionary[tuple, UOp] = weakref.WeakValueDictionary()
+def to_program(ast:UOp, renderer:Renderer) -> UOp:
+  if ast.op is Ops.PROGRAM and len(ast.src) >= 5 and ast.src[4].op is Ops.BINARY: return ast
+  key = (ast.key, type(renderer), renderer.target, NOOPT.value, DEVECTORIZE.value, EMULATED_DTYPES.value)
+  if (prg:=to_program_cache.get(key)) is None: to_program_cache[key] = prg = do_to_program(ast, renderer)
+  return prg
 
-  # create the ProgramSpec
-  return ProgramSpec.from_uop(prg)
+def get_program(ast:UOp, renderer:Renderer) -> ProgramSpec: return ProgramSpec.from_uop(to_program(ast, renderer))
diff --git a/tinygrad/codegen/late/devectorizer.py b/tinygrad/codegen/late/devectorizer.py
index 36190c5196baf..992dd9bd3fdf9 100644
--- a/tinygrad/codegen/late/devectorizer.py
+++ b/tinygrad/codegen/late/devectorizer.py
@@ -5,7 +5,7 @@
 from tinygrad.dtype import dtypes, ImageDType, DType, AddrSpace, Invalid, PtrDType
 from tinygrad.uop.ops import UOp, Ops, UPat, PatternMatcher, GroupOp, identity_element
 from tinygrad.uop.symbolic import uop_given_valid, parse_valid, invalid_gate
-from tinygrad.helpers import getenv, flatten, AMX, prod
+from tinygrad.helpers import getenv, flatten, prod
 from tinygrad.renderer import Renderer
 
 # ***** image load valid simplification *****
@@ -171,7 +171,7 @@ def split_load_store(ctx:Renderer|None, ls:UOp, idx:UOp):
     lengths = [4]
   elif ctx is not None and ctx.supports_float4:
     # TODO: a better way to get this than ctx
-    lengths = [8,4,2] if buf.dtype.base == dtypes.half and getenv("ALLOW_HALF8") else ([16,8,4,2] if AMX else [4,2])
+    lengths = [8,4,2] if buf.dtype.base == dtypes.half and getenv("ALLOW_HALF8") else ([16,8,4,2] if "AMX" in ctx.target.arch else [4,2])
   lengths.append(1)  # worst case, it's not folded
 
   # filter fold lengths that don't divide
@@ -319,13 +319,13 @@ def reduce_to_acc(ctx:ReduceContext, red:UOp):
     input_ranges = tuple([x for x in topo if x.op is Ops.RANGE and x not in reduce_range and x not in ended_ranges])
     identity = red.const(red.dtype, identity_element(red.arg, red.dtype.scalar()))
     acc = UOp.placeholder((1,), red.dtype, ctx.acc_num, AddrSpace.REG)
-    acc_init = acc.after(*input_ranges).index(UOp.const(dtypes.int, 0)).store(identity)
-    lst = [acc.after(acc_init, *reduce_range).index(UOp.const(dtypes.int, 0))] + lst  # put acc as the first element
+    acc_init = acc.after(*input_ranges).index(UOp.const(dtypes.weakint, 0)).store(identity)
+    lst = [acc.after(acc_init, *reduce_range).index(UOp.const(dtypes.weakint, 0))] + lst  # put acc as the first element
     ctx.acc_num += 1
   ret = functools.reduce(lambda x,y: x.alu(red.arg, y), lst)
   if len(reduce_range) == 0: return ret
-  end = acc.index(UOp.const(dtypes.int, 0)).store(ret).end(*reduce_range).rtag("mergeable")
-  return acc.after(end).index(UOp.const(dtypes.int, 0))
+  end = acc.index(UOp.const(dtypes.weakint, 0)).store(ret).end(*reduce_range).rtag("mergeable")
+  return acc.after(end).index(UOp.const(dtypes.weakint, 0))
 
 def merge_reduce_ends(ctx:ReduceContext, sink:UOp):
   # merge ENDs that share the same range and nesting context (only those created by reduce_to_acc)
diff --git a/tinygrad/codegen/opt/heuristic.py b/tinygrad/codegen/opt/heuristic.py
index 7a5b5cee890da..1049b3bbf2f94 100644
--- a/tinygrad/codegen/opt/heuristic.py
+++ b/tinygrad/codegen/opt/heuristic.py
@@ -1,6 +1,6 @@
 import itertools
 from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
-from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS, TC_OPT, TC_SELECT, USE_TC, AMX, IMAGE
+from tinygrad.helpers import getenv, DEBUG, prod, NOLOCALS, TC_OPT, TC_SELECT, USE_TC, IMAGE
 from tinygrad.dtype import PtrDType, ImageDType
 from tinygrad.uop.ops import Ops, resolve, AxisType
 from tinygrad.codegen.opt.postrange import Scheduler
@@ -34,7 +34,7 @@ def hand_coded_optimizations(k:Scheduler) -> Scheduler:
     except KernelOptError:
       pass
     # skip hand-coded TC opts if AMX, upcasting will make kernel slower
-    if good_tc_opt and not AMX:
+    if good_tc_opt and "AMX" not in k.ren.target.arch:
       if rngs is not None:
         for tc_dim in [1,0]: # attempt to upcast M and N
           szs = [sz for sz in [5,4,3,2] if rngs[tc_dim].src[0].divides(sz) is not None]
diff --git a/tinygrad/engine/jit.py b/tinygrad/engine/jit.py
index 92f137f050fbb..7708bde9e7a19 100644
--- a/tinygrad/engine/jit.py
+++ b/tinygrad/engine/jit.py
@@ -5,7 +5,7 @@
 from tinygrad.device import Buffer, Compiled, Device, MultiBuffer
 from tinygrad.dtype import DType, dtypes
 from tinygrad.uop.ops import UOp, PatternMatcher, Variable, sym_infer, Ops, buffers, track_rewrites, graph_rewrite
-from tinygrad.engine.realize import ExecItem, capturing, CompiledRunner, Runner, Estimates, pm_beam, run_linear, get_runner, graph_cache
+from tinygrad.engine.realize import ExecItem, capturing, CompiledRunner, Runner, Estimates, compile_linear, run_linear, get_runner, graph_cache
 from tinygrad.schedule.memory import memory_plan_rewrite, _collect_bufs
 from tinygrad.schedule import linear_to_schedule
 from tinygrad.nn.state import get_parameters
@@ -45,7 +45,7 @@ def flush_batch():
   for si in linear.src:
     if si.src[0].op is Ops.BUFFER_VIEW: continue
 
-    devs = [Device[x] for x in (si.device if isinstance(si.device, tuple) else (si.device,))]
+    devs = dedup([Device[x] for b in si.src[1:] if b.op is not Ops.BIND for x in (b.device if isinstance(b.device, tuple) else (b.device,))])
     graph_t = graph_class(devs[0]) if devs[0].graph is not None else None
 
     can_graph = graph_t is not None and graph_t.supports_exec_item(devs, si)
@@ -79,10 +79,7 @@ def jit_lower(linear:UOp, held_bufs:set[UOp], input_uops:list[UOp]) -> UOp:
 
   # parametrize input buffers: map each input buffer UOp to a PARAM with the correct slot index
   linear = linear.substitute({u: UOp.param(i, u.dtype, u.shape, u.device) for i,u in enumerate(input_uops)}, walk=True)
-
-  # set KernelInfo.beam on SINKs if jitbeam is set
-  if (jitbeam:=getenv("JITBEAM", BEAM.value)) >= 1: linear = graph_rewrite(linear, pm_beam, ctx=jitbeam, walk=True)
-
+  linear = compile_linear(linear, beam=getenv("JITBEAM", BEAM.value))
   linear = memory_plan_rewrite(linear, held_bufs)
   if JIT < 2: linear = graph_split_rewrite(linear, max_batch_size=JIT_BATCH_SIZE.value)
   if VIZ: graph_rewrite(linear, PatternMatcher([]), name="View graphed linear")
@@ -108,8 +105,9 @@ def get_input_replace(jit_cache: list[ExecItem], input_buffers:list[Buffer]) ->
   return input_replace
 
 class GraphRunner(Runner):
-  def __init__(self, linear:UOp, input_buffers:list[Buffer]):
-    self.jit_cache = [ei.lower() for ei in linear_to_schedule(linear.src[0])]
+  def __init__(self, linear:UOp, input_buffers:list[Buffer], input_uops:tuple[UOp, ...]=()):
+    self.linear = linear.src[0]
+    self.jit_cache = [ei.lower() for ei in linear_to_schedule(self.linear.substitute({p: input_uops[p.arg] for p in linear.src[1:]}))]
     for ei in self.jit_cache:
       for b in ei.bufs:
         if b is not None: b.ensure_allocated()
diff --git a/tinygrad/engine/realize.py b/tinygrad/engine/realize.py
index 5816a7c5a9959..4456ffa13da03 100644
--- a/tinygrad/engine/realize.py
+++ b/tinygrad/engine/realize.py
@@ -7,7 +7,7 @@
 from tinygrad.uop.ops import Ops, PatternMatcher, UOp, UPat, sym_infer, buffers, graph_rewrite
 from tinygrad.device import Device, Buffer, MultiBuffer
 from tinygrad.renderer import ProgramSpec, Estimates
-from tinygrad.codegen import get_program
+from tinygrad.codegen import get_program, to_program
 
 # **************** Stat ****************
 
@@ -205,7 +205,7 @@ class ExecContext:
 def _resolve(b:UOp, inputs:tuple[UOp, ...]) -> UOp:
   if b.op is Ops.BUFFER_VIEW and b.src[0].op is Ops.PARAM: return b.replace(src=(inputs[b.src[0].arg], *b.src[1:]))
   return inputs[b.arg] if b.op is Ops.PARAM else b
-def resolve_params(ctx:ExecContext, call:UOp) -> list[UOp]: return [_resolve(b, ctx.input_uops) for b in call.src[1:] if b.op is not Ops.BIND]
+def resolve_params(call:UOp, inputs:tuple[UOp, ...]) -> list[UOp]: return [_resolve(b, inputs) for b in call.src[1:] if b.op is not Ops.BIND]
 
 @contextlib.contextmanager
 def track_stats(ctx:ExecContext, call:UOp, device:str, display_name:str, estimates:Estimates, bufs:list[Buffer], var_vals:dict[str, int],
@@ -229,14 +229,14 @@ def unwrap_multi(call:UOp, resolved:list[UOp]) -> Iterator[tuple[list[Buffer], d
     for j, per_dev in enumerate(zip(*[cast(MultiBuffer, b).bufs for b in bufs])): yield list(per_dev), {dnum: j} if dnum else {}
 
 def exec_view(ctx:ExecContext, call, ast):
-  resolved = resolve_params(ctx, call)
+  resolved = resolve_params(call, ctx.input_uops)
   bufs = [cast(Buffer, b.buffer) for b in resolved]
   bv = bufs[1].view(resolved[0].arg, ast.dtype, ast.arg[1]*bufs[1].dtype.itemsize)
   with track_stats(ctx, call, bv.device, colored(f"view {bv.nbytes:8d} @ {bv.offset:<10d}", "yellow"), Estimates(), [bv, bufs[1]], ctx.var_vals):
     buffers[resolved[0]] = bv
 
 def exec_copy(ctx:ExecContext, call, ast):
-  for bufs, device_vars in unwrap_multi(call, resolve_params(ctx, call)):
+  for bufs, device_vars in unwrap_multi(call, resolve_params(call, ctx.input_uops)):
     dest, src = bufs[0].ensure_allocated(), bufs[1].ensure_allocated()
     xfer = hasattr(alc:=Device[dest.device].allocator,'_transfer') and alc.supports_transfer and dest.device.split(":")[0]==src.device.split(":")[0]
     prg = (BufferXfer if xfer else BufferCopy)(dest.nbytes, dest.device, src.device)
@@ -244,7 +244,7 @@ def exec_copy(ctx:ExecContext, call, ast):
       prg.copy(dest, src)
 
 def exec_kernel(ctx:ExecContext, call, ast):
-  for bufs, device_vars in unwrap_multi(call, resolve_params(ctx, call)):
+  for bufs, device_vars in unwrap_multi(call, resolve_params(call, ctx.input_uops)):
     var_vals = {**ctx.var_vals, **device_vars}
     prg = get_runner(bufs[0].device, ast)
     prg_bufs = [bufs[i].ensure_allocated() for i in prg.p.globals]
@@ -264,7 +264,7 @@ def exec_kernel(ctx:ExecContext, call, ast):
       for i in prg.p.outs: np.testing.assert_allclose(prg_bufs[i].numpy(), cpu_bufs[i].numpy(), rtol=1e-3, atol=1e-3)
 
 def exec_encdec(ctx:ExecContext, call, ast):
-  bufs = [cast(Buffer, b.buffer).ensure_allocated() for b in resolve_params(ctx, call)]
+  bufs = [cast(Buffer, b.buffer).ensure_allocated() for b in resolve_params(call, ctx.input_uops)]
   shape, pos_var = tuple(s.arg for s in ast.src if s.op is Ops.CONST), ast.variables()[0].expr
   with track_stats(ctx, call, bufs[0].device, colored(f"enc/dec {size_to_str(bufs[0].nbytes)}", "yellow"),
                    Estimates(lds=bufs[0].nbytes, mem=bufs[0].nbytes), bufs, ctx.var_vals):
@@ -272,13 +272,11 @@ def exec_encdec(ctx:ExecContext, call, ast):
 
 graph_cache:weakref.WeakKeyDictionary[UOp, Runner] = weakref.WeakKeyDictionary()
 def exec_graph(ctx:ExecContext, call, cf):
-  inputs = resolve_params(ctx, call)
-  bufs = flatten([b.bufs if isinstance(b, MultiBuffer) else [b] for b in (u.buffer for u in inputs)])
+  bufs = flatten([b.bufs if isinstance(b, MultiBuffer) else [b] for b in (u.buffer for u in resolve_params(call, ctx.input_uops))])
   if (runner:=graph_cache.get(cf)) is None:
-    sub = cf.substitute(dict(zip(cf.src[1:], inputs)))
-    graph_cache[cf] = runner = Device[cf.device if isinstance(cf.device, str) else cf.device[0]].graph(sub, bufs)
+    graph_cache[cf] = runner = Device[cf.device if isinstance(cf.device, str) else cf.device[0]].graph(cf, bufs, input_uops=ctx.input_uops)
   with track_stats(ctx, call, runner.device, runner.display_name, runner.estimates, bufs, ctx.var_vals) as t:
-    t[0] = runner(bufs, ctx.var_vals, wait=DEBUG >= 2)
+    t[0] = runner(bufs, ctx.var_vals, wait=DEBUG >= 2, input_uops=ctx.input_uops) # type: ignore[call-arg]
 
 # ctx is beam value
 pm_beam = PatternMatcher([
@@ -286,15 +284,24 @@ def exec_graph(ctx:ExecContext, call, cf):
    lambda ctx,call,sink: call.replace(src=(sink.replace(arg=replace(sink.arg, beam=ctx)), *call.src[1:])) if sink.arg.beam == 0 else None),
 ])
 
+pm_compile = PatternMatcher([
+  (UPat(Ops.CALL, src=(UPat((Ops.SINK, Ops.PROGRAM), name="ast"),), name="call", allow_any_len=True), lambda call,ast:
+    call.replace(src=(to_program(ast, Device[call.device if isinstance(call.device, str) else call.device[0]].renderer), *call.src[1:]))),
+])
+
 pm_exec = PatternMatcher([
   (UPat(Ops.CALL, src=(UPat(Ops.BUFFER_VIEW, name="ast"),), name="call", allow_any_len=True), exec_view),
   (UPat(Ops.CALL, src=(UPat(Ops.COPY, name="ast"),), name="call", allow_any_len=True), exec_copy),
-  (UPat(Ops.CALL, src=(UPat((Ops.SINK, Ops.PROGRAM), name="ast"),), name="call", allow_any_len=True), exec_kernel),
+  (UPat(Ops.CALL, src=(UPat((Ops.PROGRAM, Ops.SINK), name="ast"),), name="call", allow_any_len=True), exec_kernel),
   (UPat(Ops.CALL, src=(UPat(Ops.CUSTOM_FUNCTION, arg="encdec", name="ast"),), name="call", allow_any_len=True), exec_encdec),
   (UPat(Ops.CALL, src=(UPat(Ops.CUSTOM_FUNCTION, arg="graph", name="cf"),), name="call", allow_any_len=True), exec_graph),
 ])
 
+def compile_linear(linear:UOp, beam=0) -> UOp:
+  if (beam_val:=(beam or BEAM.value)) >= 1: linear = graph_rewrite(linear, pm_beam, ctx=beam_val, walk=True)
+  return graph_rewrite(linear, pm_compile, name="precompile kernels", walk=True) if not VALIDATE_WITH_CPU else linear
+
 def run_linear(linear:UOp, var_vals:dict[str, int]|None=None, input_uops:tuple[UOp, ...]=(), do_update_stats=True, jit=False):
-  if BEAM >= 1: linear = graph_rewrite(linear, pm_beam, ctx=BEAM.value, name="add beam")
+  if not jit: linear = compile_linear(linear)
   ctx = ExecContext(var_vals or {}, input_uops, do_update_stats, jit)
   for call in linear.src: pm_exec.rewrite(call, ctx)
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index 326d42b20788f..2cc58cc68df34 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -232,7 +232,7 @@ def target(self, dev:str, **kwargs) -> Target:
 IMAGE, FLOAT16, OPENPILOT_HACKS = ContextVar("IMAGE", 0), ContextVar("FLOAT16", 0), ContextVar("OPENPILOT_HACKS", 0)
 JIT, JIT_BATCH_SIZE = ContextVar("JIT", 2 if OSX and ARCH_X86 else 1), ContextVar("JIT_BATCH_SIZE", 32)
 WINO, CAPTURING, TRACEMETA, NO_COLOR = ContextVar("WINO", 0), ContextVar("CAPTURING", 1), ContextVar("TRACEMETA", 1), ContextVar("NO_COLOR", 0)
-USE_TC, TC_SELECT, TC_OPT, AMX = ContextVar("TC", 1), ContextVar("TC_SELECT", -1), ContextVar("TC_OPT", 0), ContextVar("AMX", 0)
+USE_TC, TC_SELECT, TC_OPT = ContextVar("TC", 1), ContextVar("TC_SELECT", -1), ContextVar("TC_OPT", 0)
 TRANSCENDENTAL, NOLOCALS = ContextVar("TRANSCENDENTAL", 1), ContextVar("NOLOCALS", 0)
 SPLIT_REDUCEOP, NO_MEMORY_PLANNER, LRU = ContextVar("SPLIT_REDUCEOP", 1), ContextVar("NO_MEMORY_PLANNER", 0), ContextVar("LRU", 1)
 RING, ALL2ALL, ALLREDUCE_CAST = ContextVar("RING", 1), ContextVar("ALL2ALL", 0), ContextVar("ALLREDUCE_CAST", 1)
diff --git a/tinygrad/mixin/__init__.py b/tinygrad/mixin/__init__.py
index cb32ded72e9df..f63fb147a044d 100644
--- a/tinygrad/mixin/__init__.py
+++ b/tinygrad/mixin/__init__.py
@@ -6,7 +6,7 @@
 from tinygrad.mixin.reduce import ReduceMixin
 from tinygrad.uop import Ops
 from tinygrad.uop.ops import _broadcast_shape, resolve, smax, smin, identity_element
-from tinygrad.dtype import ConstType, DTypeLike, Invalid, dtypes, least_upper_dtype, sum_acc_dtype, to_dtype
+from tinygrad.dtype import ConstType, DTypeLike, Invalid, InvalidType, PtrDType, dtypes, least_upper_dtype, sum_acc_dtype, to_dtype
 from tinygrad.helpers import argfix, ceildiv, flatten, flat_to_grouped, make_tuple, prod, resolve_pool_pads, round_up
 
 if TYPE_CHECKING:
@@ -200,15 +200,25 @@ def _pad_constant(self, pX, value:float) -> Self:
     if value == 0: return MovementMixin.pad(X, pads)
     return MovementMixin.pad(X, pads) + MovementMixin.pad(X.ones_like(), pads).cast(dtypes.bool).where(0, value)
 
+  def _ufix_keep_dtype(self, x) -> bool:
+    # matches Tensor scalar-wrapping behavior: keep self.dtype for float self, or for int self with int/Invalid scalar
+    return dtypes.is_float(self.dtype) or (dtypes.is_int(self.dtype) and isinstance(x, (int, InvalidType)))
+
   def _broadcasted(self, y, reverse=False) -> tuple[Self, Self]:
     if not isinstance(y, type(self)): y = self.ufix(y)
     x, y = (self, y) if not reverse else (y, self)
+    # ValueError: unsized ptr has shape (-1,) which can't broadcast; RuntimeError: shape mismatch
     try:
       out_shape = _broadcast_shape(x.shape, y.shape)
       x, y = x._broadcast_to(out_shape), y._broadcast_to(out_shape)
-    except RuntimeError: pass
-    out_dtype = least_upper_dtype(x.dtype, y.dtype)
-    return x.cast(out_dtype), y.cast(out_dtype)
+    except (RuntimeError, ValueError): pass
+    # ptr dtypes aren't in the promo lattice
+    if x.dtype == y.dtype or any(isinstance(d, PtrDType) for d in (x.dtype, y.dtype)): return x, y
+    return x.cast(out_dtype := least_upper_dtype(x.dtype, y.dtype)), y.cast(out_dtype)
+
+  def _binop(self, op:Ops, x, reverse:bool) -> Self:
+    lhs, rhs = self._broadcasted(x, reverse)
+    return lhs.alu(op, rhs)
 
   def dot(self, w:Self, dtype:DTypeLike|None=None) -> Self:
     """
diff --git a/tinygrad/renderer/__init__.py b/tinygrad/renderer/__init__.py
index 4886ccf8b38e9..cc3f66e520d3a 100644
--- a/tinygrad/renderer/__init__.py
+++ b/tinygrad/renderer/__init__.py
@@ -67,6 +67,7 @@ class ProgramSpec:
   src:str
   device:str
   ast:UOp  # save the base ast (this is method cache key)
+  prg:UOp|None=None
   uops:list[UOp]|None=None
   lib:bytes|None=None
   aux:list=field(default_factory=list)
@@ -127,7 +128,7 @@ def from_uop(prg:UOp) -> ProgramSpec:
         if special_size is not None: special_size[int(u.arg[-1])] = cast(int, u.src[0].ssimplify())
       if u.op is Ops.DEFINE_VAR and u.arg[0] == 'core_id': global_size[0] = u.arg[2] + 1
 
-    return ProgramSpec(sink.arg.name, source.arg, device.arg, sink, uops, lib, list(prg.arg) if prg.arg else [], global_size, local_size,
+    return ProgramSpec(sink.arg.name, source.arg, device.arg, sink, prg, uops, lib, list(prg.arg) if prg.arg else [], global_size, local_size,
                        sorted(_vars, key=lambda v: v.arg), sorted(dedup(_globals)), sorted(dedup(outs)), sorted(dedup(ins)))
 
 class Renderer:
diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
index 98b3118d50907..cc4ba8622206f 100644
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -3,7 +3,7 @@
 from collections import defaultdict, Counter
 from tinygrad.codegen.opt import tc
 from tinygrad.uop.ops import GroupOp, Ops, UOp, PatternMatcher, UPat, range_str, axis_letters
-from tinygrad.helpers import strip_parens, getenv, prod, dedup, Target, AMX, CPU_COUNT
+from tinygrad.helpers import strip_parens, getenv, prod, dedup, Target, CPU_COUNT
 from tinygrad.dtype import ImageDType, dtypes, DType, PtrDType, AddrSpace, truncate, float_to_bf16
 from tinygrad.renderer import Renderer
 from tinygrad.codegen.late.devectorizer import no_vectorized_alu
@@ -226,7 +226,6 @@ class ClangRenderer(CStyleLanguage):
   global_max = (CPU_COUNT.value, 0, 0)
   infinity = "__builtin_inff()"
   nan = '__builtin_nanf("")'
-  if AMX: tensor_cores = tc.amx
 
   # language options
   buffer_suffix = " restrict"
@@ -280,7 +279,8 @@ class ClangJITRenderer(ClangRenderer):
   def __init__(self, target:Target):
     super().__init__(target)
     from tinygrad.runtime.support.compiler_cpu import ClangJITCompiler
-    self.compiler = ClangJITCompiler(target.arch)
+    if "AMX" in target.arch: self.tensor_cores = tc.amx
+    self.compiler = ClangJITCompiler([x for x in target.arch.split(",") if x != "AMX"])
 
 class OpenCLRenderer(CStyleLanguage):
   has_aux = True
diff --git a/tinygrad/renderer/llvmir.py b/tinygrad/renderer/llvmir.py
index d21ef31319690..e7b2d1b358c97 100644
--- a/tinygrad/renderer/llvmir.py
+++ b/tinygrad/renderer/llvmir.py
@@ -6,7 +6,7 @@
 from tinygrad.uop.decompositions import xexp2, xlog2
 from tinygrad.uop.ops import UOp, PatternMatcher, UPat, Ops, GroupOp, range_str
 from tinygrad.dtype import dtypes, float_to_fp8, DType, PtrDType, truncate
-from tinygrad.helpers import prod, Target, AMX, CPU_COUNT, getenv
+from tinygrad.helpers import prod, Target, CPU_COUNT, getenv
 
 def ldt(dt:DType):
   if dt.vcount > 1: return f"<{dt.vcount} x {ldt(dt.scalar())}>"
@@ -134,7 +134,6 @@ class LLVMRenderer(Renderer):
   abi: str | None
   string_rewrite: PatternMatcher
   code_for_op = {k:lambda:None for v in lop.values() for k in v.keys()}
-  if AMX: tensor_cores = tc.amx
 
   extra_matcher = create_non_native_float_pats((dtypes.bfloat16,)) + pm_manual_bf16_cast
   def _render_fn(self, name:str, args:list[tuple[str,DType]], kernel:list[str], prefix:list[str]|None=None) -> str:
@@ -149,7 +148,7 @@ def _render_kernel(self, uops: list[UOp], prefix:list[str]|None=None) -> tuple[t
 
     local_args: list[str] = []
     for u in uops:
-      if AMX and u.op is Ops.WMMA: # prealloc aux buffers as AMX can only load from memory
+      if self.tensor_cores == tc.amx and u.op is Ops.WMMA: # prealloc aux buffers as AMX can only load from memory
         vc += 1
         r[u] = f"%wmma{vc}"
         for i, dtype in enumerate(u.arg[2].vec(sz) for sz in [prod(size for _, size in upcast) for upcast in u.arg[6]]):
@@ -204,7 +203,8 @@ def _render_footer(self, uops: list[UOp]) -> str: return 'attributes #0 = { alwa
   def __init__(self, target:Target):
     super().__init__(target)
     from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler
-    self.compiler = CPULLVMCompiler(target.arch)
+    if "AMX" in target.arch: self.tensor_cores = tc.amx
+    self.compiler = CPULLVMCompiler([x for x in target.arch.split(",") if x != "AMX"])
 
 barrier = 'fence syncscope("workgroup") release\ntail call void @llvm.amdgcn.s.barrier()\nfence syncscope("workgroup") acquire\n'
 code_for_workitem = {"g": lambda x: f"tail call i32 @llvm.amdgcn.workgroup.id.{chr(120+int(x))}()",
diff --git a/tinygrad/runtime/graph/cuda.py b/tinygrad/runtime/graph/cuda.py
index 48292175de481..5a23e74a63f23 100644
--- a/tinygrad/runtime/graph/cuda.py
+++ b/tinygrad/runtime/graph/cuda.py
@@ -1,72 +1,73 @@
 import ctypes
 from typing import Any, cast
 import tinygrad.runtime.autogen.cuda as cuda
-from tinygrad.helpers import dedup
 from tinygrad.runtime.support.c import init_c_var
-from tinygrad.device import Buffer, Device
+from tinygrad.device import Device, MultiBuffer
+from tinygrad.uop.ops import Ops
 from tinygrad.runtime.ops_cuda import CUDADevice, check, encode_args, cu_time_execution
-from tinygrad.engine.realize import BufferXfer, CompiledRunner
-from tinygrad.engine.jit import MultiGraphRunner, GraphException
+from tinygrad.engine.realize import get_runner, unwrap_multi, resolve_params
+from tinygrad.engine.jit import MultiGraphRunner
 
 class CUDAGraph(MultiGraphRunner):
-  def __init__(self, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-
-    # Check all jit items are compatible.
-    if not all(isinstance(ji.prg, (CompiledRunner, BufferXfer)) for ji in self.jit_cache): raise GraphException
-
-    self.jc_idx_with_updatable_bufs = dedup([x[0] for x in self.input_replace.keys()])
-    self.updatable_nodes: dict[int, tuple[Any, Any, Any, bool]] = {} # dict[jc index] = tuple(graph node, node params, input kernel params, is memcpy)
+  def __init__(self, linear, input_buffers, input_uops=()):
+    super().__init__(linear, input_buffers, input_uops)
 
+    self.nodes: list[tuple[Any, ...]] = [] # list of tuple(graph node, node params, c_args/context, is memcpy, replace, dev_idx)
     self.graph = init_c_var(cuda.CUgraph, lambda x: check(cuda.cuGraphCreate(ctypes.byref(x), 0)))
 
-    for j,ji in enumerate(self.jit_cache):
-      if isinstance(ji.prg, CompiledRunner):
-        global_size, local_size = ji.prg.p.launch_dims({v: 0 for v in self.vars})
+    for call in self.linear.src:
+      replace = [(p, b.arg) for p, b in enumerate(b for b in call.src[1:] if b.op is not Ops.BIND) if b.op is Ops.PARAM]
+      for dev_idx, (bufs, device_vars) in enumerate(unwrap_multi(call, resolve_params(call, input_uops))):
+        for b in bufs: b.ensure_allocated()
+        if call.src[0].op in (Ops.SINK, Ops.PROGRAM):
+          prg = get_runner(bufs[0].device, call.src[0])
+          global_size, local_size = prg.p.launch_dims({v: 0 for v in self.vars})
 
-        new_node = cuda.CUgraphNode()
-        deps = self._access_resources([x.base for x in ji.bufs if x is not None], ji.prg.p.outs, new_dependency=new_node)
-        c_deps = (cuda.CUgraphNode*len(deps))(*deps) if deps else None
+          c_deps, new_node = self.new_node([b.base for b in bufs], prg.p.outs)
+          c_args, vargs = encode_args([b._buf for b in bufs], [device_vars.get(x.expr, 0) for x in prg.p.vars])
+          kern_params = cuda.CUDA_KERNEL_NODE_PARAMS_v1(prg._prg.prg, *global_size, *local_size, 0,
+                                                        ctypes.cast(0, ctypes.POINTER(ctypes.c_void_p)), vargs)
+          check(cuda.cuGraphAddKernelNode(ctypes.byref(new_node), self.graph, c_deps, len(c_deps or []), ctypes.byref(kern_params)))
 
-        c_args, vargs = encode_args([cast(Buffer, x)._buf for x in ji.bufs], [ji.fixedvars.get(x.expr, 0) for x in ji.prg.p.vars])
-        kern_params = cuda.CUDA_KERNEL_NODE_PARAMS_v1(ji.prg._prg.prg, *global_size, *local_size, 0, ctypes.cast(0, ctypes.POINTER(ctypes.c_void_p)),
-                                                      vargs)
-        check(cuda.cuGraphAddKernelNode(ctypes.byref(new_node), self.graph, c_deps, len(deps), ctypes.byref(kern_params)))
+          self.nodes.append((new_node, kern_params, c_args, False, replace, dev_idx))
+        elif call.src[0].op is Ops.COPY:
+          dest, src = bufs[0], bufs[1]
+          src_dev = cast(CUDADevice, Device[src.device])
+          c_deps, new_node = self.new_node([dest.base, src.base], [0])
+          cp_params = cuda.CUDA_MEMCPY3D_v2(srcMemoryType=cuda.CU_MEMORYTYPE_DEVICE, srcDevice=src._buf, srcPitch=src.nbytes, srcHeight=1,
+                                            dstMemoryType=cuda.CU_MEMORYTYPE_DEVICE, dstDevice=dest._buf, dstPitch=dest.nbytes, dstHeight=1,
+                                            WidthInBytes=dest.nbytes, Height=1, Depth=1)
+          check(cuda.cuGraphAddMemcpyNode(ctypes.byref(new_node), self.graph, c_deps, len(c_deps or []), ctypes.byref(cp_params), src_dev.context))
 
-        if j in self.launch_dims_replace or j in self.var_vals_replace or j in self.jc_idx_with_updatable_bufs:
-          self.updatable_nodes[j] = (new_node, kern_params, c_args, False)
-      elif isinstance(ji.prg, BufferXfer):
-        dest, src = [cast(Buffer, x) for x in ji.bufs[0:2]]
-        src_dev = cast(CUDADevice, Device[src.device])
-        node_from = cuda.CUgraphNode()
-        deps = self._access_resources(bufs=[dest.base, src.base], write=[0], new_dependency=node_from)
-        c_deps = (cuda.CUgraphNode*len(deps))(*deps) if deps else None
-        cp_params = cuda.CUDA_MEMCPY3D_v2(srcMemoryType=cuda.CU_MEMORYTYPE_DEVICE, srcDevice=src._buf, srcPitch=src.nbytes, srcHeight=1,
-                                          dstMemoryType=cuda.CU_MEMORYTYPE_DEVICE, dstDevice=dest._buf, dstPitch=dest.nbytes, dstHeight=1,
-                                          WidthInBytes=dest.nbytes, Height=1, Depth=1)
-        check(cuda.cuGraphAddMemcpyNode(ctypes.byref(node_from), self.graph, c_deps, len(deps), ctypes.byref(cp_params), src_dev.context))
-        if j in self.jc_idx_with_updatable_bufs: self.updatable_nodes[j] = (node_from, cp_params, src_dev.context, True)
+          self.nodes.append((new_node, cp_params, src_dev.context, True, [x for x in replace if x[0] < 2], dev_idx))
 
     self.instance = init_c_var(cuda.CUgraphExec, lambda x: check(cuda.cuGraphInstantiate_v2(ctypes.byref(x), self.graph, None, None, 0)))
+    self.updatable = sorted(set(j for j,n in enumerate(self.nodes) if n[4]) | self.var_vals_replace.keys() | self.launch_dims_replace.keys())
+
+  def new_node(self, bufs, write):
+    deps = self._access_resources(bufs, write, new_dependency=(node:=cuda.CUgraphNode()))
+    return (cuda.CUgraphNode*len(deps))(*deps) if deps else None, node
 
-  def __call__(self, input_buffers: list[Buffer], var_vals: dict[str, int], wait=False) -> float|None:
+  def __call__(self, input_buffers, var_vals, wait=False, input_uops=None):
     # Update buffers in the c_args struct.
-    for (j,i),input_idx in self.input_replace.items():
-      if not self.updatable_nodes[j][3]: setattr(self.updatable_nodes[j][2], f'f{i}', input_buffers[input_idx]._buf)
-      else:
-        if i == 0: self.updatable_nodes[j][1].destDevice = input_buffers[input_idx]._buf
-        elif i == 1: self.updatable_nodes[j][1].srcDevice = input_buffers[input_idx]._buf
+    for j in self.updatable:
+      _, params, c_args, is_copy, replace, dev_idx = self.nodes[j]
+      for pos, iidx in replace:
+        buf = b.bufs[dev_idx] if isinstance(b:=input_uops[iidx].buffer, MultiBuffer) else b
+        if not is_copy: setattr(c_args, f'f{pos}', buf._buf)
+        else: setattr(params, 'srcDevice' if pos == 1 else 'dstDevice', buf._buf)
 
     # Update var_vals in the c_args struct.
-    for j, i, v in self.updated_vars(var_vals): setattr(self.updatable_nodes[j][2], f'v{i}', v)
+    for j, i, v in self.updated_vars(var_vals): setattr(self.nodes[j][2], f'v{i}', v)
 
     # Update launch dims in the kern_params struct.
     for j, global_dims, local_dims in self.updated_launch_dims(var_vals):
-      node = self.updatable_nodes[j][1]
+      node = self.nodes[j][1]
       node.blockDimX, node.blockDimY, node.blockDimZ, node.gridDimX, node.gridDimY, node.gridDimZ = *local_dims, *global_dims # type: ignore[misc]
 
     # Update graph nodes with the updated structs.
-    for node, c_node_params, c_args, is_copy in self.updatable_nodes.values():
+    for j in self.updatable:
+      node, c_node_params, c_args, is_copy, _, _ = self.nodes[j]
       if not is_copy: check(cuda.cuGraphExecKernelNodeSetParams(self.instance, node, ctypes.byref(c_node_params)))
       else: check(cuda.cuGraphExecMemcpyNodeSetParams(self.instance, node, ctypes.byref(c_node_params), c_args))
 
diff --git a/tinygrad/runtime/graph/hcq.py b/tinygrad/runtime/graph/hcq.py
index 77ce6c8faa1aa..d2ad964faceec 100644
--- a/tinygrad/runtime/graph/hcq.py
+++ b/tinygrad/runtime/graph/hcq.py
@@ -258,7 +258,7 @@ def _resolve_deps(self, bufs, outs, enqueue_queue, enqueue_dev, out_signal, j, i
 
   def _dev_copy_queues(self, dev): return [q for (d, _), q in self.copy_queues.items() if d == dev]
 
-  def __call__(self, input_buffers: list[Buffer], var_vals: dict[str, int], wait=False) -> float|None:
+  def __call__(self, input_buffers: list[Buffer], var_vals: dict[str, int], wait=False, input_uops=None) -> float|None:
     # Map input buffers
     for dev in self.devices:
       for idx_to_map in self.input_replace_map[dev]: cast(HCQAllocator, dev.allocator).map(input_buffers[idx_to_map]._buf)
diff --git a/tinygrad/runtime/graph/metal.py b/tinygrad/runtime/graph/metal.py
index c881bdd19703e..d246f46b5e349 100644
--- a/tinygrad/runtime/graph/metal.py
+++ b/tinygrad/runtime/graph/metal.py
@@ -51,7 +51,7 @@ def __init__(self, *args, **kwargs):
     for var in self.fixedvars: self.int_buf_view[self.varlist.index(var)] = self.fixedvars[var]
     self.range = metal.NSRange(0, len(self.jit_cache))
 
-  def __call__(self, input_buffers: list[Buffer], var_vals: dict[str, int], wait=False) -> float|None:
+  def __call__(self, input_buffers: list[Buffer], var_vals: dict[str, int], wait=False, input_uops=None) -> float|None:
     if self.command_buffer is not None and self.command_buffer in self.dev.mtl_buffers_in_flight: wait_check(self.command_buffer)
     # NOTE: old command buffer may not be inflight anymore
     if self.command_buffer is not None and PROFILE: self.collect_timestamps()
diff --git a/tinygrad/runtime/ops_null.py b/tinygrad/runtime/ops_null.py
index 9d10cc1049bd7..b068e3a405130 100644
--- a/tinygrad/runtime/ops_null.py
+++ b/tinygrad/runtime/ops_null.py
@@ -27,7 +27,7 @@ def _transfer(self, dest, src, sz:int, src_dev, dest_dev):
   def _offset(self, buf, offset:int, size:int): pass
 
 class NullGraph(MultiGraphRunner):
-  def __call__(self, input_buffers, var_vals, wait=False) -> float|None: return 1e-1
+  def __call__(self, input_buffers, var_vals, wait=False, input_uops=None) -> float|None: return 1e-1
 
 class NullDevice(Compiled):
   def __init__(self, device:str):
diff --git a/tinygrad/runtime/support/compiler_cpu.py b/tinygrad/runtime/support/compiler_cpu.py
index d468ef668eb79..2777d25b4880d 100644
--- a/tinygrad/runtime/support/compiler_cpu.py
+++ b/tinygrad/runtime/support/compiler_cpu.py
@@ -5,17 +5,17 @@
 from tinygrad.runtime.autogen import llvm
 
 class ClangJITCompiler(Compiler):
-  def __init__(self, arch, cachekey="compile_clang_jit"):
-    self.arch, cpu, feats = (sp:=arch.split(',', 2)) + [""] * (3 - len(sp))
+  def __init__(self, arch:list[str], cachekey="compile_clang_jit"):
+    self.arch, cpu, *feats = arch
     assert self.arch and cpu, f"invalid arch string: {arch!r}, expected '<arch>,<cpu>,[<feats>]' (eg. 'x86_64,znver2')"
     match self.arch:
-      case "x86_64": self.args = [f"-march={cpu}"] + [f"-mno{f}" if f.startswith("-") else f"-m{f}" for f in feats.split(',') if f]
+      case "x86_64": self.args = [f"-march={cpu}"] + [f"-mno{f}" if f.startswith("-") else f"-m{f}" for f in feats]
       # on arm march means "runs on this arch and superset" instead of "optimize for this arch". x86 march == arm mcpu
       # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm
-      case "arm64": self.args = ["-ffixed-x18", "-mcpu=" + "+".join([cpu] + ["no"+f[1:] if f.startswith("-") else f for f in feats.split(',') if f])]
-      case "riscv64": self.args = ["-march=" + "_".join(["rv64g" if cpu == "native" else cpu] + [f for f in feats.split(',') if f])]
+      case "arm64": self.args = ["-ffixed-x18", "-mcpu=" + "+".join([cpu] + ["no"+f[1:] if f.startswith("-") else f for f in feats])]
+      case "riscv64": self.args = ["-march=" + "_".join(["rv64g" if cpu == "native" else cpu] + feats)]
       case _: raise RuntimeError(f"unsupported arch: {self.arch!r}")
-    super().__init__(f"{cachekey}_{arch}")
+    super().__init__(f"{cachekey}_{'_'.join(arch)}")
 
   def compile_to_obj(self, src:str) -> bytes:
     """Compile C source to ELF object file (before linking)."""
@@ -91,14 +91,14 @@ def compile(self, src:str) -> bytes: return jit_loader(self.compile_to_obj(src))
 
 
 class CPULLVMCompiler(LLVMCompiler):
-  def __init__(self, arch, cache_key=None):
-    self.arch, cpu, feats = (sp:=arch.split(',', 2)) + [""] * (3 - len(sp))
+  def __init__(self, arch:list[str], cache_key=None):
+    self.arch, cpu, *feats = arch
     assert self.arch and cpu, f"invalid arch string: {arch!r}, expected '<arch>,<cpu>,[<feats>]' (eg. 'x86_64,znver2')"
-    feats = ','.join(f if f.startswith('-') else '+'+f for f in feats.split(',') if f)
+    featstr = ','.join(f if f.startswith('-') else '+'+f for f in feats)
     if cpu == "native":
       cpu = ctypes.string_at(llvm.LLVMGetHostCPUName()).decode()
-      feats = (feats + "," if feats else "") + ctypes.string_at(llvm.LLVMGetHostCPUFeatures()).decode()
+      featstr = (featstr + "," if featstr else "") + ctypes.string_at(llvm.LLVMGetHostCPUFeatures()).decode()
     # +reserve-x18 here does the same thing as -ffixed-x18 in ClangJITCompiler, see comments there for why it's needed on arm osx
-    super().__init__(self.arch, cpu, ('+reserve-x18,' if self.arch == "arm64" else '') + feats, cache_key)
+    super().__init__(self.arch, cpu, ('+reserve-x18,' if self.arch == "arm64" else '') + featstr, cache_key)
 
   def disassemble(self, lib:bytes): capstone_flatdump(lib, self.arch)
diff --git a/tinygrad/runtime/support/compiler_mesa.py b/tinygrad/runtime/support/compiler_mesa.py
index 2479bf0b1f902..922605a9f15f8 100644
--- a/tinygrad/runtime/support/compiler_mesa.py
+++ b/tinygrad/runtime/support/compiler_mesa.py
@@ -17,7 +17,7 @@ def deserialize(enc_src, opts):
   return mesa.nir_deserialize(None, ctypes.cast(opts, ctypes.POINTER(mesa.nir_shader_compiler_options)), blobreader)
 
 class LVPCompiler(CPULLVMCompiler):
-  def __init__(self, arch): CPULLVMCompiler.__init__(self, arch, cache_key="compile_lvp")
+  def __init__(self, arch): CPULLVMCompiler.__init__(self, arch.split(","), cache_key="compile_lvp")
 
   def compile(self, src) -> bytes:
     shader, ctx = deserialize(src, mesa.lvp_nir_options), llvm.LLVMGetGlobalContext()
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index c93ab490eaa61..68d1122d39888 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -5,7 +5,7 @@
 from typing import Any, Callable, ClassVar, Sequence, cast, get_args, Literal, ParamSpec, TypeVar, Generic, TYPE_CHECKING
 if TYPE_CHECKING: import numpy
 from tinygrad.dtype import DType, DTypeLike, dtypes, ConstType, least_upper_float, least_upper_dtype, to_dtype, truncate
-from tinygrad.dtype import _from_np_dtype, _to_np_dtype, PyConst, Invalid, InvalidType
+from tinygrad.dtype import _from_np_dtype, _to_np_dtype, PyConst, Invalid
 from tinygrad.helpers import argfix, make_tuple, flatten, prod, all_int, round_up, getenv, all_same, fully_flatten, ceildiv, fetch, flat_to_grouped
 from tinygrad.helpers import resolve_pool_pads, IMAGE, FLOAT16, WINO, Metadata, TRACEMETA, is_numpy_ndarray, TracingKey, cpu_profile
 from tinygrad.helpers import suppress_finalizing, disable_gc
@@ -169,10 +169,7 @@ def _apply_uop(self, fxn:Callable[..., UOp], *x:Tensor, extra_args=(), **kwargs)
     all_tensors[weakref.ref(ret)] = None
     return ret
 
-  # _binop, alu, and const_like are used by the mixins
-  def _binop(self, op, x, reverse):
-    lhs,rhs = self._broadcasted(x, reverse)
-    return lhs._apply_uop(lambda *u: u[0].alu(op, *u[1:]), rhs)
+  # alu and const_like are used by the mixins
   def alu(self, op: Ops, *src: Tensor) -> Tensor: return self._apply_uop(lambda *u: u[0].alu(op, *u[1:]), *src)
   def const_like(self, b:ConstType) -> Tensor: return Tensor(self.uop.const_like(b), requires_grad=False)
   @staticmethod
@@ -1860,8 +1857,7 @@ def contiguous(self, *args, **kwargs) -> Tensor:
   def ufix(self, x) -> Tensor:
     # TODO: x:ConstType|UOp does not work because mixin only accepts Self | ConstType
     assert isinstance(x, (*get_args(ConstType), UOp)), f"{type(x)=}, {x=}"
-    dtype = self.dtype if dtypes.is_float(self.dtype) or (dtypes.is_int(self.dtype) and isinstance(x, (int, InvalidType))) else None
-    return Tensor(x, self.device, dtype, requires_grad=False)
+    return Tensor(x, self.device, self.dtype if self._ufix_keep_dtype(x) else None, requires_grad=False)
 
   def div(self, x:Tensor|ConstType|UOp, reverse=False, rounding_mode:Literal["trunc", "floor"]|None=None) -> Tensor:
     """
diff --git a/tinygrad/uop/ops.py b/tinygrad/uop/ops.py
index bee6701161924..5093bf089506d 100644
--- a/tinygrad/uop/ops.py
+++ b/tinygrad/uop/ops.py
@@ -27,7 +27,7 @@ def __repr__(self): return str(self)
                AxisType.GROUP_REDUCE: 2, AxisType.REDUCE: 4, AxisType.UNROLL: 5}
 
 range_start = {Ops.BUFFERIZE: 1, Ops.REDUCE: 1, Ops.STORE: 2, Ops.WMMA: 3, Ops.END: 1, Ops.CALL: 1, Ops.FUNCTION: 1,
-               Ops.COPY: 2, Ops.BUFFER_VIEW: 1}
+               Ops.COPY: 2, Ops.BUFFER_VIEW: 1, Ops.LINEAR: 0}
 
 # https://en.wikipedia.org/wiki/Identity_element
 def identity_element(op:Ops, dt:DType) -> PyConst: return dt.const({Ops.ADD:0, Ops.MUL:1, Ops.MAX:dt.min}[op])
@@ -439,10 +439,13 @@ def __getitem__(self, idx):
       perm = src.permute(tuple([i for i in range(src.ndim) if i not in slice_idx] + slice_idx))
       return perm.index(*non_slice_args, ptr=True)
     return self.index(*[UOp.const(dtypes.weakint, x) if isinstance(x, int) else x for x in idx])
-  def const_like(self, b:ConstLike):
+  def const_like(self, b:ConstLike, dtype:DType|None=None):
     # constants can optionally have a DEVICE source
-    ret = UOp.const(self.dtype.base, b, device=self._device, shape=self.shard_shape if self.axis is not None else self._shape)
+    ret = UOp.const(dtype or self.dtype.base, b, device=self._device, shape=self.shard_shape if self.axis is not None else self._shape)
     return ret.multi(self.axis) if self.axis is not None else ret
+  def ufix(self, x):
+    if isinstance(x, UOp): return x
+    return self.const_like(x, None if self._ufix_keep_dtype(x) else dtypes.from_py(x).vec(self.dtype.vcount))
   def broadcast(self, count:int):
     assert self.dtype.vcount == 1
     if count == 1: return self
@@ -1101,6 +1104,9 @@ def after(self, *src:UPat, **kwargs): return UPat(Ops.AFTER, self.match_dtype, (
   def end(self, *src:UPat, **kwargs): return UPat(Ops.END, self.match_dtype, (self,)+src, **kwargs)
 
   def const_like(self, b:ConstLike): return UPat.const(self.match_dtype, cast(ConstType, b))
+  # UPat patterns are built with `upat + 1`-style operators; don't insert CAST nodes like _broadcasted does
+  def _binop(self, op:Ops, x, reverse:bool) -> UPat:
+    return self.ufix(x).alu(op, self) if reverse else self.alu(op, self.ufix(x))
   def alu(self, op:Ops, *src:UPat):
     asrc = (self,)+src
     return UPat(op, dtypes.bool if op in {Ops.CMPLT, Ops.CMPNE} else asrc[-1].match_dtype, list(asrc) if op in GroupOp.Commutative else asrc)
diff --git a/tinygrad/uop/symbolic.py b/tinygrad/uop/symbolic.py
index 4f8aeb358fb88..673c3f668541e 100644
--- a/tinygrad/uop/symbolic.py
+++ b/tinygrad/uop/symbolic.py
@@ -91,6 +91,9 @@ def fold_add_divmod_recombine(x:UOp) -> UOp|None:
   (UPat.var("x", dtype=dtypes.bool).logical_not().logical_not(), lambda x: x),
   (UPat.var("x", dtype=dtypes.bool).where(UPat.const(dtypes.bool, True), UPat.const(dtypes.bool, False)), lambda x: x),
   (UPat.var("x", dtype=dtypes.bool).where(UPat.const(dtypes.bool, False), UPat.const(dtypes.bool, True)), lambda x: x.logical_not()),
+  # CAST(bool -> int) != const — CAST(True)=1, CAST(False)=0, so fold based on const value
+  (UPat.var("x", dtype=dtypes.bool).cast(dtypes.ints+(dtypes.weakint,)) != UPat.cvar("c", vec=False),
+   lambda x,c: x if c.arg == 0 else x.logical_not() if c.arg == 1 else x.const_like(True)),
   (UPat.var("x", dtype=dtypes.ints+(dtypes.bool, dtypes.weakint)).trunc(), lambda x: x),
   # ** zero folding **
   (UPat.var("x") < UPat.var("x"), lambda x: x.const_like(False).cast(dtypes.bool.vec(x.dtype.count))), # x < x -> False
@@ -392,7 +395,8 @@ def can_move(c:UOp) -> bool:
   moved, keep = partition([c for c in where_clauses if c not in in_load], can_move)
   if len(keep) == len(where_clauses): return None
   idx = buf.index(idx.get_idx().valid(load_valid.uprod(*moved)))
-  return UOp.const(dtypes.bool, True).uprod(*keep).where(idx.cast(or_cast.dtype) if or_cast.op is Ops.CAST else idx, 0)
+  ret_idx = idx.cast(or_cast.dtype) if or_cast.op is Ops.CAST else idx
+  return UOp.const(dtypes.bool, True).uprod(*keep).where(ret_idx, ret_idx.const_like(0))
 
 # where after gated load becomes alt value, TODO: this is sort of duplicated with rules in devectorizer
 pm_move_where_on_load = PatternMatcher([