Grillcheese-AI · grillcheese123 · Apr 7, 2026 · Mar 31, 2026 · Mar 31, 2026 · Apr 2, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -23,6 +23,15 @@ jobs:
       - name: Lint
         run: ruff check .
 
+      - name: Guard deprecated Compute usage in runtime paths
+        run: |
+          hits="$(rg -n "from grilly import Compute|from \.\.backend\.compute import Compute|from \.compute import Compute|Compute\(" functional nn/module.py utils/tensor_conversion.py --glob "*.py" || true)"
+          if [ -n "$hits" ]; then
+            echo "Deprecated Compute usage detected in runtime paths:"
+            echo "$hits"
+            exit 1
+          fi
+
   test:
     runs-on: ubuntu-latest
     steps:

diff --git a/.gitignore b/.gitignore
@@ -201,3 +201,10 @@ docs/symp_former.md
 /third_party
 third_party/BLAKE3
 third_party/VulkanMemoryAllocator
+docs/BRIDGE_MIGRATION_CHECKLIST.md
+docs/GPU_OPTIMIZATION_REVIEW.md
+docs/PYTORCH_PARITY_STATUS.md
+docs/PYTORCH_PARITY_TASKLIST.md
+/build_verify_pybind
+/build_verify_pybind2
+docs/MIGRATION_PYTORCH.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,32 @@ This changelog follows the spirit of **Keep a Changelog** and uses the terms **A
 
 ---
 
+## [Unreleased]
+
+### Added
+
+- **`tests/sentencepiece/test_sentencepiece_parity.py`** — Adaptive SentencePiece parity tests vs Hugging Face references (T5/MT5-style tokenizers; auto-skip until `grilly.tokenizers` SentencePiece support is available).
+- **`tests/tokenizers/test_gpu_tokenizer_parity.py`** — Adaptive real tokenizer parity tests vs Hugging Face references (auto-skip until `grilly.tokenizers` lands and assets are available).
+- **`tests/sentence_transformers/`**, **`tests/transformers_compat/`**, **`tests/converter/`**, **`tests/autograd_chain/`**, **`tests/moe_quant/`** — CI-safe skip-marked scaffold suites to make pre-v1.0 roadmap targets executable and incrementally fillable.
+- **`docs/pre-v1.0/OPTIMIZATION_PARITY_TASKLIST.md`** — Post-upgrade pre-v1.0 optimization + feature parity execution plan (P0/P1/P2 workstreams, acceptance criteria, and verification commands).
+- **`backend/fnn_chain.py`** — `FnnChainRecorder` / `ChainBufferHandle`: batched `linear` / `relu` / `softmax` recording with `read()` / `read_multiple()` for one submit+wait (then one or many downloads); `VulkanCompute.record_commands(fnn_chain=True)` or `VulkanFNN.chain_record()`. See `docs/PERF_DISPATCH.md`.
+- **`tests/parity/`** — Numerical parity tests for `grilly.functional` (numpy reference; optional PyTorch `F.linear` / `F.relu` when `torch` is installed). See `tests/parity/README.md`.
+- **`tests/parity/test_optimizers_parity.py`** — SGD and Adam (CPU) stepping vs `torch.optim`.
+- **`docs/MIGRATION_PYTORCH.md`** — PyTorch → Grilly migration cookbook (device model, functional layout, module backend lifecycle, debugging).
+- **`docs/PERF_DISPATCH.md`** — Vulkan dispatch/batching (`record_commands`, async dispatch), Sequential fusion notes, pybind GIL checklist.
+
+### Changed
+
+- **`docs/pre-v1.0/OPTIMIZATION_PARITY_TASKLIST.md`** — Re-prioritized roadmap to feature-delivery order: GPU tokenizer -> sentence-transformers -> transformers compatibility -> PyTorch→Grilly converter, with supporting throughput track; expanded tokenizer interoperability to include SentencePiece compatibility targets.
+- **`utils/tensor_conversion.py`**, **`backend/base.py`** — `VulkanTensor.prepare_for_dispatch()` and C++ `Tensor.gpu_handle_if_valid()` binding so GPU-resident tensors reuse buffers in `BufferMixin._prepare_input` without redundant uploads when possible.
+- **`docs/PYTORCH_PARITY_TASKLIST.md`**, **`docs/PYTORCH_PARITY_STATUS.md`**, **`docs/GPU_OPTIMIZATION_REVIEW.md`** — Workstream C (kernel/throughput milestone) marked complete with scoped deliverables; follow-ups consolidated under **Workstream C — future**.
+- **`docs/api/functional.md`** — PyTorch parity notes and links to migration docs and parity tests.
+- **`backend/compute.py`** — `VulkanCompute` exposes `record_commands`, `dispatch_compute`, `dispatch_compute_async`, `wait_async`, `wait_fence`.
+- **`backend/fnn.py`** — `_linear_relu_recorded_chain`: Linear→ReLU in one submit when `fused-linear-relu` shader is absent.
+- **`cpp/python/bindings_{activations,attention,linear}.cpp`** — GIL release around heavy ops; `require_c_contiguous_float` on `linear` inputs.
+
+---
+
 ## [0.5.0] — 2026-03-18 — "GPU-First"
 
 ### Added

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -90,6 +90,10 @@ else()
 endif()
 
 # ── pybind11 ─────────────────────────────────────────────────────────────
+# Silence "Using compatibility mode for Python, set PYBIND11_FINDPYTHON to NEW/OLD"
+if(NOT DEFINED PYBIND11_FINDPYTHON)
+    set(PYBIND11_FINDPYTHON NEW)
+endif()
 set(PYBIND11_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party/pybind11")
 if(EXISTS "${PYBIND11_DIR}/CMakeLists.txt")
     add_subdirectory("${PYBIND11_DIR}" pybind11)
@@ -214,6 +218,9 @@ add_library(grilly_core_lib STATIC
     cpp/src/ops/perceiver.cpp
     cpp/src/ops/perceiver_encoder.cpp
     cpp/src/ops/moqe_train.cpp
+    cpp/src/ops/moe_forward.cpp
+    cpp/src/ops/vsa_lm_forward.cpp
+    cpp/src/ops/prefix_scan.cpp
     cpp/src/shader_fusion.cpp
     # ── Experimental ──
     cpp/src/experimental/paged_latent_pool.cpp
@@ -243,6 +250,7 @@ add_library(grilly_core_lib STATIC
     cpp/src/nn/containers.cpp
     cpp/src/nn/optimizer.cpp
     cpp/src/nn/dataloader.cpp
+    cpp/src/io/grl_checkpoint.cpp
 )
 
 target_include_directories(grilly_core_lib PUBLIC
@@ -299,7 +307,11 @@ pybind11_add_module(grilly_core
     cpp/python/bindings_siglip.cpp
     cpp/python/bindings_perceiver.cpp
     cpp/python/bindings_moqe_train.cpp
+    cpp/python/bindings_moe.cpp
     cpp/python/bindings_fusion.cpp
+    cpp/python/bindings_vsa_lm.cpp
+    cpp/python/bindings_grl.cpp
+    cpp/python/bindings_prefix_scan.cpp
 )
 target_link_libraries(grilly_core PRIVATE grilly_core_lib)
 

diff --git a/README.md b/README.md
@@ -53,8 +53,8 @@ curl -sSL https://raw.githubusercontent.com/Grillcheese-AI/grilly/main/scripts/i
 On Colab:
 
 ```python
-# Recommended: fast mode for Colab (5 min instead of 30)
-!wget -qO- https://raw.githubusercontent.com/Grillcheese-AI/grilly/main/scripts/install.sh | bash -s -- --fast
+# Recommended: Colab mode (Vulkan 1.3 + fast build + NVIDIA ICD, ~5 min)
+!wget -qO- https://raw.githubusercontent.com/Grillcheese-AI/grilly/main/scripts/install.sh | bash -s -- --colab
 ```
 
 This installs system deps, downloads and builds Vulkan SDK 1.4, compiles the grilly C++ extension, and installs the Python package. The `--fast` flag builds only the components grilly needs (shaderc, loader, headers) and skips validation layers.

diff --git a/__init__.py b/__init__.py
@@ -19,11 +19,29 @@
 - Hippocampal transformer with capsule memory
 """
 
+# grilly_core.<plat>.pyd lives in this directory as a sibling .pyd. Editable
+# (PEP 660) installs route `import grilly` through a path hook that does NOT
+# add this directory to sys.path, so `import grilly_core` would fail and the
+# Vulkan probe in backend/base.py would silently report VULKAN_AVAILABLE=False
+# even on machines with a perfectly working Vulkan device. Insert the package
+# directory once, before any submodule import triggers the probe.
+import os as _os
+import sys as _sys
+_pkg_dir = _os.path.dirname(_os.path.abspath(__file__))
+if _pkg_dir not in _sys.path:
+    _sys.path.insert(0, _pkg_dir)
+
 import grilly.functional as functional
 import grilly.nn as nn
 import grilly.optim as optim
 import grilly.utils as utils
-from grilly.backend.base import VULKAN_AVAILABLE
+
+from . import torch_api
+from grilly.backend.base import (
+    VULKAN_AVAILABLE,
+    VULKAN_PYTHON_BINDINGS_AVAILABLE,
+    VULKAN_PYTHON_LEGACY_BACKEND_AVAILABLE,
+)
 from grilly.backend.capsule_transformer import (
     CapsuleMemory,
     CapsuleTransformerConfig,
@@ -65,6 +83,8 @@
 
 __all__ = [
     "VULKAN_AVAILABLE",
+    "VULKAN_PYTHON_BINDINGS_AVAILABLE",
+    "VULKAN_PYTHON_LEGACY_BACKEND_AVAILABLE",
     "VulkanCompute",
     "Compute",
     "SNNCompute",
@@ -79,6 +99,7 @@
     "functional",
     "optim",
     "utils",
+    "torch_api",
 ]
 
 # Conditionally add compatibility exports

diff --git a/backend/__init__.py b/backend/__init__.py
@@ -12,7 +12,11 @@
 - Bridge operations (continuous ↔ spike)
 """
 
-from .base import VULKAN_AVAILABLE
+from .base import (
+    VULKAN_AVAILABLE,
+    VULKAN_PYTHON_BINDINGS_AVAILABLE,
+    VULKAN_PYTHON_LEGACY_BACKEND_AVAILABLE,
+)
 from .capsule_transformer import (
     CapsuleMemory,
     CapsuleTransformerConfig,
@@ -26,6 +30,8 @@
 
 __all__ = [
     "VULKAN_AVAILABLE",
+    "VULKAN_PYTHON_BINDINGS_AVAILABLE",
+    "VULKAN_PYTHON_LEGACY_BACKEND_AVAILABLE",
     "VulkanCompute",
     "SNNCompute",
     "VulkanLearning",