RhizoNymph · maxsloef-goodfire · Jun 11, 2026
diff --git a/docs/design/capture_consumers.md b/docs/design/capture_consumers.md
@@ -117,7 +117,7 @@ VllmInternalRequestId = NewType("VllmInternalRequestId", str)
 CaptureKey = tuple[VllmInternalRequestId, int, str]
 # (request id, layer index, hook name)
 
-HookName = Literal["pre_attn", "post_attn", "post_mlp", "mlp_in", "mlp_out"]
+HookName = Literal["pre_attn", "post_attn", "post_block", "mlp_in", "mlp_out"]
 PositionSelector = (
     Literal["last_prompt", "all_prompt", "all_generated", "all"]
     | list[int]
@@ -678,7 +678,7 @@ Writer details (`writer.py`):
 - TP / PP / EP / DP are all accepted for the replicated residual hooks
   (no parallel-size rejection). See
   [Capture Consumers under Parallelism](capture_parallelism.md).
-- Every hook name is in `{pre_attn, post_attn, post_mlp, mlp_in,
+- Every hook name is in `{pre_attn, post_attn, post_block, mlp_in,
   mlp_out}`.
 - Every resolved layer is in `[0, num_hidden_layers)`, the **global**
   layer count (admission validates the full layer space; the runner then

diff --git a/docs/design/capture_parallelism.md b/docs/design/capture_parallelism.md
@@ -18,7 +18,7 @@ guide see [Capture Consumers](../features/capture_consumers.md).
 ## TL;DR
 
 - **The capturable hooks are replicated.** The three hooks that fire
-  today — `pre_attn`, `post_attn`, `post_mlp` — read the residual
+  today — `pre_attn`, `post_attn`, `post_block` — read the residual
   stream *after* the TP all-reduce and the MoE combine, so the tensor
   is full `hidden_size`, **byte-identical on every TP and every EP
   rank**. For these hooks, TP/EP support is a *rank gate*, not a
@@ -52,10 +52,10 @@ downstream of the reducing collectives:
   (`vllm/model_executor/layers/linear.py:1558-1559`), so attention- and
   MLP-output projections produce full `hidden_size` on every TP rank.
 - MoE paths all-gather/all-reduce before the residual add
-  (`vllm/model_executor/models/deepseek_v2.py:384`), so `post_mlp` on an
+  (`vllm/model_executor/models/deepseek_v2.py:384`), so `post_block` on an
   EP rank also sees the full residual.
 
-Hence `pre_attn` / `post_attn` / `post_mlp` are `[num_rows,
+Hence `pre_attn` / `post_attn` / `post_block` are `[num_rows,
 hidden_size]` and identical across the TP×EP plane of a PP stage.
 
 What is **genuinely sharded** (and not captured today):
@@ -166,7 +166,7 @@ declared `location` can select between them.
   filesystem consumer writes to a path keyed by **global** layer index
   + request_id on a **shared** mount.
 - The on-disk layout merges naturally: stage 0 writes
-  `…/req/12_post_mlp.bin`, stage 1 writes `…/req/40_post_mlp.bin`, no
+  `…/req/12_post_block.bin`, stage 1 writes `…/req/40_post_block.bin`, no
   collision. The `packed`/`sharded` layouts (one file per request / per
   tag) cannot merge by global layer index alone, so under PP each stage
   writes its **own** file keyed by stage rank

diff --git a/docs/features/capture_consumers.md b/docs/features/capture_consumers.md
@@ -290,7 +290,7 @@ llm = LLM(
     model="meta-llama/Llama-3-8B",
     capture_consumers=[
         {"name": "filesystem", "params": {"root": "/tmp/captures"}},
-        {"name": "logging", "params": {"hooks": {"post_mlp": [0]}}},
+        {"name": "logging", "params": {"hooks": {"post_block": [0]}}},
     ],
 )
 ```
@@ -323,7 +323,7 @@ sampling_params = SamplingParams(
         "filesystem": FilesystemCaptureRequest(
             request_id="probe_0001",
             tag="mnist-probe-v1",
-            hooks={"post_mlp": [12]},
+            hooks={"post_block": [12]},
             positions="last_prompt",
         ),
     },
@@ -355,7 +355,7 @@ response = httpx.post(
                 "filesystem": {
                     "request_id": "probe_train_0001",
                     "tag": "capital-probe",
-                    "hooks": {"post_mlp": [12, 16, 20, 24]},
+                    "hooks": {"post_block": [12, 16, 20, 24]},
                     "positions": "last_prompt",
                     "layout": "packed",
                 },
@@ -392,7 +392,7 @@ sampling_params = SamplingParams(
         "filesystem": FilesystemCaptureRequest(
             request_id="req1",
             tag="demo",
-            hooks={"post_mlp": [0]},
+            hooks={"post_block": [0]},
             positions="last_prompt",
         ),
     },
@@ -428,7 +428,7 @@ response body as `capture_results`, mirroring the structure above.
 ## Parallelism
 
 Capturing the residual-stream hooks (`pre_attn`, `post_attn`,
-`post_mlp`) is supported under **tensor, pipeline, expert, and data
+`post_block`) is supported under **tensor, pipeline, expert, and data
 parallelism** for worker-location consumers — including the built-in
 `filesystem` consumer. How it works:
 

diff --git a/docs/features/steering.md b/docs/features/steering.md
@@ -50,7 +50,7 @@ Also supported:
 - Global steering through HTTP endpoints
 - Per-request steering through `SamplingParams`
 - Three additive tiers (base / prefill-specific / decode-specific)
-- Three hook points: `pre_attn`, `post_attn`, `post_mlp`
+- Three hook points: `pre_attn`, `post_attn`, `post_block`
 - Phase-aware scheduler admission for per-request steering
 - Prefix-cache separation for different prefill steering configs
 - Continuous batching
@@ -109,7 +109,7 @@ activation that is discarded immediately afterward.
 | --- | --- |
 | `pre_attn` | Residual stream before attention |
 | `post_attn` | Residual stream after attention |
-| `post_mlp` | Residual stream after MLP |
+| `post_block` | Residual stream after MLP |
 
 For supported models, these hooks are wired directly into each decoder
 layer's forward path. Unused hook points are zero-valued no-ops.
@@ -157,7 +157,7 @@ curl -X POST http://localhost:8000/v1/steering/set \
   -H "Content-Type: application/json" \
   -d '{
     "vectors": {
-      "post_mlp": {
+      "post_block": {
         "15": {"vector": [0.1, 0.2], "scale": 2.0}
       }
     },
@@ -209,7 +209,7 @@ packed_hook = {
 
 requests.post(
     "http://localhost:8000/v1/steering/set",
-    json={"vectors": {"post_mlp": packed_hook}},
+    json={"vectors": {"post_block": packed_hook}},
 )
 ```
 
@@ -248,7 +248,7 @@ params = SamplingParams(
     max_tokens=64,
     temperature=0.0,
     steering_vectors={
-        "post_mlp": {
+        "post_block": {
             15: {"vector": [0.1, 0.2], "scale": 2.0},
         },
     },
@@ -288,7 +288,7 @@ vec = np.random.standard_normal(2560).astype(np.float16)
 stacked = np.stack([vec], axis=0)  # (num_layers, hidden_size)
 
 base = {
-    "post_mlp": {
+    "post_block": {
         "dtype": str(stacked.dtype),  # "float16" | "float32" | "float64"
         "shape": list(stacked.shape),
         "layer_indices": [15],
@@ -335,7 +335,7 @@ The JSON file uses the same three-tier format as the global steering API:
 ```json
 {
   "vectors": {
-    "post_mlp": {
+    "post_block": {
       "15": [0.1, 0.2, 0.3],
       "20": {"vector": [0.4, 0.5, 0.6], "scale": 2.0}
     }
@@ -357,7 +357,7 @@ startup cost:
 ```json
 {
   "vectors": {
-    "post_mlp": {
+    "post_block": {
       "dtype": "float32",
       "shape": [2, 2560],
       "layer_indices": [15, 20],
@@ -379,7 +379,7 @@ curl -X POST http://localhost:8000/v1/steering/modules/register \
   -d '{
     "name": "creativity",
     "vectors": {
-      "post_mlp": {"15": [0.1, 0.2, 0.3]}
+      "post_block": {"15": [0.1, 0.2, 0.3]}
     }
   }'
 
@@ -412,7 +412,7 @@ requests.post(
     json={
         "name": "creativity",
         "vectors": {
-            "post_mlp": {
+            "post_block": {
                 "dtype": str(stacked.dtype),
                 "shape": list(stacked.shape),
                 "layer_indices": [15],
@@ -454,7 +454,7 @@ response = client.chat.completions.create(
     extra_body={
         "steering_name": "creativity",
         "steering_vectors": {
-            "post_mlp": {15: [0.05, 0.1, 0.15]},
+            "post_block": {15: [0.05, 0.1, 0.15]},
         },
     },
 )
@@ -546,7 +546,7 @@ Returns per-layer hook-point availability aggregated across TP × PP ranks:
 
 ```bash
 curl http://localhost:8000/v1/steering/layers
-# {"layers": {"0": {"hook_points": ["post_mlp"]}, "1": {"hook_points": ["post_mlp", "pre_attn"]}, ...}}
+# {"layers": {"0": {"hook_points": ["post_block"]}, "1": {"hook_points": ["post_block", "pre_attn"]}, ...}}
 ```
 
 Useful to confirm which layers of the loaded model are steerable before

diff --git a/examples/capture_consumers/activation_reward_producer/README.md b/examples/capture_consumers/activation_reward_producer/README.md
@@ -48,7 +48,7 @@ llm = LLM(
             "name": "activation_reward",
             "params": {
                 "layer": 12,
-                "hook": "post_mlp",
+                "hook": "post_block",
                 "vector_path": "/models/happy/sadness.pt",
                 "position_slice": {"start": 10, "end": None, "stride": 1},
                 "scale": 5.0,
@@ -64,15 +64,15 @@ llm = LLM(
 
 ```bash
 vllm serve meta-llama/Llama-3-8B \
-    --capture-consumers activation_reward:layer=12,hook=post_mlp,vector_path=/models/happy/sadness.pt,scale=5.0,nonlinearity=tanh
+    --capture-consumers activation_reward:layer=12,hook=post_block,vector_path=/models/happy/sadness.pt,scale=5.0,nonlinearity=tanh
 ```
 
 ### Parameters
 
 | Field | Type | Default | Purpose |
 | --- | --- | --- | --- |
 | `layer` | `int` | required | Layer index to capture at. |
-| `hook` | `str` | required | One of `pre_attn`, `post_attn`, `post_mlp`, `mlp_in`, `mlp_out`. |
+| `hook` | `str` | required | One of `pre_attn`, `post_attn`, `post_block`, `mlp_in`, `mlp_out`. |
 | `vector_path` | `str` | required | Path to a `.pt` file holding a 1-D tensor of shape `(hidden_size,)`. L2-normalized at load. |
 | `position_slice` | `dict` | `{start: 10, end: null, stride: 1}` | Applied to the `all_generated` span before mean-pooling. |
 | `scale` | `float` | `1.0` | Multiplicative factor on the raw cosine. |
@@ -120,7 +120,7 @@ llm = LLM(
             "name": "activation_reward",
             "instance_name": "sadness_reward",
             "params": {
-                "layer": 12, "hook": "post_mlp",
+                "layer": 12, "hook": "post_block",
                 "vector_path": "/models/happy/sadness.pt",
             },
         },

diff --git a/examples/capture_consumers/activation_reward_producer/activation_reward_producer/__init__.py b/examples/capture_consumers/activation_reward_producer/activation_reward_producer/__init__.py
@@ -35,7 +35,7 @@
     from vllm.v1.capture.types import CaptureContext
 
 
-_HOOK_NAMES = frozenset({"pre_attn", "post_attn", "post_mlp", "mlp_in", "mlp_out"})
+_HOOK_NAMES = frozenset({"pre_attn", "post_attn", "post_block", "mlp_in", "mlp_out"})
 _NONLIN = {
     "tanh": math.tanh,
     "sigmoid": lambda x: 1.0 / (1.0 + math.exp(-x)),

diff --git a/examples/capture_consumers/activation_reward_producer/test.py b/examples/capture_consumers/activation_reward_producer/test.py
@@ -63,7 +63,7 @@ def test_payload_shape_and_lifecycle(tmp: Path) -> None:
         _mock_config(),
         {
             "layer": 12,
-            "hook": "post_mlp",
+            "hook": "post_block",
             "vector_path": str(vec_path),
             "position_slice": {"start": 2, "end": None, "stride": 1},
             "scale": 5.0,
@@ -73,11 +73,11 @@ def test_payload_shape_and_lifecycle(tmp: Path) -> None:
 
     # Validator returns the pinned spec.
     spec = producer.validate_client_spec({}, _ctx())
-    assert spec.hooks == {"post_mlp": [12]}
+    assert spec.hooks == {"post_block": [12]}
     assert spec.positions == "all_generated"
 
     # Two chunks across two steps; total 6 rows; slice starts at 2.
-    key = (VllmInternalRequestId("req-1"), 12, "post_mlp")
+    key = (VllmInternalRequestId("req-1"), 12, "post_block")
     chunk_a = CaptureChunk(
         key=key,
         tensor=torch.randn(3, HIDDEN),
@@ -127,12 +127,12 @@ def test_empty_window_payload(tmp: Path) -> None:
         _mock_config(),
         {
             "layer": 0,
-            "hook": "post_mlp",
+            "hook": "post_block",
             "vector_path": str(vec_path),
             "position_slice": {"start": 100, "end": None, "stride": 1},
         },
     )
-    key = (VllmInternalRequestId("short"), 0, "post_mlp")
+    key = (VllmInternalRequestId("short"), 0, "post_block")
     producer.submit_chunk(
         CaptureChunk(
             key=key,
@@ -156,9 +156,9 @@ def test_no_chunks_partial_error(tmp: Path) -> None:
 
     producer = ActivationRewardProducer(
         _mock_config(),
-        {"layer": 0, "hook": "post_mlp", "vector_path": str(vec_path)},
+        {"layer": 0, "hook": "post_block", "vector_path": str(vec_path)},
     )
-    key = (VllmInternalRequestId("ghost"), 0, "post_mlp")
+    key = (VllmInternalRequestId("ghost"), 0, "post_block")
     producer.submit_finalize(CaptureFinalize(key=key))
     result = producer.get_result(key)
     assert result.status == "partial_error"
@@ -172,7 +172,7 @@ def test_non_empty_client_spec_rejected(tmp: Path) -> None:
 
     producer = ActivationRewardProducer(
         _mock_config(),
-        {"layer": 0, "hook": "post_mlp", "vector_path": str(vec_path)},
+        {"layer": 0, "hook": "post_block", "vector_path": str(vec_path)},
     )
     try:
         producer.validate_client_spec({"layer": 99}, _ctx())
@@ -189,7 +189,7 @@ def test_tp_pp_rejected(tmp: Path) -> None:
 
     producer = ActivationRewardProducer(
         _mock_config(),
-        {"layer": 0, "hook": "post_mlp", "vector_path": str(vec_path)},
+        {"layer": 0, "hook": "post_block", "vector_path": str(vec_path)},
     )
     try:
         producer.validate_client_spec({}, _ctx(tp=2))
@@ -207,7 +207,7 @@ def test_bad_layer_rejected(tmp: Path) -> None:
     try:
         ActivationRewardProducer(
             _mock_config(),
-            {"layer": NUM_LAYERS + 5, "hook": "post_mlp", "vector_path": str(vec_path)},
+            {"layer": NUM_LAYERS + 5, "hook": "post_block", "vector_path": str(vec_path)},
         )
     except ValueError as e:
         assert "out of range" in str(e)
@@ -223,7 +223,7 @@ def test_vector_hidden_size_mismatch(tmp: Path) -> None:
     try:
         ActivationRewardProducer(
             _mock_config(),
-            {"layer": 0, "hook": "post_mlp", "vector_path": str(vec_path)},
+            {"layer": 0, "hook": "post_block", "vector_path": str(vec_path)},
         )
     except ValueError as e:
         assert "hidden_size" in str(e)

diff --git a/examples/capture_consumers/minimal_plugin/my_plugin/__init__.py b/examples/capture_consumers/minimal_plugin/my_plugin/__init__.py
@@ -28,7 +28,7 @@ def __init__(self, vllm_config: Any, params: dict[str, Any]) -> None:
 
     def global_capture_spec(self) -> CaptureSpec:
         return CaptureSpec(
-            hooks={"post_mlp": self._layers},
+            hooks={"post_block": self._layers},
             positions="last_prompt",
         )
 

diff --git a/examples/capture_consumers/plugin_authoring.md b/examples/capture_consumers/plugin_authoring.md
@@ -40,7 +40,7 @@ class MyConsumer(CaptureConsumer):
 
     def global_capture_spec(self) -> CaptureSpec:
         return CaptureSpec(
-            hooks={"post_mlp": self._layers},
+            hooks={"post_block": self._layers},
             positions="last_prompt",
         )
 
@@ -86,7 +86,7 @@ pattern — the consumer always needs the same data.
 ```python
 def global_capture_spec(self) -> CaptureSpec:
     return CaptureSpec(
-        hooks={"post_mlp": [0, 15, 31]},
+        hooks={"post_block": [0, 15, 31]},
         positions="last_prompt",
     )
 ```
@@ -104,7 +104,7 @@ class FlexConsumer(CaptureConsumer):
     reads_client_spec = True
 
     def validate_client_spec(self, raw_spec, ctx):
-        hooks = raw_spec.get("hooks", {"post_mlp": list(range(ctx.num_hidden_layers))})
+        hooks = raw_spec.get("hooks", {"post_block": list(range(ctx.num_hidden_layers))})
         positions = raw_spec.get("positions", "all_prompt")
         return CaptureSpec(hooks=hooks, positions=positions)
 ```
@@ -197,7 +197,7 @@ import torch
 consumer = MyConsumer(MagicMock(), {"layers": [0]})
 adapter = _BatchedAdapter(consumer)
 
-key = (VllmInternalRequestId("test-req"), 0, "post_mlp")
+key = (VllmInternalRequestId("test-req"), 0, "post_block")
 
 adapter.submit_chunk(CaptureChunk(
     key=key,
@@ -247,7 +247,7 @@ class SumConsumer(CaptureConsumer):
 
     def global_capture_spec(self) -> CaptureSpec:
         return CaptureSpec(
-            hooks={"post_mlp": self._layers},
+            hooks={"post_block": self._layers},
             positions="last_prompt",
         )
 

diff --git a/examples/online_serving/openai_steering_client.py b/examples/online_serving/openai_steering_client.py
@@ -80,7 +80,7 @@ def main() -> None:
 
     # Base steering applied to both prefill and decode.
     base = {
-        "post_mlp": pack_hook(
+        "post_block": pack_hook(
             {15: rng.standard_normal(HIDDEN_SIZE).astype(PACK_DTYPE)},
             # Per-layer scales: the server multiplies row-by-row without
             # re-encoding the bytes, so the same vector can be reused at