Leberkas-org · st0o0 · Jun 22, 2026 · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -15,5 +15,3 @@
 # Other binary assets — LFS + no text diff
 *.ico  filter=lfs diff=lfs merge=lfs -text
 *.pdf  filter=lfs diff=lfs merge=lfs -text
-*.zip  filter=lfs diff=lfs merge=lfs -text
-*.gz   filter=lfs diff=lfs merge=lfs -text
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -153,9 +153,11 @@ Single source of truth for all non-code knowledge. **Use Obsidian MCP tools** (`
 
 ## Performance Patterns
 
-- **Snapshot semantics**: Decoder/FrameDecoder return values are held across calls by tests —
-  cannot return reused lists directly. Use `.ToArray()` or `new List<>(buffer)` for public APIs.
-  Akka back-pressure guarantees consumption in production, but test contracts require copies.
+- **Reused decode buffers**: `FrameDecoder.Decode` returns its reused `_frames` list directly (no
+  per-read array alloc); the client/server state machines consume it synchronously within the same
+  actor message under Akka back-pressure. A caller (or test) that needs to hold a result across a
+  later `Decode` MUST snapshot it (`.ToArray()`). When adding a decoder return that is consumed
+  asynchronously or retained, copy instead — never hand out a reused buffer to such a caller.
 - **List reuse pattern**: Http2/RequestEncoder has `_reusableHeaders`/`_reusableFrames` —
   follow this pattern for any per-request collection (clear + repopulate, not new).
 - **`string.Concat` over `$""`** for simple 2-3 part joins (avoids handler alloc)

diff --git a/docs/when-to-use.md b/docs/when-to-use.md
@@ -1,34 +1,41 @@
 # When to Use TurboHTTP
 
 TurboHTTP is not a drop-in "faster HttpClient/Kestrel". It is an HTTP stack built on Akka.Streams
-whose strengths are **streaming, backpressure, large payloads under concurrency, and actor
-integration** — and whose trade-off is per-request overhead on tiny, latency-critical requests.
-This page summarizes where each side of the stack wins, based on the benchmark suite
-(BenchmarkDotNet, loopback, 2026-06).
+whose strengths are **HTTP/2 multiplexing, streaming, backpressure, and actor integration** — and
+whose trade-off is per-request overhead on tiny, latency-critical requests and a heavier cold start.
+This page summarizes where each side of the stack wins, based on measured BenchmarkDotNet results
+(Ryzen 7 5800X, .NET 10.0.8, loopback, 2026-06-21).
 
 ## TL;DR
 
 | Your workload | Recommendation |
 |---|---|
 | Many small GETs, lowest possible latency | HttpClient / Kestrel |
-| Large request bodies (uploads) under concurrency | **TurboHTTP client** (H2/H3: up to 2–3.5× HttpClient) |
-| Upload-heavy server endpoints (HTTP/1.1) | **TurboServer** (+10–34 % vs Kestrel) |
+| HTTP/2 server endpoints (plaintext, JSON) | **TurboServer** (1.4–1.5× Kestrel) |
+| Concurrent downloads over HTTP/2 or HTTP/3 | **TurboHTTP client** (2–3.5× HttpClient) |
+| HTTP/1.1 pipelined requests on a single connection | **TurboHTTP client** (up to 4.7× HttpClient) |
 | Streaming, SSE, backpressure end-to-end | **TurboHTTP (both sides)** |
 | Actor-based backends (Akka.NET) | **TurboServer** — shares your `ActorSystem` |
 | Bulk request pipelines (fire thousands, drain results) | **TurboHTTP client channel API** |
+| HTTP/3 (QUIC) at any scale | HttpClient / Kestrel (TurboHTTP H3 is 2–7× slower) |
 
 ## As a Client
 
 ### Where it wins
 
-- **Concurrent uploads over HTTP/2 and HTTP/3.** With many in-flight POSTs, the multiplexed
-  upload path clearly beats `SocketsHttpHandler`: at 512–4096 concurrent 10 KB uploads the
-  benchmark shows **+12 % to +58 % (H2)** and **+123 % to +247 % (H3)** throughput, with up to
-  **84 % fewer allocations** (H2, CL=4096). Tail latency follows: p99 is 40–70 % lower in these
-  scenarios.
-- **HTTP/1.1 uploads at scale** run close to HttpClient (within ~30–40 % at high concurrency)
-  with bounded memory — the request body pump is backpressured against the socket instead of
-  buffering whole bodies.
+- **HTTP/1.1 pipelining on a single connection.** At 256 concurrent requests over one connection,
+  TurboHTTP delivers **4.7× the throughput** (73K vs 15K req/s) of HttpClient. At 64 concurrent
+  it is 1.5× faster. This makes it ideal for connection-constrained scenarios and serial
+  keep-alive workloads.
+- **HTTP/2 and HTTP/3 concurrent downloads.** Downloading 1 MB payloads across 32 connections,
+  TurboHTTP is **2.3× faster on H2** (2,727 vs 1,199 req/s) and **3.5× faster on H3** (613 vs
+  176 req/s). For 8 MB payloads the advantage holds: **2.4× on H2** and **2.9× on H3**. The
+  streams-based body consumption handles flow-controlled data more efficiently than
+  SocketsHttpHandler.
+- **HTTP/1.1 concurrent light requests at moderate scale.** At 512 concurrent light GETs,
+  TurboHTTP is **1.6× faster** (66K vs 42K req/s) than HttpClient.
+- **HTTP/2 single-connection multiplexing.** At 64 concurrent requests on one H2 connection,
+  TurboHTTP delivers **1.5× the throughput** (49K vs 33K req/s).
 - **Resilience built into the pipeline.** Retries, reconnect with request replay, redirects,
   cookies, HTTP caching, and content encoding are stream stages, not handler wrappers — and all
   of it is observable through permanent `Servus.Senf` tracing.
@@ -39,20 +46,28 @@ This page summarizes where each side of the stack wins, based on the benchmark s
 
 ### Where HttpClient is the better tool
 
-- **Single-request latency on light GETs.** A lone ~3 B GET costs ~150–160 µs vs HttpClient's
-  ~74 µs; light-GET fan-out at very high concurrency is also slower (H2/H3 light concurrent).
-- **The channel API has a latency floor** (~1.3–1.6 ms per isolated request) from its
-  stream-materialization hops — it amortizes over bulk work, not single calls.
+- **Single-request latency.** A warm light GET costs **114 µs vs HttpClient's 67 µs** on H1.1
+  (~47 µs GraphInterpreter overhead), 123 vs 77 µs on H2, 228 vs 180 µs on H3.
+- **Cold start.** First request takes **6.4 ms vs 480 µs** (13× slower) on H1.1/H2, allocating
+  ~3 MB for actor system and streams graph materialization vs HttpClient's 33 KB.
+- **Very high concurrency (4096+).** TurboHTTP's SendAsync API currently crashes at 4096
+  concurrent requests across all protocols and at 512 for HTTP/3.
+- **HTTP/3 (QUIC) generally.** Single-connection H3 is **4.5–7.3× slower** than HttpClient.
+  This is a known transport-layer limitation being worked on.
+- **HTTP/1.1 concurrent downloads.** At 32 connections downloading 1 MB, HttpClient is **3.1×
+  faster** (11,413 vs 3,692 req/s) — the connection pool management overhead currently hurts
+  on H1.1 download workloads.
 
 ## As a Server
 
 ### Where it wins
 
-- **HTTP/1.1 upload endpoints.** 1 MB POSTs run **+10 % to +34 %** faster than Kestrel
-  (sequential and CL=1 concurrent; +10–20 % at CL=64/256 sequential).
-- **HTTP/2 / HTTP/3 request handling at parity.** Plaintext/JSON/Fortunes sequential are within
-  ±5–15 % of Kestrel across protocols; several H2 concurrent scenarios (plaintext, JSON) are
-  ahead at p95/p99.
+- **HTTP/2 plaintext and JSON at high concurrency.** At 256 concurrent requests, TurboServer
+  delivers **1.5× Kestrel's throughput on plaintext** (80K vs 54K req/s) and **1.4× on JSON**
+  (79K vs 57K req/s). At 64 concurrent it is 1.2–1.3× faster. HTTP/2 multiplexing is
+  TurboServer's sweet spot.
+- **HTTP/1.1 at near-parity.** Plaintext/JSON/Fortunes are within 5–10% of Kestrel on H1.1
+  across all concurrency levels — competitive enough for most workloads.
 - **Streaming responses with real backpressure.** Return an Akka Streams `Source` (SSE, long
   downloads) and flow control runs end-to-end — a slow client slows the producer instead of
   growing a buffer.
@@ -61,19 +76,23 @@ This page summarizes where each side of the stack wins, based on the benchmark s
 
 ### Where Kestrel is the better tool
 
-- **Small-response throughput/latency records.** Plaintext/JSON-style endpoints are ~6–16 %
-  slower at p50 and allocate more per request (managed allocations are roughly 3–4× Kestrel's
-  2.7 KB; native/pooled buffers excluded on both sides).
-- **Very high fan-out on HTTP/3.** Light-request concurrency over QUIC currently trails Kestrel
-  significantly (-50 % to -74 %) — a known limitation of the shared pipeline, being worked on.
+- **HTTP/3 (QUIC) — significantly.** TurboServer is **1.4–3.9× slower** than Kestrel across
+  all H3 workloads (JSON concurrent @256: 29K vs 114K req/s = 26% of Kestrel).
+- **Larger response bodies on HTTP/2.** The Fortunes benchmark (larger HTML responses) shows a
+  dramatic **4.2× regression** at 256 concurrent H2 requests (22K vs 92K req/s), compared to
+  plaintext/JSON where TurboServer leads. This points to a body-write or serialization
+  bottleneck specific to larger response payloads.
+- **Per-request allocations.** TurboServer allocates roughly **2.5–3× more** per request than
+  Kestrel (6.9 KB vs 2.6 KB on H1.1 plaintext). Kestrel pools its HttpContext, feature
+  collections, and header dictionaries more aggressively.
+- **Uploads at scale.** Upload endpoints are 1.3–1.4× slower on H1.1/H2.
 
 ## In Combination
 
 Running TurboHTTP on both ends pays off when the *pipeline* is the product:
 
-- **Service-to-service with large payloads.** TurboHTTP client → TurboServer keeps uploads
-  backpressured on both sides; neither end buffers whole bodies, so memory stays flat under
-  load spikes.
+- **HTTP/2 service-to-service.** TurboHTTP client's H2 download advantage (2.3×) combined with
+  TurboServer's H2 serving advantage (1.4–1.5×) makes a compelling end-to-end H2 story.
 - **End-to-end streaming.** An Akka Streams `Source` on the server feeds an Akka Streams
   consumer on the client — one flow-controlled graph across the network, including SSE.
 - **Gateways and proxies.** Forward-proxy and CONNECT tunneling are supported; combined with
@@ -84,9 +103,11 @@ Running TurboHTTP on both ends pays off when the *pipeline* is the product:
 
 ## Benchmark Context
 
-Numbers above come from the repo's benchmark suite (`TurboHTTP.Benchmarks`): localhost loopback,
-BenchmarkDotNet, HTTP/1.1 + h2c cleartext, HTTP/3 with self-signed TLS, run 2026-06. Loopback
-isolates protocol-stack overhead and exaggerates per-request costs relative to real networks —
-over WAN latencies, the gaps on light requests shrink while the streaming/backpressure advantages
-remain. Memory figures count managed allocations only. Re-run with
-`dotnet run -c Release --project TurboHTTP.Benchmarks` to reproduce on your hardware.
+Numbers above come from the repo's benchmark suite (`TurboHTTP.Benchmarks`): Ryzen 7 5800X
+(8C/16T), .NET 10.0.8, BenchmarkDotNet v0.15.8, localhost loopback, HTTP/1.1 + h2c cleartext,
+HTTP/3 with self-signed TLS, measured 2026-06-21 on branch `feat/dispatcher-analysis` after 12+
+optimization commits. Loopback isolates protocol-stack overhead and exaggerates per-request costs
+relative to real networks — over WAN latencies, the gaps on light requests shrink while the
+streaming/backpressure/multiplexing advantages remain. Memory figures count managed allocations
+only. Re-run with `dotnet run -c Release --project TurboHTTP.Benchmarks` to reproduce on your
+hardware.
diff --git a/lib/servus.akka b/lib/servus.akka
diff --git a/src/TurboHTTP.IntegrationTests.Client/H11/SingleConnectionConcurrencyRegressionSpec.cs b/src/TurboHTTP.IntegrationTests.Client/H11/SingleConnectionConcurrencyRegressionSpec.cs
@@ -0,0 +1,77 @@
+using System.Net;
+using TurboHTTP.IntegrationTests.Client.Shared;
+using TurboHTTP.Tests.Shared;
+
+namespace TurboHTTP.IntegrationTests.Client.H11;
+
+/// <summary>
+/// Repro for the single-connection HTTP/1.1 concurrency deadlock in the 2026-06-19 benchmark run
+/// (KestrelTurboSingleConnectionBenchmarks [ConcurrencyLevel=64 and 256, HttpVersion=1.1] → NA).
+/// With MaxConnectionsPerServer forced to 1, the benchmark completed a few iterations of N concurrent
+/// GETs (~1.5 ms each) and then HUNG to the 60 s WaitAsync — an intermittent pipelining/dispatch
+/// deadlock when many requests share one H1.1 connection. The H2 and H3 single-connection variants
+/// produced results; only H1.1 went NA.
+///
+/// The deadlock is intermittent, so the spec drives many rounds of concurrent bursts on the single
+/// connection and fails the first round that does not drain within a generous per-round budget.
+///
+/// ROOT CAUSE (2026-06-20, fixed): under heavy pipelining the server streams many responses back to
+/// back, so a response's status line or header block is frequently split across two TCP reads. The
+/// H1.1 client decoder kept no cross-read remainder, so the unconsumed prefix of a split header was
+/// discarded and the next read's continuation parsed as garbage ("Malformed header field"), faulting
+/// that request and stranding its in-flight pipelined siblings. Fixed by retaining the unconsumed
+/// prefix in Http11ClientStateMachine (_partialResponse) and prepending it to the next read; the
+/// deterministic repro lives in Http11ClientFragmentedResponseSpec. This stress guard now drains 256 ×
+/// 40 cleanly; it failed ~50-80% of runs before the fix.
+/// </summary>
+[Collection("H11")]
+public sealed class SingleConnectionConcurrencyRegressionSpec : IntegrationSpecBase
+{
+    public SingleConnectionConcurrencyRegressionSpec(ServerContainerFixture server, ActorSystemFixture systemFixture)
+        : base(server, systemFixture)
+    {
+    }
+
+    // Build our own single-connection client below; do not use the default multi-connection Client.
+    protected override ProtocolVariant? Variant => null;
+
+    [Fact(Timeout = 180_000)]
+    public async Task SingleConnection_should_not_deadlock_under_concurrent_H11_requests()
+    {
+        await using var helper = CreateClient(
+            new ProtocolVariant(TestHttpVersion.H11, tls: false),
+            configureOptions: o => o.Http1.MaxConnectionsPerServer = 1);
+        var client = helper.Client;
+
+        // 256 concurrency matches the heavier of the two NA configs ([256, 1.1]); many rounds give the
+        // intermittent single-connection deadlock repeated chances to surface.
+        const int concurrency = 256;
+        const int rounds = 40;
+
+        for (var round = 0; round < rounds; round++)
+        {
+            var tasks = new Task<HttpResponseMessage>[concurrency];
+            for (var i = 0; i < concurrency; i++)
+            {
+                tasks[i] = client.SendAsync(
+                    new HttpRequestMessage(HttpMethod.Get, "/get"), CancellationToken);
+            }
+
+            try
+            {
+                var responses = await Task.WhenAll(tasks).WaitAsync(TimeSpan.FromSeconds(15), CancellationToken);
+                Assert.All(responses, r => Assert.Equal(HttpStatusCode.OK, r.StatusCode));
+                foreach (var r in responses)
+                {
+                    r.Dispose();
+                }
+            }
+            catch (TimeoutException)
+            {
+                Assert.Fail(
+                    $"REPRO: round {round} of {concurrency} concurrent HTTP/1.1 GETs on a single connection " +
+                    "did not complete within 15 s — single-connection request dispatch deadlocked.");
+            }
+        }
+    }
+}
diff --git a/src/TurboHTTP.IntegrationTests.Client/H2/LargeDownloadRegressionSpec.cs b/src/TurboHTTP.IntegrationTests.Client/H2/LargeDownloadRegressionSpec.cs
@@ -0,0 +1,83 @@
+using System.Net;
+using TurboHTTP.Client;
+using TurboHTTP.IntegrationTests.Client.Shared;
+using TurboHTTP.Tests.Shared;
+
+namespace TurboHTTP.IntegrationTests.Client.H2;
+
+/// <summary>
+/// Repro for the HTTP/2 large-download hang in the 2026-06-19 benchmark run
+/// (KestrelTurboDownloadBenchmarks [ConcurrencyLevel=1, DownloadBytes=8388608, HttpVersion=2.0] → NA,
+/// "System.TimeoutException: The operation has timed out"). A SINGLE 8 MB response over one H2 stream
+/// hung to the 120 s WaitAsync, while 1 MB over H2 — and 8 MB over H1.1 and H3 — all completed.
+/// Suspected receive-path flow-control / WINDOW_UPDATE stall on a single large stream.
+///
+/// 1 MB is included first as a sanity check (it completes in the benchmark); the 8 MB download is the
+/// configuration that hung.
+/// </summary>
+[Collection("H2")]
+public sealed class LargeDownloadRegressionSpec : IntegrationSpecBase
+{
+    public LargeDownloadRegressionSpec(ServerContainerFixture server, ActorSystemFixture systemFixture)
+        : base(server, systemFixture)
+    {
+    }
+
+    // Build our own client below so we can pin a single H2 connection (one stream at a time),
+    // matching the benchmark's ConcurrencyLevel=1 over the default pool.
+    protected override ProtocolVariant? Variant => null;
+
+    [Fact(Timeout = 180_000)]
+    public async Task LargeDownload_should_complete_8MB_body_over_single_H2_stream()
+    {
+        await using var helper = CreateClient(
+            new ProtocolVariant(TestHttpVersion.H2, tls: true),
+            configureOptions: o => o.Http2.MaxConnectionsPerServer = 1);
+        var client = helper.Client;
+
+        // Warmup + iterations: the benchmark drained 8 MB ~13 times in sequence before it hung.
+        await DownloadAsync(client, 1 * 1024 * 1024);
+        for (var i = 0; i < 13; i++)
+        {
+            await DownloadAsync(client, 8 * 1024 * 1024);
+        }
+    }
+
+    private async Task DownloadAsync(ITurboHttpClient client, int size)
+    {
+        using var cts = CancellationTokenSource.CreateLinkedTokenSource(CancellationToken);
+        cts.CancelAfter(TimeSpan.FromSeconds(30));
+
+        try
+        {
+            var response = await client.SendAsync(
+                new HttpRequestMessage(HttpMethod.Get, $"/bytes/{size}"), cts.Token);
+
+            // This guard needs a server that streams an arbitrary-size body. The Kestrel backend's
+            // /bytes/{n} does; the Docker (httpbin) backend caps the size and rejects it up front with 400
+            // (some servers use 413). Skip ONLY on those size-rejection statuses — any other non-200
+            // (404, 5xx, ...) is a real failure and must not be masked. The stall this guards against
+            // surfaces as the 30 s timeout below, never as a status code.
+            if (response.StatusCode is HttpStatusCode.BadRequest or HttpStatusCode.RequestEntityTooLarge)
+            {
+                response.Dispose();
+                Assert.Skip(
+                    $"Backend rejected /bytes/{size} with {(int)response.StatusCode} (size cap); "
+                    + "run with the Kestrel backend to exercise the H2 receive flow-control fix.");
+                return;
+            }
+
+            Assert.Equal(HttpStatusCode.OK, response.StatusCode);
+
+            // Drain exactly like the benchmark (Content.CopyToAsync(Stream.Null)).
+            await response.Content.CopyToAsync(Stream.Null, cts.Token);
+            response.Dispose();
+        }
+        catch (OperationCanceledException) when (cts.IsCancellationRequested && !CancellationToken.IsCancellationRequested)
+        {
+            Assert.Fail(
+                $"REPRO: a {size / (1024 * 1024)} MB HTTP/2 download did not complete within 30 s — " +
+                "the receive path stalls on a large single stream (suspected missing/stuck WINDOW_UPDATE).");
+        }
+    }
+}
diff --git a/src/TurboHTTP.IntegrationTests.End2End/H10/LargePayloadSpec.cs b/src/TurboHTTP.IntegrationTests.End2End/H10/LargePayloadSpec.cs
@@ -23,12 +23,14 @@ protected override void ConfigureEndpoints(WebApplication app)
             await ctx.Request.Body.CopyToAsync(stream, ctx.RequestAborted);
             var data = stream.ToArray();
             ctx.Response.ContentType = "application/octet-stream";
+            ctx.Response.ContentLength = data.Length;
             await ctx.Response.Body.WriteAsync(data, ctx.RequestAborted);
         });
 
         app.MapGet("/generate", async (int size, HttpContext ctx) =>
         {
             ctx.Response.ContentType = "application/octet-stream";
+            ctx.Response.ContentLength = size;
             var buffer = new byte[1024];
             Array.Fill(buffer, (byte)0xAB);
             var remaining = size;