From f85a79061e8ea3e13d492d9362bfefbe44d3b7f6 Mon Sep 17 00:00:00 2001 From: Matias Palma Date: Tue, 21 Apr 2026 00:37:52 -0400 Subject: [PATCH 1/2] fix: preserve UTF-8 codepoints across PTY 4 KiB read boundaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PTY reader thread was calling `String::from_utf8_lossy` on each raw 4096-byte chunk straight from the OS. That works for ASCII but breaks on any multi-byte UTF-8 sequence that happens to straddle a chunk boundary: the last byte(s) of a codepoint end up decoded as the lead of an "incomplete" sequence and get replaced with U+FFFD, so the terminal shows `�` for accented letters, CJK text, emoji, box-drawing characters, and most non-English shell prompts whenever the read splits on a bad offset. The fix is a small leftover buffer: on every read we combine any held bytes with the new chunk, find the longest valid UTF-8 prefix, emit that, and keep the 1–3 trailing truncated bytes for the next read. Actually invalid sequences are still replaced with U+FFFD so one bad byte cannot stall the stream forever, and the leftover is flushed on EOF/error so the final prompt byte is never silently swallowed. Extracted the decode rule as `split_utf8_safely` and covered it with unit tests for ASCII, 4-byte codepoints split at every internal boundary, mixed multibyte runs with 1-byte reads, invalid bytes, empty input, and a truncated tail on close. Fixes #70 --- apps/desktop/src-tauri/src/pty.rs | 150 +++++++++++++++++++++++++++++- 1 file changed, 147 insertions(+), 3 deletions(-) diff --git a/apps/desktop/src-tauri/src/pty.rs b/apps/desktop/src-tauri/src/pty.rs index a80e4beb..43786b4c 100644 --- a/apps/desktop/src-tauri/src/pty.rs +++ b/apps/desktop/src-tauri/src/pty.rs @@ -7,6 +7,25 @@ use std::{ }; use tauri::{AppHandle, Emitter, Runtime}; +/// Returns the length in bytes of the longest prefix of `buf` that is safe to +/// decode right now. Callers are expected to emit `buf[..split]` and retain +/// `buf[split..]` as leftover bytes to prepend to the next chunk. +/// +/// Rules: +/// - If the whole buffer is valid UTF-8, the whole buffer is emitted. +/// - If the tail is a *truncated* multi-byte sequence (incomplete but still +/// possibly valid), the truncated tail is held back for the next read. +/// - If the tail is *actually invalid*, the invalid bytes are included in the +/// emit so lossy decoding replaces them with U+FFFD — otherwise one bad byte +/// would stall the stream forever. +fn split_utf8_safely(buf: &[u8]) -> usize { + match std::str::from_utf8(buf) { + Ok(_) => buf.len(), + Err(e) if e.error_len().is_none() => e.valid_up_to(), + Err(e) => e.valid_up_to() + e.error_len().unwrap_or(1), + } +} + pub struct PtyState { pub writer: Arc>>, pub master: Box, @@ -96,15 +115,34 @@ pub fn spawn_pty( err })? = Some(pty_state); - // Spawn a thread to read from the PTY and emit to the frontend + // Spawn a thread to read from the PTY and emit to the frontend. + // + // Why the leftover buffer: a single UTF-8 codepoint can span up to 4 + // bytes, and we read in fixed 4 KiB chunks. When a multi-byte character + // straddles the chunk boundary, decoding each chunk independently with + // `from_utf8_lossy` produces U+FFFD replacement characters (showing up + // as `�` in the terminal) even though the byte stream itself is + // perfectly valid. We hold back the trailing truncated bytes and + // prepend them to the next read so codepoints are never split. See + // `split_utf8_safely` for the decoding rules. thread::spawn(move || { let mut buffer = [0u8; 4096]; + // At most 3 bytes can be waiting for completion (4-byte UTF-8 + // codepoints have 3 bytes after the lead), so capacity 4 is plenty. + let mut leftover: Vec = Vec::with_capacity(4); loop { match reader.read(&mut buffer) { Ok(0) => break, Ok(n) => { - let data = String::from_utf8_lossy(&buffer[..n]).to_string(); - let _ = app.emit("pty-output", data); + let mut combined = std::mem::take(&mut leftover); + combined.extend_from_slice(&buffer[..n]); + + let split = split_utf8_safely(&combined); + if split > 0 { + let chunk = String::from_utf8_lossy(&combined[..split]).into_owned(); + let _ = app.emit("pty-output", chunk); + } + leftover = combined[split..].to_vec(); } Err(e) => { error!("PTY reader error: {}", e); @@ -112,6 +150,13 @@ pub fn spawn_pty( } } } + // Flush any trailing truncated bytes the child never completed so + // users still see the final prompt/output byte-for-byte (lossy at + // worst on a deliberately malformed tail). + if !leftover.is_empty() { + let tail = String::from_utf8_lossy(&leftover).into_owned(); + let _ = app.emit("pty-output", tail); + } }); Ok(()) @@ -161,3 +206,102 @@ pub fn kill_pty(state: tauri::State<'_, Arc>>>) -> Result *guard = None; // Drops PtyState — closes master PTY and terminates child Ok(()) } + +#[cfg(test)] +mod tests { + use super::split_utf8_safely; + + /// Walks `bytes` in fixed `chunk` sized reads, feeding each slice through + /// the same leftover-aware decoder the PTY thread uses, and returns the + /// concatenated emitted string. Mirrors the reader loop so tests exercise + /// the full assembly path, not just the split helper in isolation. + fn drain(bytes: &[u8], chunk: usize) -> String { + let mut out = String::new(); + let mut leftover: Vec = Vec::new(); + for window in bytes.chunks(chunk) { + let mut combined = std::mem::take(&mut leftover); + combined.extend_from_slice(window); + let split = split_utf8_safely(&combined); + if split > 0 { + out.push_str(&String::from_utf8_lossy(&combined[..split])); + } + leftover = combined[split..].to_vec(); + } + if !leftover.is_empty() { + out.push_str(&String::from_utf8_lossy(&leftover)); + } + out + } + + #[test] + fn pure_ascii_passes_through_any_chunking() { + let input = b"hello world from the pty"; + for chunk in 1..=input.len() { + assert_eq!(drain(input, chunk), "hello world from the pty"); + } + } + + #[test] + fn four_byte_codepoint_split_across_reads() { + // U+1F600 "😀" encodes to F0 9F 98 80 — a four-byte sequence. + let smiley = "😀"; + let bytes = smiley.as_bytes(); + assert_eq!(bytes.len(), 4); + + // Any split 1..=3 would have been substituted with U+FFFD by the + // previous naive `from_utf8_lossy(&buffer[..n])` call. + for boundary in 1..=3 { + let mut input = b"pre-".to_vec(); + input.extend_from_slice(&bytes[..boundary]); + // Simulate the read coming back in two chunks exactly at `boundary` + // inside the codepoint. + input.extend_from_slice(&bytes[boundary..]); + input.extend_from_slice(b"-post"); + + // Feed at a chunk size that guarantees the boundary lands inside + // the emoji. + let chunk = "pre-".len() + boundary; + assert_eq!(drain(&input, chunk), "pre-😀-post"); + } + } + + #[test] + fn multiple_multibyte_codepoints_split_repeatedly() { + // Mix 2-, 3- and 4-byte sequences, and force 1-byte reads so every + // boundary is mid-codepoint. + let input = "café — 日本語 — 😀🚀".as_bytes(); + assert_eq!(drain(input, 1), "café — 日本語 — 😀🚀"); + } + + #[test] + fn invalid_bytes_become_replacement_and_do_not_stall() { + // 0xFF is never legal in UTF-8. We want the decoder to replace it + // with U+FFFD and keep making forward progress instead of holding + // it back as "maybe the next read completes it". + let mut input = b"ok-".to_vec(); + input.push(0xFF); + input.extend_from_slice(b"-more"); + let out = drain(&input, 4); + assert!(out.starts_with("ok-")); + assert!(out.ends_with("-more")); + assert!(out.contains('\u{FFFD}')); + } + + #[test] + fn empty_input_emits_nothing() { + assert_eq!(drain(b"", 1), ""); + assert_eq!(drain(b"", 4096), ""); + } + + #[test] + fn truncated_tail_is_flushed_on_close() { + // Stream ends mid-codepoint (the child crashed or we hit EOF). The + // incomplete bytes should still surface, just lossily — better than + // silently swallowing the tail. + let mut input = b"done ".to_vec(); + input.extend_from_slice(&"😀".as_bytes()[..2]); + let out = drain(&input, 8); + assert!(out.starts_with("done ")); + assert!(out.contains('\u{FFFD}')); + } +} From dc63b040c67755b4b245f6c288743b01f318ff13 Mon Sep 17 00:00:00 2001 From: Matias Palma Date: Tue, 21 Apr 2026 01:17:26 -0400 Subject: [PATCH 2/2] perf(pty): reuse assembly buffer and stop withholding post-invalid bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot review on #173 flagged three issues on the reader thread, all valid: 1. Per-iteration allocations — `combined` was rebuilt with `std::mem::take(&mut leftover)` + `extend_from_slice` + a `combined[split..].to_vec()` on every read, so a busy shell paid two Vec allocations per 4 KiB chunk. 2. Post-invalid bytes delayed until the next read — when a chunk was `[valid][invalid][valid]`, the old split rule used `valid_up_to + error_len` as the emit boundary and kept the trailing valid suffix as `leftover`. That did not lose data but delayed it by a full read. 3. Docstring for `split_utf8_safely` said "truncated tail" but the code also branched on mid-buffer invalid sequences, which did not match the contract. Fix: - `split_utf8_safely` now returns `buf.len()` for any invalid UTF-8 that is not a truncated tail, so callers emit the whole chunk (`from_utf8_lossy` substitutes `U+FFFD` for the bad bytes) instead of holding the tail back. - The reader thread now keeps a single `Vec` assembly buffer with capacity `4096 + 4` and uses `drain(..split)` to consume emitted bytes in place — no allocation per read. - Docstring rewritten to match the new contract. - Added a regression test (`invalid_byte_mid_chunk_does_not_delay_valid_trailing_bytes`) that would fail under the old implementation. --- apps/desktop/src-tauri/src/pty.rs | 74 +++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 23 deletions(-) diff --git a/apps/desktop/src-tauri/src/pty.rs b/apps/desktop/src-tauri/src/pty.rs index 43786b4c..e8f9cce9 100644 --- a/apps/desktop/src-tauri/src/pty.rs +++ b/apps/desktop/src-tauri/src/pty.rs @@ -7,22 +7,23 @@ use std::{ }; use tauri::{AppHandle, Emitter, Runtime}; -/// Returns the length in bytes of the longest prefix of `buf` that is safe to -/// decode right now. Callers are expected to emit `buf[..split]` and retain -/// `buf[split..]` as leftover bytes to prepend to the next chunk. +/// Returns the length in bytes of the longest prefix of `buf` that should be +/// emitted right now. Callers are expected to emit `buf[..split]` through +/// `from_utf8_lossy` and retain `buf[split..]` as leftover to prepend to the +/// next read. /// -/// Rules: -/// - If the whole buffer is valid UTF-8, the whole buffer is emitted. -/// - If the tail is a *truncated* multi-byte sequence (incomplete but still -/// possibly valid), the truncated tail is held back for the next read. -/// - If the tail is *actually invalid*, the invalid bytes are included in the -/// emit so lossy decoding replaces them with U+FFFD — otherwise one bad byte -/// would stall the stream forever. +/// The only case we hold bytes back is a *truncated* UTF-8 sequence at the +/// very end of `buf` (where `str::from_utf8` reports `error_len() == None`): +/// the next read may complete it into a valid codepoint. If the buffer +/// contains invalid UTF-8 *anywhere else* we return `buf.len()` and let the +/// caller emit the whole thing — `from_utf8_lossy` replaces the bad bytes +/// with `U+FFFD` and valid bytes that follow the invalid byte in the same +/// chunk are not delayed to the next read. fn split_utf8_safely(buf: &[u8]) -> usize { match std::str::from_utf8(buf) { Ok(_) => buf.len(), Err(e) if e.error_len().is_none() => e.valid_up_to(), - Err(e) => e.valid_up_to() + e.error_len().unwrap_or(1), + Err(_) => buf.len(), } } @@ -127,22 +128,26 @@ pub fn spawn_pty( // `split_utf8_safely` for the decoding rules. thread::spawn(move || { let mut buffer = [0u8; 4096]; - // At most 3 bytes can be waiting for completion (4-byte UTF-8 - // codepoints have 3 bytes after the lead), so capacity 4 is plenty. - let mut leftover: Vec = Vec::with_capacity(4); + // Reuse the same assembly buffer across reads so a busy shell does + // not force one allocation per iteration. Capacity is chunk size + + // the maximum UTF-8 sequence length (4), so a full read plus a + // 3-byte truncated leftover never has to grow. + let mut combined: Vec = Vec::with_capacity(buffer.len() + 4); loop { match reader.read(&mut buffer) { Ok(0) => break, Ok(n) => { - let mut combined = std::mem::take(&mut leftover); combined.extend_from_slice(&buffer[..n]); let split = split_utf8_safely(&combined); if split > 0 { let chunk = String::from_utf8_lossy(&combined[..split]).into_owned(); let _ = app.emit("pty-output", chunk); + // Drop the emitted prefix in place; the tail (at + // most 3 bytes of a truncated codepoint) stays put + // for the next read. + combined.drain(..split); } - leftover = combined[split..].to_vec(); } Err(e) => { error!("PTY reader error: {}", e); @@ -153,8 +158,8 @@ pub fn spawn_pty( // Flush any trailing truncated bytes the child never completed so // users still see the final prompt/output byte-for-byte (lossy at // worst on a deliberately malformed tail). - if !leftover.is_empty() { - let tail = String::from_utf8_lossy(&leftover).into_owned(); + if !combined.is_empty() { + let tail = String::from_utf8_lossy(&combined).into_owned(); let _ = app.emit("pty-output", tail); } }); @@ -217,18 +222,17 @@ mod tests { /// the full assembly path, not just the split helper in isolation. fn drain(bytes: &[u8], chunk: usize) -> String { let mut out = String::new(); - let mut leftover: Vec = Vec::new(); + let mut combined: Vec = Vec::new(); for window in bytes.chunks(chunk) { - let mut combined = std::mem::take(&mut leftover); combined.extend_from_slice(window); let split = split_utf8_safely(&combined); if split > 0 { out.push_str(&String::from_utf8_lossy(&combined[..split])); + combined.drain(..split); } - leftover = combined[split..].to_vec(); } - if !leftover.is_empty() { - out.push_str(&String::from_utf8_lossy(&leftover)); + if !combined.is_empty() { + out.push_str(&String::from_utf8_lossy(&combined)); } out } @@ -287,6 +291,30 @@ mod tests { assert!(out.contains('\u{FFFD}')); } + #[test] + fn invalid_byte_mid_chunk_does_not_delay_valid_trailing_bytes() { + // Regression guard for the "hold everything after invalid as leftover" + // bug: when a read contains `[valid_prefix, invalid_byte, valid_suffix]` + // we must emit all of it in the same iteration (the valid suffix + // shouldn't be withheld until the next read). We simulate a single + // read by giving `drain` a chunk size larger than the whole input. + let mut input = b"before-".to_vec(); + input.push(0xFF); + input.extend_from_slice(b"-after"); + // Chunk size = input.len() means drain hits the last iteration with + // the full buffer present; if the decoder withheld "-after" it would + // only surface in the final post-loop flush, but the emit-before- + // flush assertion still catches it. + let out = drain(&input, input.len()); + assert_eq!( + out.chars().filter(|&c| c == '\u{FFFD}').count(), + 1, + "exactly one replacement char expected: {out:?}" + ); + assert!(out.starts_with("before-")); + assert!(out.ends_with("-after")); + } + #[test] fn empty_input_emits_nothing() { assert_eq!(drain(b"", 1), "");