diff --git a/apps/desktop/src-tauri/src/pty.rs b/apps/desktop/src-tauri/src/pty.rs index a80e4beb..e8f9cce9 100644 --- a/apps/desktop/src-tauri/src/pty.rs +++ b/apps/desktop/src-tauri/src/pty.rs @@ -7,6 +7,26 @@ use std::{ }; use tauri::{AppHandle, Emitter, Runtime}; +/// Returns the length in bytes of the longest prefix of `buf` that should be +/// emitted right now. Callers are expected to emit `buf[..split]` through +/// `from_utf8_lossy` and retain `buf[split..]` as leftover to prepend to the +/// next read. +/// +/// The only case we hold bytes back is a *truncated* UTF-8 sequence at the +/// very end of `buf` (where `str::from_utf8` reports `error_len() == None`): +/// the next read may complete it into a valid codepoint. If the buffer +/// contains invalid UTF-8 *anywhere else* we return `buf.len()` and let the +/// caller emit the whole thing — `from_utf8_lossy` replaces the bad bytes +/// with `U+FFFD` and valid bytes that follow the invalid byte in the same +/// chunk are not delayed to the next read. +fn split_utf8_safely(buf: &[u8]) -> usize { + match std::str::from_utf8(buf) { + Ok(_) => buf.len(), + Err(e) if e.error_len().is_none() => e.valid_up_to(), + Err(_) => buf.len(), + } +} + pub struct PtyState { pub writer: Arc>>, pub master: Box, @@ -96,15 +116,38 @@ pub fn spawn_pty( err })? = Some(pty_state); - // Spawn a thread to read from the PTY and emit to the frontend + // Spawn a thread to read from the PTY and emit to the frontend. + // + // Why the leftover buffer: a single UTF-8 codepoint can span up to 4 + // bytes, and we read in fixed 4 KiB chunks. When a multi-byte character + // straddles the chunk boundary, decoding each chunk independently with + // `from_utf8_lossy` produces U+FFFD replacement characters (showing up + // as `�` in the terminal) even though the byte stream itself is + // perfectly valid. We hold back the trailing truncated bytes and + // prepend them to the next read so codepoints are never split. See + // `split_utf8_safely` for the decoding rules. thread::spawn(move || { let mut buffer = [0u8; 4096]; + // Reuse the same assembly buffer across reads so a busy shell does + // not force one allocation per iteration. Capacity is chunk size + + // the maximum UTF-8 sequence length (4), so a full read plus a + // 3-byte truncated leftover never has to grow. + let mut combined: Vec = Vec::with_capacity(buffer.len() + 4); loop { match reader.read(&mut buffer) { Ok(0) => break, Ok(n) => { - let data = String::from_utf8_lossy(&buffer[..n]).to_string(); - let _ = app.emit("pty-output", data); + combined.extend_from_slice(&buffer[..n]); + + let split = split_utf8_safely(&combined); + if split > 0 { + let chunk = String::from_utf8_lossy(&combined[..split]).into_owned(); + let _ = app.emit("pty-output", chunk); + // Drop the emitted prefix in place; the tail (at + // most 3 bytes of a truncated codepoint) stays put + // for the next read. + combined.drain(..split); + } } Err(e) => { error!("PTY reader error: {}", e); @@ -112,6 +155,13 @@ pub fn spawn_pty( } } } + // Flush any trailing truncated bytes the child never completed so + // users still see the final prompt/output byte-for-byte (lossy at + // worst on a deliberately malformed tail). + if !combined.is_empty() { + let tail = String::from_utf8_lossy(&combined).into_owned(); + let _ = app.emit("pty-output", tail); + } }); Ok(()) @@ -161,3 +211,125 @@ pub fn kill_pty(state: tauri::State<'_, Arc>>>) -> Result *guard = None; // Drops PtyState — closes master PTY and terminates child Ok(()) } + +#[cfg(test)] +mod tests { + use super::split_utf8_safely; + + /// Walks `bytes` in fixed `chunk` sized reads, feeding each slice through + /// the same leftover-aware decoder the PTY thread uses, and returns the + /// concatenated emitted string. Mirrors the reader loop so tests exercise + /// the full assembly path, not just the split helper in isolation. + fn drain(bytes: &[u8], chunk: usize) -> String { + let mut out = String::new(); + let mut combined: Vec = Vec::new(); + for window in bytes.chunks(chunk) { + combined.extend_from_slice(window); + let split = split_utf8_safely(&combined); + if split > 0 { + out.push_str(&String::from_utf8_lossy(&combined[..split])); + combined.drain(..split); + } + } + if !combined.is_empty() { + out.push_str(&String::from_utf8_lossy(&combined)); + } + out + } + + #[test] + fn pure_ascii_passes_through_any_chunking() { + let input = b"hello world from the pty"; + for chunk in 1..=input.len() { + assert_eq!(drain(input, chunk), "hello world from the pty"); + } + } + + #[test] + fn four_byte_codepoint_split_across_reads() { + // U+1F600 "😀" encodes to F0 9F 98 80 — a four-byte sequence. + let smiley = "😀"; + let bytes = smiley.as_bytes(); + assert_eq!(bytes.len(), 4); + + // Any split 1..=3 would have been substituted with U+FFFD by the + // previous naive `from_utf8_lossy(&buffer[..n])` call. + for boundary in 1..=3 { + let mut input = b"pre-".to_vec(); + input.extend_from_slice(&bytes[..boundary]); + // Simulate the read coming back in two chunks exactly at `boundary` + // inside the codepoint. + input.extend_from_slice(&bytes[boundary..]); + input.extend_from_slice(b"-post"); + + // Feed at a chunk size that guarantees the boundary lands inside + // the emoji. + let chunk = "pre-".len() + boundary; + assert_eq!(drain(&input, chunk), "pre-😀-post"); + } + } + + #[test] + fn multiple_multibyte_codepoints_split_repeatedly() { + // Mix 2-, 3- and 4-byte sequences, and force 1-byte reads so every + // boundary is mid-codepoint. + let input = "café — 日本語 — 😀🚀".as_bytes(); + assert_eq!(drain(input, 1), "café — 日本語 — 😀🚀"); + } + + #[test] + fn invalid_bytes_become_replacement_and_do_not_stall() { + // 0xFF is never legal in UTF-8. We want the decoder to replace it + // with U+FFFD and keep making forward progress instead of holding + // it back as "maybe the next read completes it". + let mut input = b"ok-".to_vec(); + input.push(0xFF); + input.extend_from_slice(b"-more"); + let out = drain(&input, 4); + assert!(out.starts_with("ok-")); + assert!(out.ends_with("-more")); + assert!(out.contains('\u{FFFD}')); + } + + #[test] + fn invalid_byte_mid_chunk_does_not_delay_valid_trailing_bytes() { + // Regression guard for the "hold everything after invalid as leftover" + // bug: when a read contains `[valid_prefix, invalid_byte, valid_suffix]` + // we must emit all of it in the same iteration (the valid suffix + // shouldn't be withheld until the next read). We simulate a single + // read by giving `drain` a chunk size larger than the whole input. + let mut input = b"before-".to_vec(); + input.push(0xFF); + input.extend_from_slice(b"-after"); + // Chunk size = input.len() means drain hits the last iteration with + // the full buffer present; if the decoder withheld "-after" it would + // only surface in the final post-loop flush, but the emit-before- + // flush assertion still catches it. + let out = drain(&input, input.len()); + assert_eq!( + out.chars().filter(|&c| c == '\u{FFFD}').count(), + 1, + "exactly one replacement char expected: {out:?}" + ); + assert!(out.starts_with("before-")); + assert!(out.ends_with("-after")); + } + + #[test] + fn empty_input_emits_nothing() { + assert_eq!(drain(b"", 1), ""); + assert_eq!(drain(b"", 4096), ""); + } + + #[test] + fn truncated_tail_is_flushed_on_close() { + // Stream ends mid-codepoint (the child crashed or we hit EOF). The + // incomplete bytes should still surface, just lossily — better than + // silently swallowing the tail. + let mut input = b"done ".to_vec(); + input.extend_from_slice(&"😀".as_bytes()[..2]); + let out = drain(&input, 8); + assert!(out.starts_with("done ")); + assert!(out.contains('\u{FFFD}')); + } +}