From f85a79061e8ea3e13d492d9362bfefbe44d3b7f6 Mon Sep 17 00:00:00 2001
From: Matias Palma <matiaspalma2594@gmail.com>
Date: Tue, 21 Apr 2026 00:37:52 -0400
Subject: [PATCH 1/2] fix: preserve UTF-8 codepoints across PTY 4 KiB read
 boundaries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PTY reader thread was calling `String::from_utf8_lossy` on each raw
4096-byte chunk straight from the OS. That works for ASCII but breaks
on any multi-byte UTF-8 sequence that happens to straddle a chunk
boundary: the last byte(s) of a codepoint end up decoded as the lead
of an "incomplete" sequence and get replaced with U+FFFD, so the
terminal shows `�` for accented letters, CJK text, emoji, box-drawing
characters, and most non-English shell prompts whenever the read
splits on a bad offset.

The fix is a small leftover buffer: on every read we combine any held
bytes with the new chunk, find the longest valid UTF-8 prefix, emit
that, and keep the 1–3 trailing truncated bytes for the next read.
Actually invalid sequences are still replaced with U+FFFD so one bad
byte cannot stall the stream forever, and the leftover is flushed on
EOF/error so the final prompt byte is never silently swallowed.

Extracted the decode rule as `split_utf8_safely` and covered it with
unit tests for ASCII, 4-byte codepoints split at every internal
boundary, mixed multibyte runs with 1-byte reads, invalid bytes,
empty input, and a truncated tail on close.

Fixes #70
---
 apps/desktop/src-tauri/src/pty.rs | 150 +++++++++++++++++++++++++++++-
 1 file changed, 147 insertions(+), 3 deletions(-)
diff --git a/apps/desktop/src-tauri/src/pty.rs b/apps/desktop/src-tauri/src/pty.rs
index a80e4beb..43786b4c 100644
--- a/apps/desktop/src-tauri/src/pty.rs
+++ b/apps/desktop/src-tauri/src/pty.rs
@@ -7,6 +7,25 @@ use std::{
 };
 use tauri::{AppHandle, Emitter, Runtime};
 
+/// Returns the length in bytes of the longest prefix of `buf` that is safe to
+/// decode right now. Callers are expected to emit `buf[..split]` and retain
+/// `buf[split..]` as leftover bytes to prepend to the next chunk.
+///
+/// Rules:
+/// - If the whole buffer is valid UTF-8, the whole buffer is emitted.
+/// - If the tail is a *truncated* multi-byte sequence (incomplete but still
+///   possibly valid), the truncated tail is held back for the next read.
+/// - If the tail is *actually invalid*, the invalid bytes are included in the
+///   emit so lossy decoding replaces them with U+FFFD — otherwise one bad byte
+///   would stall the stream forever.
+fn split_utf8_safely(buf: &[u8]) -> usize {
+    match std::str::from_utf8(buf) {
+        Ok(_) => buf.len(),
+        Err(e) if e.error_len().is_none() => e.valid_up_to(),
+        Err(e) => e.valid_up_to() + e.error_len().unwrap_or(1),
+    }
+}
+
 pub struct PtyState {
     pub writer: Arc<Mutex<Box<dyn Write + Send>>>,
     pub master: Box<dyn portable_pty::MasterPty + Send>,
@@ -96,15 +115,34 @@ pub fn spawn_pty<R: Runtime>(
         err
     })? = Some(pty_state);
 
-    // Spawn a thread to read from the PTY and emit to the frontend
+    // Spawn a thread to read from the PTY and emit to the frontend.
+    //
+    // Why the leftover buffer: a single UTF-8 codepoint can span up to 4
+    // bytes, and we read in fixed 4 KiB chunks. When a multi-byte character
+    // straddles the chunk boundary, decoding each chunk independently with
+    // `from_utf8_lossy` produces U+FFFD replacement characters (showing up
+    // as `�` in the terminal) even though the byte stream itself is
+    // perfectly valid. We hold back the trailing truncated bytes and
+    // prepend them to the next read so codepoints are never split. See
+    // `split_utf8_safely` for the decoding rules.
     thread::spawn(move || {
         let mut buffer = [0u8; 4096];
+        // At most 3 bytes can be waiting for completion (4-byte UTF-8
+        // codepoints have 3 bytes after the lead), so capacity 4 is plenty.
+        let mut leftover: Vec<u8> = Vec::with_capacity(4);
         loop {
             match reader.read(&mut buffer) {
                 Ok(0) => break,
                 Ok(n) => {
-                    let data = String::from_utf8_lossy(&buffer[..n]).to_string();
-                    let _ = app.emit("pty-output", data);
+                    let mut combined = std::mem::take(&mut leftover);
+                    combined.extend_from_slice(&buffer[..n]);
+
+                    let split = split_utf8_safely(&combined);
+                    if split > 0 {
+                        let chunk = String::from_utf8_lossy(&combined[..split]).into_owned();
+                        let _ = app.emit("pty-output", chunk);
+                    }
+                    leftover = combined[split..].to_vec();
                 }
                 Err(e) => {
                     error!("PTY reader error: {}", e);
@@ -112,6 +150,13 @@ pub fn spawn_pty<R: Runtime>(
                 }
             }
         }
+        // Flush any trailing truncated bytes the child never completed so
+        // users still see the final prompt/output byte-for-byte (lossy at
+        // worst on a deliberately malformed tail).
+        if !leftover.is_empty() {
+            let tail = String::from_utf8_lossy(&leftover).into_owned();
+            let _ = app.emit("pty-output", tail);
+        }
     });
 
     Ok(())
@@ -161,3 +206,102 @@ pub fn kill_pty(state: tauri::State<'_, Arc<Mutex<Option<PtyState>>>>) -> Result
     *guard = None; // Drops PtyState — closes master PTY and terminates child
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::split_utf8_safely;
+
+    /// Walks `bytes` in fixed `chunk` sized reads, feeding each slice through
+    /// the same leftover-aware decoder the PTY thread uses, and returns the
+    /// concatenated emitted string. Mirrors the reader loop so tests exercise
+    /// the full assembly path, not just the split helper in isolation.
+    fn drain(bytes: &[u8], chunk: usize) -> String {
+        let mut out = String::new();
+        let mut leftover: Vec<u8> = Vec::new();
+        for window in bytes.chunks(chunk) {
+            let mut combined = std::mem::take(&mut leftover);
+            combined.extend_from_slice(window);
+            let split = split_utf8_safely(&combined);
+            if split > 0 {
+                out.push_str(&String::from_utf8_lossy(&combined[..split]));
+            }
+            leftover = combined[split..].to_vec();
+        }
+        if !leftover.is_empty() {
+            out.push_str(&String::from_utf8_lossy(&leftover));
+        }
+        out
+    }
+
+    #[test]
+    fn pure_ascii_passes_through_any_chunking() {
+        let input = b"hello world from the pty";
+        for chunk in 1..=input.len() {
+            assert_eq!(drain(input, chunk), "hello world from the pty");
+        }
+    }
+
+    #[test]
+    fn four_byte_codepoint_split_across_reads() {
+        // U+1F600 "😀" encodes to F0 9F 98 80 — a four-byte sequence.
+        let smiley = "😀";
+        let bytes = smiley.as_bytes();
+        assert_eq!(bytes.len(), 4);
+
+        // Any split 1..=3 would have been substituted with U+FFFD by the
+        // previous naive `from_utf8_lossy(&buffer[..n])` call.
+        for boundary in 1..=3 {
+            let mut input = b"pre-".to_vec();
+            input.extend_from_slice(&bytes[..boundary]);
+            // Simulate the read coming back in two chunks exactly at `boundary`
+            // inside the codepoint.
+            input.extend_from_slice(&bytes[boundary..]);
+            input.extend_from_slice(b"-post");
+
+            // Feed at a chunk size that guarantees the boundary lands inside
+            // the emoji.
+            let chunk = "pre-".len() + boundary;
+            assert_eq!(drain(&input, chunk), "pre-😀-post");
+        }
+    }
+
+    #[test]
+    fn multiple_multibyte_codepoints_split_repeatedly() {
+        // Mix 2-, 3- and 4-byte sequences, and force 1-byte reads so every
+        // boundary is mid-codepoint.
+        let input = "café — 日本語 — 😀🚀".as_bytes();
+        assert_eq!(drain(input, 1), "café — 日本語 — 😀🚀");
+    }
+
+    #[test]
+    fn invalid_bytes_become_replacement_and_do_not_stall() {
+        // 0xFF is never legal in UTF-8. We want the decoder to replace it
+        // with U+FFFD and keep making forward progress instead of holding
+        // it back as "maybe the next read completes it".
+        let mut input = b"ok-".to_vec();
+        input.push(0xFF);
+        input.extend_from_slice(b"-more");
+        let out = drain(&input, 4);
+        assert!(out.starts_with("ok-"));
+        assert!(out.ends_with("-more"));
+        assert!(out.contains('\u{FFFD}'));
+    }
+
+    #[test]
+    fn empty_input_emits_nothing() {
+        assert_eq!(drain(b"", 1), "");
+        assert_eq!(drain(b"", 4096), "");
+    }
+
+    #[test]
+    fn truncated_tail_is_flushed_on_close() {
+        // Stream ends mid-codepoint (the child crashed or we hit EOF). The
+        // incomplete bytes should still surface, just lossily — better than
+        // silently swallowing the tail.
+        let mut input = b"done ".to_vec();
+        input.extend_from_slice(&"😀".as_bytes()[..2]);
+        let out = drain(&input, 8);
+        assert!(out.starts_with("done "));
+        assert!(out.contains('\u{FFFD}'));
+    }
+}

From dc63b040c67755b4b245f6c288743b01f318ff13 Mon Sep 17 00:00:00 2001
From: Matias Palma <matiaspalma2594@gmail.com>
Date: Tue, 21 Apr 2026 01:17:26 -0400
Subject: [PATCH 2/2] perf(pty): reuse assembly buffer and stop withholding
 post-invalid bytes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copilot review on #173 flagged three issues on the reader thread, all
valid:

1. Per-iteration allocations — `combined` was rebuilt with
   `std::mem::take(&mut leftover)` + `extend_from_slice` + a
   `combined[split..].to_vec()` on every read, so a busy shell paid
   two Vec allocations per 4 KiB chunk.

2. Post-invalid bytes delayed until the next read — when a chunk was
   `[valid][invalid][valid]`, the old split rule used
   `valid_up_to + error_len` as the emit boundary and kept the
   trailing valid suffix as `leftover`. That did not lose data but
   delayed it by a full read.

3. Docstring for `split_utf8_safely` said "truncated tail" but the
   code also branched on mid-buffer invalid sequences, which did not
   match the contract.

Fix:

- `split_utf8_safely` now returns `buf.len()` for any invalid UTF-8
  that is not a truncated tail, so callers emit the whole chunk
  (`from_utf8_lossy` substitutes `U+FFFD` for the bad bytes) instead
  of holding the tail back.
- The reader thread now keeps a single `Vec<u8>` assembly buffer with
  capacity `4096 + 4` and uses `drain(..split)` to consume emitted
  bytes in place — no allocation per read.
- Docstring rewritten to match the new contract.
- Added a regression test (`invalid_byte_mid_chunk_does_not_delay_valid_trailing_bytes`)
  that would fail under the old implementation.
---
 apps/desktop/src-tauri/src/pty.rs | 74 +++++++++++++++++++++----------
 1 file changed, 51 insertions(+), 23 deletions(-)

diff --git a/apps/desktop/src-tauri/src/pty.rs b/apps/desktop/src-tauri/src/pty.rs
index 43786b4c..e8f9cce9 100644
--- a/apps/desktop/src-tauri/src/pty.rs
+++ b/apps/desktop/src-tauri/src/pty.rs
@@ -7,22 +7,23 @@ use std::{
 };
 use tauri::{AppHandle, Emitter, Runtime};
 
-/// Returns the length in bytes of the longest prefix of `buf` that is safe to
-/// decode right now. Callers are expected to emit `buf[..split]` and retain
-/// `buf[split..]` as leftover bytes to prepend to the next chunk.
+/// Returns the length in bytes of the longest prefix of `buf` that should be
+/// emitted right now. Callers are expected to emit `buf[..split]` through
+/// `from_utf8_lossy` and retain `buf[split..]` as leftover to prepend to the
+/// next read.
 ///
-/// Rules:
-/// - If the whole buffer is valid UTF-8, the whole buffer is emitted.
-/// - If the tail is a *truncated* multi-byte sequence (incomplete but still
-///   possibly valid), the truncated tail is held back for the next read.
-/// - If the tail is *actually invalid*, the invalid bytes are included in the
-///   emit so lossy decoding replaces them with U+FFFD — otherwise one bad byte
-///   would stall the stream forever.
+/// The only case we hold bytes back is a *truncated* UTF-8 sequence at the
+/// very end of `buf` (where `str::from_utf8` reports `error_len() == None`):
+/// the next read may complete it into a valid codepoint. If the buffer
+/// contains invalid UTF-8 *anywhere else* we return `buf.len()` and let the
+/// caller emit the whole thing — `from_utf8_lossy` replaces the bad bytes
+/// with `U+FFFD` and valid bytes that follow the invalid byte in the same
+/// chunk are not delayed to the next read.
 fn split_utf8_safely(buf: &[u8]) -> usize {
     match std::str::from_utf8(buf) {
         Ok(_) => buf.len(),
         Err(e) if e.error_len().is_none() => e.valid_up_to(),
-        Err(e) => e.valid_up_to() + e.error_len().unwrap_or(1),
+        Err(_) => buf.len(),
     }
 }
 
@@ -127,22 +128,26 @@ pub fn spawn_pty<R: Runtime>(
     // `split_utf8_safely` for the decoding rules.
     thread::spawn(move || {
         let mut buffer = [0u8; 4096];
-        // At most 3 bytes can be waiting for completion (4-byte UTF-8
-        // codepoints have 3 bytes after the lead), so capacity 4 is plenty.
-        let mut leftover: Vec<u8> = Vec::with_capacity(4);
+        // Reuse the same assembly buffer across reads so a busy shell does
+        // not force one allocation per iteration. Capacity is chunk size +
+        // the maximum UTF-8 sequence length (4), so a full read plus a
+        // 3-byte truncated leftover never has to grow.
+        let mut combined: Vec<u8> = Vec::with_capacity(buffer.len() + 4);
         loop {
             match reader.read(&mut buffer) {
                 Ok(0) => break,
                 Ok(n) => {
-                    let mut combined = std::mem::take(&mut leftover);
                     combined.extend_from_slice(&buffer[..n]);
 
                     let split = split_utf8_safely(&combined);
                     if split > 0 {
                         let chunk = String::from_utf8_lossy(&combined[..split]).into_owned();
                         let _ = app.emit("pty-output", chunk);
+                        // Drop the emitted prefix in place; the tail (at
+                        // most 3 bytes of a truncated codepoint) stays put
+                        // for the next read.
+                        combined.drain(..split);
                     }
-                    leftover = combined[split..].to_vec();
                 }
                 Err(e) => {
                     error!("PTY reader error: {}", e);
@@ -153,8 +158,8 @@ pub fn spawn_pty<R: Runtime>(
         // Flush any trailing truncated bytes the child never completed so
         // users still see the final prompt/output byte-for-byte (lossy at
         // worst on a deliberately malformed tail).
-        if !leftover.is_empty() {
-            let tail = String::from_utf8_lossy(&leftover).into_owned();
+        if !combined.is_empty() {
+            let tail = String::from_utf8_lossy(&combined).into_owned();
             let _ = app.emit("pty-output", tail);
         }
     });
@@ -217,18 +222,17 @@ mod tests {
     /// the full assembly path, not just the split helper in isolation.
     fn drain(bytes: &[u8], chunk: usize) -> String {
         let mut out = String::new();
-        let mut leftover: Vec<u8> = Vec::new();
+        let mut combined: Vec<u8> = Vec::new();
         for window in bytes.chunks(chunk) {
-            let mut combined = std::mem::take(&mut leftover);
             combined.extend_from_slice(window);
             let split = split_utf8_safely(&combined);
             if split > 0 {
                 out.push_str(&String::from_utf8_lossy(&combined[..split]));
+                combined.drain(..split);
             }
-            leftover = combined[split..].to_vec();
         }
-        if !leftover.is_empty() {
-            out.push_str(&String::from_utf8_lossy(&leftover));
+        if !combined.is_empty() {
+            out.push_str(&String::from_utf8_lossy(&combined));
         }
         out
     }
@@ -287,6 +291,30 @@ mod tests {
         assert!(out.contains('\u{FFFD}'));
     }
 
+    #[test]
+    fn invalid_byte_mid_chunk_does_not_delay_valid_trailing_bytes() {
+        // Regression guard for the "hold everything after invalid as leftover"
+        // bug: when a read contains `[valid_prefix, invalid_byte, valid_suffix]`
+        // we must emit all of it in the same iteration (the valid suffix
+        // shouldn't be withheld until the next read). We simulate a single
+        // read by giving `drain` a chunk size larger than the whole input.
+        let mut input = b"before-".to_vec();
+        input.push(0xFF);
+        input.extend_from_slice(b"-after");
+        // Chunk size = input.len() means drain hits the last iteration with
+        // the full buffer present; if the decoder withheld "-after" it would
+        // only surface in the final post-loop flush, but the emit-before-
+        // flush assertion still catches it.
+        let out = drain(&input, input.len());
+        assert_eq!(
+            out.chars().filter(|&c| c == '\u{FFFD}').count(),
+            1,
+            "exactly one replacement char expected: {out:?}"
+        );
+        assert!(out.starts_with("before-"));
+        assert!(out.ends_with("-after"));
+    }
+
     #[test]
     fn empty_input_emits_nothing() {
         assert_eq!(drain(b"", 1), "");