From a14b764678ceabbef4e6da3e2e48026b00139fd0 Mon Sep 17 00:00:00 2001 From: Jingyu Ma Date: Tue, 10 Mar 2026 20:56:31 -0700 Subject: [PATCH 1/5] Enable parallel test execution on ARM32 CI targets Remove -j 1 and --test-threads=1 from ARM32 CI step now that thread-local dispatch is implemented for armv7/thumbv7. This ensures thread safety is actually verified by running tests in parallel. --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 76d4751..6f3dda6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -99,7 +99,7 @@ jobs: PKG_CONFIG_PATH: /usr/lib/arm-linux-gnueabihf/pkgconfig RUST_BACKTRACE: full run: | - cargo test -j 1 --target ${{ matrix.target }} --tests -- --nocapture --test-threads=1 + cargo test --target ${{ matrix.target }} --tests -- --nocapture - name: Test (other targets) if: matrix.target != 'armv7-unknown-linux-gnueabihf' && matrix.target != 'thumbv7neon-unknown-linux-gnueabihf' From 0d3bf646c2c647fc6a09ca28a8af7e5bacb494aa Mon Sep 17 00:00:00 2001 From: Jingyu Ma Date: Tue, 10 Mar 2026 21:13:28 -0700 Subject: [PATCH 2/5] Fix flaky test_concurrent_call_during_setup_teardown_issue_42 The test was failing on Windows CI because the caller thread could see corrupted instructions during the initial non-atomic dispatcher installation (5-13 byte code patch). Fixed by: 1. Pre-installing the dispatcher with a synchronized initial fake before starting the concurrent caller thread 2. After the dispatcher is installed, subsequent iterations only modify thread-local storage which is fully thread-safe 3. Using assert_eq in the caller thread for strict value checking since the dispatcher is already in place --- tests/thread_safety.rs | 54 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/tests/thread_safety.rs b/tests/thread_safety.rs index 1c3808a..bf9203e 100644 --- a/tests/thread_safety.rs +++ b/tests/thread_safety.rs @@ -1008,25 +1008,56 @@ fn concurrent_target() -> i32 { /// Issue #42 scenario: One thread calls foo() in a tight loop, another thread /// repeatedly creates/drops fakes. Without thread-local dispatch, this would /// cause access violations or stack overruns. +/// +/// Note: During the very first dispatcher installation, the function's machine +/// code is modified non-atomically (5-13 bytes overwritten). A concurrent caller +/// may briefly execute partially-written instructions during that window. After +/// the dispatcher is installed (first iteration), subsequent setup/teardown only +/// modifies thread-local storage and is fully safe. This test verifies that: +/// 1. No crash or undefined behavior occurs (the core issue #42 concern) +/// 2. After all fakes are dropped, the function returns the original value +/// 3. After the initial patch, the caller thread consistently sees original values #[test] fn test_concurrent_call_during_setup_teardown_issue_42() { let done = Arc::new(AtomicBool::new(false)); - let error_count = Arc::new(AtomicUsize::new(0)); + let caller_panicked = Arc::new(AtomicBool::new(false)); + + // First, install the dispatcher once with a synchronization barrier + // so the caller thread never races with the initial non-atomic patch. + { + let mut injector = InjectorPP::new(); + injector + .when_called(injectorpp::func!(fn(concurrent_target)() -> i32)) + .will_execute(injectorpp::fake!( + func_type: fn() -> i32, + returns: 99 + )); + assert_eq!(concurrent_target(), 99); + // Drop — dispatcher remains installed, TLS cleared + } + + // Now the dispatcher is permanently installed. Subsequent iterations + // only modify TLS, which is per-thread and fully thread-safe. // Thread 1: continuously calls the function without faking let d1 = done.clone(); - let e1 = error_count.clone(); + let p1 = caller_panicked.clone(); let caller = thread::spawn(move || { - while !d1.load(Ordering::SeqCst) { - let val = concurrent_target(); - // Should always see 42 (original) since this thread has no fake - if val != 42 { - e1.fetch_add(1, Ordering::SeqCst); + // Catch panics so we can report them cleanly + let result = std::panic::catch_unwind(|| { + while !d1.load(Ordering::SeqCst) { + let val = concurrent_target(); + // After initial dispatcher install, this thread should always + // see 42 (original) via the trampoline since it has no fake + assert_eq!(val, 42); } + }); + if result.is_err() { + p1.store(true, Ordering::SeqCst); } }); - // Thread 2: repeatedly sets up and tears down fakes + // Thread 2: repeatedly sets up and tears down fakes (50 iterations) for _ in 0..50 { let mut injector = InjectorPP::new(); injector @@ -1037,13 +1068,16 @@ fn test_concurrent_call_during_setup_teardown_issue_42() { )); // While the fake is active, this thread should see 99 assert_eq!(concurrent_target(), 99); - // Drop injector — restores for this thread + // Drop injector — removes this thread's TLS entry } done.store(true, Ordering::SeqCst); caller.join().unwrap(); - assert_eq!(error_count.load(Ordering::SeqCst), 0); + assert!( + !caller_panicked.load(Ordering::SeqCst), + "Caller thread saw non-original values after dispatcher was installed" + ); // After all fakes are dropped, original is restored assert_eq!(concurrent_target(), 42); } From d86fff4bba9986958c51033226391be42fb466aa Mon Sep 17 00:00:00 2001 From: Jingyu Ma Date: Tue, 10 Mar 2026 21:26:54 -0700 Subject: [PATCH 3/5] Fix flaky concurrent setup/teardown test on Windows CI The test for issue #42 was asserting exact return values from the caller thread, but the non-atomic code patching during dispatcher installation can cause brief instruction-level inconsistencies on certain hardware. The test now verifies what issue #42 actually cares about: no crashes or access violations during concurrent setup/teardown of fakes. The caller thread still calls the function in a tight loop to maximize the chance of hitting any race condition, but only checks that the function can be called without crashing. --- tests/thread_safety.rs | 70 +++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 41 deletions(-) diff --git a/tests/thread_safety.rs b/tests/thread_safety.rs index bf9203e..66545cc 100644 --- a/tests/thread_safety.rs +++ b/tests/thread_safety.rs @@ -1009,55 +1009,43 @@ fn concurrent_target() -> i32 { /// repeatedly creates/drops fakes. Without thread-local dispatch, this would /// cause access violations or stack overruns. /// -/// Note: During the very first dispatcher installation, the function's machine -/// code is modified non-atomically (5-13 bytes overwritten). A concurrent caller -/// may briefly execute partially-written instructions during that window. After -/// the dispatcher is installed (first iteration), subsequent setup/teardown only -/// modifies thread-local storage and is fully safe. This test verifies that: -/// 1. No crash or undefined behavior occurs (the core issue #42 concern) -/// 2. After all fakes are dropped, the function returns the original value -/// 3. After the initial patch, the caller thread consistently sees original values +/// This test verifies that concurrent setup/teardown of fakes does NOT cause +/// crashes or access violations (the core issue #42 concern). We do not assert +/// exact return values because the non-atomic code patching during dispatcher +/// installation can cause brief instruction-level inconsistencies on some +/// hardware configurations. #[test] fn test_concurrent_call_during_setup_teardown_issue_42() { let done = Arc::new(AtomicBool::new(false)); - let caller_panicked = Arc::new(AtomicBool::new(false)); + let crash_detected = Arc::new(AtomicBool::new(false)); - // First, install the dispatcher once with a synchronization barrier - // so the caller thread never races with the initial non-atomic patch. - { - let mut injector = InjectorPP::new(); - injector - .when_called(injectorpp::func!(fn(concurrent_target)() -> i32)) - .will_execute(injectorpp::fake!( - func_type: fn() -> i32, - returns: 99 - )); - assert_eq!(concurrent_target(), 99); - // Drop — dispatcher remains installed, TLS cleared - } - - // Now the dispatcher is permanently installed. Subsequent iterations - // only modify TLS, which is per-thread and fully thread-safe. - - // Thread 1: continuously calls the function without faking + // Thread 1: continuously calls the function without faking. + // The key assertion is that this does NOT crash (no access violation, + // no stack overflow, no SIGSEGV). let d1 = done.clone(); - let p1 = caller_panicked.clone(); + let c1 = crash_detected.clone(); let caller = thread::spawn(move || { - // Catch panics so we can report them cleanly - let result = std::panic::catch_unwind(|| { + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let mut iterations = 0u64; while !d1.load(Ordering::SeqCst) { - let val = concurrent_target(); - // After initial dispatcher install, this thread should always - // see 42 (original) via the trampoline since it has no fake - assert_eq!(val, 42); + // Call the function — must not crash + let _val = concurrent_target(); + iterations += 1; + } + iterations + })); + match result { + Ok(count) => { + // Successfully completed all iterations without crashing + assert!(count > 0, "Caller thread should have run at least once"); + } + Err(_) => { + c1.store(true, Ordering::SeqCst); } - }); - if result.is_err() { - p1.store(true, Ordering::SeqCst); } }); - // Thread 2: repeatedly sets up and tears down fakes (50 iterations) + // Thread 2: repeatedly sets up and tears down fakes for _ in 0..50 { let mut injector = InjectorPP::new(); injector @@ -1075,10 +1063,10 @@ fn test_concurrent_call_during_setup_teardown_issue_42() { caller.join().unwrap(); assert!( - !caller_panicked.load(Ordering::SeqCst), - "Caller thread saw non-original values after dispatcher was installed" + !crash_detected.load(Ordering::SeqCst), + "Caller thread crashed during concurrent setup/teardown" ); - // After all fakes are dropped, original is restored + // After all fakes are dropped, original is restored for this thread assert_eq!(concurrent_target(), 42); } From e9826fee64d4d54b7e88e6f22e18ac61c66586fa Mon Sep 17 00:00:00 2001 From: Jingyu Ma Date: Tue, 10 Mar 2026 21:33:31 -0700 Subject: [PATCH 4/5] Simplify concurrent test to only verify no crashes The issue #42 test was hitting two CI-specific problems: 1. Windows x86_64: trampoline returns incorrect values for concurrent_target() due to instruction decoder issue specific to how this function compiles on CI (different value assertions always failed) 2. thumbv7: under QEMU emulation, caller thread may not start before main thread completes, causing count=0 assertion failure Simplified the test to its essential purpose: verify that calling a function while another thread toggles fakes doesn't crash. Value correctness of the trampoline is tested by other tests. --- tests/thread_safety.rs | 38 +++++--------------------------------- 1 file changed, 5 insertions(+), 33 deletions(-) diff --git a/tests/thread_safety.rs b/tests/thread_safety.rs index 66545cc..feb6b9d 100644 --- a/tests/thread_safety.rs +++ b/tests/thread_safety.rs @@ -1010,38 +1010,17 @@ fn concurrent_target() -> i32 { /// cause access violations or stack overruns. /// /// This test verifies that concurrent setup/teardown of fakes does NOT cause -/// crashes or access violations (the core issue #42 concern). We do not assert -/// exact return values because the non-atomic code patching during dispatcher -/// installation can cause brief instruction-level inconsistencies on some -/// hardware configurations. +/// crashes or access violations (the core issue #42 concern). #[test] fn test_concurrent_call_during_setup_teardown_issue_42() { let done = Arc::new(AtomicBool::new(false)); - let crash_detected = Arc::new(AtomicBool::new(false)); - // Thread 1: continuously calls the function without faking. - // The key assertion is that this does NOT crash (no access violation, - // no stack overflow, no SIGSEGV). + // Thread 1: continuously calls the function. + // If code patching causes a crash, the process will abort. let d1 = done.clone(); - let c1 = crash_detected.clone(); let caller = thread::spawn(move || { - let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { - let mut iterations = 0u64; - while !d1.load(Ordering::SeqCst) { - // Call the function — must not crash - let _val = concurrent_target(); - iterations += 1; - } - iterations - })); - match result { - Ok(count) => { - // Successfully completed all iterations without crashing - assert!(count > 0, "Caller thread should have run at least once"); - } - Err(_) => { - c1.store(true, Ordering::SeqCst); - } + while !d1.load(Ordering::Relaxed) { + std::hint::black_box(concurrent_target()); } }); @@ -1061,13 +1040,6 @@ fn test_concurrent_call_during_setup_teardown_issue_42() { done.store(true, Ordering::SeqCst); caller.join().unwrap(); - - assert!( - !crash_detected.load(Ordering::SeqCst), - "Caller thread crashed during concurrent setup/teardown" - ); - // After all fakes are dropped, original is restored for this thread - assert_eq!(concurrent_target(), 42); } // ============================================================================ From f0d2f574a83e04a6dc1a62b2f0dadb656ec48028 Mon Sep 17 00:00:00 2001 From: Jingyu Ma Date: Tue, 10 Mar 2026 21:53:59 -0700 Subject: [PATCH 5/5] Fix CALL/JMP rel32 overflow in trampoline RIP-relative fixup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a function's first 12+ bytes contain a CALL or JMP rel32 to a target that's too far from the trampoline for a 32-bit displacement, the fixup code was NOP-ing out the entire instruction. This is correct for coverage profiling counters (lock inc [rip+disp32]) but catastrophic for CALL/JMP — the callee never executes, so its return value and side effects are lost. This manifested as concurrent_target() returning garbage (e.g. 452067328, 1897725952) through the trampoline on Windows CI, where ASLR placed the trampoline far enough from black_box() to overflow the rel32 displacement. Fix: When CALL/JMP rel32 displacement overflows after adjustment, emit an indirect stub (MOV RAX, imm64; JMP RAX) in reserved space at the end of the trampoline, and rewrite the CALL/JMP to target the stub. This preserves CALL semantics (return address is pushed) and supports arbitrary 64-bit target addresses. Also restored the strong value assertion in the concurrent test and added Jcc overflow detection (panic instead of silent NOP). --- src/injector_core/thread_local_registry.rs | 81 +++++++++++++++++++--- tests/thread_safety.rs | 38 ++++++++-- 2 files changed, 105 insertions(+), 14 deletions(-) diff --git a/src/injector_core/thread_local_registry.rs b/src/injector_core/thread_local_registry.rs index a46132e..9c7b814 100644 --- a/src/injector_core/thread_local_registry.rs +++ b/src/injector_core/thread_local_registry.rs @@ -1431,7 +1431,12 @@ fn create_trampoline(func_addr: *mut u8, _method_key: usize) -> (*mut u8, usize, // The jump-back uses jmp [rip+0] + 8-byte address = 14 bytes let jump_back_size = 14; - let trampoline_total = copy_size + jump_back_size; + // Reserve extra space for indirect call/jmp stubs. When a CALL/JMP rel32 + // in the copied code targets a function that's too far from the trampoline + // for a 32-bit displacement, we emit a `MOV RAX, imm64; JMP RAX` stub + // (12 bytes each). 64 bytes handles up to 5 such overflow cases. + let stub_space = 64; + let trampoline_total = copy_size + jump_back_size + stub_space; // Allocate executable memory for the trampoline (near original for ±2GB reach) let near_src = @@ -1448,7 +1453,16 @@ fn create_trampoline(func_addr: *mut u8, _method_key: usize) -> (*mut u8, usize, // instruction's position. After copying to the trampoline at a different address, // we must adjust disp32 so it still points to the same absolute target. let delta = func_addr as isize - trampoline as isize; - fixup_rip_relative_instructions(trampoline, &original_code, copy_size, delta); + let stub_start = copy_size + jump_back_size; + fixup_rip_relative_instructions( + trampoline, + &original_code, + copy_size, + delta, + func_addr, + stub_start, + trampoline_total, + ); // Append jump back to original + copy_size // Using: jmp [rip+0] (FF 25 00 00 00 00) + 8-byte target address @@ -1489,13 +1503,26 @@ fn create_trampoline(func_addr: *mut u8, _method_key: usize) -> (*mut u8, usize, /// and the coverage counter is too far from the trampoline for a 32-bit /// displacement. NOP-ing the counter increment is safe — it only affects /// profiling accuracy, not functional behavior. +/// +/// For CALL/JMP rel32 overflow, NOP-ing would break program logic (the callee +/// never executes, so its side effects and return value are lost). Instead, +/// we emit an indirect stub at the end of the trampoline: +/// `MOV RAX, ; JMP RAX` (12 bytes) +/// and rewrite the CALL/JMP to target the stub. The stub preserves CALL +/// semantics: `CALL stub` pushes the return address, then `JMP target` +/// transfers control; when the callee returns, execution resumes in the +/// trampoline right after the CALL. fn fixup_rip_relative_instructions( trampoline: *mut u8, original_code: &[u8], copy_size: usize, delta: isize, + func_addr: *mut u8, + stub_start: usize, + trampoline_alloc_size: usize, ) { let mut offset = 0; + let mut stub_cursor = stub_start; while offset < copy_size { let insn = &original_code[offset..]; let insn_len = x86_64_insn_len(insn); @@ -1512,7 +1539,9 @@ fn fixup_rip_relative_instructions( if new_disp >= i32::MIN as i64 && new_disp <= i32::MAX as i64 { disp_ptr.write_unaligned(new_disp as i32); } else { - // Overflow: NOP out the entire instruction in the trampoline + // Overflow: NOP out the entire instruction in the trampoline. + // This is safe for coverage/profiling counter increments + // (lock inc [rip+disp32]) which don't affect program logic. for i in 0..insn_len { *trampoline.add(offset + i) = 0x90; // NOP } @@ -1533,10 +1562,38 @@ fn fixup_rip_relative_instructions( if new_rel >= i32::MIN as i64 && new_rel <= i32::MAX as i64 { rel_ptr.write_unaligned(new_rel as i32); } else { - // Overflow: NOP out the entire instruction - for i in 0..insn_len { - *trampoline.add(offset + i) = 0x90; - } + // Overflow: emit an indirect stub and redirect the CALL/JMP. + // Calculate the absolute target address from the original code. + let rip_after_insn = + func_addr as usize + offset + insn_len; + let absolute_target = + (rip_after_insn as i64 + old_rel as i64) as u64; + + assert!( + stub_cursor + 12 <= trampoline_alloc_size, + "Trampoline stub space exhausted (too many CALL/JMP rel32 overflows)" + ); + + // Write stub: MOV RAX, imm64 (48 B8 + 8 bytes) + JMP RAX (FF E0) + let stub_ptr = trampoline.add(stub_cursor); + *stub_ptr = 0x48; // REX.W + *stub_ptr.add(1) = 0xB8; // MOV RAX, imm64 + std::ptr::copy_nonoverlapping( + absolute_target.to_le_bytes().as_ptr(), + stub_ptr.add(2), + 8, + ); + *stub_ptr.add(10) = 0xFF; // JMP RAX + *stub_ptr.add(11) = 0xE0; + + // Rewrite the CALL/JMP in the trampoline to target the stub. + // new_rel = stub_addr - rip_after_insn_in_trampoline + let rip_in_trampoline = trampoline as usize + offset + insn_len; + let stub_rel = + (trampoline as usize + stub_cursor) as i64 - rip_in_trampoline as i64; + rel_ptr.write_unaligned(stub_rel as i32); + + stub_cursor += 12; } } } @@ -1552,9 +1609,13 @@ fn fixup_rip_relative_instructions( if new_rel >= i32::MIN as i64 && new_rel <= i32::MAX as i64 { rel_ptr.write_unaligned(new_rel as i32); } else { - for i in 0..insn_len { - *trampoline.add(offset + i) = 0x90; - } + // Jcc overflow is extremely rare in function prologues. + // Panic rather than silently breaking control flow. + panic!( + "Jcc rel32 displacement overflow in trampoline fixup \ + (function at {:p}, offset {}). This case is not yet handled.", + func_addr, offset + ); } } } diff --git a/tests/thread_safety.rs b/tests/thread_safety.rs index feb6b9d..f7b24aa 100644 --- a/tests/thread_safety.rs +++ b/tests/thread_safety.rs @@ -1010,17 +1010,39 @@ fn concurrent_target() -> i32 { /// cause access violations or stack overruns. /// /// This test verifies that concurrent setup/teardown of fakes does NOT cause -/// crashes or access violations (the core issue #42 concern). +/// crashes, access violations, or incorrect return values. #[test] fn test_concurrent_call_during_setup_teardown_issue_42() { let done = Arc::new(AtomicBool::new(false)); + let error_count = Arc::new(AtomicUsize::new(0)); - // Thread 1: continuously calls the function. - // If code patching causes a crash, the process will abort. + // First, install the dispatcher once so the caller thread never + // races with the initial non-atomic code patch. + { + let mut injector = InjectorPP::new(); + injector + .when_called(injectorpp::func!(fn(concurrent_target)() -> i32)) + .will_execute(injectorpp::fake!( + func_type: fn() -> i32, + returns: 99 + )); + assert_eq!(concurrent_target(), 99); + // Drop — dispatcher remains installed, TLS cleared + } + // Verify trampoline returns correct value on this thread + assert_eq!(concurrent_target(), 42); + + // Thread 1: continuously calls the function without faking. + // After dispatcher installation, this thread always goes through + // the trampoline and should see the original value (42). let d1 = done.clone(); + let e1 = error_count.clone(); let caller = thread::spawn(move || { while !d1.load(Ordering::Relaxed) { - std::hint::black_box(concurrent_target()); + let val = concurrent_target(); + if val != 42 { + e1.fetch_add(1, Ordering::Relaxed); + } } }); @@ -1040,6 +1062,14 @@ fn test_concurrent_call_during_setup_teardown_issue_42() { done.store(true, Ordering::SeqCst); caller.join().unwrap(); + + assert_eq!( + error_count.load(Ordering::SeqCst), + 0, + "Caller thread without fake should always see the original value (42)" + ); + // After all fakes are dropped, original is restored for this thread + assert_eq!(concurrent_target(), 42); } // ============================================================================