Skip to content

Commit 0ad72de

Browse files
perf: negative-cache unreachable destinations and grow startup pre-warm
1 parent 08efbc5 commit 0ad72de

2 files changed

Lines changed: 287 additions & 2 deletions

File tree

src/proxy_server.rs

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,9 +308,17 @@ impl ProxyServer {
308308
// doesn't pay a fresh TLS handshake to Google edge. Best-effort;
309309
// failures are logged and ignored. Skipped in `google_only` — there
310310
// is no fronter to warm.
311+
//
312+
// Sized to roughly match a browser's parallel-connection burst at
313+
// startup. The previous fixed `3` was fine for a single deployment
314+
// but left requests 4-10 of the opening burst paying a cold TLS
315+
// handshake each (~300ms). Scaling with deployment count gives
316+
// multi-account configs a proportionally warmer pool, capped so
317+
// single-deployment users don't hammer Google edge unnecessarily.
311318
if let Some(warm_fronter) = self.fronter.clone() {
319+
let n = warm_fronter.num_scripts().clamp(6, 16);
312320
tokio::spawn(async move {
313-
warm_fronter.warm(3).await;
321+
warm_fronter.warm(n).await;
314322
});
315323
}
316324

@@ -503,6 +511,20 @@ async fn handle_http_client(
503511

504512
if method.eq_ignore_ascii_case("CONNECT") {
505513
let (host, port) = parse_host_port(&target);
514+
// Mirror the SOCKS5 short-circuit: if the tunnel-node just failed
515+
// this (host, port) with unreachable, return 502 immediately rather
516+
// than acknowledging the CONNECT and blowing tunnel quota on a
517+
// guaranteed retry. See `TunnelMux::is_unreachable` for context.
518+
if let Some(ref mux) = tunnel_mux {
519+
if mux.is_unreachable(&host, port) {
520+
tracing::info!("CONNECT {}:{} (negative-cached, refusing)", host, port);
521+
let _ = sock
522+
.write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\nConnection: close\r\n\r\n")
523+
.await;
524+
let _ = sock.flush().await;
525+
return Ok(());
526+
}
527+
}
506528
sock.write_all(b"HTTP/1.1 200 Connection Established\r\n\r\n")
507529
.await?;
508530
sock.flush().await?;
@@ -600,6 +622,21 @@ async fn handle_socks5_client(
600622
return handle_socks5_udp_associate(sock, rewrite_ctx, tunnel_mux).await;
601623
}
602624

625+
// Negative-cache short-circuit: if the tunnel-node just failed to reach
626+
// this exact (host, port) with `Network is unreachable` / `No route to
627+
// host`, reply 0x04 (Host unreachable) immediately. Saves a 1.5–2s tunnel
628+
// round-trip on guaranteed-failing targets — the IPv6 probe retry loop
629+
// is the main offender on devices without IPv6.
630+
if let Some(ref mux) = tunnel_mux {
631+
if mux.is_unreachable(&host, port) {
632+
tracing::info!("SOCKS5 CONNECT -> {}:{} (negative-cached, refusing)", host, port);
633+
sock.write_all(&[0x05, 0x04, 0x00, 0x01, 0, 0, 0, 0, 0, 0])
634+
.await?;
635+
sock.flush().await?;
636+
return Ok(());
637+
}
638+
}
639+
603640
tracing::info!("SOCKS5 CONNECT -> {}:{}", host, port);
604641

605642
// Success reply with zeroed BND.

src/tunnel_client.rs

Lines changed: 249 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use std::collections::HashMap;
1414
// reason; reuse it here. `AtomicBool` works fine in std on every target.
1515
use portable_atomic::AtomicU64;
1616
use std::sync::atomic::{AtomicBool, Ordering};
17-
use std::sync::Arc;
17+
use std::sync::{Arc, Mutex};
1818
use std::time::{Duration, Instant};
1919

2020
use base64::engine::general_purpose::STANDARD as B64;
@@ -78,6 +78,19 @@ const CODE_UNSUPPORTED_OP: &str = "UNSUPPORTED_OP";
7878
/// floor, so network jitter on either side won't false-trigger.
7979
const LEGACY_DETECT_THRESHOLD: Duration = Duration::from_millis(1500);
8080

81+
/// How long to remember a `Network is unreachable` / `No route to host`
82+
/// failure for a given `(host, port)`. While cached, the proxy short-circuits
83+
/// repeat CONNECTs with an immediate "host unreachable" reply instead of
84+
/// burning a 1.5–2s tunnel batch round-trip on a target that just failed.
85+
/// Real motivator: IPv6-only probe hostnames (e.g. `ds6.probe.*`) on devices
86+
/// without IPv6 — the OS retries the probe every ~1.5s for 10s+, generating
87+
/// 5–10 wasted tunnel sessions per probe.
88+
const UNREACHABLE_CACHE_TTL: Duration = Duration::from_secs(30);
89+
90+
/// Hard cap on negative-cache size. Browsing pulls in dozens of distinct
91+
/// hosts; we don't want a runaway map. Pruned opportunistically on insert.
92+
const UNREACHABLE_CACHE_MAX: usize = 256;
93+
8194
/// Ports where the *server* speaks first (SMTP banner, SSH identification,
8295
/// POP3/IMAP greeting, FTP banner). On these, waiting for client bytes
8396
/// gains nothing and just adds handshake latency — skip the pre-read.
@@ -87,6 +100,32 @@ fn is_server_speaks_first(port: u16) -> bool {
87100
matches!(port, 21 | 22 | 25 | 80 | 110 | 143 | 587)
88101
}
89102

103+
/// Recognize the tunnel-node's connect-error strings that mean
104+
/// "this destination is fundamentally unreachable from the tunnel-node's
105+
/// network right now" — distinct from refused/reset/timeout, which can be
106+
/// transient. These come through as the inner `e` of a `TunnelResponse`
107+
/// after the tunnel-node's std::io::Error is stringified, so we match on
108+
/// substrings rather than `ErrorKind`. Linux: errno 101 (ENETUNREACH),
109+
/// errno 113 (EHOSTUNREACH). Format varies a bit across libc/Tokio
110+
/// versions, so cover both the human text and the os-error tag.
111+
fn is_unreachable_error_str(s: &str) -> bool {
112+
let lc = s.to_ascii_lowercase();
113+
lc.contains("network is unreachable")
114+
|| lc.contains("no route to host")
115+
|| lc.contains("os error 101")
116+
|| lc.contains("os error 113")
117+
}
118+
119+
/// Canonicalize a host string for use as a negative-cache key. DNS names
120+
/// are case-insensitive and may carry a trailing root-label dot, so
121+
/// `Example.COM:443`, `example.com:443`, and `example.com.:443` are all the
122+
/// same destination. IPv4 / IPv6 literals are unaffected — IPv4 has no
123+
/// letters, and `Ipv6Addr::to_string()` already emits lowercase.
124+
fn normalize_cache_host(host: &str) -> String {
125+
let trimmed = host.strip_suffix('.').unwrap_or(host);
126+
trimmed.to_ascii_lowercase()
127+
}
128+
90129
// ---------------------------------------------------------------------------
91130
// Multiplexer
92131
// ---------------------------------------------------------------------------
@@ -159,6 +198,11 @@ pub struct TunnelMux {
159198
/// Separate monotonic counter used only to trigger the summary log
160199
/// (avoids a race where two threads both see `total % 100 == 0`).
161200
preread_total_events: AtomicU64,
201+
/// Short-lived negative cache for targets the tunnel-node reported as
202+
/// unreachable (`Network is unreachable` / `No route to host`). Keyed by
203+
/// `(host, port)`, value is the expiry instant. Plain Mutex<HashMap> is
204+
/// fine: it's touched once per CONNECT (cheap) and once per failure.
205+
unreachable_cache: Mutex<HashMap<(String, u16), Instant>>,
162206
}
163207

164208
impl TunnelMux {
@@ -181,6 +225,7 @@ impl TunnelMux {
181225
preread_skip_unsupported: AtomicU64::new(0),
182226
preread_win_total_us: AtomicU64::new(0),
183227
preread_total_events: AtomicU64::new(0),
228+
unreachable_cache: Mutex::new(HashMap::new()),
184229
})
185230
}
186231

@@ -254,6 +299,71 @@ impl TunnelMux {
254299
}
255300
}
256301

302+
/// Returns true if `(host, port)` has a non-expired unreachable entry.
303+
/// The proxy front-end uses this to skip the tunnel and reply
304+
/// "host unreachable" immediately on follow-up CONNECTs.
305+
pub fn is_unreachable(&self, host: &str, port: u16) -> bool {
306+
let now = Instant::now();
307+
let mut cache = match self.unreachable_cache.lock() {
308+
Ok(g) => g,
309+
Err(p) => p.into_inner(),
310+
};
311+
let key = (normalize_cache_host(host), port);
312+
match cache.get(&key) {
313+
Some(expiry) if *expiry > now => true,
314+
Some(_) => {
315+
cache.remove(&key);
316+
false
317+
}
318+
None => false,
319+
}
320+
}
321+
322+
/// If `err` looks like a network-unreachable / no-route-to-host error
323+
/// from the tunnel-node, remember the target for `UNREACHABLE_CACHE_TTL`.
324+
/// No-op for any other error (timeouts, refused, EOF, etc.) — those can
325+
/// be transient and we don't want to lock out a host on a flaky moment.
326+
fn record_unreachable_if_match(&self, host: &str, port: u16, err: &str) {
327+
if !is_unreachable_error_str(err) {
328+
return;
329+
}
330+
let mut cache = match self.unreachable_cache.lock() {
331+
Ok(g) => g,
332+
Err(p) => p.into_inner(),
333+
};
334+
// Cap enforcement is two-stage: first drop anything already expired,
335+
// then if we're STILL at/above the cap (i.e. an unbounded burst of
336+
// unique unreachable hosts within the TTL), evict the entry that
337+
// would expire soonest. This bounds the map size at all times — a
338+
// pure `retain` on expiry alone would let the map grow unbounded
339+
// until the first entry's TTL elapses.
340+
if cache.len() >= UNREACHABLE_CACHE_MAX {
341+
let now = Instant::now();
342+
cache.retain(|_, expiry| *expiry > now);
343+
while cache.len() >= UNREACHABLE_CACHE_MAX {
344+
let victim = cache
345+
.iter()
346+
.min_by_key(|(_, expiry)| **expiry)
347+
.map(|(k, _)| k.clone());
348+
match victim {
349+
Some(k) => {
350+
cache.remove(&k);
351+
}
352+
None => break,
353+
}
354+
}
355+
}
356+
let key = (normalize_cache_host(host), port);
357+
cache.insert(key, Instant::now() + UNREACHABLE_CACHE_TTL);
358+
tracing::debug!(
359+
"negative-cached {}:{} for {:?} ({})",
360+
host,
361+
port,
362+
UNREACHABLE_CACHE_TTL,
363+
err
364+
);
365+
}
366+
257367
fn record_preread_win(&self, port: u16, elapsed: Duration) {
258368
self.preread_win.fetch_add(1, Ordering::Relaxed);
259369
self.preread_win_total_us
@@ -723,6 +833,11 @@ async fn connect_plain(host: &str, port: u16, mux: &Arc<TunnelMux>) -> std::io::
723833
Ok(Ok(resp)) => {
724834
if let Some(ref e) = resp.e {
725835
tracing::error!("tunnel connect error for {}:{}: {}", host, port, e);
836+
// Only cache here: `resp.e` is the tunnel-node's own connect()
837+
// result against the target. The outer `Ok(Err(_))` arm below
838+
// is a transport-level failure (relay → Apps Script → tunnel-
839+
// node never reached) and tells us nothing about the target.
840+
mux.record_unreachable_if_match(host, port, e);
726841
return Err(std::io::Error::new(
727842
std::io::ErrorKind::ConnectionRefused,
728843
e.clone(),
@@ -769,6 +884,9 @@ async fn connect_with_initial_data(
769884
return Ok(ConnectDataOutcome::Unsupported);
770885
}
771886
tracing::error!("tunnel connect_data error for {}:{}: {}", host, port, e);
887+
// Outer transport failure (relay/Apps Script never reached the
888+
// tunnel-node). Don't poison the destination cache from here —
889+
// see `connect_plain` for the same reasoning.
772890
return Err(std::io::Error::new(
773891
std::io::ErrorKind::ConnectionRefused,
774892
e,
@@ -794,6 +912,8 @@ async fn connect_with_initial_data(
794912

795913
if let Some(ref e) = resp.e {
796914
tracing::error!("tunnel connect_data error for {}:{}: {}", host, port, e);
915+
// `resp.e` is the tunnel-node's own connect result — cache it.
916+
mux.record_unreachable_if_match(host, port, e);
797917
return Err(std::io::Error::new(
798918
std::io::ErrorKind::ConnectionRefused,
799919
e.clone(),
@@ -1095,6 +1215,133 @@ mod tests {
10951215
)));
10961216
}
10971217

1218+
#[test]
1219+
fn unreachable_error_str_matches_expected_variants() {
1220+
assert!(is_unreachable_error_str(
1221+
"connect failed: Network is unreachable (os error 101)"
1222+
));
1223+
assert!(is_unreachable_error_str("No route to host"));
1224+
assert!(is_unreachable_error_str("os error 113"));
1225+
// Case-insensitive.
1226+
assert!(is_unreachable_error_str(
1227+
"CONNECT FAILED: NETWORK IS UNREACHABLE"
1228+
));
1229+
}
1230+
1231+
#[test]
1232+
fn unreachable_error_str_rejects_unrelated() {
1233+
assert!(!is_unreachable_error_str("connection refused"));
1234+
assert!(!is_unreachable_error_str("connect timed out"));
1235+
assert!(!is_unreachable_error_str("connection reset by peer"));
1236+
assert!(!is_unreachable_error_str(""));
1237+
}
1238+
1239+
#[test]
1240+
fn negative_cache_records_and_short_circuits() {
1241+
let (mux, _rx) = mux_for_test();
1242+
// Initially nothing is cached.
1243+
assert!(!mux.is_unreachable("ds6.probe.example", 443));
1244+
// Record a matching error.
1245+
mux.record_unreachable_if_match(
1246+
"ds6.probe.example",
1247+
443,
1248+
"connect failed: Network is unreachable (os error 101)",
1249+
);
1250+
assert!(mux.is_unreachable("ds6.probe.example", 443));
1251+
// A different port for the same host is its own entry.
1252+
assert!(!mux.is_unreachable("ds6.probe.example", 80));
1253+
}
1254+
1255+
#[test]
1256+
fn negative_cache_ignores_non_unreachable_errors() {
1257+
let (mux, _rx) = mux_for_test();
1258+
mux.record_unreachable_if_match(
1259+
"example.com",
1260+
443,
1261+
"connect failed: connection refused",
1262+
);
1263+
assert!(!mux.is_unreachable("example.com", 443));
1264+
}
1265+
1266+
#[test]
1267+
fn negative_cache_normalizes_host_keys() {
1268+
let (mux, _rx) = mux_for_test();
1269+
// Cache under one casing/format...
1270+
mux.record_unreachable_if_match(
1271+
"Example.COM.",
1272+
443,
1273+
"Network is unreachable (os error 101)",
1274+
);
1275+
// ...and look up under several equivalent forms.
1276+
assert!(mux.is_unreachable("example.com", 443));
1277+
assert!(mux.is_unreachable("EXAMPLE.com", 443));
1278+
assert!(mux.is_unreachable("example.com.", 443));
1279+
// Different host should still miss.
1280+
assert!(!mux.is_unreachable("other.com", 443));
1281+
}
1282+
1283+
/// Outer `Ok(Err(_))` from the mux channel means "the relay never
1284+
/// reached the tunnel-node" (HTTP/TLS to Apps Script failed, batch
1285+
/// timed out, etc.) — the destination wasn't even attempted. Even if
1286+
/// that error string contains "Network is unreachable" (e.g. the
1287+
/// client device's WAN was momentarily down), it must NOT poison the
1288+
/// destination cache, or every host the user touched during a
1289+
/// connectivity blip stays refused for 30s.
1290+
#[tokio::test]
1291+
async fn negative_cache_skips_outer_relay_errors() {
1292+
let (mux, mut rx) = mux_for_test();
1293+
let mux_for_task = mux.clone();
1294+
let task = tokio::spawn(async move {
1295+
connect_plain("real.target.example", 443, &mux_for_task).await
1296+
});
1297+
1298+
// Receive the Connect msg and reply with an outer Err whose string
1299+
// would otherwise match `is_unreachable_error_str`.
1300+
let msg = rx.recv().await.expect("connect msg");
1301+
let reply = match msg {
1302+
MuxMsg::Connect { reply, .. } => reply,
1303+
other => panic!("expected Connect, got {:?}", std::mem::discriminant(&other)),
1304+
};
1305+
let _ = reply.send(Err(
1306+
"relay failed: Network is unreachable (os error 101)".into(),
1307+
));
1308+
1309+
let res = task.await.expect("task");
1310+
assert!(res.is_err(), "connect_plain should surface the error");
1311+
assert!(
1312+
!mux.is_unreachable("real.target.example", 443),
1313+
"outer relay error must not negative-cache the destination"
1314+
);
1315+
}
1316+
1317+
#[test]
1318+
fn negative_cache_enforces_hard_cap_under_unique_burst() {
1319+
let (mux, _rx) = mux_for_test();
1320+
// Insert enough unique still-live entries to exceed the cap. The
1321+
// map size must never exceed UNREACHABLE_CACHE_MAX, even though
1322+
// every entry is fresh and `retain(expired)` prunes nothing.
1323+
let burst = UNREACHABLE_CACHE_MAX + 50;
1324+
for i in 0..burst {
1325+
let host = format!("h{}.example", i);
1326+
mux.record_unreachable_if_match(
1327+
&host,
1328+
443,
1329+
"connect failed: Network is unreachable (os error 101)",
1330+
);
1331+
}
1332+
let len = mux
1333+
.unreachable_cache
1334+
.lock()
1335+
.map(|g| g.len())
1336+
.unwrap_or(0);
1337+
assert!(
1338+
len <= UNREACHABLE_CACHE_MAX,
1339+
"cache size {} exceeded cap {}",
1340+
len,
1341+
UNREACHABLE_CACHE_MAX
1342+
);
1343+
}
1344+
10981345
#[test]
10991346
fn server_speaks_first_covers_common_protocols() {
11001347
for p in [21u16, 22, 25, 80, 110, 143, 587] {
@@ -1128,6 +1375,7 @@ mod tests {
11281375
preread_skip_unsupported: AtomicU64::new(0),
11291376
preread_win_total_us: AtomicU64::new(0),
11301377
preread_total_events: AtomicU64::new(0),
1378+
unreachable_cache: Mutex::new(HashMap::new()),
11311379
});
11321380
(mux, rx)
11331381
}

0 commit comments

Comments
 (0)