@@ -14,7 +14,7 @@ use std::collections::HashMap;
1414// reason; reuse it here. `AtomicBool` works fine in std on every target.
1515use portable_atomic:: AtomicU64 ;
1616use std:: sync:: atomic:: { AtomicBool , Ordering } ;
17- use std:: sync:: Arc ;
17+ use std:: sync:: { Arc , Mutex } ;
1818use std:: time:: { Duration , Instant } ;
1919
2020use base64:: engine:: general_purpose:: STANDARD as B64 ;
@@ -78,6 +78,19 @@ const CODE_UNSUPPORTED_OP: &str = "UNSUPPORTED_OP";
7878/// floor, so network jitter on either side won't false-trigger.
7979const LEGACY_DETECT_THRESHOLD : Duration = Duration :: from_millis ( 1500 ) ;
8080
81+ /// How long to remember a `Network is unreachable` / `No route to host`
82+ /// failure for a given `(host, port)`. While cached, the proxy short-circuits
83+ /// repeat CONNECTs with an immediate "host unreachable" reply instead of
84+ /// burning a 1.5–2s tunnel batch round-trip on a target that just failed.
85+ /// Real motivator: IPv6-only probe hostnames (e.g. `ds6.probe.*`) on devices
86+ /// without IPv6 — the OS retries the probe every ~1.5s for 10s+, generating
87+ /// 5–10 wasted tunnel sessions per probe.
88+ const UNREACHABLE_CACHE_TTL : Duration = Duration :: from_secs ( 30 ) ;
89+
90+ /// Hard cap on negative-cache size. Browsing pulls in dozens of distinct
91+ /// hosts; we don't want a runaway map. Pruned opportunistically on insert.
92+ const UNREACHABLE_CACHE_MAX : usize = 256 ;
93+
8194/// Ports where the *server* speaks first (SMTP banner, SSH identification,
8295/// POP3/IMAP greeting, FTP banner). On these, waiting for client bytes
8396/// gains nothing and just adds handshake latency — skip the pre-read.
@@ -87,6 +100,32 @@ fn is_server_speaks_first(port: u16) -> bool {
87100 matches ! ( port, 21 | 22 | 25 | 80 | 110 | 143 | 587 )
88101}
89102
103+ /// Recognize the tunnel-node's connect-error strings that mean
104+ /// "this destination is fundamentally unreachable from the tunnel-node's
105+ /// network right now" — distinct from refused/reset/timeout, which can be
106+ /// transient. These come through as the inner `e` of a `TunnelResponse`
107+ /// after the tunnel-node's std::io::Error is stringified, so we match on
108+ /// substrings rather than `ErrorKind`. Linux: errno 101 (ENETUNREACH),
109+ /// errno 113 (EHOSTUNREACH). Format varies a bit across libc/Tokio
110+ /// versions, so cover both the human text and the os-error tag.
111+ fn is_unreachable_error_str ( s : & str ) -> bool {
112+ let lc = s. to_ascii_lowercase ( ) ;
113+ lc. contains ( "network is unreachable" )
114+ || lc. contains ( "no route to host" )
115+ || lc. contains ( "os error 101" )
116+ || lc. contains ( "os error 113" )
117+ }
118+
119+ /// Canonicalize a host string for use as a negative-cache key. DNS names
120+ /// are case-insensitive and may carry a trailing root-label dot, so
121+ /// `Example.COM:443`, `example.com:443`, and `example.com.:443` are all the
122+ /// same destination. IPv4 / IPv6 literals are unaffected — IPv4 has no
123+ /// letters, and `Ipv6Addr::to_string()` already emits lowercase.
124+ fn normalize_cache_host ( host : & str ) -> String {
125+ let trimmed = host. strip_suffix ( '.' ) . unwrap_or ( host) ;
126+ trimmed. to_ascii_lowercase ( )
127+ }
128+
90129// ---------------------------------------------------------------------------
91130// Multiplexer
92131// ---------------------------------------------------------------------------
@@ -159,6 +198,11 @@ pub struct TunnelMux {
159198 /// Separate monotonic counter used only to trigger the summary log
160199 /// (avoids a race where two threads both see `total % 100 == 0`).
161200 preread_total_events : AtomicU64 ,
201+ /// Short-lived negative cache for targets the tunnel-node reported as
202+ /// unreachable (`Network is unreachable` / `No route to host`). Keyed by
203+ /// `(host, port)`, value is the expiry instant. Plain Mutex<HashMap> is
204+ /// fine: it's touched once per CONNECT (cheap) and once per failure.
205+ unreachable_cache : Mutex < HashMap < ( String , u16 ) , Instant > > ,
162206}
163207
164208impl TunnelMux {
@@ -181,6 +225,7 @@ impl TunnelMux {
181225 preread_skip_unsupported : AtomicU64 :: new ( 0 ) ,
182226 preread_win_total_us : AtomicU64 :: new ( 0 ) ,
183227 preread_total_events : AtomicU64 :: new ( 0 ) ,
228+ unreachable_cache : Mutex :: new ( HashMap :: new ( ) ) ,
184229 } )
185230 }
186231
@@ -254,6 +299,71 @@ impl TunnelMux {
254299 }
255300 }
256301
302+ /// Returns true if `(host, port)` has a non-expired unreachable entry.
303+ /// The proxy front-end uses this to skip the tunnel and reply
304+ /// "host unreachable" immediately on follow-up CONNECTs.
305+ pub fn is_unreachable ( & self , host : & str , port : u16 ) -> bool {
306+ let now = Instant :: now ( ) ;
307+ let mut cache = match self . unreachable_cache . lock ( ) {
308+ Ok ( g) => g,
309+ Err ( p) => p. into_inner ( ) ,
310+ } ;
311+ let key = ( normalize_cache_host ( host) , port) ;
312+ match cache. get ( & key) {
313+ Some ( expiry) if * expiry > now => true ,
314+ Some ( _) => {
315+ cache. remove ( & key) ;
316+ false
317+ }
318+ None => false ,
319+ }
320+ }
321+
322+ /// If `err` looks like a network-unreachable / no-route-to-host error
323+ /// from the tunnel-node, remember the target for `UNREACHABLE_CACHE_TTL`.
324+ /// No-op for any other error (timeouts, refused, EOF, etc.) — those can
325+ /// be transient and we don't want to lock out a host on a flaky moment.
326+ fn record_unreachable_if_match ( & self , host : & str , port : u16 , err : & str ) {
327+ if !is_unreachable_error_str ( err) {
328+ return ;
329+ }
330+ let mut cache = match self . unreachable_cache . lock ( ) {
331+ Ok ( g) => g,
332+ Err ( p) => p. into_inner ( ) ,
333+ } ;
334+ // Cap enforcement is two-stage: first drop anything already expired,
335+ // then if we're STILL at/above the cap (i.e. an unbounded burst of
336+ // unique unreachable hosts within the TTL), evict the entry that
337+ // would expire soonest. This bounds the map size at all times — a
338+ // pure `retain` on expiry alone would let the map grow unbounded
339+ // until the first entry's TTL elapses.
340+ if cache. len ( ) >= UNREACHABLE_CACHE_MAX {
341+ let now = Instant :: now ( ) ;
342+ cache. retain ( |_, expiry| * expiry > now) ;
343+ while cache. len ( ) >= UNREACHABLE_CACHE_MAX {
344+ let victim = cache
345+ . iter ( )
346+ . min_by_key ( |( _, expiry) | * * expiry)
347+ . map ( |( k, _) | k. clone ( ) ) ;
348+ match victim {
349+ Some ( k) => {
350+ cache. remove ( & k) ;
351+ }
352+ None => break ,
353+ }
354+ }
355+ }
356+ let key = ( normalize_cache_host ( host) , port) ;
357+ cache. insert ( key, Instant :: now ( ) + UNREACHABLE_CACHE_TTL ) ;
358+ tracing:: debug!(
359+ "negative-cached {}:{} for {:?} ({})" ,
360+ host,
361+ port,
362+ UNREACHABLE_CACHE_TTL ,
363+ err
364+ ) ;
365+ }
366+
257367 fn record_preread_win ( & self , port : u16 , elapsed : Duration ) {
258368 self . preread_win . fetch_add ( 1 , Ordering :: Relaxed ) ;
259369 self . preread_win_total_us
@@ -723,6 +833,11 @@ async fn connect_plain(host: &str, port: u16, mux: &Arc<TunnelMux>) -> std::io::
723833 Ok ( Ok ( resp) ) => {
724834 if let Some ( ref e) = resp. e {
725835 tracing:: error!( "tunnel connect error for {}:{}: {}" , host, port, e) ;
836+ // Only cache here: `resp.e` is the tunnel-node's own connect()
837+ // result against the target. The outer `Ok(Err(_))` arm below
838+ // is a transport-level failure (relay → Apps Script → tunnel-
839+ // node never reached) and tells us nothing about the target.
840+ mux. record_unreachable_if_match ( host, port, e) ;
726841 return Err ( std:: io:: Error :: new (
727842 std:: io:: ErrorKind :: ConnectionRefused ,
728843 e. clone ( ) ,
@@ -769,6 +884,9 @@ async fn connect_with_initial_data(
769884 return Ok ( ConnectDataOutcome :: Unsupported ) ;
770885 }
771886 tracing:: error!( "tunnel connect_data error for {}:{}: {}" , host, port, e) ;
887+ // Outer transport failure (relay/Apps Script never reached the
888+ // tunnel-node). Don't poison the destination cache from here —
889+ // see `connect_plain` for the same reasoning.
772890 return Err ( std:: io:: Error :: new (
773891 std:: io:: ErrorKind :: ConnectionRefused ,
774892 e,
@@ -794,6 +912,8 @@ async fn connect_with_initial_data(
794912
795913 if let Some ( ref e) = resp. e {
796914 tracing:: error!( "tunnel connect_data error for {}:{}: {}" , host, port, e) ;
915+ // `resp.e` is the tunnel-node's own connect result — cache it.
916+ mux. record_unreachable_if_match ( host, port, e) ;
797917 return Err ( std:: io:: Error :: new (
798918 std:: io:: ErrorKind :: ConnectionRefused ,
799919 e. clone ( ) ,
@@ -1095,6 +1215,133 @@ mod tests {
10951215 ) ) ) ;
10961216 }
10971217
1218+ #[ test]
1219+ fn unreachable_error_str_matches_expected_variants ( ) {
1220+ assert ! ( is_unreachable_error_str(
1221+ "connect failed: Network is unreachable (os error 101)"
1222+ ) ) ;
1223+ assert ! ( is_unreachable_error_str( "No route to host" ) ) ;
1224+ assert ! ( is_unreachable_error_str( "os error 113" ) ) ;
1225+ // Case-insensitive.
1226+ assert ! ( is_unreachable_error_str(
1227+ "CONNECT FAILED: NETWORK IS UNREACHABLE"
1228+ ) ) ;
1229+ }
1230+
1231+ #[ test]
1232+ fn unreachable_error_str_rejects_unrelated ( ) {
1233+ assert ! ( !is_unreachable_error_str( "connection refused" ) ) ;
1234+ assert ! ( !is_unreachable_error_str( "connect timed out" ) ) ;
1235+ assert ! ( !is_unreachable_error_str( "connection reset by peer" ) ) ;
1236+ assert ! ( !is_unreachable_error_str( "" ) ) ;
1237+ }
1238+
1239+ #[ test]
1240+ fn negative_cache_records_and_short_circuits ( ) {
1241+ let ( mux, _rx) = mux_for_test ( ) ;
1242+ // Initially nothing is cached.
1243+ assert ! ( !mux. is_unreachable( "ds6.probe.example" , 443 ) ) ;
1244+ // Record a matching error.
1245+ mux. record_unreachable_if_match (
1246+ "ds6.probe.example" ,
1247+ 443 ,
1248+ "connect failed: Network is unreachable (os error 101)" ,
1249+ ) ;
1250+ assert ! ( mux. is_unreachable( "ds6.probe.example" , 443 ) ) ;
1251+ // A different port for the same host is its own entry.
1252+ assert ! ( !mux. is_unreachable( "ds6.probe.example" , 80 ) ) ;
1253+ }
1254+
1255+ #[ test]
1256+ fn negative_cache_ignores_non_unreachable_errors ( ) {
1257+ let ( mux, _rx) = mux_for_test ( ) ;
1258+ mux. record_unreachable_if_match (
1259+ "example.com" ,
1260+ 443 ,
1261+ "connect failed: connection refused" ,
1262+ ) ;
1263+ assert ! ( !mux. is_unreachable( "example.com" , 443 ) ) ;
1264+ }
1265+
1266+ #[ test]
1267+ fn negative_cache_normalizes_host_keys ( ) {
1268+ let ( mux, _rx) = mux_for_test ( ) ;
1269+ // Cache under one casing/format...
1270+ mux. record_unreachable_if_match (
1271+ "Example.COM." ,
1272+ 443 ,
1273+ "Network is unreachable (os error 101)" ,
1274+ ) ;
1275+ // ...and look up under several equivalent forms.
1276+ assert ! ( mux. is_unreachable( "example.com" , 443 ) ) ;
1277+ assert ! ( mux. is_unreachable( "EXAMPLE.com" , 443 ) ) ;
1278+ assert ! ( mux. is_unreachable( "example.com." , 443 ) ) ;
1279+ // Different host should still miss.
1280+ assert ! ( !mux. is_unreachable( "other.com" , 443 ) ) ;
1281+ }
1282+
1283+ /// Outer `Ok(Err(_))` from the mux channel means "the relay never
1284+ /// reached the tunnel-node" (HTTP/TLS to Apps Script failed, batch
1285+ /// timed out, etc.) — the destination wasn't even attempted. Even if
1286+ /// that error string contains "Network is unreachable" (e.g. the
1287+ /// client device's WAN was momentarily down), it must NOT poison the
1288+ /// destination cache, or every host the user touched during a
1289+ /// connectivity blip stays refused for 30s.
1290+ #[ tokio:: test]
1291+ async fn negative_cache_skips_outer_relay_errors ( ) {
1292+ let ( mux, mut rx) = mux_for_test ( ) ;
1293+ let mux_for_task = mux. clone ( ) ;
1294+ let task = tokio:: spawn ( async move {
1295+ connect_plain ( "real.target.example" , 443 , & mux_for_task) . await
1296+ } ) ;
1297+
1298+ // Receive the Connect msg and reply with an outer Err whose string
1299+ // would otherwise match `is_unreachable_error_str`.
1300+ let msg = rx. recv ( ) . await . expect ( "connect msg" ) ;
1301+ let reply = match msg {
1302+ MuxMsg :: Connect { reply, .. } => reply,
1303+ other => panic ! ( "expected Connect, got {:?}" , std:: mem:: discriminant( & other) ) ,
1304+ } ;
1305+ let _ = reply. send ( Err (
1306+ "relay failed: Network is unreachable (os error 101)" . into ( ) ,
1307+ ) ) ;
1308+
1309+ let res = task. await . expect ( "task" ) ;
1310+ assert ! ( res. is_err( ) , "connect_plain should surface the error" ) ;
1311+ assert ! (
1312+ !mux. is_unreachable( "real.target.example" , 443 ) ,
1313+ "outer relay error must not negative-cache the destination"
1314+ ) ;
1315+ }
1316+
1317+ #[ test]
1318+ fn negative_cache_enforces_hard_cap_under_unique_burst ( ) {
1319+ let ( mux, _rx) = mux_for_test ( ) ;
1320+ // Insert enough unique still-live entries to exceed the cap. The
1321+ // map size must never exceed UNREACHABLE_CACHE_MAX, even though
1322+ // every entry is fresh and `retain(expired)` prunes nothing.
1323+ let burst = UNREACHABLE_CACHE_MAX + 50 ;
1324+ for i in 0 ..burst {
1325+ let host = format ! ( "h{}.example" , i) ;
1326+ mux. record_unreachable_if_match (
1327+ & host,
1328+ 443 ,
1329+ "connect failed: Network is unreachable (os error 101)" ,
1330+ ) ;
1331+ }
1332+ let len = mux
1333+ . unreachable_cache
1334+ . lock ( )
1335+ . map ( |g| g. len ( ) )
1336+ . unwrap_or ( 0 ) ;
1337+ assert ! (
1338+ len <= UNREACHABLE_CACHE_MAX ,
1339+ "cache size {} exceeded cap {}" ,
1340+ len,
1341+ UNREACHABLE_CACHE_MAX
1342+ ) ;
1343+ }
1344+
10981345 #[ test]
10991346 fn server_speaks_first_covers_common_protocols ( ) {
11001347 for p in [ 21u16 , 22 , 25 , 80 , 110 , 143 , 587 ] {
@@ -1128,6 +1375,7 @@ mod tests {
11281375 preread_skip_unsupported : AtomicU64 :: new ( 0 ) ,
11291376 preread_win_total_us : AtomicU64 :: new ( 0 ) ,
11301377 preread_total_events : AtomicU64 :: new ( 0 ) ,
1378+ unreachable_cache : Mutex :: new ( HashMap :: new ( ) ) ,
11311379 } ) ;
11321380 ( mux, rx)
11331381 }
0 commit comments