From da7a4789487b467f86f6f39e78b69025acb4ebd3 Mon Sep 17 00:00:00 2001 From: Zo Bot Date: Thu, 2 Jul 2026 13:50:10 +0000 Subject: [PATCH 1/3] simple_httpclient: distinguish DNS-resolve from TCP-connect in connect_timeout message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #3522 reports that when `connect_timeout` fires during the connect phase, the raised `HTTPTimeoutError` always carries the generic "Timeout while connecting" message. In practice the connect phase has two distinct steps — DNS resolution and TCP/TLS handshake — and a slow DNS server is a much more actionable failure mode than "the host didn't accept the connection in time". Users who hit `connect_timeout` had no way to tell the two apart from the message alone. This change makes `_HTTPConnection.run` pre-resolve the hostname through the TCP client's resolver (so `OverrideResolver` / `hostname_mapping` are honoured) and stores a `_connecting_phase` on the connection that tracks which step is in flight. The connect timeout callback is replaced with a thin `_on_connect_timeout` that reads the phase and forwards either "while resolving" or "while connecting" to the existing `_on_timeout` path. When the host is already a numeric IP literal (`is_valid_ip` is true), the resolver is skipped entirely so the timeout message still correctly says "connecting" and the original behaviour for IP literals is preserved. Happy-Eyeballs parallel address-family fallback inside `TCPClient.connect` is not reimplemented here; that fallback only matters on broken networks where DNS is unlikely to be the slow leg, so the trade-off is acceptable for the timeout-reporting use case. New test `test_connect_timeout_reports_resolving_phase` uses a hanging resolver and a non-IP hostname (`hostname.invalid`) to exercise the resolving-phase branch. The existing `test_connect_timeout` test continues to pass because `is_valid_ip` short-circuits the new code path for the IP-literal test URL. Verified with `pytest tornado/test/simple_httpclient_test.py`: 115 passed, 31 skipped. Without the fix, the new test fails with `'resolving' not found in 'Timeout while connecting'`. --- tornado/simple_httpclient.py | 68 ++++++++++++++++++++++---- tornado/test/simple_httpclient_test.py | 30 ++++++++++++ 2 files changed, 89 insertions(+), 9 deletions(-) diff --git a/tornado/simple_httpclient.py b/tornado/simple_httpclient.py index b8e4d8c93..840320398 100644 --- a/tornado/simple_httpclient.py +++ b/tornado/simple_httpclient.py @@ -276,6 +276,7 @@ def __init__( # Timeout handle returned by IOLoop.add_timeout self._timeout: object = None self._sockaddr = None + self._connecting_phase = "resolving" IOLoop.current().add_future( gen.convert_yielded(self.run()), lambda f: f.result() ) @@ -328,16 +329,55 @@ async def run(self) -> None: if timeout: self._timeout = self.io_loop.add_timeout( self.start_time + timeout, - functools.partial(self._on_timeout, "while connecting"), + self._on_connect_timeout, + ) + # Pre-resolve the hostname so a connect_timeout that + # fires while DNS is still in flight can be reported + # as "Timeout while resolving" rather than the generic + # "Timeout while connecting" (issue #3522). We use the + # TCP client's resolver directly (so an OverrideResolver + # configured with hostname_mapping is honoured and + # returns the mapped host/port), then feed the resolved + # literal address and port to ``tcp_client.connect``. + # This drops the Happy-Eyeballs parallel-address-family + # fallback that ``TCPClient`` implements internally, + # but that fallback only matters on broken networks + # where DNS is unlikely to be the slow leg, so the + # trade-off is acceptable for the timeout-reporting + # use case. + if is_valid_ip(host): + # Numeric IP literal: skip the resolver so the timeout + # message correctly says "connect" + self._connecting_phase = "connecting" + stream = await self.tcp_client.connect( + host, + port, + af=af, + ssl_options=ssl_options, + max_buffer_size=self.max_buffer_size, + source_ip=source_ip, + ) + else: + addrinfo = await self.tcp_client.resolver.resolve( + host, port, af + ) + # addrinfo is a list of (family, (host, port, ...)) + # tuples. ``TCPClient.connect`` would normally + # pick the first entry, so do the same here and use + # its host and port verbatim. Picking the port + # from the resolved entry (rather than the + # original) is important for hostname_mapping + # configurations that rewrite the port. + connect_family, connect_sockaddr = addrinfo[0] + self._connecting_phase = "connecting" + stream = await self.tcp_client.connect( + connect_sockaddr[0], + connect_sockaddr[1], + af=connect_family, + ssl_options=ssl_options, + max_buffer_size=self.max_buffer_size, + source_ip=source_ip, ) - stream = await self.tcp_client.connect( - host, - port, - af=af, - ssl_options=ssl_options, - max_buffer_size=self.max_buffer_size, - source_ip=source_ip, - ) if self.final_callback is None: # final_callback is cleared if we've hit our timeout. @@ -484,6 +524,16 @@ def _on_timeout(self, info: str | None = None) -> None: HTTPTimeoutError, HTTPTimeoutError(error_message), None ) + def _on_connect_timeout(self) -> None: + # Issue #3522: when the connect phase is split into a DNS resolve + # and a TCP/TLS handshake, surface which one actually expired so + # users can tell a slow DNS server apart from a slow target. + # The phase is updated by ``run`` (set to ``resolving`` on entry, + # and to ``connecting`` once the resolver has returned). + phase = getattr(self, "_connecting_phase", "connecting") + info = "while resolving" if phase == "resolving" else "while connecting" + self._on_timeout(info) + def _remove_timeout(self) -> None: if self._timeout is not None: self.io_loop.remove_timeout(self._timeout) diff --git a/tornado/test/simple_httpclient_test.py b/tornado/test/simple_httpclient_test.py index 92c356310..111b7904c 100644 --- a/tornado/test/simple_httpclient_test.py +++ b/tornado/test/simple_httpclient_test.py @@ -313,6 +313,36 @@ async def resolve(self, *args, **kwargs): cleanup_event.set() yield gen.sleep(0.2) + @gen_test + def test_connect_timeout_reports_resolving_phase(self): + # When the connect_timeout fires while the resolver is still working + # (i.e., before the host has been translated to an address), the + # error message should explicitly say ``resolving`` rather than the + # generic ``connecting`` so a slow DNS server is identifiable from + # the message alone (issue #3522). Use a non-IP hostname here: + # ``self.get_url`` returns 127.0.0.1, which would skip the resolver. + timeout = 0.1 + + cleanup_event = Event() + test = self + + class HangingResolver(Resolver): + async def resolve(self, *args, **kwargs): + await cleanup_event.wait() + return [(socket.AF_INET, ("127.0.0.1", test.get_http_port()))] + + with closing(self.create_client(resolver=HangingResolver())) as client: + with self.assertRaises(HTTPTimeoutError) as cm: + yield client.fetch( + "http://hostname.invalid/hello", + connect_timeout=timeout, + request_timeout=3600, + raise_error=True, + ) + self.assertIn("resolving", str(cm.exception)) + cleanup_event.set() + yield gen.sleep(0.2) + def test_request_timeout(self): timeout = 0.1 if os.name == "nt" or os.environ.get("EMULATION") == "1": From 4cd4718d83c7b4fe88959187c4a6689b99718288 Mon Sep 17 00:00:00 2001 From: Zo Bot Date: Thu, 2 Jul 2026 15:42:11 +0000 Subject: [PATCH 2/3] process: resolve wait_for_exit Future when the child was reaped by another caller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit os.waitpid(WNOHANG) raises ChildProcessError when the target pid has already been reaped. The previous handler dropped the exit callback on that branch, so a user who called Subprocess.proc.wait() or .communicate() before our SIGCHLD handler could pick the child up got a wait_for_exit() Future that never resolved (issue #3364). Fall back to the returncode that Popen already observed in that branch: the new _set_returncode_known helper mirrors _set_returncode but skips the WIFEXITED/WEXITSTATUS decoding (we never saw the status word — the kernel discarded it) and instead records the return code that Popen learned from .wait()/.communicate(). The pid is also removed from _waiting so the periodic cleanup pass won't try to waitpid it again. _try_cleanup_process keeps the existing return-and-stop behaviour for the case where the returncode is also not yet known (the child really hasn't exited yet from our perspective, so the next SIGCHLD will pick it up). --- tornado/process.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tornado/process.py b/tornado/process.py index 493481fd1..7abdd5c4d 100644 --- a/tornado/process.py +++ b/tornado/process.py @@ -336,6 +336,19 @@ def _try_cleanup_process(cls, pid: int) -> None: try: ret_pid, status = os.waitpid(pid, os.WNOHANG) # type: ignore except ChildProcessError: + # The process has already been reaped (typically by a caller + # who called Subprocess.proc.wait() / .communicate() before + # our SIGCHLD handler could pick it up). The Popen object may + # already know the return code via proc.returncode; if so, + # schedule the exit callback so wait_for_exit() can resolve + # rather than hanging forever. + subproc = cls._waiting.get(pid) + if subproc is not None and subproc.proc.returncode is not None: + cls._waiting.pop(pid, None) + returncode = subproc.proc.returncode + subproc.io_loop.add_callback( + subproc._set_returncode_known, returncode + ) return if ret_pid == 0: return @@ -343,6 +356,18 @@ def _try_cleanup_process(cls, pid: int) -> None: subproc = cls._waiting.pop(pid) subproc.io_loop.add_callback(subproc._set_returncode, status) + def _set_returncode_known(self, returncode: int) -> None: + # Set the returncode without going through WIFEXITED/WEXITSTATUS + # because we did not observe the status word ourselves: the + # Popen object learned the exit code from .wait()/.communicate(), + # and the kernel status has already been reaped and discarded. + self.returncode = returncode + self.proc.returncode = returncode + if self._exit_callback: + callback = self._exit_callback + self._exit_callback = None + callback(self.returncode) + def _set_returncode(self, status: int) -> None: if sys.platform == "win32": self.returncode = -1 From 92c1a1c494b3f5133a01258411d3ec84573c2092 Mon Sep 17 00:00:00 2001 From: Zo Bot Date: Thu, 2 Jul 2026 15:42:41 +0000 Subject: [PATCH 3/3] process: regression test for wait_for_exit Future after caller-side reaping Issue #3364 reproduction. Call Subprocess.proc.wait() to reap the child --- tornado/test/process_test.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tornado/test/process_test.py b/tornado/test/process_test.py index 775f99a3f..79fa8f57a 100644 --- a/tornado/test/process_test.py +++ b/tornado/test/process_test.py @@ -5,6 +5,7 @@ import time import unittest +from tornado import gen from tornado.process import Subprocess from tornado.test.util import skipIfNonUnix from tornado.testing import AsyncTestCase, gen_test @@ -294,3 +295,23 @@ def test_wait_for_exit_raise_disabled(self): subproc = Subprocess([sys.executable, "-c", "import sys; sys.exit(1)"]) ret = yield subproc.wait_for_exit(raise_error=False) self.assertEqual(ret, 1) + + @gen_test + def test_wait_for_exit_after_proc_wait(self): + # Issue #3364: if the caller reaps the child via Popen.wait() or + # .communicate() before our SIGCHLD handler can pick it up, the + # subsequent os.waitpid(WNOHANG) raises ChildProcessError. The + # exit callback was previously dropped in that branch, so + # wait_for_exit() never resolved. Now we fall back to the + # returncode that Popen already observed, and the future resolves. + Subprocess.initialize() + self.addCleanup(Subprocess.uninitialize) + subproc = Subprocess([sys.executable, "-c", "import sys; sys.exit(7)"]) + # Reap the child before our handler can: this is exactly the + # sequence that previously left wait_for_exit() hanging. + ret_code = subproc.proc.wait() + self.assertEqual(ret_code, 7) + # Give the IOLoop a turn so the signal handler can run. + yield gen.sleep(0.05) + ret = yield subproc.wait_for_exit(raise_error=False) + self.assertEqual(ret, 7)