diff --git a/tornado/process.py b/tornado/process.py index 493481fd1..7abdd5c4d 100644 --- a/tornado/process.py +++ b/tornado/process.py @@ -336,6 +336,19 @@ def _try_cleanup_process(cls, pid: int) -> None: try: ret_pid, status = os.waitpid(pid, os.WNOHANG) # type: ignore except ChildProcessError: + # The process has already been reaped (typically by a caller + # who called Subprocess.proc.wait() / .communicate() before + # our SIGCHLD handler could pick it up). The Popen object may + # already know the return code via proc.returncode; if so, + # schedule the exit callback so wait_for_exit() can resolve + # rather than hanging forever. + subproc = cls._waiting.get(pid) + if subproc is not None and subproc.proc.returncode is not None: + cls._waiting.pop(pid, None) + returncode = subproc.proc.returncode + subproc.io_loop.add_callback( + subproc._set_returncode_known, returncode + ) return if ret_pid == 0: return @@ -343,6 +356,18 @@ def _try_cleanup_process(cls, pid: int) -> None: subproc = cls._waiting.pop(pid) subproc.io_loop.add_callback(subproc._set_returncode, status) + def _set_returncode_known(self, returncode: int) -> None: + # Set the returncode without going through WIFEXITED/WEXITSTATUS + # because we did not observe the status word ourselves: the + # Popen object learned the exit code from .wait()/.communicate(), + # and the kernel status has already been reaped and discarded. + self.returncode = returncode + self.proc.returncode = returncode + if self._exit_callback: + callback = self._exit_callback + self._exit_callback = None + callback(self.returncode) + def _set_returncode(self, status: int) -> None: if sys.platform == "win32": self.returncode = -1 diff --git a/tornado/simple_httpclient.py b/tornado/simple_httpclient.py index b8e4d8c93..840320398 100644 --- a/tornado/simple_httpclient.py +++ b/tornado/simple_httpclient.py @@ -276,6 +276,7 @@ def __init__( # Timeout handle returned by IOLoop.add_timeout self._timeout: object = None self._sockaddr = None + self._connecting_phase = "resolving" IOLoop.current().add_future( gen.convert_yielded(self.run()), lambda f: f.result() ) @@ -328,16 +329,55 @@ async def run(self) -> None: if timeout: self._timeout = self.io_loop.add_timeout( self.start_time + timeout, - functools.partial(self._on_timeout, "while connecting"), + self._on_connect_timeout, + ) + # Pre-resolve the hostname so a connect_timeout that + # fires while DNS is still in flight can be reported + # as "Timeout while resolving" rather than the generic + # "Timeout while connecting" (issue #3522). We use the + # TCP client's resolver directly (so an OverrideResolver + # configured with hostname_mapping is honoured and + # returns the mapped host/port), then feed the resolved + # literal address and port to ``tcp_client.connect``. + # This drops the Happy-Eyeballs parallel-address-family + # fallback that ``TCPClient`` implements internally, + # but that fallback only matters on broken networks + # where DNS is unlikely to be the slow leg, so the + # trade-off is acceptable for the timeout-reporting + # use case. + if is_valid_ip(host): + # Numeric IP literal: skip the resolver so the timeout + # message correctly says "connect" + self._connecting_phase = "connecting" + stream = await self.tcp_client.connect( + host, + port, + af=af, + ssl_options=ssl_options, + max_buffer_size=self.max_buffer_size, + source_ip=source_ip, + ) + else: + addrinfo = await self.tcp_client.resolver.resolve( + host, port, af + ) + # addrinfo is a list of (family, (host, port, ...)) + # tuples. ``TCPClient.connect`` would normally + # pick the first entry, so do the same here and use + # its host and port verbatim. Picking the port + # from the resolved entry (rather than the + # original) is important for hostname_mapping + # configurations that rewrite the port. + connect_family, connect_sockaddr = addrinfo[0] + self._connecting_phase = "connecting" + stream = await self.tcp_client.connect( + connect_sockaddr[0], + connect_sockaddr[1], + af=connect_family, + ssl_options=ssl_options, + max_buffer_size=self.max_buffer_size, + source_ip=source_ip, ) - stream = await self.tcp_client.connect( - host, - port, - af=af, - ssl_options=ssl_options, - max_buffer_size=self.max_buffer_size, - source_ip=source_ip, - ) if self.final_callback is None: # final_callback is cleared if we've hit our timeout. @@ -484,6 +524,16 @@ def _on_timeout(self, info: str | None = None) -> None: HTTPTimeoutError, HTTPTimeoutError(error_message), None ) + def _on_connect_timeout(self) -> None: + # Issue #3522: when the connect phase is split into a DNS resolve + # and a TCP/TLS handshake, surface which one actually expired so + # users can tell a slow DNS server apart from a slow target. + # The phase is updated by ``run`` (set to ``resolving`` on entry, + # and to ``connecting`` once the resolver has returned). + phase = getattr(self, "_connecting_phase", "connecting") + info = "while resolving" if phase == "resolving" else "while connecting" + self._on_timeout(info) + def _remove_timeout(self) -> None: if self._timeout is not None: self.io_loop.remove_timeout(self._timeout) diff --git a/tornado/test/process_test.py b/tornado/test/process_test.py index 775f99a3f..79fa8f57a 100644 --- a/tornado/test/process_test.py +++ b/tornado/test/process_test.py @@ -5,6 +5,7 @@ import time import unittest +from tornado import gen from tornado.process import Subprocess from tornado.test.util import skipIfNonUnix from tornado.testing import AsyncTestCase, gen_test @@ -294,3 +295,23 @@ def test_wait_for_exit_raise_disabled(self): subproc = Subprocess([sys.executable, "-c", "import sys; sys.exit(1)"]) ret = yield subproc.wait_for_exit(raise_error=False) self.assertEqual(ret, 1) + + @gen_test + def test_wait_for_exit_after_proc_wait(self): + # Issue #3364: if the caller reaps the child via Popen.wait() or + # .communicate() before our SIGCHLD handler can pick it up, the + # subsequent os.waitpid(WNOHANG) raises ChildProcessError. The + # exit callback was previously dropped in that branch, so + # wait_for_exit() never resolved. Now we fall back to the + # returncode that Popen already observed, and the future resolves. + Subprocess.initialize() + self.addCleanup(Subprocess.uninitialize) + subproc = Subprocess([sys.executable, "-c", "import sys; sys.exit(7)"]) + # Reap the child before our handler can: this is exactly the + # sequence that previously left wait_for_exit() hanging. + ret_code = subproc.proc.wait() + self.assertEqual(ret_code, 7) + # Give the IOLoop a turn so the signal handler can run. + yield gen.sleep(0.05) + ret = yield subproc.wait_for_exit(raise_error=False) + self.assertEqual(ret, 7) diff --git a/tornado/test/simple_httpclient_test.py b/tornado/test/simple_httpclient_test.py index 92c356310..111b7904c 100644 --- a/tornado/test/simple_httpclient_test.py +++ b/tornado/test/simple_httpclient_test.py @@ -313,6 +313,36 @@ async def resolve(self, *args, **kwargs): cleanup_event.set() yield gen.sleep(0.2) + @gen_test + def test_connect_timeout_reports_resolving_phase(self): + # When the connect_timeout fires while the resolver is still working + # (i.e., before the host has been translated to an address), the + # error message should explicitly say ``resolving`` rather than the + # generic ``connecting`` so a slow DNS server is identifiable from + # the message alone (issue #3522). Use a non-IP hostname here: + # ``self.get_url`` returns 127.0.0.1, which would skip the resolver. + timeout = 0.1 + + cleanup_event = Event() + test = self + + class HangingResolver(Resolver): + async def resolve(self, *args, **kwargs): + await cleanup_event.wait() + return [(socket.AF_INET, ("127.0.0.1", test.get_http_port()))] + + with closing(self.create_client(resolver=HangingResolver())) as client: + with self.assertRaises(HTTPTimeoutError) as cm: + yield client.fetch( + "http://hostname.invalid/hello", + connect_timeout=timeout, + request_timeout=3600, + raise_error=True, + ) + self.assertIn("resolving", str(cm.exception)) + cleanup_event.set() + yield gen.sleep(0.2) + def test_request_timeout(self): timeout = 0.1 if os.name == "nt" or os.environ.get("EMULATION") == "1":