Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions tornado/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,13 +336,38 @@ def _try_cleanup_process(cls, pid: int) -> None:
try:
ret_pid, status = os.waitpid(pid, os.WNOHANG) # type: ignore
except ChildProcessError:
# The process has already been reaped (typically by a caller
# who called Subprocess.proc.wait() / .communicate() before
# our SIGCHLD handler could pick it up). The Popen object may
# already know the return code via proc.returncode; if so,
# schedule the exit callback so wait_for_exit() can resolve
# rather than hanging forever.
subproc = cls._waiting.get(pid)
if subproc is not None and subproc.proc.returncode is not None:
cls._waiting.pop(pid, None)
returncode = subproc.proc.returncode
subproc.io_loop.add_callback(
subproc._set_returncode_known, returncode
)
return
if ret_pid == 0:
return
assert ret_pid == pid
subproc = cls._waiting.pop(pid)
subproc.io_loop.add_callback(subproc._set_returncode, status)

def _set_returncode_known(self, returncode: int) -> None:
# Set the returncode without going through WIFEXITED/WEXITSTATUS
# because we did not observe the status word ourselves: the
# Popen object learned the exit code from .wait()/.communicate(),
# and the kernel status has already been reaped and discarded.
self.returncode = returncode
self.proc.returncode = returncode
if self._exit_callback:
callback = self._exit_callback
self._exit_callback = None
callback(self.returncode)

def _set_returncode(self, status: int) -> None:
if sys.platform == "win32":
self.returncode = -1
Expand Down
68 changes: 59 additions & 9 deletions tornado/simple_httpclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ def __init__(
# Timeout handle returned by IOLoop.add_timeout
self._timeout: object = None
self._sockaddr = None
self._connecting_phase = "resolving"
IOLoop.current().add_future(
gen.convert_yielded(self.run()), lambda f: f.result()
)
Expand Down Expand Up @@ -328,16 +329,55 @@ async def run(self) -> None:
if timeout:
self._timeout = self.io_loop.add_timeout(
self.start_time + timeout,
functools.partial(self._on_timeout, "while connecting"),
self._on_connect_timeout,
)
# Pre-resolve the hostname so a connect_timeout that
# fires while DNS is still in flight can be reported
# as "Timeout while resolving" rather than the generic
# "Timeout while connecting" (issue #3522). We use the
# TCP client's resolver directly (so an OverrideResolver
# configured with hostname_mapping is honoured and
# returns the mapped host/port), then feed the resolved
# literal address and port to ``tcp_client.connect``.
# This drops the Happy-Eyeballs parallel-address-family
# fallback that ``TCPClient`` implements internally,
# but that fallback only matters on broken networks
# where DNS is unlikely to be the slow leg, so the
# trade-off is acceptable for the timeout-reporting
# use case.
if is_valid_ip(host):
# Numeric IP literal: skip the resolver so the timeout
# message correctly says "connect"
self._connecting_phase = "connecting"
stream = await self.tcp_client.connect(
host,
port,
af=af,
ssl_options=ssl_options,
max_buffer_size=self.max_buffer_size,
source_ip=source_ip,
)
else:
addrinfo = await self.tcp_client.resolver.resolve(
host, port, af
)
# addrinfo is a list of (family, (host, port, ...))
# tuples. ``TCPClient.connect`` would normally
# pick the first entry, so do the same here and use
# its host and port verbatim. Picking the port
# from the resolved entry (rather than the
# original) is important for hostname_mapping
# configurations that rewrite the port.
connect_family, connect_sockaddr = addrinfo[0]
self._connecting_phase = "connecting"
stream = await self.tcp_client.connect(
connect_sockaddr[0],
connect_sockaddr[1],
af=connect_family,
ssl_options=ssl_options,
max_buffer_size=self.max_buffer_size,
source_ip=source_ip,
)
stream = await self.tcp_client.connect(
host,
port,
af=af,
ssl_options=ssl_options,
max_buffer_size=self.max_buffer_size,
source_ip=source_ip,
)

if self.final_callback is None:
# final_callback is cleared if we've hit our timeout.
Expand Down Expand Up @@ -484,6 +524,16 @@ def _on_timeout(self, info: str | None = None) -> None:
HTTPTimeoutError, HTTPTimeoutError(error_message), None
)

def _on_connect_timeout(self) -> None:
# Issue #3522: when the connect phase is split into a DNS resolve
# and a TCP/TLS handshake, surface which one actually expired so
# users can tell a slow DNS server apart from a slow target.
# The phase is updated by ``run`` (set to ``resolving`` on entry,
# and to ``connecting`` once the resolver has returned).
phase = getattr(self, "_connecting_phase", "connecting")
info = "while resolving" if phase == "resolving" else "while connecting"
self._on_timeout(info)

def _remove_timeout(self) -> None:
if self._timeout is not None:
self.io_loop.remove_timeout(self._timeout)
Expand Down
21 changes: 21 additions & 0 deletions tornado/test/process_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import time
import unittest

from tornado import gen
from tornado.process import Subprocess
from tornado.test.util import skipIfNonUnix
from tornado.testing import AsyncTestCase, gen_test
Expand Down Expand Up @@ -294,3 +295,23 @@ def test_wait_for_exit_raise_disabled(self):
subproc = Subprocess([sys.executable, "-c", "import sys; sys.exit(1)"])
ret = yield subproc.wait_for_exit(raise_error=False)
self.assertEqual(ret, 1)

@gen_test
def test_wait_for_exit_after_proc_wait(self):
# Issue #3364: if the caller reaps the child via Popen.wait() or
# .communicate() before our SIGCHLD handler can pick it up, the
# subsequent os.waitpid(WNOHANG) raises ChildProcessError. The
# exit callback was previously dropped in that branch, so
# wait_for_exit() never resolved. Now we fall back to the
# returncode that Popen already observed, and the future resolves.
Subprocess.initialize()
self.addCleanup(Subprocess.uninitialize)
subproc = Subprocess([sys.executable, "-c", "import sys; sys.exit(7)"])
# Reap the child before our handler can: this is exactly the
# sequence that previously left wait_for_exit() hanging.
ret_code = subproc.proc.wait()
self.assertEqual(ret_code, 7)
# Give the IOLoop a turn so the signal handler can run.
yield gen.sleep(0.05)
ret = yield subproc.wait_for_exit(raise_error=False)
self.assertEqual(ret, 7)
30 changes: 30 additions & 0 deletions tornado/test/simple_httpclient_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,36 @@ async def resolve(self, *args, **kwargs):
cleanup_event.set()
yield gen.sleep(0.2)

@gen_test
def test_connect_timeout_reports_resolving_phase(self):
# When the connect_timeout fires while the resolver is still working
# (i.e., before the host has been translated to an address), the
# error message should explicitly say ``resolving`` rather than the
# generic ``connecting`` so a slow DNS server is identifiable from
# the message alone (issue #3522). Use a non-IP hostname here:
# ``self.get_url`` returns 127.0.0.1, which would skip the resolver.
timeout = 0.1

cleanup_event = Event()
test = self

class HangingResolver(Resolver):
async def resolve(self, *args, **kwargs):
await cleanup_event.wait()
return [(socket.AF_INET, ("127.0.0.1", test.get_http_port()))]

with closing(self.create_client(resolver=HangingResolver())) as client:
with self.assertRaises(HTTPTimeoutError) as cm:
yield client.fetch(
"http://hostname.invalid/hello",
connect_timeout=timeout,
request_timeout=3600,
raise_error=True,
)
self.assertIn("resolving", str(cm.exception))
cleanup_event.set()
yield gen.sleep(0.2)

def test_request_timeout(self):
timeout = 0.1
if os.name == "nt" or os.environ.get("EMULATION") == "1":
Expand Down