Skip to content

Commit 929cd6f

Browse files
committed
tests: extend kill-after-join to remaining memory_ipc tests
The fix in the initial commit only covered test_send_buffers.py. The 63-minute CI hang confirmed the deadlock was occurring in one of the other test files. Applies the same kill() after join(timeout=...) pattern to all remaining memory_ipc tests: - test_errors.py (1 join) - test_event_ipc.py (2 joins) - test_ipc_duplicate_import.py (1 join) - test_leaks.py (3 joins) - test_memory_ipc.py (7 joins across 4 test classes) - test_peer_access.py (2 joins) - test_serialize.py (3 joins) See issue #2004.
1 parent 036a8f5 commit 929cd6f

7 files changed

Lines changed: 142 additions & 0 deletions

File tree

cuda_core/tests/memory_ipc/test_errors.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import multiprocessing
55
import pickle
66
import re
7+
import sys
78

89
import pytest
910

@@ -43,6 +44,15 @@ def test_main(self, ipc_device, ipc_memory_resource):
4344

4445
# Wait for the child process.
4546
process.join(timeout=CHILD_TIMEOUT_SEC)
47+
if process.is_alive():
48+
print(
49+
f"[WARN] child process {process.pid} still alive after "
50+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
51+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
52+
file=sys.stderr,
53+
)
54+
process.kill()
55+
process.join()
4656
assert process.exitcode == 0
4757
finally:
4858
for mr in self._extra_mrs:

cuda_core/tests/memory_ipc/test_event_ipc.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,15 @@ def test_main(self, ipc_device, ipc_memory_resource):
6767
log("releasing stream1")
6868
latch.release()
6969
process.join(timeout=CHILD_TIMEOUT_SEC)
70+
if process.is_alive():
71+
print(
72+
f"[WARN] child process {process.pid} still alive after "
73+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
74+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
75+
file=sys.stderr,
76+
)
77+
process.kill()
78+
process.join()
7079
assert process.exitcode == 0
7180
log("done")
7281

@@ -162,6 +171,15 @@ def test_main(self, ipc_device, blocking_sync, use_options_cls, use_option_kw):
162171
assert props[5] is None
163172

164173
process.join(timeout=CHILD_TIMEOUT_SEC)
174+
if process.is_alive():
175+
print(
176+
f"[WARN] child process {process.pid} still alive after "
177+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
178+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
179+
file=sys.stderr,
180+
)
181+
process.kill()
182+
process.join()
165183
assert process.exitcode == 0
166184

167185
def child_main(self, q_in, q_out):

cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,15 @@ def test_main(self, ipc_device, ipc_memory_resource):
8484

8585
log("waiting for child")
8686
process.join(timeout=CHILD_TIMEOUT_SEC)
87+
if process.is_alive():
88+
print(
89+
f"[WARN] child process {process.pid} still alive after "
90+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
91+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
92+
file=sys.stderr,
93+
)
94+
process.kill()
95+
process.join()
8796
log(f"child exit code: {process.exitcode}")
8897
assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}"
8998
log("done")

cuda_core/tests/memory_ipc/test_leaks.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import gc
66
import multiprocessing as mp
77
import platform
8+
import sys
89

910
try:
1011
import psutil
@@ -38,6 +39,15 @@ def exec_success(obj, number=1):
3839
process = mp.Process(target=child_main, args=(obj,))
3940
process.start()
4041
process.join(timeout=CHILD_TIMEOUT_SEC)
42+
if process.is_alive():
43+
print(
44+
f"[WARN] child process {process.pid} still alive after "
45+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
46+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
47+
file=sys.stderr,
48+
)
49+
process.kill()
50+
process.join()
4151
assert process.exitcode == 0
4252

4353

@@ -54,6 +64,15 @@ def exec_launch_failure(obj, number=1):
5464
process = mp.Process(target=child_main_bad, args=(obj,))
5565
process.start()
5666
process.join(timeout=CHILD_TIMEOUT_SEC)
67+
if process.is_alive():
68+
print(
69+
f"[WARN] child process {process.pid} still alive after "
70+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
71+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
72+
file=sys.stderr,
73+
)
74+
process.kill()
75+
process.join()
5776
assert process.exitcode != 0
5877

5978

@@ -137,5 +156,14 @@ def prime():
137156
process = mp.Process()
138157
process.start()
139158
process.join(timeout=CHILD_TIMEOUT_SEC)
159+
if process.is_alive():
160+
print(
161+
f"[WARN] child process {process.pid} still alive after "
162+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
163+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
164+
file=sys.stderr,
165+
)
166+
process.kill()
167+
process.join()
140168
assert process.exitcode == 0
141169
prime_was_run = True

cuda_core/tests/memory_ipc/test_memory_ipc.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
import multiprocessing as mp
5+
import sys
56

67
import pytest
78
from helpers.buffers import PatternGen
@@ -39,6 +40,15 @@ def test_main(self, ipc_device, ipc_memory_resource):
3940

4041
# Wait for the child process.
4142
process.join(timeout=CHILD_TIMEOUT_SEC)
43+
if process.is_alive():
44+
print(
45+
f"[WARN] child process {process.pid} still alive after "
46+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
47+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
48+
file=sys.stderr,
49+
)
50+
process.kill()
51+
process.join()
4252
assert process.exitcode == 0
4353

4454
# Verify that the buffer was modified.
@@ -82,6 +92,16 @@ def test_main(self, ipc_device, ipc_memory_resource):
8292
# Wait for the child processes.
8393
p1.join(timeout=CHILD_TIMEOUT_SEC)
8494
p2.join(timeout=CHILD_TIMEOUT_SEC)
95+
for p in (p1, p2):
96+
if p.is_alive():
97+
print(
98+
f"[WARN] child process {p.pid} still alive after "
99+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
100+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
101+
file=sys.stderr,
102+
)
103+
p.kill()
104+
p.join()
85105
assert p1.exitcode == 0
86106
assert p2.exitcode == 0
87107

@@ -135,6 +155,16 @@ def test_main(self, ipc_device, ipc_memory_resource):
135155
# Wait for children.
136156
p1.join(timeout=CHILD_TIMEOUT_SEC)
137157
p2.join(timeout=CHILD_TIMEOUT_SEC)
158+
for p in (p1, p2):
159+
if p.is_alive():
160+
print(
161+
f"[WARN] child process {p.pid} still alive after "
162+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
163+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
164+
file=sys.stderr,
165+
)
166+
p.kill()
167+
p.join()
138168
assert p1.exitcode == 0
139169
assert p2.exitcode == 0
140170

@@ -185,6 +215,16 @@ def test_main(self, ipc_device, ipc_memory_resource):
185215
# Wait for children.
186216
p1.join(timeout=CHILD_TIMEOUT_SEC)
187217
p2.join(timeout=CHILD_TIMEOUT_SEC)
218+
for p in (p1, p2):
219+
if p.is_alive():
220+
print(
221+
f"[WARN] child process {p.pid} still alive after "
222+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
223+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
224+
file=sys.stderr,
225+
)
226+
p.kill()
227+
p.join()
188228
assert p1.exitcode == 0
189229
assert p2.exitcode == 0
190230

cuda_core/tests/memory_ipc/test_peer_access.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@ def test_main(self, ipc_mempool_device_x2):
3535
process = mp.Process(target=self.child_main, args=(mr,))
3636
process.start()
3737
process.join(timeout=CHILD_TIMEOUT_SEC)
38+
if process.is_alive():
39+
print(
40+
f"[WARN] child process {process.pid} still alive after "
41+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
42+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
43+
file=sys.stderr,
44+
)
45+
process.kill()
46+
process.join()
3847
assert process.exitcode == 0
3948

4049
# Verify parent's MR still has peer access set (independent state)
@@ -81,6 +90,15 @@ def test_main(self, ipc_mempool_device_x2, grant_access_in_parent):
8190
process = mp.Process(target=self.child_main, args=(mr, buffer))
8291
process.start()
8392
process.join(timeout=CHILD_TIMEOUT_SEC)
93+
if process.is_alive():
94+
print(
95+
f"[WARN] child process {process.pid} still alive after "
96+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
97+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
98+
file=sys.stderr,
99+
)
100+
process.kill()
101+
process.join()
84102
assert process.exitcode == 0
85103

86104
buffer.close()

cuda_core/tests/memory_ipc/test_serialize.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import multiprocessing as mp
55
import multiprocessing.reduction
66
import os
7+
import sys
78

89
import pytest
910
from helpers.buffers import PatternGen
@@ -46,6 +47,15 @@ def test_main(self, ipc_device, ipc_memory_resource):
4647

4748
# Wait for the child process.
4849
process.join(timeout=CHILD_TIMEOUT_SEC)
50+
if process.is_alive():
51+
print(
52+
f"[WARN] child process {process.pid} still alive after "
53+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
54+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
55+
file=sys.stderr,
56+
)
57+
process.kill()
58+
process.join()
4959
assert process.exitcode == 0
5060

5161
# Confirm buffers were modified.
@@ -103,6 +113,15 @@ def test_main(self, ipc_device, ipc_memory_resource):
103113

104114
# Wait for the child process.
105115
process.join(timeout=CHILD_TIMEOUT_SEC)
116+
if process.is_alive():
117+
print(
118+
f"[WARN] child process {process.pid} still alive after "
119+
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
120+
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
121+
file=sys.stderr,
122+
)
123+
process.kill()
124+
process.join()
106125
assert process.exitcode == 0
107126

108127
# Confirm buffer was modified.

0 commit comments

Comments
 (0)