Skip to content

Commit e033ca8

Browse files
committed
Use joblib for more robust parallel import scanning
https://joblib.readthedocs.io/en/stable/parallel.html Joblib takes some things for us. Relevant here: * Robust calculation for number of available CPUs. * Sequential calculation when n_jobs = 1. And likely other minor things I don't even understand.
1 parent 6b2dd86 commit e033ca8

File tree

4 files changed

+21
-22
lines changed

4 files changed

+21
-22
lines changed

CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ Changelog
55
Unreleased
66
----------
77

8+
* Use joblib for more robust parallel import scanning.
9+
810
3.8 (2025-04-11)
911
----------------
1012

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ authors = [
1616
]
1717
requires-python = ">=3.9"
1818
dependencies = [
19+
"joblib~=1.4.2",
1920
"typing-extensions>=3.10.0.0",
2021
]
2122
classifiers = [

src/grimp/application/usecases.py

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
"""
44

55
from typing import Dict, Sequence, Set, Type, Union, cast, Iterable, Collection
6-
import multiprocessing
76
import math
87

8+
import joblib # type: ignore
9+
910
from ..application.ports import caching
1011
from ..application.ports.filesystem import AbstractFileSystem
1112
from ..application.ports.graph import ImportGraph
@@ -21,7 +22,7 @@ class NotSupplied:
2122

2223

2324
# This is an arbitrary number, but setting it too low slows down our functional tests considerably.
24-
MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING = 50
25+
MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPLE_PROCESSES = 64
2526

2627

2728
def build_graph(
@@ -228,19 +229,19 @@ def _create_chunks(module_files: Collection[ModuleFile]) -> tuple[tuple[ModuleFi
228229
module_files_tuple = tuple(module_files)
229230

230231
number_of_module_files = len(module_files_tuple)
231-
n_chunks = _decide_number_of_of_processes(number_of_module_files)
232+
n_chunks = _decide_number_of_processes(number_of_module_files)
232233
chunk_size = math.ceil(number_of_module_files / n_chunks)
233234

234235
return tuple(
235236
module_files_tuple[i * chunk_size : (i + 1) * chunk_size] for i in range(n_chunks)
236237
)
237238

238239

239-
def _decide_number_of_of_processes(number_of_module_files: int) -> int:
240-
if number_of_module_files < MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING:
241-
# Don't incur the overhead of multiprocessing.
240+
def _decide_number_of_processes(number_of_module_files: int) -> int:
241+
if number_of_module_files < MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPLE_PROCESSES:
242+
# Don't incur the overhead of multiple processes.
242243
return 1
243-
return min(multiprocessing.cpu_count(), number_of_module_files)
244+
return min(joblib.cpu_count(), number_of_module_files)
244245

245246

246247
def _scan_chunks(
@@ -257,20 +258,15 @@ def _scan_chunks(
257258
)
258259

259260
number_of_processes = len(chunks)
260-
if number_of_processes == 1:
261-
# No need to spawn a process if there's only one chunk.
262-
[chunk] = chunks
263-
return _scan_chunk(import_scanner, exclude_type_checking_imports, chunk)
264-
else:
265-
with multiprocessing.Pool(number_of_processes) as pool:
266-
imports_by_module_file: Dict[ModuleFile, Set[DirectImport]] = {}
267-
import_scanning_jobs = pool.starmap(
268-
_scan_chunk,
269-
[(import_scanner, exclude_type_checking_imports, chunk) for chunk in chunks],
270-
)
271-
for chunk_imports_by_module_file in import_scanning_jobs:
272-
imports_by_module_file.update(chunk_imports_by_module_file)
273-
return imports_by_module_file
261+
import_scanning_jobs = joblib.Parallel(n_jobs=number_of_processes)(
262+
joblib.delayed(_scan_chunk)(import_scanner, exclude_type_checking_imports, chunk)
263+
for chunk in chunks
264+
)
265+
266+
imports_by_module_file = {}
267+
for chunk_imports_by_module_file in import_scanning_jobs:
268+
imports_by_module_file.update(chunk_imports_by_module_file)
269+
return imports_by_module_file
274270

275271

276272
def _scan_chunk(

tests/functional/test_build_and_use_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def test_modules():
5555
}
5656

5757

58-
@patch.object(usecases, "MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING", 0)
58+
@patch.object(usecases, "MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPLE_PROCESSES", 0)
5959
def test_modules_multiprocessing():
6060
"""
6161
This test runs relatively slowly, but it's important we cover the multiprocessing code.

0 commit comments

Comments
 (0)