From d338d3d17d703d9f5b3a594578f9b10522d9928b Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Braun Date: Sat, 9 May 2026 20:26:46 +0200 Subject: [PATCH] fix: replace pkg_resources with importlib.metadata (#1816) Drop the runtime dependency on pkg_resources from profile_report.py and utils/versions.py. pkg_resources was removed in setuptools >= 81 (Aug 2025), which made every fresh install raise ModuleNotFoundError: No module named 'pkg_resources' on import of data_profiling. importlib.metadata.version provides the same lookup and ships in the stdlib from Python 3.8 onward; pyproject.toml already pins requires-python >= 3.10, so the previous pkg_resources fallback in utils/versions.py was dead code under any supported runtime. The Pillow-version branch in ProfileReport.to_file is also hardened to tolerate non-numeric pre-release segments (e.g. "11.0.0a1") and to skip the warning rather than crash when Pillow is not installed (it's an indirect dependency, so trimmed environments may lack it). Adds tests/issues/test_issue1816.py with three guards: - data_profiling.profile_report import must not pull pkg_resources in - data_profiling.utils.versions import must not pull pkg_resources in - ProfileReport.to_file end-to-end must not re-import pkg_resources All three tests fail on origin/develop (collection error: the import chain crashes when pkg_resources is unavailable) and pass on this branch. Co-Authored-By: Claude Code --- src/data_profiling/profile_report.py | 42 +++++++++++++------ src/data_profiling/utils/versions.py | 14 +++---- tests/issues/test_issue1816.py | 63 ++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 19 deletions(-) create mode 100644 tests/issues/test_issue1816.py diff --git a/src/data_profiling/profile_report.py b/src/data_profiling/profile_report.py index 8d68b8775..3feec11a9 100644 --- a/src/data_profiling/profile_report.py +++ b/src/data_profiling/profile_report.py @@ -1,15 +1,20 @@ import copy import json import warnings + +# Use stdlib `importlib.metadata` instead of `pkg_resources`. The latter +# was removed in setuptools >= 81 (see +# https://setuptools.pypa.io/en/latest/history.html#v81-0-0); since +# pyproject.toml pins ``requires-python = ">=3.10"`` and +# ``importlib.metadata`` ships with the stdlib from Python 3.8 onward, +# no fallback is needed here. +from importlib.metadata import PackageNotFoundError +from importlib.metadata import version as _metadata_version from pathlib import Path from typing import Any, Optional, Union from data_profiling.utils.backend import is_pyspark_installed -with warnings.catch_warnings(): - warnings.simplefilter("ignore") - import pkg_resources - if not is_pyspark_installed(): from typing import TypeVar @@ -357,14 +362,27 @@ def to_file(self, output_file: Union[str, Path], silent: bool = True) -> None: output_file: The name or the path of the file to generate including the extension (.html, .json). silent: if False, opens the file in the default browser or download it in a Google Colab environment """ - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - pillow_version = pkg_resources.get_distribution("Pillow").version - version_tuple = tuple(map(int, pillow_version.split("."))) - if version_tuple < (9, 5, 0): - warnings.warn( - "Try running command: 'pip install --upgrade Pillow' to avoid ValueError" - ) + # Resolve the installed Pillow version via the stdlib metadata API. + # If Pillow is not installed (it's an indirect dependency, so this + # is unusual but possible in trimmed environments), skip the warning + # rather than crashing the whole report-write path. + try: + pillow_version = _metadata_version("Pillow") + except PackageNotFoundError: + pillow_version = None + if pillow_version is not None: + # Some Pillow pre-releases use non-numeric segments (e.g. "11.0.0a1"); + # take only the leading numeric components so int() never fails. + numeric_parts = [] + for part in pillow_version.split("."): + digits = "".join(c for c in part if c.isdigit()) + if not digits: + break + numeric_parts.append(int(digits)) + if len(numeric_parts) >= 3 and tuple(numeric_parts[:3]) < (9, 5, 0): + warnings.warn( + "Try running command: 'pip install --upgrade Pillow' to avoid ValueError" + ) if not isinstance(output_file, Path): output_file = Path(str(output_file)) diff --git a/src/data_profiling/utils/versions.py b/src/data_profiling/utils/versions.py index 284f1ed07..d06bb4c1a 100644 --- a/src/data_profiling/utils/versions.py +++ b/src/data_profiling/utils/versions.py @@ -1,10 +1,10 @@ -try: - from importlib.metadata import version -except ImportError: - import pkg_resources - - def version(pkg: str) -> str: # type: ignore - return pkg_resources.get_distribution(pkg).version +# `importlib.metadata.version` is in the stdlib since Python 3.8. +# pyproject.toml pins `requires-python = ">=3.10,<3.14"`, so the previous +# `pkg_resources` fallback was dead code under any supported runtime. +# Removing it also drops a runtime dependency on setuptools, which now +# raises `ModuleNotFoundError: No module named 'pkg_resources'` on +# setuptools >= 81 where pkg_resources was removed. +from importlib.metadata import version def pandas_version() -> list: diff --git a/tests/issues/test_issue1816.py b/tests/issues/test_issue1816.py new file mode 100644 index 000000000..dd5f38a5d --- /dev/null +++ b/tests/issues/test_issue1816.py @@ -0,0 +1,63 @@ +""" +Test for issue 1816: +https://github.com/Data-Centric-AI-Community/fg-data-profiling/issues/1816 + +`pkg_resources` was removed in setuptools >= 81. Importing +`data_profiling.profile_report` (or its public re-exports) must not pull +`pkg_resources` in, or the import chain crashes with +``ModuleNotFoundError: No module named 'pkg_resources'`` on environments +where setuptools >= 81 has been installed (e.g. fresh CI runners). + +The two regressions guards below are designed so they would fail on the +pre-fix tree even when ``setuptools`` is present (because the import +statement would still bring `pkg_resources` into ``sys.modules``), and +they exercise the actual code path (``ProfileReport.to_file``) that used +``pkg_resources.get_distribution("Pillow").version``. +""" + +import importlib +import sys + +import pandas as pd + +from data_profiling import ProfileReport + + +def test_profile_report_module_does_not_import_pkg_resources(): + """The module must not pull `pkg_resources` into ``sys.modules``. + + Asserting *non-import* is the load-bearing check: setuptools 81 raises + ``ModuleNotFoundError`` from any ``import pkg_resources``, so any + transitive import from ``data_profiling.profile_report`` would crash + on those environments. + """ + sys.modules.pop("pkg_resources", None) + sys.modules.pop("data_profiling.profile_report", None) + importlib.import_module("data_profiling.profile_report") + assert "pkg_resources" not in sys.modules + + +def test_versions_helper_does_not_import_pkg_resources(): + """`utils.versions` previously had a `pkg_resources` fallback branch. + + With Python 3.10+ (the project's floor), `importlib.metadata.version` + is always available, so the fallback is dead code. Removing it is + what closes the issue end-to-end. + """ + sys.modules.pop("pkg_resources", None) + sys.modules.pop("data_profiling.utils.versions", None) + importlib.import_module("data_profiling.utils.versions") + assert "pkg_resources" not in sys.modules + + +def test_to_file_runs_without_pkg_resources(test_output_dir): + """End-to-end smoke: writing a report exercises the Pillow-version + branch that previously called ``pkg_resources.get_distribution``.""" + sys.modules.pop("pkg_resources", None) + df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + profile = ProfileReport(df, minimal=True) + output_file = test_output_dir / "issue1816.html" + profile.to_file(output_file) + assert output_file.exists() + # `to_file` must not have re-imported `pkg_resources` either. + assert "pkg_resources" not in sys.modules