Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 30 additions & 12 deletions src/data_profiling/profile_report.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
import copy
import json
import warnings

# Use stdlib `importlib.metadata` instead of `pkg_resources`. The latter
# was removed in setuptools >= 81 (see
# https://setuptools.pypa.io/en/latest/history.html#v81-0-0); since
# pyproject.toml pins ``requires-python = ">=3.10"`` and
# ``importlib.metadata`` ships with the stdlib from Python 3.8 onward,
# no fallback is needed here.
from importlib.metadata import PackageNotFoundError
from importlib.metadata import version as _metadata_version
from pathlib import Path
from typing import Any, Optional, Union

from data_profiling.utils.backend import is_pyspark_installed

with warnings.catch_warnings():
warnings.simplefilter("ignore")
import pkg_resources

if not is_pyspark_installed():
from typing import TypeVar

Expand Down Expand Up @@ -357,14 +362,27 @@ def to_file(self, output_file: Union[str, Path], silent: bool = True) -> None:
output_file: The name or the path of the file to generate including the extension (.html, .json).
silent: if False, opens the file in the default browser or download it in a Google Colab environment
"""
with warnings.catch_warnings():
warnings.simplefilter("ignore")
pillow_version = pkg_resources.get_distribution("Pillow").version
version_tuple = tuple(map(int, pillow_version.split(".")))
if version_tuple < (9, 5, 0):
warnings.warn(
"Try running command: 'pip install --upgrade Pillow' to avoid ValueError"
)
# Resolve the installed Pillow version via the stdlib metadata API.
# If Pillow is not installed (it's an indirect dependency, so this
# is unusual but possible in trimmed environments), skip the warning
# rather than crashing the whole report-write path.
try:
pillow_version = _metadata_version("Pillow")
except PackageNotFoundError:
pillow_version = None
if pillow_version is not None:
# Some Pillow pre-releases use non-numeric segments (e.g. "11.0.0a1");
# take only the leading numeric components so int() never fails.
numeric_parts = []
for part in pillow_version.split("."):
digits = "".join(c for c in part if c.isdigit())
if not digits:
break
numeric_parts.append(int(digits))
if len(numeric_parts) >= 3 and tuple(numeric_parts[:3]) < (9, 5, 0):
warnings.warn(
"Try running command: 'pip install --upgrade Pillow' to avoid ValueError"
)

if not isinstance(output_file, Path):
output_file = Path(str(output_file))
Expand Down
14 changes: 7 additions & 7 deletions src/data_profiling/utils/versions.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
try:
from importlib.metadata import version
except ImportError:
import pkg_resources

def version(pkg: str) -> str: # type: ignore
return pkg_resources.get_distribution(pkg).version
# `importlib.metadata.version` is in the stdlib since Python 3.8.
# pyproject.toml pins `requires-python = ">=3.10,<3.14"`, so the previous
# `pkg_resources` fallback was dead code under any supported runtime.
# Removing it also drops a runtime dependency on setuptools, which now
# raises `ModuleNotFoundError: No module named 'pkg_resources'` on
# setuptools >= 81 where pkg_resources was removed.
from importlib.metadata import version


def pandas_version() -> list:
Expand Down
63 changes: 63 additions & 0 deletions tests/issues/test_issue1816.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""
Test for issue 1816:
https://github.com/Data-Centric-AI-Community/fg-data-profiling/issues/1816

`pkg_resources` was removed in setuptools >= 81. Importing
`data_profiling.profile_report` (or its public re-exports) must not pull
`pkg_resources` in, or the import chain crashes with
``ModuleNotFoundError: No module named 'pkg_resources'`` on environments
where setuptools >= 81 has been installed (e.g. fresh CI runners).

The two regressions guards below are designed so they would fail on the
pre-fix tree even when ``setuptools`` is present (because the import
statement would still bring `pkg_resources` into ``sys.modules``), and
they exercise the actual code path (``ProfileReport.to_file``) that used
``pkg_resources.get_distribution("Pillow").version``.
"""

import importlib
import sys

import pandas as pd

from data_profiling import ProfileReport


def test_profile_report_module_does_not_import_pkg_resources():
"""The module must not pull `pkg_resources` into ``sys.modules``.

Asserting *non-import* is the load-bearing check: setuptools 81 raises
``ModuleNotFoundError`` from any ``import pkg_resources``, so any
transitive import from ``data_profiling.profile_report`` would crash
on those environments.
"""
sys.modules.pop("pkg_resources", None)
sys.modules.pop("data_profiling.profile_report", None)
importlib.import_module("data_profiling.profile_report")
assert "pkg_resources" not in sys.modules


def test_versions_helper_does_not_import_pkg_resources():
"""`utils.versions` previously had a `pkg_resources` fallback branch.

With Python 3.10+ (the project's floor), `importlib.metadata.version`
is always available, so the fallback is dead code. Removing it is
what closes the issue end-to-end.
"""
sys.modules.pop("pkg_resources", None)
sys.modules.pop("data_profiling.utils.versions", None)
importlib.import_module("data_profiling.utils.versions")
assert "pkg_resources" not in sys.modules


def test_to_file_runs_without_pkg_resources(test_output_dir):
"""End-to-end smoke: writing a report exercises the Pillow-version
branch that previously called ``pkg_resources.get_distribution``."""
sys.modules.pop("pkg_resources", None)
df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
profile = ProfileReport(df, minimal=True)
output_file = test_output_dir / "issue1816.html"
profile.to_file(output_file)
assert output_file.exists()
# `to_file` must not have re-imported `pkg_resources` either.
assert "pkg_resources" not in sys.modules