Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.10-slim

WORKDIR /app

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*

COPY . .

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \
pip install --no-cache-dir . && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \
pip install --no-cache-dir jupyter

EXPOSE 8888

CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]


179 changes: 98 additions & 81 deletions src/ydata_profiling/model/handler.py
Original file line number Diff line number Diff line change
@@ -1,81 +1,98 @@
"""
Auxiliary handler methods for data summary extraction
"""
from typing import Any, Callable, Dict, List, Sequence

import networkx as nx
from visions import VisionsTypeset


def compose(functions: Sequence[Callable]) -> Callable:
"""
Compose a sequence of functions.

:param functions: sequence of functions
:return: combined function applying all functions in order.
"""

def composed_function(*args) -> List[Any]:
result = args # Start with the input arguments
for func in functions:
result = func(*result) if isinstance(result, tuple) else func(result)
return result # type: ignore

return composed_function # type: ignore


class Handler:
"""A generic handler

Allows any custom mapping between data types and functions
"""

def __init__(
self,
mapping: Dict[str, List[Callable]],
typeset: VisionsTypeset,
*args,
**kwargs
):
self.mapping = mapping
self.typeset = typeset
self._complete_dag()

def _complete_dag(self) -> None:
for from_type, to_type in nx.topological_sort(
nx.line_graph(self.typeset.base_graph)
):
self.mapping[str(to_type)] = (
self.mapping[str(from_type)] + self.mapping[str(to_type)]
)

def handle(self, dtype: str, *args, **kwargs) -> dict:
"""
Returns:
object: a tuple containing the config, the dataset series and the summary extracted
"""
funcs = self.mapping.get(dtype, [])
op = compose(funcs)
summary = op(*args)[-1]
return summary


def get_render_map() -> Dict[str, Callable]:
import ydata_profiling.report.structure.variables as render_algorithms

render_map = {
"Boolean": render_algorithms.render_boolean,
"Numeric": render_algorithms.render_real,
"Complex": render_algorithms.render_complex,
"Text": render_algorithms.render_text,
"DateTime": render_algorithms.render_date,
"Categorical": render_algorithms.render_categorical,
"URL": render_algorithms.render_url,
"Path": render_algorithms.render_path,
"File": render_algorithms.render_file,
"Image": render_algorithms.render_image,
"Unsupported": render_algorithms.render_generic,
"TimeSeries": render_algorithms.render_timeseries,
}

return render_map
"""
Auxiliary handler methods for data summary extraction
"""
from typing import Any, Callable, Dict, List, Sequence, Tuple, Union

import networkx as nx
from visions import VisionsTypeset


def compose(functions: Sequence[Callable]) -> Callable:
"""
Compose a sequence of functions.

Each function in the sequence receives the result of the previous function.
Functions are expected to accept and return tuples for proper chaining.

:param functions: sequence of functions that accept and return tuples
:return: combined function applying all functions in order
"""

def composed_function(*args: Any) -> Tuple[Any, ...]:
result: Union[Tuple[Any, ...], Any] = args
for func in functions:
if isinstance(result, tuple):
result = func(*result)
else:
result = func(result)
if isinstance(result, tuple):
return result
return (result,)

return composed_function


class Handler:
"""A generic handler

Allows any custom mapping between data types and functions.
Functions are composed based on the type hierarchy defined in the typeset.
"""

def __init__(
self,
mapping: Dict[str, List[Callable]],
typeset: VisionsTypeset,
*args: Any,
**kwargs: Any
):
self.mapping = mapping
self.typeset = typeset
self._complete_dag()

def _complete_dag(self) -> None:
for from_type, to_type in nx.topological_sort(
nx.line_graph(self.typeset.base_graph)
):
from_key = str(from_type)
to_key = str(to_type)
self.mapping[to_key] = self.mapping.get(from_key, []) + self.mapping.get(
to_key, []
)

def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Dict[str, Any]:
"""
Execute the handler chain for the given data type.

:param dtype: the data type to handle
:param args: arguments to pass to the handler functions
:param kwargs: keyword arguments (currently unused but reserved for extensibility)
:return: a dictionary containing the summary extracted from the data
"""
funcs = self.mapping.get(dtype, [])
op = compose(funcs)
result = op(*args)
if result:
return result[-1] if isinstance(result[-1], dict) else {}
return {}


def get_render_map() -> Dict[str, Callable]:
import ydata_profiling.report.structure.variables as render_algorithms

render_map = {
"Boolean": render_algorithms.render_boolean,
"Numeric": render_algorithms.render_real,
"Complex": render_algorithms.render_complex,
"Text": render_algorithms.render_text,
"DateTime": render_algorithms.render_date,
"Categorical": render_algorithms.render_categorical,
"URL": render_algorithms.render_url,
"Path": render_algorithms.render_path,
"File": render_algorithms.render_file,
"Image": render_algorithms.render_image,
"Unsupported": render_algorithms.render_generic,
"TimeSeries": render_algorithms.render_timeseries,
}

return render_map
3 changes: 0 additions & 3 deletions src/ydata_profiling/model/spark/missing_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def __len__(self) -> Optional[int]:
def missing_bar(config: Settings, df: DataFrame) -> str:
import pyspark.sql.functions as F

# FIXME: move to univariate
data_nan_counts = (
df.agg(
*[F.count(F.when(F.isnull(c) | F.isnan(c), c)).alias(c) for c in df.columns]
Expand All @@ -83,11 +82,9 @@ def missing_matrix(config: Settings, df: DataFrame) -> str:
def missing_heatmap(config: Settings, df: DataFrame) -> str:
df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count())

# Remove completely filled or completely empty variables.
columns = [i for i, n in enumerate(np.var(df.isnull(), axis="rows")) if n > 0]
df = df.iloc[:, columns]

# Create and mask the correlation matrix. Construct the base heatmap.
corr_mat = df.isnull().corr()
mask = np.zeros_like(corr_mat)
mask[np.triu_indices_from(mask)] = True
Expand Down
4 changes: 2 additions & 2 deletions src/ydata_profiling/report/presentation/core/renderable.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ def classes(self) -> str:
def render(self) -> Any:
pass

def __str__(self):
def __str__(self) -> str:
return self.__class__.__name__

@classmethod
def convert_to_class(cls, obj: "Renderable", flavour_func) -> None: # noqa: ANN001
def convert_to_class(cls, obj: "Renderable", flavour_func) -> None:
obj.__class__ = cls
4 changes: 3 additions & 1 deletion src/ydata_profiling/report/presentation/flavours/flavours.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""
Flavours registry information
"""
from typing import Callable

from ydata_profiling.report.presentation.core import Root
from ydata_profiling.report.presentation.core.renderable import Renderable

Expand All @@ -20,7 +22,7 @@ def get_flavour_mapping(name: str) -> dict:
def apply_renderable_mapping(
mapping: dict,
structure: Renderable,
flavour_func, # noqa: ANN001
flavour_func: Callable[[Renderable], None],
) -> None:
mapping[type(structure)].convert_to_class(structure, flavour_func)

Expand Down