diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..7bb15bf5d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \ + pip install --no-cache-dir . && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \ + pip install --no-cache-dir jupyter + +EXPOSE 8888 + +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"] + + diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 992c1840c..4ea43192a 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,81 +1,98 @@ -""" - Auxiliary handler methods for data summary extraction -""" -from typing import Any, Callable, Dict, List, Sequence - -import networkx as nx -from visions import VisionsTypeset - - -def compose(functions: Sequence[Callable]) -> Callable: - """ - Compose a sequence of functions. - - :param functions: sequence of functions - :return: combined function applying all functions in order. - """ - - def composed_function(*args) -> List[Any]: - result = args # Start with the input arguments - for func in functions: - result = func(*result) if isinstance(result, tuple) else func(result) - return result # type: ignore - - return composed_function # type: ignore - - -class Handler: - """A generic handler - - Allows any custom mapping between data types and functions - """ - - def __init__( - self, - mapping: Dict[str, List[Callable]], - typeset: VisionsTypeset, - *args, - **kwargs - ): - self.mapping = mapping - self.typeset = typeset - self._complete_dag() - - def _complete_dag(self) -> None: - for from_type, to_type in nx.topological_sort( - nx.line_graph(self.typeset.base_graph) - ): - self.mapping[str(to_type)] = ( - self.mapping[str(from_type)] + self.mapping[str(to_type)] - ) - - def handle(self, dtype: str, *args, **kwargs) -> dict: - """ - Returns: - object: a tuple containing the config, the dataset series and the summary extracted - """ - funcs = self.mapping.get(dtype, []) - op = compose(funcs) - summary = op(*args)[-1] - return summary - - -def get_render_map() -> Dict[str, Callable]: - import ydata_profiling.report.structure.variables as render_algorithms - - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map +""" + Auxiliary handler methods for data summary extraction +""" +from typing import Any, Callable, Dict, List, Sequence, Tuple, Union + +import networkx as nx +from visions import VisionsTypeset + + +def compose(functions: Sequence[Callable]) -> Callable: + """ + Compose a sequence of functions. + + Each function in the sequence receives the result of the previous function. + Functions are expected to accept and return tuples for proper chaining. + + :param functions: sequence of functions that accept and return tuples + :return: combined function applying all functions in order + """ + + def composed_function(*args: Any) -> Tuple[Any, ...]: + result: Union[Tuple[Any, ...], Any] = args + for func in functions: + if isinstance(result, tuple): + result = func(*result) + else: + result = func(result) + if isinstance(result, tuple): + return result + return (result,) + + return composed_function + + +class Handler: + """A generic handler + + Allows any custom mapping between data types and functions. + Functions are composed based on the type hierarchy defined in the typeset. + """ + + def __init__( + self, + mapping: Dict[str, List[Callable]], + typeset: VisionsTypeset, + *args: Any, + **kwargs: Any + ): + self.mapping = mapping + self.typeset = typeset + self._complete_dag() + + def _complete_dag(self) -> None: + for from_type, to_type in nx.topological_sort( + nx.line_graph(self.typeset.base_graph) + ): + from_key = str(from_type) + to_key = str(to_type) + self.mapping[to_key] = self.mapping.get(from_key, []) + self.mapping.get( + to_key, [] + ) + + def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Dict[str, Any]: + """ + Execute the handler chain for the given data type. + + :param dtype: the data type to handle + :param args: arguments to pass to the handler functions + :param kwargs: keyword arguments (currently unused but reserved for extensibility) + :return: a dictionary containing the summary extracted from the data + """ + funcs = self.mapping.get(dtype, []) + op = compose(funcs) + result = op(*args) + if result: + return result[-1] if isinstance(result[-1], dict) else {} + return {} + + +def get_render_map() -> Dict[str, Callable]: + import ydata_profiling.report.structure.variables as render_algorithms + + render_map = { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + return render_map diff --git a/src/ydata_profiling/model/spark/missing_spark.py b/src/ydata_profiling/model/spark/missing_spark.py index deacf1b89..5ad367e6e 100644 --- a/src/ydata_profiling/model/spark/missing_spark.py +++ b/src/ydata_profiling/model/spark/missing_spark.py @@ -56,7 +56,6 @@ def __len__(self) -> Optional[int]: def missing_bar(config: Settings, df: DataFrame) -> str: import pyspark.sql.functions as F - # FIXME: move to univariate data_nan_counts = ( df.agg( *[F.count(F.when(F.isnull(c) | F.isnan(c), c)).alias(c) for c in df.columns] @@ -83,11 +82,9 @@ def missing_matrix(config: Settings, df: DataFrame) -> str: def missing_heatmap(config: Settings, df: DataFrame) -> str: df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count()) - # Remove completely filled or completely empty variables. columns = [i for i, n in enumerate(np.var(df.isnull(), axis="rows")) if n > 0] df = df.iloc[:, columns] - # Create and mask the correlation matrix. Construct the base heatmap. corr_mat = df.isnull().corr() mask = np.zeros_like(corr_mat) mask[np.triu_indices_from(mask)] = True diff --git a/src/ydata_profiling/report/presentation/core/renderable.py b/src/ydata_profiling/report/presentation/core/renderable.py index 3f7f09f6c..84265c1c6 100644 --- a/src/ydata_profiling/report/presentation/core/renderable.py +++ b/src/ydata_profiling/report/presentation/core/renderable.py @@ -34,9 +34,9 @@ def classes(self) -> str: def render(self) -> Any: pass - def __str__(self): + def __str__(self) -> str: return self.__class__.__name__ @classmethod - def convert_to_class(cls, obj: "Renderable", flavour_func) -> None: # noqa: ANN001 + def convert_to_class(cls, obj: "Renderable", flavour_func) -> None: obj.__class__ = cls diff --git a/src/ydata_profiling/report/presentation/flavours/flavours.py b/src/ydata_profiling/report/presentation/flavours/flavours.py index 10a5fa522..5b7551d99 100644 --- a/src/ydata_profiling/report/presentation/flavours/flavours.py +++ b/src/ydata_profiling/report/presentation/flavours/flavours.py @@ -1,6 +1,8 @@ """ Flavours registry information """ +from typing import Callable + from ydata_profiling.report.presentation.core import Root from ydata_profiling.report.presentation.core.renderable import Renderable @@ -20,7 +22,7 @@ def get_flavour_mapping(name: str) -> dict: def apply_renderable_mapping( mapping: dict, structure: Renderable, - flavour_func, # noqa: ANN001 + flavour_func: Callable[[Renderable], None], ) -> None: mapping[type(structure)].convert_to_class(structure, flavour_func)