diff --git a/DashAI/back/converters/hugging_face/embedding.py b/DashAI/back/converters/hugging_face/embedding.py index bac80031d..34520bf69 100644 --- a/DashAI/back/converters/hugging_face/embedding.py +++ b/DashAI/back/converters/hugging_face/embedding.py @@ -1,7 +1,4 @@ -import pyarrow as pa -import torch -from datasets import Dataset, concatenate_datasets -from transformers import AutoModel, AutoTokenizer +"""HuggingFace embedding converter with lazy-loaded dependencies.""" from DashAI.back.converters.category.advanced_preprocessing import ( AdvancedPreprocessingConverter, @@ -104,16 +101,23 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float32 as the output type for embeddings.""" + import pyarrow as pa + return Float(arrow_type=pa.float32()) def _load_model(self): """Load the embedding model and tokenizer.""" + from transformers import AutoModel, AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.model = AutoModel.from_pretrained(self.model_name).to(self.device) self.model.eval() def _process_batch(self, batch: DashAIDataset) -> DashAIDataset: """Process a batch of text into embeddings.""" + import torch + from datasets import Dataset, concatenate_datasets + all_column_embeddings = [] for column in batch.column_names: diff --git a/DashAI/back/converters/hugging_face/tokenizer.py b/DashAI/back/converters/hugging_face/tokenizer.py index f89c8baa6..95f3ab6ec 100644 --- a/DashAI/back/converters/hugging_face/tokenizer.py +++ b/DashAI/back/converters/hugging_face/tokenizer.py @@ -1,6 +1,3 @@ -from datasets import Dataset, concatenate_datasets -from transformers import AutoTokenizer - from DashAI.back.converters.category.advanced_preprocessing import ( AdvancedPreprocessingConverter, ) @@ -87,12 +84,16 @@ def __init__(self, **kwargs): def _load_model(self): """Load tokenizer only.""" + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) def _process_batch(self, batch: DashAIDataset) -> DashAIDataset: """ Tokenize a batch of text columns and store each input_id in a separate column. """ + from datasets import Dataset, concatenate_datasets + all_column_tokens = [] for column in batch.column_names: diff --git a/DashAI/back/converters/hugging_face_wrapper.py b/DashAI/back/converters/hugging_face_wrapper.py index 6d1b64073..d096389d9 100644 --- a/DashAI/back/converters/hugging_face_wrapper.py +++ b/DashAI/back/converters/hugging_face_wrapper.py @@ -1,8 +1,6 @@ from abc import ABCMeta, abstractmethod from typing import Type -from datasets import concatenate_datasets - from DashAI.back.converters.base_converter import BaseConverter from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.types.dashai_data_type import DashAIDataType @@ -49,6 +47,8 @@ def fit(self, x: DashAIDataset, y: DashAIDataset = None) -> Type[BaseConverter]: def transform(self, x: DashAIDataset, y: DashAIDataset = None) -> DashAIDataset: """Transform the input data using the model.""" + from datasets import concatenate_datasets + all_results = [] # Process in batches diff --git a/DashAI/back/converters/imbalanced_learn/smoteenn_converter.py b/DashAI/back/converters/imbalanced_learn/smoteenn_converter.py index daa834314..5b710ebe9 100644 --- a/DashAI/back/converters/imbalanced_learn/smoteenn_converter.py +++ b/DashAI/back/converters/imbalanced_learn/smoteenn_converter.py @@ -1,5 +1,4 @@ from imblearn.combine import SMOTEENN -from imblearn.over_sampling import SMOTE from DashAI.back.converters.category.sampling import SamplingConverter from DashAI.back.converters.imbalanced_learn_wrapper import ImbalancedLearnWrapper @@ -61,6 +60,8 @@ class SMOTEENNConverter(SamplingConverter, ImbalancedLearnWrapper, SMOTEENN): IMAGE_PREVIEW = "smoteenn.png" def __init__(self, **kwargs): + from imblearn.over_sampling import SMOTE + self.smote = SMOTE( sampling_strategy=kwargs.get("sampling_strategy", "auto"), random_state=kwargs.get("random_state"), diff --git a/DashAI/back/converters/imbalanced_learn_wrapper.py b/DashAI/back/converters/imbalanced_learn_wrapper.py index a7972bea9..5a797d794 100644 --- a/DashAI/back/converters/imbalanced_learn_wrapper.py +++ b/DashAI/back/converters/imbalanced_learn_wrapper.py @@ -1,10 +1,6 @@ from abc import ABCMeta from typing import Type, Union -import numpy as np -import pandas as pd -import pyarrow as pa - from DashAI.back.converters.base_converter import BaseConverter from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.job.base_job import JobError @@ -20,7 +16,7 @@ class ImbalancedLearnWrapper(BaseConverter, metaclass=ABCMeta): def __init__(self, **kwargs): super().__init__(**kwargs) self.fitted = False - self._resampled_table: Union[pa.Table, None] = None + self._resampled_table = None self.original_X_column_names_: list = [] self.original_target_column_name_: str = "" @@ -44,6 +40,10 @@ def fit(self, x: DashAIDataset, y: DashAIDataset) -> Type[BaseConverter]: Fit the sampler using imbalanced-learn's fit_resample and store the combined result. """ + import numpy as np + import pandas as pd + import pyarrow as pa + if y is None or len(y) == 0: raise ValueError( "Imbalanced-learn samplers require a non-empty target dataset (y)." diff --git a/DashAI/back/converters/scikit_learn/additive_chi_2_sampler.py b/DashAI/back/converters/scikit_learn/additive_chi_2_sampler.py index 1c5dd14f6..ec1e90c84 100644 --- a/DashAI/back/converters/scikit_learn/additive_chi_2_sampler.py +++ b/DashAI/back/converters/scikit_learn/additive_chi_2_sampler.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.kernel_approximation import ( AdditiveChi2Sampler as AdditiveChi2SamplerOperation, ) @@ -57,4 +56,6 @@ class AdditiveChi2Sampler( def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for transformed data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/bag_of_words.py b/DashAI/back/converters/scikit_learn/bag_of_words.py index b3d4d06e4..df13bca6a 100644 --- a/DashAI/back/converters/scikit_learn/bag_of_words.py +++ b/DashAI/back/converters/scikit_learn/bag_of_words.py @@ -1,6 +1,3 @@ -import pandas as pd -from sklearn.feature_extraction.text import CountVectorizer - from DashAI.back.converters.base_converter import BaseConverter from DashAI.back.converters.category.advanced_preprocessing import ( AdvancedPreprocessingConverter, @@ -90,6 +87,8 @@ class BagOfWordsConverter(AdvancedPreprocessingConverter, BaseConverter): def __init__(self, **kwargs): super().__init__() + from sklearn.feature_extraction.text import CountVectorizer + self.vectorizer = CountVectorizer( max_features=kwargs.get("max_features", 1000), lowercase=kwargs.get("lowercase", True), @@ -111,6 +110,8 @@ def fit(self, x: DashAIDataset, y=None) -> "BagOfWordsConverter": def transform(self, x: DashAIDataset, y=None) -> DashAIDataset: """Transform text into Bag-of-Words frequency columns.""" + import pandas as pd + if not self.fitted: raise RuntimeError("The converter must be fitted before calling transform.") diff --git a/DashAI/back/converters/scikit_learn/binarizer.py b/DashAI/back/converters/scikit_learn/binarizer.py index e1b358ead..656fa492a 100644 --- a/DashAI/back/converters/scikit_learn/binarizer.py +++ b/DashAI/back/converters/scikit_learn/binarizer.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.preprocessing import Binarizer as BinarizerOperation from DashAI.back.converters.category.encoding import EncodingConverter @@ -51,4 +50,6 @@ class Binarizer(EncodingConverter, SklearnWrapper, BinarizerOperation): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Integer64 as the output type for binarized data.""" + import pyarrow as pa + return Integer(arrow_type=pa.int64()) diff --git a/DashAI/back/converters/scikit_learn/cca.py b/DashAI/back/converters/scikit_learn/cca.py index 5d3142eda..2bef04702 100644 --- a/DashAI/back/converters/scikit_learn/cca.py +++ b/DashAI/back/converters/scikit_learn/cca.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.cross_decomposition import CCA as CCAOPERATION from DashAI.back.converters.category.advanced_preprocessing import ( @@ -72,4 +71,6 @@ class CCA(AdvancedPreprocessingConverter, SklearnWrapper, CCAOPERATION): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for transformed data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/fast_ica.py b/DashAI/back/converters/scikit_learn/fast_ica.py index 79b14413c..ad4a3c99f 100644 --- a/DashAI/back/converters/scikit_learn/fast_ica.py +++ b/DashAI/back/converters/scikit_learn/fast_ica.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.decomposition import FastICA as FastICAOperation from DashAI.back.api.utils import ( @@ -161,4 +160,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for transformed data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/generic_univariate_select.py b/DashAI/back/converters/scikit_learn/generic_univariate_select.py index 3d8475e1b..85ac2ed18 100644 --- a/DashAI/back/converters/scikit_learn/generic_univariate_select.py +++ b/DashAI/back/converters/scikit_learn/generic_univariate_select.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.feature_selection import ( GenericUnivariateSelect as GenericUnivariateSelectOperation, ) @@ -62,4 +61,6 @@ class GenericUnivariateSelect( def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for selected features.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/knn_imputer.py b/DashAI/back/converters/scikit_learn/knn_imputer.py index 5e4ef60bb..ae10787b6 100644 --- a/DashAI/back/converters/scikit_learn/knn_imputer.py +++ b/DashAI/back/converters/scikit_learn/knn_imputer.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.impute import KNNImputer as KNNImputerOperation from DashAI.back.converters.category.basic_preprocessing import ( @@ -88,4 +87,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for imputed data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/label_binarizer.py b/DashAI/back/converters/scikit_learn/label_binarizer.py index fd6949a4b..967a44687 100644 --- a/DashAI/back/converters/scikit_learn/label_binarizer.py +++ b/DashAI/back/converters/scikit_learn/label_binarizer.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.preprocessing import LabelBinarizer as LabelBinarizerOperation from DashAI.back.converters.category.encoding import EncodingConverter @@ -44,4 +43,6 @@ class LabelBinarizer(EncodingConverter, SklearnWrapper, LabelBinarizerOperation) def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Integer64 as the output type for binarized labels.""" + import pyarrow as pa + return Integer(arrow_type=pa.int64()) diff --git a/DashAI/back/converters/scikit_learn/label_encoder.py b/DashAI/back/converters/scikit_learn/label_encoder.py index db8ac862a..538cf03a5 100644 --- a/DashAI/back/converters/scikit_learn/label_encoder.py +++ b/DashAI/back/converters/scikit_learn/label_encoder.py @@ -1,8 +1,5 @@ from typing import Union -import pyarrow as pa -from sklearn.preprocessing import LabelEncoder as LabelEncoderOperation - from DashAI.back.converters.category.encoding import EncodingConverter from DashAI.back.converters.sklearn_wrapper import SklearnWrapper from DashAI.back.core.schema_fields.base_schema import BaseSchema @@ -51,6 +48,8 @@ def get_output_type(self, column_name: str = None) -> DashAIDataType: If the encoder has been fitted and has classes_, use them to create a proper categorical type. """ + import pyarrow as pa + if column_name and column_name in self.encoders: encoder = self.encoders[column_name] if hasattr(encoder, "classes_"): @@ -63,6 +62,8 @@ def get_output_type(self, column_name: str = None) -> DashAIDataType: def fit(self, x: DashAIDataset, y: Union[DashAIDataset, None] = None): """Fit label encoders to each column in the dataset.""" + from sklearn.preprocessing import LabelEncoder as LabelEncoderOperation + x_pandas = x.to_pandas() for col in x_pandas.columns: diff --git a/DashAI/back/converters/scikit_learn/max_abs_scaler.py b/DashAI/back/converters/scikit_learn/max_abs_scaler.py index 903af7e06..c1961e376 100644 --- a/DashAI/back/converters/scikit_learn/max_abs_scaler.py +++ b/DashAI/back/converters/scikit_learn/max_abs_scaler.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.preprocessing import MaxAbsScaler as MaxAbsScalerOperation from DashAI.back.converters.category.scaling_and_normalization import ( @@ -39,4 +38,6 @@ class MaxAbsScaler( def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for scaled data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/min_max_scaler.py b/DashAI/back/converters/scikit_learn/min_max_scaler.py index cf2a745c9..5344ae04f 100644 --- a/DashAI/back/converters/scikit_learn/min_max_scaler.py +++ b/DashAI/back/converters/scikit_learn/min_max_scaler.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.preprocessing import MinMaxScaler as MinMaxScalerOperation from DashAI.back.converters.category.scaling_and_normalization import ( @@ -74,4 +73,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for scaled data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/missing_indicator.py b/DashAI/back/converters/scikit_learn/missing_indicator.py index b73c1ce98..6d505dac2 100644 --- a/DashAI/back/converters/scikit_learn/missing_indicator.py +++ b/DashAI/back/converters/scikit_learn/missing_indicator.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.impute import MissingIndicator as MissingIndicatorOperation from DashAI.back.converters.category.basic_preprocessing import ( @@ -35,4 +34,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Integer64 as the output type for binary indicators.""" + import pyarrow as pa + return Integer(arrow_type=pa.int64()) diff --git a/DashAI/back/converters/scikit_learn/normalizer.py b/DashAI/back/converters/scikit_learn/normalizer.py index 1c57a8ac8..c0160c01d 100644 --- a/DashAI/back/converters/scikit_learn/normalizer.py +++ b/DashAI/back/converters/scikit_learn/normalizer.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.preprocessing import Normalizer as NormalizerOperation from DashAI.back.converters.category.scaling_and_normalization import ( @@ -45,4 +44,6 @@ class Normalizer(ScalingAndNormalizationConverter, SklearnWrapper, NormalizerOpe def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for normalized data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/nystroem.py b/DashAI/back/converters/scikit_learn/nystroem.py index c17e8144b..d26820ce7 100644 --- a/DashAI/back/converters/scikit_learn/nystroem.py +++ b/DashAI/back/converters/scikit_learn/nystroem.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.kernel_approximation import Nystroem as NystroemOperation from DashAI.back.api.utils import create_random_state, parse_string_to_dict @@ -133,4 +132,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for transformed data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/one_hot_encoder.py b/DashAI/back/converters/scikit_learn/one_hot_encoder.py index 76c36c554..d249cf5b6 100644 --- a/DashAI/back/converters/scikit_learn/one_hot_encoder.py +++ b/DashAI/back/converters/scikit_learn/one_hot_encoder.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.preprocessing import OneHotEncoder as OneHotEncoderOperation from DashAI.back.api.utils import cast_string_to_type, parse_string_to_list @@ -118,4 +117,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Integer64 as the output type for one-hot encoded data.""" + import pyarrow as pa + return Integer(arrow_type=pa.int64()) diff --git a/DashAI/back/converters/scikit_learn/ordinal_encoder.py b/DashAI/back/converters/scikit_learn/ordinal_encoder.py index 061784583..d4ec63e7b 100644 --- a/DashAI/back/converters/scikit_learn/ordinal_encoder.py +++ b/DashAI/back/converters/scikit_learn/ordinal_encoder.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.preprocessing import OrdinalEncoder as OrdinalEncoderOperation from DashAI.back.api.utils import cast_string_to_type @@ -116,6 +115,8 @@ def get_output_type(self, column_name: str = None) -> DashAIDataType: Returns Categorical type with encoded values. After fitting, categories are encoded as integers. """ + import pyarrow as pa + # Return a placeholder categorical type # The actual categories will be set by sklearn_wrapper's transform method return Categorical(values=pa.array(["0", "1"])) diff --git a/DashAI/back/converters/scikit_learn/pca.py b/DashAI/back/converters/scikit_learn/pca.py index cc6b86068..8a6554614 100644 --- a/DashAI/back/converters/scikit_learn/pca.py +++ b/DashAI/back/converters/scikit_learn/pca.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.decomposition import PCA as PCAOPERATION from DashAI.back.api.utils import create_random_state @@ -179,4 +178,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for PCA components.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/polynomial_features.py b/DashAI/back/converters/scikit_learn/polynomial_features.py index be57d5421..7686f0ade 100644 --- a/DashAI/back/converters/scikit_learn/polynomial_features.py +++ b/DashAI/back/converters/scikit_learn/polynomial_features.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.preprocessing import PolynomialFeatures as PolynomialFeaturesOperation from DashAI.back.converters.category.polynomial_kernel import PolynomialKernelConverter @@ -94,4 +93,6 @@ class PolynomialFeatures( def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for polynomial features.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/rbf_sampler.py b/DashAI/back/converters/scikit_learn/rbf_sampler.py index 891f676ce..a0c65d481 100644 --- a/DashAI/back/converters/scikit_learn/rbf_sampler.py +++ b/DashAI/back/converters/scikit_learn/rbf_sampler.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.kernel_approximation import RBFSampler as RBFSamplerOperation from DashAI.back.api.utils import create_random_state @@ -80,4 +79,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for transformed data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/select_fdr.py b/DashAI/back/converters/scikit_learn/select_fdr.py index dd40fd788..1a58205a3 100644 --- a/DashAI/back/converters/scikit_learn/select_fdr.py +++ b/DashAI/back/converters/scikit_learn/select_fdr.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.feature_selection import SelectFdr as SelectFdrOperation from DashAI.back.converters.category.feature_selection import FeatureSelectionConverter @@ -45,4 +44,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for selected features.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/select_fpr.py b/DashAI/back/converters/scikit_learn/select_fpr.py index fa0289543..e246754d3 100644 --- a/DashAI/back/converters/scikit_learn/select_fpr.py +++ b/DashAI/back/converters/scikit_learn/select_fpr.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.feature_selection import SelectFpr as SelectFprOperation from DashAI.back.converters.category.feature_selection import FeatureSelectionConverter @@ -42,4 +41,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for selected features.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/select_fwe.py b/DashAI/back/converters/scikit_learn/select_fwe.py index 3e0825f5f..4b3d0aff5 100644 --- a/DashAI/back/converters/scikit_learn/select_fwe.py +++ b/DashAI/back/converters/scikit_learn/select_fwe.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.feature_selection import SelectFwe as SelectFweOperation from DashAI.back.converters.category.feature_selection import FeatureSelectionConverter @@ -42,6 +41,8 @@ class SelectFwe(FeatureSelectionConverter, SklearnWrapper, SelectFweOperation): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for selected features.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) def __init__(self, **kwargs): diff --git a/DashAI/back/converters/scikit_learn/select_k_best.py b/DashAI/back/converters/scikit_learn/select_k_best.py index f13a2d717..d15d2ba2a 100644 --- a/DashAI/back/converters/scikit_learn/select_k_best.py +++ b/DashAI/back/converters/scikit_learn/select_k_best.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.feature_selection import SelectKBest as SelectKBestOperation from DashAI.back.converters.category.feature_selection import FeatureSelectionConverter @@ -41,6 +40,8 @@ class SelectKBest(FeatureSelectionConverter, SklearnWrapper, SelectKBestOperatio def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for selected features.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) def __init__(self, **kwargs): diff --git a/DashAI/back/converters/scikit_learn/select_percentile.py b/DashAI/back/converters/scikit_learn/select_percentile.py index 99423c5d2..e9ff36787 100644 --- a/DashAI/back/converters/scikit_learn/select_percentile.py +++ b/DashAI/back/converters/scikit_learn/select_percentile.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.feature_selection import SelectPercentile as SelectPercentileOperation from DashAI.back.converters.category.feature_selection import FeatureSelectionConverter @@ -43,6 +42,8 @@ class SelectPercentile( def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for selected features.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) def __init__(self, **kwargs): diff --git a/DashAI/back/converters/scikit_learn/simple_imputer.py b/DashAI/back/converters/scikit_learn/simple_imputer.py index 9129ac932..ecb343559 100644 --- a/DashAI/back/converters/scikit_learn/simple_imputer.py +++ b/DashAI/back/converters/scikit_learn/simple_imputer.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.impute import SimpleImputer as SimpleImputerOperation from DashAI.back.converters.category.basic_preprocessing import ( @@ -101,4 +100,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for imputed data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/skewed_chi_2_sampler.py b/DashAI/back/converters/scikit_learn/skewed_chi_2_sampler.py index d104d3058..0697f2468 100644 --- a/DashAI/back/converters/scikit_learn/skewed_chi_2_sampler.py +++ b/DashAI/back/converters/scikit_learn/skewed_chi_2_sampler.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.kernel_approximation import SkewedChi2Sampler as SkewedChi2SamplerOperation from DashAI.back.api.utils import create_random_state @@ -89,4 +88,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for transformed data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/standard_scaler.py b/DashAI/back/converters/scikit_learn/standard_scaler.py index 90dc0c2e3..bdff073a2 100644 --- a/DashAI/back/converters/scikit_learn/standard_scaler.py +++ b/DashAI/back/converters/scikit_learn/standard_scaler.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.preprocessing import StandardScaler as StandardScalerOperation from DashAI.back.converters.category.scaling_and_normalization import ( @@ -68,6 +67,8 @@ class StandardScaler( def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for standardized data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) IMAGE_PREVIEW = "standard_scaler.png" diff --git a/DashAI/back/converters/scikit_learn/tf_idf.py b/DashAI/back/converters/scikit_learn/tf_idf.py index 4a5670a8c..5ac7d4666 100644 --- a/DashAI/back/converters/scikit_learn/tf_idf.py +++ b/DashAI/back/converters/scikit_learn/tf_idf.py @@ -1,6 +1,3 @@ -import pandas as pd -from sklearn.feature_extraction.text import TfidfVectorizer - from DashAI.back.converters.base_converter import BaseConverter from DashAI.back.converters.category.advanced_preprocessing import ( AdvancedPreprocessingConverter, @@ -94,6 +91,8 @@ class TFIDFConverter(AdvancedPreprocessingConverter, BaseConverter): def __init__(self, **kwargs): super().__init__() + from sklearn.feature_extraction.text import TfidfVectorizer + self.vectorizer = TfidfVectorizer( max_features=kwargs.get("max_features", 1000), lowercase=kwargs.get("lowercase", True), @@ -115,6 +114,8 @@ def fit(self, x: DashAIDataset, y=None) -> "TFIDFConverter": def transform(self, x: DashAIDataset, y=None) -> DashAIDataset: """Transform text into TF-IDF weighted columns.""" + import pandas as pd + if not self.fitted: raise RuntimeError("The converter must be fitted before calling transform.") diff --git a/DashAI/back/converters/scikit_learn/truncated_svd.py b/DashAI/back/converters/scikit_learn/truncated_svd.py index 58a80abdb..9227be347 100644 --- a/DashAI/back/converters/scikit_learn/truncated_svd.py +++ b/DashAI/back/converters/scikit_learn/truncated_svd.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.decomposition import TruncatedSVD as TruncatedSVDOperation from DashAI.back.api.utils import create_random_state @@ -128,4 +127,6 @@ def __init__(self, **kwargs): def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for transformed data.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) diff --git a/DashAI/back/converters/scikit_learn/variance_threshold.py b/DashAI/back/converters/scikit_learn/variance_threshold.py index b2aac4cf1..ffcd672e9 100644 --- a/DashAI/back/converters/scikit_learn/variance_threshold.py +++ b/DashAI/back/converters/scikit_learn/variance_threshold.py @@ -1,4 +1,3 @@ -import pyarrow as pa from sklearn.feature_selection import VarianceThreshold as VarianceThresholdOperation from DashAI.back.converters.category.dimensionality_reduction import ( @@ -40,6 +39,8 @@ class VarianceThreshold( def get_output_type(self, column_name: str = None) -> DashAIDataType: """Returns Float64 as the output type for selected features.""" + import pyarrow as pa + return Float(arrow_type=pa.float64()) IMAGE_PREVIEW = "variance_threshold.png" diff --git a/DashAI/back/converters/simple_converters/character_replacer.py b/DashAI/back/converters/simple_converters/character_replacer.py index 7477367c4..6a7340287 100644 --- a/DashAI/back/converters/simple_converters/character_replacer.py +++ b/DashAI/back/converters/simple_converters/character_replacer.py @@ -1,7 +1,5 @@ from typing import List, Union -import pyarrow as pa - from DashAI.back.converters.base_converter import BaseConverter from DashAI.back.converters.category.basic_preprocessing import ( BasicPreprocessingConverter, @@ -109,6 +107,8 @@ def transform( Replaces or removes characters in the target string columns of the dataset x. If all values in a column become numeric after replacement, converts to int. """ + import pyarrow as pa + if not self._target_columns: # if no target columns were set, return the dataset unchanged return x @@ -174,4 +174,6 @@ def get_output_type(self, column_name: str = None) -> DashAIDataType: to int. Since this is determined dynamically during transform, we return Text as default. """ + import pyarrow as pa + return Text(arrow_type=pa.string()) diff --git a/DashAI/back/converters/simple_converters/column_remover.py b/DashAI/back/converters/simple_converters/column_remover.py index 6230e7e6c..aab343803 100644 --- a/DashAI/back/converters/simple_converters/column_remover.py +++ b/DashAI/back/converters/simple_converters/column_remover.py @@ -1,5 +1,3 @@ -import pyarrow as pa - from DashAI.back.converters.base_converter import BaseConverter from DashAI.back.converters.category.basic_preprocessing import ( BasicPreprocessingConverter, @@ -65,4 +63,6 @@ def get_output_type(self, column_name: str = None) -> DashAIDataType: This converter removes columns, so it doesn't change types. Return a placeholder type. """ + import pyarrow as pa + return Text(arrow_type=pa.string()) diff --git a/DashAI/back/converters/simple_converters/nan_remover.py b/DashAI/back/converters/simple_converters/nan_remover.py index 960d73dbf..7953be80c 100644 --- a/DashAI/back/converters/simple_converters/nan_remover.py +++ b/DashAI/back/converters/simple_converters/nan_remover.py @@ -1,6 +1,3 @@ -import numpy as np -import pyarrow as pa - from DashAI.back.converters.base_converter import BaseConverter from DashAI.back.converters.category.basic_preprocessing import ( BasicPreprocessingConverter, @@ -83,6 +80,8 @@ def fit(self, x: DashAIDataset, y: DashAIDataset = None) -> "NanRemover": def _is_null_value(self, value) -> bool: """Check if a value should be treated as null.""" + import numpy as np + if value is None: return True if isinstance(value, float) and np.isnan(value): @@ -95,6 +94,8 @@ def transform(self, x: DashAIDataset, y: DashAIDataset = None) -> DashAIDataset: Remove the nan rows from the columns selected in the scope. Also handles string representations of null values like "None", "nan", etc. """ + import numpy as np + missing = [col for col in self.columns if col not in x.column_names] if missing: raise ValueError( @@ -141,6 +142,8 @@ def get_output_type(self, column_name: str = None) -> DashAIDataType: This converter removes rows with NaN, doesn't change column types. Return the original type if available. """ + import pyarrow as pa + if column_name and column_name in self.column_types: return self.column_types[column_name] return Text(arrow_type=pa.string()) diff --git a/DashAI/back/converters/sklearn_wrapper.py b/DashAI/back/converters/sklearn_wrapper.py index 27235db5f..ca00061ef 100644 --- a/DashAI/back/converters/sklearn_wrapper.py +++ b/DashAI/back/converters/sklearn_wrapper.py @@ -1,10 +1,6 @@ from abc import ABCMeta, abstractmethod from typing import Union -import numpy as np -import pandas as pd -import pyarrow as pa - from DashAI.back.converters.base_converter import BaseConverter from DashAI.back.dataloaders.classes.dashai_dataset import ( DashAIDataset, @@ -110,6 +106,10 @@ def transform( DashAIDataset The transformed dataset with proper DashAI types. """ + import numpy as np + import pandas as pd + import pyarrow as pa + x_pandas = x.to_pandas() if hasattr(x, "to_pandas") else x sklearn_cls = next( diff --git a/DashAI/back/dataloaders/classes/csv_dataloader.py b/DashAI/back/dataloaders/classes/csv_dataloader.py index c71f43e93..2c01bea0f 100644 --- a/DashAI/back/dataloaders/classes/csv_dataloader.py +++ b/DashAI/back/dataloaders/classes/csv_dataloader.py @@ -4,9 +4,7 @@ from itertools import islice from typing import Any, Dict -import pandas as pd from beartype import beartype -from datasets import Dataset, IterableDatasetDict, load_dataset from DashAI.back.core.schema_fields import ( bool_field, @@ -315,6 +313,8 @@ def load_data( DatasetDict A HuggingFace's Dataset with the loaded data. """ + from datasets import Dataset, IterableDatasetDict, load_dataset + clean_params = self._check_params(params) prepared_path = self.prepare_files(filepath_or_buffer, temp_path) if prepared_path[1] == "file": @@ -343,7 +343,7 @@ def load_preview( filepath_or_buffer: str, params: Dict[str, Any], n_rows: int = 100, - ) -> pd.DataFrame: + ): """ Load a preview of the CSV dataset using streaming. @@ -361,6 +361,9 @@ def load_preview( pd.DataFrame A DataFrame containing the preview rows. """ + import pandas as pd + from datasets import load_dataset + clean_params = self._check_params(params) dataset_stream = load_dataset( diff --git a/DashAI/back/dataloaders/classes/dashai_dataset.py b/DashAI/back/dataloaders/classes/dashai_dataset.py index e5100a305..320e952e6 100644 --- a/DashAI/back/dataloaders/classes/dashai_dataset.py +++ b/DashAI/back/dataloaders/classes/dashai_dataset.py @@ -1,19 +1,11 @@ """DashAI Dataset implementation.""" -import json import logging import os -from contextlib import suppress -import numpy as np -import pyarrow as pa -import pyarrow.ipc as ipc from beartype import beartype from beartype.typing import Dict, List, Literal, Optional, Tuple, Union -from datasets import ClassLabel, Dataset, DatasetDict, Value, concatenate_datasets -from datasets.table import InMemoryTable -from pandas import DataFrame -from sklearn.model_selection import train_test_split +from datasets import Dataset from DashAI.back.types.categorical import Categorical from DashAI.back.types.dashai_data_type import DashAIDataType @@ -29,7 +21,7 @@ log = logging.getLogger(__name__) -def get_arrow_table(ds: Dataset) -> pa.Table: +def get_arrow_table(ds: Dataset) -> object: """ Retrieve the underlying PyArrow table from a Hugging Face Dataset. This function abstracts away the need to access private attributes. @@ -38,7 +30,7 @@ def get_arrow_table(ds: Dataset) -> pa.Table: ds (Dataset): A Hugging Face Dataset. Returns: - pa.Table: The underlying PyArrow table. + object: The underlying PyArrow table. Raises: ValueError: If the arrow table cannot be retrieved. @@ -57,7 +49,7 @@ class DashAIDataset(Dataset): @beartype def __init__( self, - table: Union[pa.Table, InMemoryTable], + table: object, splits: dict = None, types: Optional[Dict[str, DashAIDataType]] = None, *args, @@ -100,12 +92,12 @@ def cast(self, *args, **kwargs) -> "DashAIDataset": return DashAIDataset(arrow_tbl, splits=self.splits, types=self._types) @property - def arrow_table(self) -> pa.Table: + def arrow_table(self) -> object: """ Provides a clean way to access the underlying PyArrow table. Returns: - pa.Table: The underlying PyArrow table. + object: The underlying PyArrow table. """ try: # Now we reference the pa.table from here (DashAIDataset) @@ -147,6 +139,8 @@ def change_columns_type(self, column_types: Dict[str, str]) -> "DashAIDataset": DashAIDataset The dataset after columns type changes. """ + from datasets import Value as HFValue + if not isinstance(column_types, dict): raise TypeError(f"types should be a dict, got {type(column_types)}") @@ -163,7 +157,7 @@ def change_columns_type(self, column_types: Dict[str, str]) -> "DashAIDataset": if column_types[column] == "Categorical": new_features[column] = encode_labels(self, column) elif column_types[column] == "Numerical": - new_features[column] = Value("float32") + new_features[column] = HFValue("float32") dataset = self.cast(new_features) return dataset @@ -529,6 +523,8 @@ def sample( Dict A dictionary with selected samples. """ + import numpy as np + if n > len(self): raise ValueError( "Number of samples must be less than or equal to the length " @@ -630,7 +626,7 @@ def select(self, *args, **kwargs) -> "DashAIDataset": @beartype -def merge_splits_with_metadata(dataset_dict: DatasetDict) -> DashAIDataset: +def merge_splits_with_metadata(dataset_dict: object) -> DashAIDataset: """ Merges the splits from a DatasetDict into a single DashAIDataset and records the original indices for each split in the metadata. @@ -644,6 +640,8 @@ def merge_splits_with_metadata(dataset_dict: DatasetDict) -> DashAIDataset: original split indices. """ + from datasets import concatenate_datasets # local import + concatenated_datasets = [] split_index = {} current_index = 0 @@ -699,6 +697,8 @@ def transform_dataset_with_schema( DashAIDataset - The updated dataset with new type information """ + import pyarrow as pa # local import + table = get_arrow_table(dataset) dai_table = {} my_schema = pa.schema([]) @@ -787,10 +787,14 @@ def save_dataset( if schema is not None: dataset = transform_dataset_with_schema(dataset, schema) + import json + + import pyarrow as pa # local import + table = get_arrow_table(dataset) data_filepath = os.path.join(path, "data.arrow") with pa.OSFile(data_filepath, "wb") as sink: - writer = ipc.new_file(sink, table.schema) + writer = pa.ipc.new_file(sink, table.schema) writer.write_table(table) writer.close() @@ -823,9 +827,13 @@ def load_dataset(dataset_path: Union[str, os.PathLike]) -> DashAIDataset: DashAIDataset: The loaded dataset with data and metadata. """ + import json + + import pyarrow as pa # local import + data_filepath = os.path.join(dataset_path, "data.arrow") with pa.OSFile(data_filepath, "rb") as source: - reader = ipc.open_file(source) + reader = pa.ipc.open_file(source) data = reader.read_all() metadata_filepath = os.path.join(dataset_path, "splits.json") if os.path.exists(metadata_filepath): @@ -845,7 +853,7 @@ def load_dataset(dataset_path: Union[str, os.PathLike]) -> DashAIDataset: def encode_labels( dataset: DashAIDataset, column_name: str, -) -> ClassLabel: +) -> object: """Encode a categorical column into numerical labels and return the ClassLabel feature. @@ -864,6 +872,8 @@ def encode_labels( if column_name not in dataset.column_names: raise ValueError(f"Column '{column_name}' does not exist in the dataset.") + from datasets import ClassLabel # local import + names = list(set(dataset[column_name])) class_label_feature = ClassLabel(names=names) return class_label_feature @@ -950,6 +960,9 @@ def split_indexes( # Generate shuffled indexes if seed is None: seed = 42 + import numpy as np + from sklearn.model_selection import train_test_split + indexes = np.arange(total_rows) stratify_labels = np.array(labels) if stratify else None @@ -1005,7 +1018,7 @@ def split_dataset( train_indexes: List = None, test_indexes: List = None, val_indexes: List = None, -) -> DatasetDict: +) -> object: """ Split the dataset in train, test and validation subsets. If indexes are not provided, it will use the split indices @@ -1032,7 +1045,11 @@ def split_dataset( ValueError Must provide all indexes or none. """ + import numpy as np + if all(idx is None for idx in [train_indexes, test_indexes, val_indexes]): + from datasets import DatasetDict + train_dataset = dataset.get_split("train") test_dataset = dataset.get_split("test") val_dataset = dataset.get_split("validation") @@ -1055,6 +1072,8 @@ def split_dataset( val_mask = np.isin(np.arange(n), val_indexes) # Get the underlying table + import pyarrow as pa # local import + table = dataset.arrow_table dataset.splits["split_indices"] = { @@ -1069,6 +1088,8 @@ def split_dataset( val_table = table.filter(pa.array(val_mask)) # Preserve types from the original dataset to maintain categorical mappings + from datasets import DatasetDict # local import + separate_dataset_dict = DatasetDict( { "train": DashAIDataset(train_table, types=dataset.types), @@ -1081,7 +1102,7 @@ def split_dataset( def to_dashai_dataset( - dataset: Union[DatasetDict, Dataset, DashAIDataset, DataFrame], + dataset: object, types: Optional[Dict[str, DashAIDataType]] = None, ) -> DashAIDataset: """ @@ -1102,19 +1123,27 @@ def to_dashai_dataset( if isinstance(dataset, DashAIDataset): return dataset - if isinstance(dataset, Dataset): + from datasets import Dataset as HFDataset # local import + + if isinstance(dataset, HFDataset): arrow_tbl = get_arrow_table(dataset) if types: types_serialized = {col: types[col].to_string() for col in types} arrow_tbl = save_types_in_arrow_metadata(arrow_tbl, types_serialized) return DashAIDataset(arrow_tbl, types=types) - if isinstance(dataset, DataFrame): - hf_dataset = Dataset.from_pandas(dataset, preserve_index=False) + try: + from pandas import DataFrame as PDDataFrame # local import + except Exception: + PDDataFrame = None + if PDDataFrame is not None and isinstance(dataset, PDDataFrame): + hf_dataset = HFDataset.from_pandas(dataset, preserve_index=False) arrow_tbl = get_arrow_table(hf_dataset) if types: types_serialized = {col: types[col].to_string() for col in types} arrow_tbl = save_types_in_arrow_metadata(arrow_tbl, types_serialized) return DashAIDataset(arrow_tbl, types=types) + from datasets import DatasetDict # local import + if isinstance(dataset, DatasetDict) and len(dataset) == 1: key = list(dataset.keys())[0] ds = dataset[key] @@ -1131,7 +1160,7 @@ def to_dashai_dataset( @beartype def validate_inputs_outputs( - datasetdict: Union[DatasetDict, DashAIDataset], + datasetdict: object, inputs: List[str], outputs: List[str], ) -> None: @@ -1170,9 +1199,7 @@ def validate_inputs_outputs( @beartype -def get_column_names_from_indexes( - dataset: Union[DashAIDataset, DatasetDict], indexes: List[int] -) -> List[str]: +def get_column_names_from_indexes(dataset: object, indexes: List[int]) -> List[str]: """Obtain the column labels that correspond to the provided indexes. Note: indexing starts from 1. @@ -1206,7 +1233,7 @@ def get_column_names_from_indexes( @beartype def select_columns( - dataset: Union[DatasetDict, DashAIDataset], + dataset: object, input_columns: List[str], output_columns: List[str], ) -> Tuple[DashAIDataset, DashAIDataset]: @@ -1252,6 +1279,8 @@ def get_columns_spec(dataset_path: str) -> Dict[str, Dict]: """ data_filepath = os.path.join(dataset_path, "data.arrow") + import pyarrow as pa # local import + with pa.OSFile(data_filepath, "rb") as source: reader = pa.ipc.open_file(source) schema = reader.schema @@ -1334,6 +1363,8 @@ def get_dataset_info(dataset_path: str) -> object: object Dictionary with the information of the dataset """ + import json + metadata_filepath = os.path.join(dataset_path, "splits.json") if os.path.exists(metadata_filepath): with open(metadata_filepath, "r", encoding="utf-8") as f: @@ -1404,8 +1435,10 @@ def update_dataset_splits( # I think it could be simplified since DashAITypes, but I don't want to break anything def prepare_for_model_session( dataset: DashAIDataset, splits: dict, output_columns: List[str] -) -> DatasetDict: +) -> object: """Prepare the dataset for a model session by updating the splits configuration""" + from contextlib import suppress + splitType = splits.get("splitType") if splitType == "manual" or splitType == "predefined": splits_index = splits @@ -1468,7 +1501,7 @@ def prepare_for_model_session( def modify_table( dataset: DashAIDataset, - columns: Dict[str, pa.Array], + columns: Dict[str, object], types: Optional[Dict[str, DashAIDataType]] = None, ) -> DashAIDataset: """ @@ -1487,6 +1520,8 @@ def modify_table( DashAIDataset The modified dataset with the updated column type. """ + import pyarrow as pa + original_table = dataset.arrow_table updated_columns = {} diff --git a/DashAI/back/dataloaders/classes/dataloader.py b/DashAI/back/dataloaders/classes/dataloader.py index d4d0ef484..5ee4cd619 100644 --- a/DashAI/back/dataloaders/classes/dataloader.py +++ b/DashAI/back/dataloaders/classes/dataloader.py @@ -1,14 +1,9 @@ """DashAI base class for dataloaders.""" import logging -import os -import zipfile from abc import abstractmethod from typing import Any, Dict, Final -import pandas as pd -from datasets.download.download_manager import DownloadManager - from DashAI.back.config_object import ConfigObject from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset @@ -54,7 +49,7 @@ def load_preview( filepath_or_buffer: str, params: Dict[str, Any], n_rows: int = 10, - ) -> pd.DataFrame: + ) -> any: """ Load a preview of the dataset using streaming. @@ -72,8 +67,8 @@ def load_preview( Returns ------- - pd.DataFrame - A DataFrame with the preview data. + DataFrame + A pandas DataFrame with the preview data. """ raise NotImplementedError( "load_preview must be implemented by specific dataloader" @@ -93,6 +88,8 @@ def prepare_files(self, file_path: str, temp_path: str) -> str: type_path (str): Type of the path. """ + from datasets.download.download_manager import DownloadManager + if file_path.startswith("http"): file_path = DownloadManager.download_and_extract(file_path, temp_path) return (file_path, "dir") @@ -116,6 +113,9 @@ def extract_files(self, file_path: str, temp_path: str) -> str: ------- str: Path of the files extracted. """ + import os + import zipfile + files_path = os.path.join(temp_path, "files") os.makedirs(files_path, exist_ok=True) with zipfile.ZipFile(file_path, "r") as zip_ref: diff --git a/DashAI/back/dataloaders/classes/excel_dataloader.py b/DashAI/back/dataloaders/classes/excel_dataloader.py index 0e6f9ed01..357c0a2cc 100644 --- a/DashAI/back/dataloaders/classes/excel_dataloader.py +++ b/DashAI/back/dataloaders/classes/excel_dataloader.py @@ -1,13 +1,8 @@ """DashAI Excel Dataloader.""" -import glob -import shutil from typing import Any, Dict -import pandas as pd from beartype import beartype -from datasets import Dataset, DatasetDict -from datasets.builder import DatasetGenerationError from DashAI.back.core.schema_fields import ( bool_field, @@ -288,6 +283,13 @@ def load_data( DatasetDict A HuggingFace's Dataset with the loaded data. """ + import glob + import shutil + + import pandas as pd + from datasets import Dataset, DatasetDict + from datasets.builder import DatasetGenerationError + prepared_path = self.prepare_files(filepath_or_buffer, temp_path) print("path prepared", prepared_path) @@ -344,7 +346,7 @@ def load_preview( filepath_or_buffer: str, params: Dict[str, Any], n_rows: int = 10, - ) -> pd.DataFrame: + ): """ Load a preview of the Excel dataset. @@ -368,6 +370,8 @@ def load_preview( pandas_params = self._prepare_pandas_params(params) pandas_params["nrows"] = n_rows + import pandas as pd + df_preview = pd.read_excel( io=filepath_or_buffer, **pandas_params, diff --git a/DashAI/back/dataloaders/classes/json_dataloader.py b/DashAI/back/dataloaders/classes/json_dataloader.py index ea6e27413..8417fd2b3 100644 --- a/DashAI/back/dataloaders/classes/json_dataloader.py +++ b/DashAI/back/dataloaders/classes/json_dataloader.py @@ -1,12 +1,8 @@ """DashAI JSON Dataloader.""" -import shutil -from itertools import islice from typing import Any, Dict -import pandas as pd from beartype import beartype -from datasets import Dataset, IterableDatasetDict, load_dataset from DashAI.back.core.schema_fields import none_type, schema_field, string_field from DashAI.back.core.schema_fields.base_schema import BaseSchema @@ -129,6 +125,10 @@ def load_data( DatasetDict A HuggingFace's Dataset with the loaded data. """ + import shutil + + from datasets import Dataset, IterableDatasetDict, load_dataset + self._check_params(params) field = params["data_key"] prepared_path = self.prepare_files(filepath_or_buffer, temp_path) @@ -156,7 +156,7 @@ def load_preview( filepath_or_buffer: str, params: Dict[str, Any], n_rows: int = 100, - ) -> pd.DataFrame: + ): """ Load a preview of the JSON dataset using streaming. @@ -174,6 +174,11 @@ def load_preview( pd.DataFrame A DataFrame containing the preview rows. """ + from itertools import islice + + import pandas as pd + from datasets import load_dataset + self._check_params(params) field = params.get("data_key") diff --git a/DashAI/back/explainability/explainers/kernel_shap.py b/DashAI/back/explainability/explainers/kernel_shap.py index e006ef541..b344035df 100644 --- a/DashAI/back/explainability/explainers/kernel_shap.py +++ b/DashAI/back/explainability/explainers/kernel_shap.py @@ -1,12 +1,3 @@ -from typing import Tuple, Union - -import numpy as np -import pandas as pd -import plotly -import plotly.graph_objs as go -import shap -from datasets import DatasetDict - from DashAI.back.core.schema_fields import ( BaseSchema, bool_field, @@ -154,7 +145,7 @@ def __init__( def _sample_background_data( self, - background_data: np.array, + background_data, background_fraction: float, sampling_method: str = "shuffle", categorical_features: bool = False, @@ -186,6 +177,9 @@ def _sample_background_data( explainer. """ + # Lazy import of shap to avoid heavy imports at module load time + import shap + samplers = {"shuffle": shap.sample, "kmeans": shap.kmeans} n_background_samples = int(background_fraction * background_data.shape[0]) @@ -199,10 +193,10 @@ def _sample_background_data( def fit( self, - background_dataset: Tuple[DatasetDict, DatasetDict], - sample_background_data: str = "false", - background_fraction: Union[float, None] = None, - sampling_method: Union[str, None] = None, + background_dataset, + sample_background_data="false", + background_fraction=None, + sampling_method=None, ): """Method to train the KernelShap explainer. @@ -254,6 +248,9 @@ def fit( ) # TODO: consider the case where the predictor is not a Sklearn model + # Lazy import of shap + import shap + self.explainer = shap.KernelExplainer( model=self.model.predict, data=background_data, @@ -270,7 +267,7 @@ def fit( def explain_instance( self, - instances: DatasetDict, + instances, ): """Method for explaining the model prediciton of an instance using the Kernel Shap method. @@ -298,6 +295,9 @@ def explain_instance( predictions = self.model.predict(x_pred=dataset_dashai) # TODO: evaluate args nsamples y l1_reg + # Lazy import numpy + import numpy as np + shap_values = self.explainer.shap_values(X=X) # shap_values has size (n_instances, n_features, n_classes) @@ -321,7 +321,7 @@ def explain_instance( return explanation def _create_plot( - self, data: pd.DataFrame, base_value: float, y_pred_pbb: float, y_pred_name: str + self, data, base_value: float, y_pred_pbb: float, y_pred_name: str ): """Helper method to create the explanation plot using plotly. @@ -341,6 +341,11 @@ def _create_plot( JSON containing the information of the explanation plot to be rendered. """ + # Lazy imports + import numpy as np + import plotly + import plotly.graph_objs as go + x = data["shap_values"].to_numpy() y = data["label"].to_numpy() measure = np.repeat("relative", len(y)) @@ -426,6 +431,10 @@ def plot(self, explanation: list[dict]): target_names = metadata["target_names"] # Normaliza feature_names a 1D + # Lazy import heavy libs + import numpy as np + import pandas as pd + feats = np.asarray(feature_names, dtype=str).reshape(-1) plots = [] @@ -449,6 +458,7 @@ def plot(self, explanation: list[dict]): # 2) Intenta extraer del objeto shap.Explanation si aplica try: + # Lazy import of shap Explanation type only if available from shap._explanation import Explanation if isinstance(sv, Explanation): diff --git a/DashAI/back/explainability/explainers/partial_dependence.py b/DashAI/back/explainability/explainers/partial_dependence.py index 345c21820..6a64670c0 100644 --- a/DashAI/back/explainability/explainers/partial_dependence.py +++ b/DashAI/back/explainability/explainers/partial_dependence.py @@ -1,11 +1,4 @@ -from typing import List, Tuple - -import numpy as np -import pandas as pd -import plotly -import plotly.express as px -from datasets import DatasetDict -from sklearn.inspection import partial_dependence +from typing import List from DashAI.back.core.schema_fields import ( BaseSchema, @@ -129,7 +122,7 @@ def __init__( self.grid_resolution = grid_resolution self.explanation = None - def explain(self, dataset: Tuple[DatasetDict, DatasetDict]): + def explain(self, dataset): """Method to generate the explanation Parameters @@ -142,6 +135,10 @@ def explain(self, dataset: Tuple[DatasetDict, DatasetDict]): dict Dictionary with metadata and the partial dependence of each feature """ + # Lazy imports + import numpy as np + from sklearn.inspection import partial_dependence + x, y = dataset x_test = x["test"].to_pandas() @@ -184,7 +181,7 @@ def explain(self, dataset: Tuple[DatasetDict, DatasetDict]): return explanation - def _create_plot(self, data: List[pd.DataFrame]) -> List[dict]: + def _create_plot(self, data: List[object]) -> List[dict]: """Helper method to create the explanation plot using plotly. Parameters @@ -197,6 +194,10 @@ def _create_plot(self, data: List[pd.DataFrame]) -> List[dict]: list of JSON containing the information of the explanation plot to be rendered. """ + # Lazy imports + import plotly + import plotly.express as px + fig = px.line( data[0], x=data[0]["grid_values"], @@ -265,6 +266,9 @@ def plot(self, explanation: dict) -> List[dict]: list of JSONs containing the information of the explanation plot to be rendered. """ + # Lazy import + import pandas as pd + explanation = explanation.copy() metadata = explanation.pop("metadata") target_names = metadata["target_names"] diff --git a/DashAI/back/explainability/explainers/permutation_feature_importance.py b/DashAI/back/explainability/explainers/permutation_feature_importance.py index 99b5b9978..043a212df 100644 --- a/DashAI/back/explainability/explainers/permutation_feature_importance.py +++ b/DashAI/back/explainability/explainers/permutation_feature_importance.py @@ -1,13 +1,4 @@ -from typing import Dict, List, Tuple, Union - -import numpy as np -import pandas as pd -import plotly -import plotly.express as px -from datasets import DatasetDict -from sklearn.inspection import permutation_importance -from sklearn.metrics import accuracy_score, balanced_accuracy_score, make_scorer -from sklearn.preprocessing import LabelEncoder +from typing import Dict, List, Union from DashAI.back.core.schema_fields import ( BaseSchema, @@ -126,6 +117,9 @@ def __init__( ): super().__init__(model) + # Lazy import metrics only during initialization + from sklearn.metrics import accuracy_score, balanced_accuracy_score + metrics = { "accuracy": accuracy_score, "balanced_accuracy": balanced_accuracy_score, @@ -175,12 +169,15 @@ def _get_feature_groups(self, columns: List[str]) -> Dict[str, List[int]]: def _calculate_grouped_importance( self, - x_data: pd.DataFrame, - y: pd.DataFrame, + x_data, + y, feature_groups: Dict[str, List[int]], max_samples: int, - ) -> Dict[str, Dict[str, np.ndarray]]: + ): """Calculate permutation importance for grouped features.""" + # Lazy imports + import numpy as np + rng = np.random.RandomState(self.random_state) n_samples = min(max_samples, len(x_data)) @@ -237,8 +234,15 @@ def calc_score(y_true, y_pred_probas): return results - def explain(self, dataset: Tuple[DatasetDict, DatasetDict]): + def explain(self, dataset): """Method for calculating the importance of features in the model.""" + # Lazy imports + import numpy as np + import pandas as pd + from sklearn.inspection import permutation_importance + from sklearn.metrics import make_scorer + from sklearn.preprocessing import LabelEncoder + x, y = dataset x_test = x["test"] @@ -299,8 +303,12 @@ def patched_metric(y_true, y_pred_probas): "importances_std": np.round(pfi["importances_std"], 3).tolist(), } - def _create_plot(self, data: pd.DataFrame, n_features: int): + def _create_plot(self, data, n_features: int): """Helper method to create the explanation plot using plotly.""" + # Lazy imports + import plotly + import plotly.express as px + fig = px.bar( data.iloc[-n_features:], x=data.iloc[-n_features:]["importances_mean"], @@ -352,6 +360,9 @@ def _create_plot(self, data: pd.DataFrame, n_features: int): def plot(self, explanation: dict) -> List[dict]: """Method to create the explanation plot.""" n_features = 10 + # Lazy import + import pandas as pd + data = pd.DataFrame.from_dict(explanation) data = data.sort_values(by=["importances_mean"], ascending=True) diff --git a/DashAI/back/exploration/explorers/box_plot.py b/DashAI/back/exploration/explorers/box_plot.py index 5f7f9d939..3a2d28b5d 100644 --- a/DashAI/back/exploration/explorers/box_plot.py +++ b/DashAI/back/exploration/explorers/box_plot.py @@ -1,10 +1,7 @@ import os import pathlib -import plotly.express as px from beartype.typing import Any, Dict -from plotly.graph_objs import Figure -from plotly.io import read_json from DashAI.back.core.schema_fields import bool_field, enum_field, schema_field from DashAI.back.core.utils import MultilingualString @@ -89,6 +86,8 @@ def __init__(self, **kwargs) -> None: super().__init__(**kwargs) def launch_exploration(self, dataset: DashAIDataset, explorer_info: Explorer): + import plotly.express as px + _df = dataset.to_pandas() cols = [col["columnName"] for col in explorer_info.columns] @@ -121,7 +120,7 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Figure, + result: Any, ) -> str: filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) @@ -134,6 +133,7 @@ def get_results( ) -> Dict[str, Any]: resultType = "plotly_json" config = {} + from plotly.io import read_json result = read_json(exploration_path) result = result.to_json() diff --git a/DashAI/back/exploration/explorers/corr_matrix.py b/DashAI/back/exploration/explorers/corr_matrix.py index 420f053d3..20e564ef7 100644 --- a/DashAI/back/exploration/explorers/corr_matrix.py +++ b/DashAI/back/exploration/explorers/corr_matrix.py @@ -2,11 +2,7 @@ import os import pathlib -import numpy as np -import pandas as pd -import plotly.express as px -import plotly.graph_objs as go -from beartype.typing import Any, Dict, Union +from beartype.typing import Any, Dict from DashAI.back.core.schema_fields import ( bool_field, @@ -125,7 +121,9 @@ def __init__(self, **kwargs) -> None: def launch_exploration( self, dataset: DashAIDataset, explorer_info: Explorer - ) -> Union[pd.DataFrame, go.Figure]: + ) -> Any: + import plotly.express as px + result = dataset.to_pandas().corr( method=self.method, min_periods=( @@ -153,8 +151,11 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Union[pd.DataFrame, go.Figure], + result: Any, ) -> str: + import pandas as pd + import plotly.graph_objs as go + filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) @@ -179,5 +180,8 @@ def get_results( config = {"orient": "dict"} path = pathlib.Path(exploration_path) + import numpy as np + import pandas as pd + result = pd.read_json(path).replace({np.nan: None}).T.to_dict(orient="dict") return {"type": resultType, "data": result, "config": config} diff --git a/DashAI/back/exploration/explorers/cov_matrix.py b/DashAI/back/exploration/explorers/cov_matrix.py index 221242e45..cc7ffdeea 100644 --- a/DashAI/back/exploration/explorers/cov_matrix.py +++ b/DashAI/back/exploration/explorers/cov_matrix.py @@ -1,11 +1,7 @@ import os import pathlib -import numpy as np -import pandas as pd -import plotly.express as px -import plotly.graph_objects as go -from beartype.typing import Any, Dict, Union +from beartype.typing import Any, Dict from DashAI.back.core.schema_fields import bool_field, int_field, schema_field from DashAI.back.core.utils import MultilingualString @@ -116,7 +112,9 @@ def __init__(self, **kwargs) -> None: def launch_exploration( self, dataset: DashAIDataset, explorer_info: Explorer - ) -> Union[pd.DataFrame, go.Figure]: + ) -> Any: + import plotly.express as px + result = dataset.to_pandas().cov( min_periods=self.min_periods, ddof=self.ddof, @@ -140,8 +138,11 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Union[pd.DataFrame, go.Figure], + result: Any, ) -> str: + import pandas as pd + import plotly.graph_objects as go + filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) @@ -166,5 +167,8 @@ def get_results( config = {"orient": "dict"} path = pathlib.Path(exploration_path) + import numpy as np + import pandas as pd + result = pd.read_json(path).replace({np.nan: None}).T.to_dict(orient="dict") return {"type": resultType, "data": result, "config": config} diff --git a/DashAI/back/exploration/explorers/density_heatmap.py b/DashAI/back/exploration/explorers/density_heatmap.py index 88d5e24b3..3543c3968 100644 --- a/DashAI/back/exploration/explorers/density_heatmap.py +++ b/DashAI/back/exploration/explorers/density_heatmap.py @@ -1,9 +1,7 @@ import os import pathlib -import plotly.express as px from beartype.typing import Any, Dict -from plotly.graph_objs import Figure from DashAI.back.core.schema_fields import int_field, none_type, schema_field from DashAI.back.core.utils import MultilingualString @@ -71,6 +69,8 @@ def __init__(self, **kwargs) -> None: super().__init__(**kwargs) def launch_exploration(self, dataset: DashAIDataset, explorer_info: Explorer): + import plotly.express as px + _df = dataset.to_pandas() columns = [col["columnName"] for col in explorer_info.columns] @@ -93,7 +93,7 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Figure, + result: Any, ) -> str: filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) diff --git a/DashAI/back/exploration/explorers/describe_explorer.py b/DashAI/back/exploration/explorers/describe_explorer.py index d88ad00c0..43b7b2781 100644 --- a/DashAI/back/exploration/explorers/describe_explorer.py +++ b/DashAI/back/exploration/explorers/describe_explorer.py @@ -1,8 +1,6 @@ import os import pathlib -import numpy as np -import pandas as pd from beartype.typing import Any, Dict from DashAI.back.core.schema_fields import ( @@ -143,7 +141,7 @@ def validate_parameters(cls, params: Dict[str, Any]) -> bool: def launch_exploration( self, dataset: DashAIDataset, __explorer_info__: Explorer - ) -> pd.DataFrame: + ) -> Any: return dataset.to_pandas().describe( percentiles=self.percentiles, include=self.include, exclude=self.exclude ) @@ -153,7 +151,7 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: pd.DataFrame, + result: Any, ) -> str: filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) @@ -169,6 +167,9 @@ def get_results( config = {"orient": orientation} path = pathlib.Path(exploration_path) + import numpy as np + import pandas as pd + result = ( pd.read_json(path).replace({np.nan: None}).T.to_dict(orient=orientation) ) diff --git a/DashAI/back/exploration/explorers/ecdf_plot.py b/DashAI/back/exploration/explorers/ecdf_plot.py index b0968c258..d49c8a2aa 100644 --- a/DashAI/back/exploration/explorers/ecdf_plot.py +++ b/DashAI/back/exploration/explorers/ecdf_plot.py @@ -2,10 +2,7 @@ import os import pathlib -import plotly.express as px -import plotly.io as pio from beartype.typing import Any, Dict, List, Union -from plotly.graph_objs import Figure from DashAI.back.core.schema_fields import ( enum_field, @@ -151,6 +148,8 @@ def prepare_dataset( return super().prepare_dataset(loaded_dataset, columns) def launch_exploration(self, dataset: DashAIDataset, explorer_info: Explorer): + import plotly.express as px + _df = dataset.to_pandas() columns = [col["columnName"] for col in explorer_info.columns] @@ -180,7 +179,7 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Figure, + result: Any, ) -> str: filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) @@ -193,6 +192,7 @@ def get_results( ) -> Dict[str, Any]: resultType = "plotly_json" config = {} + import plotly.io as pio result = pio.read_json(exploration_path) result = result.to_json() diff --git a/DashAI/back/exploration/explorers/histogram_plot.py b/DashAI/back/exploration/explorers/histogram_plot.py index b31ee964c..6dad42673 100644 --- a/DashAI/back/exploration/explorers/histogram_plot.py +++ b/DashAI/back/exploration/explorers/histogram_plot.py @@ -2,10 +2,7 @@ import os import pathlib -import plotly.express as px -import plotly.io as pio from beartype.typing import Any, Dict, List, Union -from plotly.graph_objs import Figure from DashAI.back.core.schema_fields import ( enum_field, @@ -161,6 +158,8 @@ def prepare_dataset( return super().prepare_dataset(loaded_dataset, columns) def launch_exploration(self, dataset: DashAIDataset, explorer_info: Explorer): + import plotly.express as px + _df = dataset.to_pandas() columns = [col["columnName"] for col in explorer_info.columns] @@ -185,7 +184,7 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Figure, + result: Any, ) -> str: filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) @@ -198,6 +197,7 @@ def get_results( ) -> Dict[str, Any]: resultType = "plotly_json" config = {} + import plotly.io as pio result = pio.read_json(exploration_path) result = result.to_json() diff --git a/DashAI/back/exploration/explorers/multibox_plot.py b/DashAI/back/exploration/explorers/multibox_plot.py index cc5c6d7e4..d08f99f11 100644 --- a/DashAI/back/exploration/explorers/multibox_plot.py +++ b/DashAI/back/exploration/explorers/multibox_plot.py @@ -1,10 +1,7 @@ import os import pathlib -import plotly.graph_objects as go from beartype.typing import Any, Dict, List -from plotly.graph_objs import Figure -from plotly.io import read_json from DashAI.back.core.schema_fields import ( bool_field, @@ -132,6 +129,8 @@ def prepare_dataset( return super().prepare_dataset(loaded_dataset, columns) def launch_exploration(self, dataset: DashAIDataset, explorer_info: Explorer): + import plotly.graph_objects as go + _df = dataset.to_pandas() cols = [col["columnName"] for col in explorer_info.columns] @@ -167,7 +166,7 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Figure, + result: Any, ) -> str: filename = f"{explorer_info.id}.pickle" path = pathlib.Path(os.path.join(save_path, filename)) @@ -181,6 +180,7 @@ def get_results( ) -> Dict[str, Any]: resultType = "plotly_json" config = {} + from plotly.io import read_json result = read_json(exploration_path) result = result.to_json() diff --git a/DashAI/back/exploration/explorers/parallel_categories.py b/DashAI/back/exploration/explorers/parallel_categories.py index df11399a8..3227e2cee 100644 --- a/DashAI/back/exploration/explorers/parallel_categories.py +++ b/DashAI/back/exploration/explorers/parallel_categories.py @@ -1,10 +1,7 @@ import os import pathlib -import plotly.express as px -import plotly.io as pio from beartype.typing import Any, Dict, List, Union -from plotly.graph_objs import Figure from DashAI.back.core.schema_fields import ( int_field, @@ -89,6 +86,8 @@ def prepare_dataset( return super().prepare_dataset(loaded_dataset, columns) def launch_exploration(self, dataset: DashAIDataset, explorer_info: Explorer): + import plotly.express as px + _df = dataset.to_pandas() columns = [col["columnName"] for col in explorer_info.columns] @@ -109,7 +108,7 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Figure, + result: Any, ) -> str: filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) @@ -122,6 +121,7 @@ def get_results( ) -> Dict[str, Any]: resultType = "plotly_json" config = {} + import plotly.io as pio result = pio.read_json(exploration_path) result = result.to_json() diff --git a/DashAI/back/exploration/explorers/parallel_cordinates.py b/DashAI/back/exploration/explorers/parallel_cordinates.py index 905fad6ae..c48345372 100644 --- a/DashAI/back/exploration/explorers/parallel_cordinates.py +++ b/DashAI/back/exploration/explorers/parallel_cordinates.py @@ -1,10 +1,7 @@ import os import pathlib -import plotly.express as px -import plotly.io as pio from beartype.typing import Any, Dict, List, Union -from plotly.graph_objs import Figure from DashAI.back.core.schema_fields import ( int_field, @@ -88,6 +85,8 @@ def prepare_dataset( return super().prepare_dataset(loaded_dataset, columns) def launch_exploration(self, dataset: DashAIDataset, explorer_info: Explorer): + import plotly.express as px + _df = dataset.to_pandas() columns = [col["columnName"] for col in explorer_info.columns] @@ -108,7 +107,7 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Figure, + result: Any, ) -> str: filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) @@ -121,6 +120,7 @@ def get_results( ) -> Dict[str, Any]: resultType = "plotly_json" config = {} + import plotly.io as pio result = pio.read_json(exploration_path) result = result.to_json() diff --git a/DashAI/back/exploration/explorers/row_explorer.py b/DashAI/back/exploration/explorers/row_explorer.py index cbeff6d98..ec1c09298 100644 --- a/DashAI/back/exploration/explorers/row_explorer.py +++ b/DashAI/back/exploration/explorers/row_explorer.py @@ -1,8 +1,6 @@ import os import pathlib -import numpy as np -import pandas as pd from beartype.typing import Any, Dict from DashAI.back.core.schema_fields import bool_field, int_field, schema_field @@ -111,7 +109,7 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: pd.DataFrame, + result: Any, ) -> str: filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) @@ -127,5 +125,8 @@ def get_results( config = {"orient": orientation} path = pathlib.Path(exploration_path) + import numpy as np + import pandas as pd + result = pd.read_json(path).replace({np.nan: None}).to_dict(orient=orientation) return {"type": resultType, "data": result, "config": config} diff --git a/DashAI/back/exploration/explorers/scatter_matrix.py b/DashAI/back/exploration/explorers/scatter_matrix.py index 5d45c79a7..1a3801777 100644 --- a/DashAI/back/exploration/explorers/scatter_matrix.py +++ b/DashAI/back/exploration/explorers/scatter_matrix.py @@ -1,10 +1,7 @@ import os import pathlib -import plotly.express as px -import plotly.io as pio from beartype.typing import Any, Dict, List -from plotly.graph_objs import Figure from DashAI.back.core.schema_fields import ( int_field, @@ -122,6 +119,8 @@ def prepare_dataset( return super().prepare_dataset(loaded_dataset, columns) def launch_exploration(self, dataset: DashAIDataset, explorer_info: Explorer): + import plotly.express as px + _df = dataset.to_pandas() dimensions = [col["columnName"] for col in explorer_info.columns] @@ -146,7 +145,7 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Figure, + result: Any, ) -> str: filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) @@ -159,6 +158,7 @@ def get_results( ) -> Dict[str, Any]: resultType = "plotly_json" config = {} + import plotly.io as pio result = pio.read_json(exploration_path) result = result.to_json() diff --git a/DashAI/back/exploration/explorers/scatter_plot.py b/DashAI/back/exploration/explorers/scatter_plot.py index 7632f5581..af91adab8 100644 --- a/DashAI/back/exploration/explorers/scatter_plot.py +++ b/DashAI/back/exploration/explorers/scatter_plot.py @@ -1,10 +1,7 @@ import os import pathlib -import plotly.express as px from beartype.typing import Any, Dict, List -from plotly.graph_objs import Figure -from plotly.io import read_json from DashAI.back.core.schema_fields import ( int_field, @@ -135,6 +132,8 @@ def prepare_dataset( return super().prepare_dataset(loaded_dataset, columns) def launch_exploration(self, dataset: DashAIDataset, explorer_info: Explorer): + import plotly.express as px + _df = dataset.to_pandas() cols = [col["columnName"] for col in explorer_info.columns] @@ -162,7 +161,7 @@ def save_notebook( __notebook_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Figure, + result: Any, ) -> str: filename = f"{explorer_info.id}.pickle" path = pathlib.Path(os.path.join(save_path, filename)) @@ -175,6 +174,7 @@ def get_results( ) -> Dict[str, Any]: resultType = "plotly_json" config = {} + from plotly.io import read_json result = read_json(exploration_path) result = result.to_json() diff --git a/DashAI/back/exploration/explorers/wordcloud.py b/DashAI/back/exploration/explorers/wordcloud.py index 6b9bedbd7..939f17c3d 100644 --- a/DashAI/back/exploration/explorers/wordcloud.py +++ b/DashAI/back/exploration/explorers/wordcloud.py @@ -3,8 +3,6 @@ import pathlib from beartype.typing import Any, Dict -from PIL.Image import Image -from wordcloud import STOPWORDS, WordCloud from DashAI.back.core.schema_fields import ( int_field, @@ -84,6 +82,8 @@ def __init__(self, **kwargs) -> None: super().__init__(**kwargs) def launch_exploration(self, dataset: DashAIDataset, explorer_info: Explorer): + from wordcloud import STOPWORDS, WordCloud + _df = dataset.to_pandas() cols = [col["columnName"] for col in explorer_info.columns] @@ -107,7 +107,7 @@ def save_notebook( __exploration_info__: Notebook, explorer_info: Explorer, save_path: pathlib.Path, - result: Image, + result: Any, ) -> str: filename = f"{explorer_info.id}.png" path = pathlib.Path(os.path.join(save_path, filename)) diff --git a/DashAI/back/job/converter_job.py b/DashAI/back/job/converter_job.py index 0f8d2d7c7..4ec703705 100644 --- a/DashAI/back/job/converter_job.py +++ b/DashAI/back/job/converter_job.py @@ -1,5 +1,4 @@ import logging -from pathlib import Path from typing import List from kink import inject @@ -88,6 +87,7 @@ def _rebuild_dataset_with_transformed_columns( } ) + # Use existing modify_table (imported at module level) modified_dataset = modify_table(base, updated_arrays, types=updated_types) modified_dataset = modified_dataset.select_columns(new_columns_order) @@ -176,6 +176,8 @@ def run( ) -> None: from kink import di + # (Lazy imports removed to avoid duplicate and unused imports warnings) + session_factory = di["session_factory"] component_registry = di["component_registry"] @@ -258,18 +260,6 @@ def instantiate_converters( raise JobError(f"Cannot load dataset from {dataset_path}") from e try: - # Get the absolute path to the converters directory - current_file = Path(__file__) - project_root = ( - current_file.parent.parent.parent - ) # Go up three levels to reach project root - converters_base_path = project_root / "back" / "converters" - - if not converters_base_path.exists(): - raise JobError( - f"Converters directory not found at {converters_base_path}" - ) - # Get stored converter configurations converters_stored_info = { converter_list.converter: converter_list.parameters diff --git a/DashAI/back/job/dataset_job.py b/DashAI/back/job/dataset_job.py index c31101c1e..bb8ea519c 100644 --- a/DashAI/back/job/dataset_job.py +++ b/DashAI/back/job/dataset_job.py @@ -1,4 +1,3 @@ -import gc import json import logging import os @@ -6,7 +5,7 @@ import uuid from pathlib import Path -from kink import inject +from kink import di, inject from sqlalchemy import exc from sqlalchemy.orm import sessionmaker @@ -19,7 +18,6 @@ ) from DashAI.back.dependencies.database.models import Dataset, Notebook from DashAI.back.job.base_job import BaseJob, JobError -from DashAI.back.types.inf.type_inference import infer_types log = logging.getLogger(__name__) @@ -98,7 +96,9 @@ def get_job_name(self) -> str: def run( self, ) -> None: - from kink import di + import gc + + from DashAI.back.types.inf.type_inference import infer_types component_registry = di["component_registry"] session_factory = di["session_factory"] diff --git a/DashAI/back/job/explorer_job.py b/DashAI/back/job/explorer_job.py index ad655d409..1e42d6e79 100644 --- a/DashAI/back/job/explorer_job.py +++ b/DashAI/back/job/explorer_job.py @@ -8,10 +8,7 @@ from sqlalchemy.orm import sessionmaker from DashAI.back.dataloaders.classes.dashai_dataset import load_dataset -from DashAI.back.dependencies.database.models import ( - Explorer, - Notebook, -) +from DashAI.back.dependencies.database.models import Explorer, Notebook from DashAI.back.exploration.base_explorer import BaseExplorer from DashAI.back.job.base_job import BaseJob, JobError diff --git a/DashAI/back/job/generative_job.py b/DashAI/back/job/generative_job.py index 7e8fa4534..43d887a52 100644 --- a/DashAI/back/job/generative_job.py +++ b/DashAI/back/job/generative_job.py @@ -108,6 +108,7 @@ def run( component_registry = di["component_registry"] session_factory = di["session_factory"] config = di["config"] + # (Lazy imports removed to avoid duplicate and unused imports warnings) model = None generative_process = None with session_factory() as db: diff --git a/DashAI/back/job/predict_job.py b/DashAI/back/job/predict_job.py index 33612314f..b7014f35f 100644 --- a/DashAI/back/job/predict_job.py +++ b/DashAI/back/job/predict_job.py @@ -6,7 +6,7 @@ import numpy as np from fastapi import status from fastapi.exceptions import HTTPException -from kink import inject +from kink import di, inject from sqlalchemy import exc from sqlalchemy.orm import sessionmaker @@ -71,8 +71,6 @@ def get_job_name(self) -> str: dataset_id = self.kwargs.get("dataset_id") if prediction_id: - from kink import di - session_factory = di["session_factory"] try: @@ -90,8 +88,6 @@ def get_job_name(self) -> str: def run( self, ) -> List[Any]: - from kink import di - component_registry = di["component_registry"] session_factory = di["session_factory"] config = di["config"] diff --git a/DashAI/back/metrics/classification/accuracy.py b/DashAI/back/metrics/classification/accuracy.py index 54236b192..a5dbe4e7e 100644 --- a/DashAI/back/metrics/classification/accuracy.py +++ b/DashAI/back/metrics/classification/accuracy.py @@ -1,8 +1,5 @@ """DashAI accuracy classification metric implementation.""" -import numpy as np -from sklearn.metrics import accuracy_score - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.classification_metric import ( ClassificationMetric, @@ -19,7 +16,7 @@ class Accuracy(ClassificationMetric): ) @staticmethod - def score(true_labels: DashAIDataset, probs_pred_labels: np.ndarray) -> float: + def score(true_labels: DashAIDataset, probs_pred_labels) -> float: """Calculate the accuracy between true labels and predicted labels. Parameters @@ -36,5 +33,7 @@ def score(true_labels: DashAIDataset, probs_pred_labels: np.ndarray) -> float: float Accuracy score between true labels and predicted labels """ + from sklearn.metrics import accuracy_score + true_labels, pred_labels = prepare_to_metric(true_labels, probs_pred_labels) return accuracy_score(true_labels, pred_labels) diff --git a/DashAI/back/metrics/classification/cohen_kappa.py b/DashAI/back/metrics/classification/cohen_kappa.py index 9a4b8bb2f..2a3e4071d 100644 --- a/DashAI/back/metrics/classification/cohen_kappa.py +++ b/DashAI/back/metrics/classification/cohen_kappa.py @@ -1,8 +1,5 @@ """DashAI Cohen Kappa classification metric implementation.""" -import numpy as np -from sklearn.metrics import cohen_kappa_score - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.classification_metric import ( ClassificationMetric, @@ -19,9 +16,7 @@ class CohenKappa(ClassificationMetric): ) @staticmethod - def score( - true_labels: DashAIDataset, probs_pred_labels: np.ndarray, multiclass=None - ) -> float: + def score(true_labels: DashAIDataset, probs_pred_labels, multiclass=None) -> float: """Calculate Cohen Kappa score between true labels and predicted labels. Parameters @@ -41,6 +36,8 @@ def score( float Cohen Kappa score between true labels and predicted labels """ + from sklearn.metrics import cohen_kappa_score + true_labels, pred_labels = prepare_to_metric(true_labels, probs_pred_labels) return cohen_kappa_score(true_labels, pred_labels) diff --git a/DashAI/back/metrics/classification/f1.py b/DashAI/back/metrics/classification/f1.py index e60617038..3b56ee349 100644 --- a/DashAI/back/metrics/classification/f1.py +++ b/DashAI/back/metrics/classification/f1.py @@ -1,8 +1,5 @@ """DashAI F1 clasification metric implementation.""" -import numpy as np -from sklearn.metrics import f1_score - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.classification_metric import ( ClassificationMetric, @@ -19,9 +16,7 @@ class F1(ClassificationMetric): ) @staticmethod - def score( - true_labels: DashAIDataset, probs_pred_labels: np.ndarray, multiclass=None - ) -> float: + def score(true_labels: DashAIDataset, probs_pred_labels, multiclass=None) -> float: """Calculate f1 score between true labels and predicted labels. Parameters @@ -46,6 +41,8 @@ def score( if multiclass is None: multiclass = ClassificationMetric.is_multiclass(true_labels) + from sklearn.metrics import f1_score + if multiclass: return f1_score(true_labels, pred_labels, average="macro") else: diff --git a/DashAI/back/metrics/classification/hamming_distance.py b/DashAI/back/metrics/classification/hamming_distance.py index 76d938c7c..d71aa1c6e 100644 --- a/DashAI/back/metrics/classification/hamming_distance.py +++ b/DashAI/back/metrics/classification/hamming_distance.py @@ -1,8 +1,5 @@ """DashAI Hamming Distance implementation.""" -import numpy as np -from sklearn.metrics import hamming_loss - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.classification_metric import ( ClassificationMetric, @@ -21,9 +18,7 @@ class HammingDistance(ClassificationMetric): ) @staticmethod - def score( - true_labels: DashAIDataset, probs_pred_labels: np.ndarray, multiclass=None - ) -> float: + def score(true_labels: DashAIDataset, probs_pred_labels, multiclass=None) -> float: """Calculate Hamming Distance between true labels and predicted labels. Parameters @@ -43,5 +38,7 @@ def score( float Hamming Distance between true labels and predicted labels """ + from sklearn.metrics import hamming_loss + true_labels, pred_labels = prepare_to_metric(true_labels, probs_pred_labels) return hamming_loss(true_labels, pred_labels) diff --git a/DashAI/back/metrics/classification/log_loss.py b/DashAI/back/metrics/classification/log_loss.py index 49e216463..b77914570 100644 --- a/DashAI/back/metrics/classification/log_loss.py +++ b/DashAI/back/metrics/classification/log_loss.py @@ -1,8 +1,5 @@ """DashAI log loss implementation.""" -import numpy as np -from sklearn.metrics import log_loss - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.classification_metric import ( ClassificationMetric, @@ -23,9 +20,7 @@ class LogLoss(ClassificationMetric): MAXIMIZE: bool = False @staticmethod - def score( - true_labels: DashAIDataset, probs_pred_labels: np.ndarray, multiclass=None - ) -> float: + def score(true_labels: DashAIDataset, probs_pred_labels, multiclass=None) -> float: """Calculate Log Loss score between true labels and predicted labels. Parameters @@ -45,6 +40,8 @@ def score( float Log Loss score between true labels and predicted labels """ + from sklearn.metrics import log_loss + true_labels, _ = prepare_to_metric(true_labels, probs_pred_labels) return log_loss(true_labels, probs_pred_labels) diff --git a/DashAI/back/metrics/classification/precision.py b/DashAI/back/metrics/classification/precision.py index 8517375d5..20c5dc8cb 100644 --- a/DashAI/back/metrics/classification/precision.py +++ b/DashAI/back/metrics/classification/precision.py @@ -1,8 +1,5 @@ """DashAI precision classification metric implementation.""" -import numpy as np -from sklearn.metrics import precision_score - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.classification_metric import ( ClassificationMetric, @@ -19,9 +16,7 @@ class Precision(ClassificationMetric): ) @staticmethod - def score( - true_labels: DashAIDataset, probs_pred_labels: np.ndarray, multiclass=None - ) -> float: + def score(true_labels: DashAIDataset, probs_pred_labels, multiclass=None) -> float: """Calculate precision between true labels and predicted labels. Parameters @@ -47,6 +42,8 @@ def score( if multiclass is None: multiclass = ClassificationMetric.is_multiclass(true_labels) + from sklearn.metrics import precision_score + if multiclass: return precision_score(true_labels, pred_labels, average="macro") else: diff --git a/DashAI/back/metrics/classification/recall.py b/DashAI/back/metrics/classification/recall.py index 194c0a8a8..541c217cf 100644 --- a/DashAI/back/metrics/classification/recall.py +++ b/DashAI/back/metrics/classification/recall.py @@ -1,8 +1,5 @@ """DashAI recall classification metric implementation.""" -import numpy as np -from sklearn.metrics import recall_score - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.classification_metric import ( ClassificationMetric, @@ -19,9 +16,7 @@ class Recall(ClassificationMetric): ) @staticmethod - def score( - true_labels: DashAIDataset, probs_pred_labels: np.ndarray, multiclass=None - ) -> float: + def score(true_labels: DashAIDataset, probs_pred_labels, multiclass=None) -> float: """Calculate recall between true labels and predicted labels. Parameters @@ -47,6 +42,8 @@ def score( if multiclass is None: multiclass = ClassificationMetric.is_multiclass(true_labels) + from sklearn.metrics import recall_score + if multiclass: return recall_score(true_labels, pred_labels, average="macro") else: diff --git a/DashAI/back/metrics/classification/roc_auc.py b/DashAI/back/metrics/classification/roc_auc.py index 431265ee0..a87d35ab3 100644 --- a/DashAI/back/metrics/classification/roc_auc.py +++ b/DashAI/back/metrics/classification/roc_auc.py @@ -1,8 +1,5 @@ """DashAI RoC AUC classification metric implementation.""" -import numpy as np -from sklearn.metrics import roc_auc_score - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.classification_metric import ( ClassificationMetric, @@ -21,9 +18,7 @@ class ROCAUC(ClassificationMetric): ) @staticmethod - def score( - true_labels: DashAIDataset, probs_pred_labels: np.ndarray, multiclass=None - ) -> float: + def score(true_labels: DashAIDataset, probs_pred_labels, multiclass=None) -> float: """Calculate RoC AUC score between true labels and predicted labels. Parameters @@ -48,6 +43,8 @@ def score( if multiclass is None: multiclass = ClassificationMetric.is_multiclass(true_labels) + from sklearn.metrics import roc_auc_score + if multiclass: return roc_auc_score(true_labels, probs_pred_labels, multi_class="ovr") else: diff --git a/DashAI/back/metrics/regression/explained_variance.py b/DashAI/back/metrics/regression/explained_variance.py index cbe4dafac..1eca3c6fe 100644 --- a/DashAI/back/metrics/regression/explained_variance.py +++ b/DashAI/back/metrics/regression/explained_variance.py @@ -1,8 +1,5 @@ """DashAI Explained Variance regression metric implementation.""" -import numpy as np -from sklearn.metrics import explained_variance_score - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.regression_metric import RegressionMetric, prepare_to_metric @@ -18,7 +15,7 @@ class ExplainedVariance(RegressionMetric): ) @staticmethod - def score(true_values: DashAIDataset, predicted_values: np.ndarray) -> float: + def score(true_values: DashAIDataset, predicted_values) -> float: """Calculate the Explained Variance between true values and predicted values. Parameters @@ -34,5 +31,7 @@ def score(true_values: DashAIDataset, predicted_values: np.ndarray) -> float: float Explained Variance score between true values and predicted values """ + from sklearn.metrics import explained_variance_score + true_values, pred_values = prepare_to_metric(true_values, predicted_values) return explained_variance_score(true_values, pred_values) diff --git a/DashAI/back/metrics/regression/mae.py b/DashAI/back/metrics/regression/mae.py index 0cf1d643f..e18fd1b4a 100644 --- a/DashAI/back/metrics/regression/mae.py +++ b/DashAI/back/metrics/regression/mae.py @@ -1,8 +1,5 @@ """DashAI MAE regression metric implementation.""" -import numpy as np -from sklearn.metrics import mean_absolute_error - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.regression_metric import RegressionMetric, prepare_to_metric @@ -16,7 +13,7 @@ class MAE(RegressionMetric): ) @staticmethod - def score(true_values: DashAIDataset, pred_values: np.ndarray) -> float: + def score(true_values: DashAIDataset, pred_values) -> float: """Calculate the MAE between true values and predicted values. Parameters @@ -32,5 +29,7 @@ def score(true_values: DashAIDataset, pred_values: np.ndarray) -> float: float MAE score between true values and predicted values """ + from sklearn.metrics import mean_absolute_error + true_values, pred_values = prepare_to_metric(true_values, pred_values) return mean_absolute_error(true_values, pred_values) diff --git a/DashAI/back/metrics/regression/median_absolute_error.py b/DashAI/back/metrics/regression/median_absolute_error.py index 086e2218b..633a8f470 100644 --- a/DashAI/back/metrics/regression/median_absolute_error.py +++ b/DashAI/back/metrics/regression/median_absolute_error.py @@ -1,8 +1,5 @@ """Median Absolute Error metric for regression tasks.""" -import numpy as np -from sklearn.metrics import median_absolute_error - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.regression_metric import RegressionMetric, prepare_to_metric @@ -20,7 +17,7 @@ class MedianAbsoluteError(RegressionMetric): ) @staticmethod - def score(true_values: DashAIDataset, predicted_values: np.ndarray) -> float: + def score(true_values: DashAIDataset, predicted_values) -> float: """Calculate the Median Absolute Error between true values and predicted values. Parameters @@ -36,5 +33,7 @@ def score(true_values: DashAIDataset, predicted_values: np.ndarray) -> float: float Median Absolute Error score between true values and predicted values """ + from sklearn.metrics import median_absolute_error + true_values, pred_values = prepare_to_metric(true_values, predicted_values) return median_absolute_error(true_values, pred_values) diff --git a/DashAI/back/metrics/regression/mse.py b/DashAI/back/metrics/regression/mse.py index 7c3f1e34b..4569b536c 100644 --- a/DashAI/back/metrics/regression/mse.py +++ b/DashAI/back/metrics/regression/mse.py @@ -1,8 +1,5 @@ """DashAI MSE regression metric implementation.""" -import numpy as np -from sklearn.metrics import mean_squared_error - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.regression_metric import RegressionMetric, prepare_to_metric @@ -19,7 +16,7 @@ class MSE(RegressionMetric): ) @staticmethod - def score(true_values: DashAIDataset, predicted_values: np.ndarray) -> float: + def score(true_values: DashAIDataset, predicted_values) -> float: """Calculate the MSE between true values and predicted values. Parameters @@ -35,5 +32,7 @@ def score(true_values: DashAIDataset, predicted_values: np.ndarray) -> float: float MSE score between true values and predicted values """ + from sklearn.metrics import mean_squared_error + true_values, pred_values = prepare_to_metric(true_values, predicted_values) return mean_squared_error(true_values, pred_values) diff --git a/DashAI/back/metrics/regression/r2.py b/DashAI/back/metrics/regression/r2.py index d03c50480..de9573452 100644 --- a/DashAI/back/metrics/regression/r2.py +++ b/DashAI/back/metrics/regression/r2.py @@ -1,8 +1,5 @@ """DashAI R2 score implementation.""" -import numpy as np -from sklearn.metrics import r2_score - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.regression_metric import RegressionMetric, prepare_to_metric @@ -19,7 +16,7 @@ class R2(RegressionMetric): ) @staticmethod - def score(true_values: DashAIDataset, pred_values: np.ndarray) -> float: + def score(true_values: DashAIDataset, pred_values) -> float: """Calculate the R2 score between true values and predicted values. Parameters @@ -35,5 +32,7 @@ def score(true_values: DashAIDataset, pred_values: np.ndarray) -> float: float R2 score between true values and predicted values """ + from sklearn.metrics import r2_score + true_values, pred_values = prepare_to_metric(true_values, pred_values) return r2_score(true_values, pred_values) diff --git a/DashAI/back/metrics/regression/rmse.py b/DashAI/back/metrics/regression/rmse.py index 816926ce3..b61669de9 100644 --- a/DashAI/back/metrics/regression/rmse.py +++ b/DashAI/back/metrics/regression/rmse.py @@ -1,8 +1,5 @@ """DashAI RMSE regression metric implementation.""" -import numpy as np -from sklearn.metrics import root_mean_squared_error - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.regression_metric import RegressionMetric, prepare_to_metric @@ -16,7 +13,7 @@ class RMSE(RegressionMetric): ) @staticmethod - def score(true_values: DashAIDataset, predicted_values: np.ndarray) -> float: + def score(true_values: DashAIDataset, predicted_values) -> float: """Calculate the RMSE between true values and predicted values. Parameters @@ -32,5 +29,7 @@ def score(true_values: DashAIDataset, predicted_values: np.ndarray) -> float: float RMSE score between true values and predicted values """ + from sklearn.metrics import root_mean_squared_error + true_values, pred_values = prepare_to_metric(true_values, predicted_values) return root_mean_squared_error(true_values, pred_values) diff --git a/DashAI/back/metrics/translation/bleu.py b/DashAI/back/metrics/translation/bleu.py index b80b8c7ac..5e8bfddcf 100644 --- a/DashAI/back/metrics/translation/bleu.py +++ b/DashAI/back/metrics/translation/bleu.py @@ -1,7 +1,6 @@ """BLEU (bilingual evaluation understudy) metric implementation for DashAI.""" import evaluate -import numpy as np from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.base_metric import prepare_to_metric @@ -27,7 +26,7 @@ class Bleu(TranslationMetric): ) @staticmethod - def score(source_sentences: DashAIDataset, target_sentences: np.ndarray): + def score(source_sentences: DashAIDataset, target_sentences): """Calculate the BLEU score between source and target sentences. Parameters diff --git a/DashAI/back/metrics/translation/chrf.py b/DashAI/back/metrics/translation/chrf.py index 59bdd7f55..ad2a88e57 100644 --- a/DashAI/back/metrics/translation/chrf.py +++ b/DashAI/back/metrics/translation/chrf.py @@ -1,8 +1,5 @@ """DashAI CHRF metric implementation for translation tasks.""" -import numpy as np -from torchmetrics.text.chrf import CHRFScore - from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.translation_metric import TranslationMetric, prepare_to_metric @@ -30,7 +27,7 @@ class Chrf(TranslationMetric): ) @staticmethod - def score(source_sentences: DashAIDataset, target_sentences: np.ndarray): + def score(source_sentences: DashAIDataset, target_sentences): """Calculate the CHRF score between source and target sentences. Parameters @@ -45,6 +42,8 @@ def score(source_sentences: DashAIDataset, target_sentences: np.ndarray): float The calculated CHRF score ranging between 0 and 1. """ + from torchmetrics.text.chrf import CHRFScore + chrf_metric = CHRFScore() source_sentences, target_sentences = prepare_to_metric( source_sentences, target_sentences diff --git a/DashAI/back/metrics/translation/ter.py b/DashAI/back/metrics/translation/ter.py index f088b188f..a347376a1 100644 --- a/DashAI/back/metrics/translation/ter.py +++ b/DashAI/back/metrics/translation/ter.py @@ -1,7 +1,6 @@ """TER (Translation Edit Rate) metric implementation for DashAI.""" import evaluate -import numpy as np from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.metrics.base_metric import prepare_to_metric @@ -26,7 +25,7 @@ class Ter(TranslationMetric): ) @staticmethod - def score(source_sentences: DashAIDataset, target_sentences: np.ndarray): + def score(source_sentences: DashAIDataset, target_sentences): """Calculate the TER score between source and target sentences. Parameters diff --git a/DashAI/back/models/hugging_face/distilbert_transformer.py b/DashAI/back/models/hugging_face/distilbert_transformer.py index 1d5447b6c..e3f76e5e9 100644 --- a/DashAI/back/models/hugging_face/distilbert_transformer.py +++ b/DashAI/back/models/hugging_face/distilbert_transformer.py @@ -4,17 +4,7 @@ from pathlib import Path from typing import Any, Union -import torch from sklearn.exceptions import NotFittedError -from torch.utils.data import DataLoader -from transformers import ( - AutoConfig, - AutoModelForSequenceClassification, - AutoTokenizer, - DataCollatorWithPadding, - Trainer, - TrainingArguments, -) from DashAI.back.core.schema_fields import ( BaseSchema, @@ -215,6 +205,8 @@ def __init__(self, model=None, **kwargs): kwargs = self.validate_and_transform(kwargs) + from transformers import AutoTokenizer + self.model_name = "distilbert-base-uncased" self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) @@ -243,6 +235,8 @@ def __init__(self, model=None, **kwargs): if self.num_labels > 1: self.model.config.problem_type = "single_label_classification" else: + from transformers import AutoConfig, AutoModelForSequenceClassification + model_config = AutoConfig.from_pretrained(self.model_name) if self.num_labels is not None: model_config.num_labels = self.num_labels @@ -257,6 +251,15 @@ def __init__(self, model=None, **kwargs): self.encodings = {} # Store encodings for categorical columns def train(self, x_train, y_train, x_validation, y_validation): + import torch + from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + DataCollatorWithPadding, + Trainer, + TrainingArguments, + ) + output_column_name = y_train.column_names[0] if self.num_labels is None: @@ -354,6 +357,9 @@ def predict(self, x_pred: DashAIDataset): pred_dataset = self.prepare_dataset(x_pred) + from torch.utils.data import DataLoader + from transformers import DataCollatorWithPadding + data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) text_columns = [col for col in x_pred.column_names if col != "label"] if len(text_columns) != 1: @@ -439,6 +445,8 @@ def tokenize_data(self, dataset: DashAIDataset) -> DashAIDataset: ) def save(self, filename: Union[str, Path]) -> None: + from transformers import AutoConfig + self.model.save_pretrained(filename) config = AutoConfig.from_pretrained(filename) config.custom_params = { @@ -455,6 +463,8 @@ def save(self, filename: Union[str, Path]) -> None: @classmethod def load(cls, filename: Union[str, Path]) -> Any: + from transformers import AutoConfig, AutoModelForSequenceClassification + config = AutoConfig.from_pretrained(filename) custom_params = getattr(config, "custom_params", {}) diff --git a/DashAI/back/models/hugging_face/opus_mt_en_es_transformer.py b/DashAI/back/models/hugging_face/opus_mt_en_es_transformer.py index da47309a0..7765cfce5 100644 --- a/DashAI/back/models/hugging_face/opus_mt_en_es_transformer.py +++ b/DashAI/back/models/hugging_face/opus_mt_en_es_transformer.py @@ -5,13 +5,6 @@ from typing import List, Optional, Union from sklearn.exceptions import NotFittedError -from transformers import ( - AutoConfig, - AutoModelForSeq2SeqLM, - AutoTokenizer, - Seq2SeqTrainer, - Seq2SeqTrainingArguments, -) from DashAI.back.core.schema_fields import ( BaseSchema, @@ -197,6 +190,8 @@ def __init__(self, model=None, **kwargs): associated tokenizer. """ kwargs = self.validate_and_transform(kwargs) + from transformers import AutoTokenizer + self.model_name = "Helsinki-NLP/opus-mt-en-es" self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) if model is None: @@ -211,11 +206,12 @@ def __init__(self, model=None, **kwargs): self.log_validation_every_n_steps = kwargs.pop( "log_validation_every_n_steps", None ) - self.model = ( - model - if model is not None - else AutoModelForSeq2SeqLM.from_pretrained(self.model_name) - ) + if model is None: + from transformers import AutoModelForSeq2SeqLM + + self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name) + else: + self.model = model self.num_train_epochs = kwargs.get("num_train_epochs", 2) self.fitted = model is not None @@ -279,6 +275,8 @@ def train( dataset = self.tokenize_data(x_train, y_train) dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"]) + from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments + training_args = Seq2SeqTrainingArguments( output_dir="DashAI/back/user_models/temp_checkpoints_opus-mt-en-es", save_steps=1, @@ -378,6 +376,7 @@ def prepare_dataset( def save(self, filename: Union[str, Path]) -> None: self.model.save_pretrained(filename) + from transformers import AutoConfig config = AutoConfig.from_pretrained(filename) @@ -394,6 +393,8 @@ def save(self, filename: Union[str, Path]) -> None: @classmethod def load(cls, filename: Union[str, Path]): + from transformers import AutoConfig, AutoModelForSeq2SeqLM + model = AutoModelForSeq2SeqLM.from_pretrained(filename) config = AutoConfig.from_pretrained(filename) diff --git a/DashAI/back/models/hugging_face/qwen_model.py b/DashAI/back/models/hugging_face/qwen_model.py index 315dfc0a9..77e889b68 100644 --- a/DashAI/back/models/hugging_face/qwen_model.py +++ b/DashAI/back/models/hugging_face/qwen_model.py @@ -1,10 +1,5 @@ from typing import List -try: - from llama_cpp import Llama -except ImportError: - Llama = None - from DashAI.back.core.schema_fields import ( BaseSchema, enum_field, @@ -128,10 +123,12 @@ class QwenModel(TextToTextGenerationTaskModel): ) def __init__(self, **kwargs): - if Llama is None: + try: + from llama_cpp import Llama + except ImportError as e: raise RuntimeError( "llama-cpp-python is not installed. Please install it to use QwenModel." - ) + ) from e kwargs = self.validate_and_transform(kwargs) self.model_name = kwargs.get("model_name", "Qwen/Qwen2.5-1.5B-Instruct-GGUF") diff --git a/DashAI/back/models/hugging_face/stable_diffusion_v1_depth_controlnet.py b/DashAI/back/models/hugging_face/stable_diffusion_v1_depth_controlnet.py index 8d7abe2e3..4c5e02224 100644 --- a/DashAI/back/models/hugging_face/stable_diffusion_v1_depth_controlnet.py +++ b/DashAI/back/models/hugging_face/stable_diffusion_v1_depth_controlnet.py @@ -1,15 +1,5 @@ from typing import Any, List, Tuple -import numpy as np -import torch -from diffusers import ( - AutoencoderKL, - ControlNetModel, - StableDiffusionXLControlNetPipeline, -) -from PIL import Image -from transformers import DPTFeatureExtractor, DPTForDepthEstimation - from DashAI.back.core.schema_fields import ( enum_field, float_field, @@ -72,6 +62,11 @@ class StableDiffusionXLV1ControlNetSchema(BaseSchema): def get_depth_map(image, device): + import numpy as np + import torch + from PIL import Image + from transformers import DPTFeatureExtractor, DPTForDepthEstimation + depth_estimator = DPTForDepthEstimation.from_pretrained( "Intel/dpt-hybrid-midas" ).to(device) @@ -119,6 +114,13 @@ class StableDiffusionXLV1ControlNet(BaseControlNetModel): def __init__(self, **kwargs: Any): """Initialize the generative model.""" + import torch + from diffusers import ( + AutoencoderKL, + ControlNetModel, + StableDiffusionXLControlNetPipeline, + ) + kwargs = self.validate_and_transform(kwargs) use_gpu = DEVICE_TO_IDX.get(kwargs.get("device")) >= 0 self.device = ( @@ -151,12 +153,12 @@ def __init__(self, **kwargs: Any): self.pipe.enable_model_cpu_offload() - def generate(self, input: Tuple[Image.Image, str]) -> List[Any]: + def generate(self, input: Tuple[Any, str]) -> List[Any]: """Generate output from a generative model. Parameters ---------- - input : Tuple[Image.Image, str] + input : Tuple[Any, str] Input data to be generated Returns diff --git a/DashAI/back/models/hugging_face/stable_diffusion_v2_model.py b/DashAI/back/models/hugging_face/stable_diffusion_v2_model.py index fa0894624..c691fe93d 100644 --- a/DashAI/back/models/hugging_face/stable_diffusion_v2_model.py +++ b/DashAI/back/models/hugging_face/stable_diffusion_v2_model.py @@ -1,8 +1,5 @@ from typing import Any, List, Optional -import torch -from diffusers import DiffusionPipeline - from DashAI.back.core.schema_fields import ( enum_field, float_field, @@ -155,6 +152,9 @@ class StableDiffusionV2Model(TextToImageGenerationTaskModel): def __init__(self, **kwargs): """Initialize the model.""" + import torch + from diffusers import DiffusionPipeline + kwargs = self.validate_and_transform(kwargs) use_gpu = DEVICE_TO_IDX.get(kwargs.get("device")) >= 0 self.device = ( @@ -189,6 +189,8 @@ def generate(self, input: str) -> List[Any]: Generated output images in a list """ + import torch + generator = None if self.seed is not None and self.seed > 0: generator = torch.Generator(device=self.device).manual_seed(self.seed) diff --git a/DashAI/back/models/hugging_face/stable_diffusion_v3_model.py b/DashAI/back/models/hugging_face/stable_diffusion_v3_model.py index caa9535b2..d23406100 100644 --- a/DashAI/back/models/hugging_face/stable_diffusion_v3_model.py +++ b/DashAI/back/models/hugging_face/stable_diffusion_v3_model.py @@ -1,9 +1,5 @@ from typing import Any, List, Optional -import torch -from diffusers import DiffusionPipeline -from huggingface_hub import login - from DashAI.back.core.schema_fields import ( enum_field, float_field, @@ -166,6 +162,11 @@ class StableDiffusionV3Model(TextToImageGenerationTaskModel): def __init__(self, **kwargs): """Initialize the model.""" + + import torch + from diffusers import DiffusionPipeline + from huggingface_hub import login + kwargs = self.validate_and_transform(kwargs) use_gpu = DEVICE_TO_IDX.get(kwargs.get("device")) >= 0 self.device = ( @@ -218,6 +219,8 @@ def generate(self, input: str) -> List[Any]: Generated output images in a list """ + import torch + generator = None if self.seed is not None and self.seed > 0: generator = torch.Generator(device=self.device).manual_seed(self.seed) diff --git a/DashAI/back/models/scikit_learn/bow_text_classification_model.py b/DashAI/back/models/scikit_learn/bow_text_classification_model.py index 7b1ed03e9..54d9a826e 100644 --- a/DashAI/back/models/scikit_learn/bow_text_classification_model.py +++ b/DashAI/back/models/scikit_learn/bow_text_classification_model.py @@ -1,12 +1,6 @@ from pathlib import Path from typing import Optional, Union -import joblib -import numpy as np -import pyarrow as pa -from datasets import Dataset -from sklearn.feature_extraction.text import CountVectorizer - from DashAI.back.core.schema_fields import ( BaseSchema, component_field, @@ -14,10 +8,7 @@ schema_field, ) from DashAI.back.core.utils import MultilingualString -from DashAI.back.dataloaders.classes.dashai_dataset import ( - DashAIDataset, - to_dashai_dataset, -) +from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.models.text_classification_model import TextClassificationModel from DashAI.back.types.categorical import Categorical from DashAI.back.types.value_types import Float @@ -128,6 +119,9 @@ def __init__(self, **kwargs) -> None: - ngram_max_n: Maximum n-gram value. """ + # Lazy import of CountVectorizer + from sklearn.feature_extraction.text import CountVectorizer + self.classifier = kwargs["tabular_classifier"] self.vectorizer = CountVectorizer( ngram_range=(kwargs["ngram_min_n"], kwargs["ngram_max_n"]) @@ -161,6 +155,9 @@ def get_vectorizer(self, input_column: str, output_column: Optional[str] = None) """ def _vectorize(example) -> dict: + # Lazy import of numpy + import numpy as np + vectorized_sentence = self.vectorizer.transform( [example[input_column]] ).toarray() @@ -173,35 +170,47 @@ def _vectorize(example) -> dict: def train( self, - x: Dataset, - y: Dataset, - x_validation: Dataset = None, - y_validation: Dataset = None, + x, + y, + x_validation=None, + y_validation=None, ): input_column = x.column_names[0] self.vectorizer.fit(x[input_column]) tokenizer_func = self.get_vectorizer(input_column) tokenized_dataset = x.map(tokenizer_func, remove_columns=x.column_names) + # Lazy import of converter + from DashAI.back.dataloaders.classes.dashai_dataset import to_dashai_dataset + tokenized_dataset = to_dashai_dataset(tokenized_dataset) self.classifier.train(tokenized_dataset, y) - def predict(self, x: Dataset): + def predict(self, x): input_column = x.column_names[0] tokenizer_func = self.get_vectorizer(input_column) tokenized_dataset = x.map(tokenizer_func, remove_columns=x.column_names) + # Lazy import of converter + from DashAI.back.dataloaders.classes.dashai_dataset import to_dashai_dataset + tokenized_dataset = to_dashai_dataset(tokenized_dataset) return self.classifier.predict(tokenized_dataset) def save(self, filename: Union[str, Path]) -> None: """Save the model in the specified path.""" + # Lazy import of joblib + import joblib + joblib.dump(self, filename) @staticmethod def load(filename: Union[str, Path]) -> None: """Load the model of the specified path.""" + # Lazy import of joblib + import joblib + model = joblib.load(filename) return model @@ -238,6 +247,11 @@ def prepare_dataset(self, dataset: DashAIDataset, is_fit=False): tokenizer_func = self.get_vectorizer(input_column) dataset = dataset.map(tokenizer_func, remove_columns=input_column) + # Lazy import converters and pyarrow + import pyarrow as pa + + from DashAI.back.dataloaders.classes.dashai_dataset import to_dashai_dataset + dataset = to_dashai_dataset(dataset) dataset.types = { diff --git a/DashAI/back/models/scikit_learn/mlp_regression.py b/DashAI/back/models/scikit_learn/mlp_regression.py index 129088799..0a342cc58 100644 --- a/DashAI/back/models/scikit_learn/mlp_regression.py +++ b/DashAI/back/models/scikit_learn/mlp_regression.py @@ -1,6 +1,4 @@ import numpy as np -import torch -import torch.nn as nn from DashAI.back.core.enums.metrics import LevelEnum, SplitEnum from DashAI.back.core.schema_fields import ( @@ -176,25 +174,6 @@ class MLPRegressorSchema(BaseSchema): ) # type: ignore -class MLP(nn.Module): - def __init__(self, input_dim, hidden_size, activation_name): - super().__init__() - activations = { - "relu": nn.ReLU(), - "tanh": nn.Tanh(), - "sigmoid": nn.Sigmoid(), - "identity": nn.Identity(), - } - self.model = nn.Sequential( - nn.Linear(input_dim, hidden_size), - activations.get(activation_name, nn.ReLU()), - nn.Linear(hidden_size, 1), - ) - - def forward(self, x): - return self.model(x) - - class MLPRegression(RegressionModel): SCHEMA = MLPRegressorSchema DISPLAY_NAME: str = MultilingualString( @@ -209,6 +188,28 @@ class MLPRegression(RegressionModel): ICON: str = "Psychology" def __init__(self, **kwargs) -> None: + import torch.nn as nn + + class MLP(nn.Module): + def __init__(self, input_dim, hidden_size, activation_name): + super().__init__() + activations = { + "relu": nn.ReLU(), + "tanh": nn.Tanh(), + "sigmoid": nn.Sigmoid(), + "identity": nn.Identity(), + } + self.model = nn.Sequential( + nn.Linear(input_dim, hidden_size), + activations.get(activation_name, nn.ReLU()), + nn.Linear(hidden_size, 1), + ) + + def forward(self, x): + return self.model(x) + + self.mlp = MLP + self.params = kwargs self.device = ( f"cuda:{DEVICE_TO_IDX.get(kwargs.get('device'))}" @@ -224,6 +225,8 @@ def train( x_validation: DashAIDataset = None, y_validation: DashAIDataset = None, ) -> "MLPRegression": + import torch + # 1. Prepare Data x_values = self.prepare_dataset(x_train, is_fit=True).to_pandas().values y_values = self.prepare_output(y_train, is_fit=True).to_pandas().values @@ -234,7 +237,7 @@ def train( ) # 2. Init Model & Optimizer - self.model = MLP( + self.model = self.mlp( input_dim=X_tensor.shape[1], hidden_size=self.params.get("hidden_size", 100), activation_name=self.params.get("activation", "relu"), @@ -243,7 +246,7 @@ def train( optimizer = torch.optim.Adam( self.model.parameters(), lr=self.params.get("learning_rate", 0.001) ) - criterion = nn.MSELoss() + criterion = torch.nn.MSELoss() # 3. Training Loop using Epochs total_epochs = self.params.get("epochs", 3) @@ -332,6 +335,8 @@ def train( return self def predict(self, x: DashAIDataset) -> np.ndarray: + import torch + self.model.eval() x_proc = self.prepare_dataset(x, is_fit=False).to_pandas().values x_tensor = torch.tensor(x_proc, dtype=torch.float32).to(self.device) @@ -339,6 +344,8 @@ def predict(self, x: DashAIDataset) -> np.ndarray: return self.model(x_tensor).cpu().numpy().flatten() def save(self, filename: str) -> None: + import torch + torch.save( { "state": self.model.state_dict(), @@ -350,11 +357,13 @@ def save(self, filename: str) -> None: @staticmethod def load(filename: str) -> "MLPRegression": + import torch + data = torch.load(filename) instance = MLPRegression(**data["params"]) # Rebuild the model architecture using saved input_dim - instance.model = MLP( + instance.model = instance.mlp( input_dim=data["input_dim"], hidden_size=instance.params.get("hidden_size", 5), activation_name=instance.params.get("activation", "relu"), diff --git a/DashAI/back/models/scikit_learn/sklearn_like_model.py b/DashAI/back/models/scikit_learn/sklearn_like_model.py index 94b8e3f24..57231cd6b 100644 --- a/DashAI/back/models/scikit_learn/sklearn_like_model.py +++ b/DashAI/back/models/scikit_learn/sklearn_like_model.py @@ -1,8 +1,5 @@ from enum import Enum -from typing import List, Optional - -import joblib -from sklearn.preprocessing import OneHotEncoder +from typing import Any, List, Optional from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset from DashAI.back.dataloaders.classes.dashai_dataset_utils import ( @@ -41,17 +38,21 @@ def __init__(self, *args, **kwargs): """Initialize the SklearnLikeModel.""" super().__init__(*args, **kwargs) self.encodings = {} - self.one_hot_encoder: Optional[OneHotEncoder] = None + self.one_hot_encoder: Optional[Any] = None self.categorical_columns: List[str] = [] self.output_encodings = {} def save(self, filename: str) -> None: """Save the model in the specified path.""" + import joblib + joblib.dump(self, filename) @staticmethod def load(filename: str) -> None: """Load the model of the specified path.""" + import joblib + model = joblib.load(filename) return model diff --git a/DashAI/back/optimizers/base_optimizer.py b/DashAI/back/optimizers/base_optimizer.py index 8e5cefd53..7390c791b 100644 --- a/DashAI/back/optimizers/base_optimizer.py +++ b/DashAI/back/optimizers/base_optimizer.py @@ -4,12 +4,6 @@ from abc import ABCMeta, abstractmethod from typing import Final -import numpy as np -import optuna -import plotly -import plotly.graph_objects as go -from optuna.importance import FanovaImportanceEvaluator - from DashAI.back.config_object import ConfigObject log = logging.getLogger(__name__) @@ -95,6 +89,11 @@ def history_objective_plot(self, trials, goal_metric): ------- fig (json): json with the plot data """ + # Lazy imports + import numpy as np + import plotly + import plotly.graph_objects as go + x = list(range(1, len(trials) + 1)) y = [trial["value"] for trial in trials] cumulative = ( @@ -151,6 +150,10 @@ def slice_plot(self, trials, goal_metric): ------- fig (json): json with the plot data """ + # Lazy imports + import plotly + import plotly.graph_objects as go + param_names = list(trials[0]["params"].keys()) traces = [] @@ -219,6 +222,10 @@ def contour_plot(self, trials, goal_metric): ------- fig (json): json with the plot data """ + # Lazy imports + import plotly + import plotly.graph_objects as go + param_names = list(trials[0]["params"].keys()) traces = [] scatter_traces = [] @@ -313,6 +320,12 @@ def importance_plot(self, trials, goal_metric): ------- fig (json): json with the plot data """ + # Lazy imports + import optuna + import plotly + import plotly.graph_objects as go + from optuna.importance import FanovaImportanceEvaluator + distributions = {} for _, param, (low, high), dtype in self.parameters: if dtype == "integer": diff --git a/DashAI/back/types/utils.py b/DashAI/back/types/utils.py index 1457807a8..a48c0e840 100644 --- a/DashAI/back/types/utils.py +++ b/DashAI/back/types/utils.py @@ -1,14 +1,10 @@ import json import re -from typing import Any, Dict, Union - -import pandas as pd -import pyarrow as pa -from pyarrow.lib import Schema +from typing import Any, Dict from DashAI.back.types.categorical import Categorical from DashAI.back.types.dashai_data_type import DashAIDataType -from DashAI.back.types.value_types import ( # Boolean, +from DashAI.back.types.value_types import ( Binary, DashAIValue, Date, @@ -21,42 +17,48 @@ Timestamp, ) -dtype_arrow_map = { - "int8": pa.int8(), - "int16": pa.int16(), - "int32": pa.int32(), - "int64": pa.int64(), - "uint8": pa.uint8(), - "uint16": pa.uint16(), - "uint32": pa.uint32(), - "uint64": pa.uint64(), - "float16": pa.float16(), - "float32": pa.float32(), - "float64": pa.float64(), - "string": pa.string(), - "large_string": pa.large_string(), - "bool": pa.bool_(), - "time32(s)": pa.time32("s"), - "time32(ms)": pa.time32("ms"), - "time64(us)": pa.time64("us"), - "time64(ns)": pa.time64("ns"), - "timestamp(s)": pa.timestamp("s"), - "timestamp(ms)": pa.timestamp("ms"), - "timestamp(us)": pa.timestamp("us"), - "timestamp(ns)": pa.timestamp("ns"), - "duration(s)": pa.duration("s"), - "duration(ms)": pa.duration("ms"), - "duration(us)": pa.duration("us"), - "duration(ns)": pa.duration("ns"), - "date32": pa.date32(), - "date64": pa.date64(), - "decimal128(8, 0)": pa.decimal128(8, 0), - "decimal128(16, 0)": pa.decimal128(16, 0), - "decimal256(38, 0)": pa.decimal256(38, 0), - "decimal256(38, 10)": pa.decimal256(38, 10), - "binary": pa.binary(), - "large_binary": pa.large_binary(), -} + +def _get_dtype_arrow_map() -> Dict[str, Any]: + """Create dtype to pyarrow DataType mapping lazily.""" + import pyarrow as pa # local import + + return { + "int8": pa.int8(), + "int16": pa.int16(), + "int32": pa.int32(), + "int64": pa.int64(), + "uint8": pa.uint8(), + "uint16": pa.uint16(), + "uint32": pa.uint32(), + "uint64": pa.uint64(), + "float16": pa.float16(), + "float32": pa.float32(), + "float64": pa.float64(), + "string": pa.string(), + "large_string": pa.large_string(), + "bool": pa.bool_(), + "time32(s)": pa.time32("s"), + "time32(ms)": pa.time32("ms"), + "time64(us)": pa.time64("us"), + "time64(ns)": pa.time64("ns"), + "timestamp(s)": pa.timestamp("s"), + "timestamp(ms)": pa.timestamp("ms"), + "timestamp(us)": pa.timestamp("us"), + "timestamp(ns)": pa.timestamp("ns"), + "duration(s)": pa.duration("s"), + "duration(ms)": pa.duration("ms"), + "duration(us)": pa.duration("us"), + "duration(ns)": pa.duration("ns"), + "date32": pa.date32(), + "date64": pa.date64(), + "decimal128(8, 0)": pa.decimal128(8, 0), + "decimal128(16, 0)": pa.decimal128(16, 0), + "decimal256(38, 0)": pa.decimal256(38, 0), + "decimal256(38, 10)": pa.decimal256(38, 10), + "binary": pa.binary(), + "large_binary": pa.large_binary(), + } + PTYPE_TO_DASHAI = { "integer": {"type": "Integer", "dtype": "int64"}, @@ -90,6 +92,8 @@ def arrow_to_dashai_types(arrow_type, format: str = None) -> DashAIValue: """Convert an Arrow type to a DashAI value.""" + import pyarrow as pa # local import + if format is not None: if arrow_type == "Date": return Date(arrow_type=pa.string(), format=format) @@ -139,14 +143,12 @@ def arrow_to_dashai_schema(arrow_tbl): return schema -def to_arrow_types(dashai_type) -> pa.DataType: - """Convert a DashAI type to an Arrow type.""" - return dtype_arrow_map.get(dashai_type) +def to_arrow_types(dashai_type) -> Any: + """Convert a DashAI type to an Arrow type lazily.""" + return _get_dtype_arrow_map().get(dashai_type) -def save_types_in_arrow_metadata( - pa_table: pa.Table, datatypes: Dict[str, Dict] -) -> pa.Table: +def save_types_in_arrow_metadata(pa_table: Any, datatypes: Dict[str, Dict]) -> Any: """ Save DashAI types in Arrow metadata. This doesn't modify the Arrow schema, but adds metadata to the table. @@ -177,7 +179,7 @@ def save_types_in_arrow_metadata( def get_types_from_arrow_metadata( - pa_table: Union[pa.Table, Schema], + pa_table: Any, ) -> Dict[str, DashAIDataType]: """ Get DashAI types from Arrow metadata. @@ -197,6 +199,7 @@ def get_types_from_arrow_metadata( ValueError If the metadata does not contain DashAI types. """ + from pyarrow.lib import Schema if isinstance(pa_table, Schema): metadata = pa_table.metadata or {} @@ -220,7 +223,8 @@ def get_types_from_arrow_metadata( # Future implementation for images, modify as needed else: dtype = info.get("dtype") - dashai_types[column] = arrow_to_dashai_types(dtype_arrow_map[dtype]) + dtype_map = _get_dtype_arrow_map() + dashai_types[column] = arrow_to_dashai_types(dtype_map[dtype]) except KeyError as e: # If the key is not found, we can log it or handle it as needed print(f"KeyError: dtype {e} not found in dtype_arrow_map") @@ -232,7 +236,7 @@ def get_types_from_arrow_metadata( # Both Date and Time conversion functions are in the case # if DashAI decides to use pyarrow dates and times instead of strings. # Both should be modified accordingly to function properly. -def pyarrow_date_conversion(column: pa.Array, format: str = "%Y-%m-%d") -> pa.Array: +def pyarrow_date_conversion(column: Any, format: str = "%Y-%m-%d") -> Any: """ Convert a PyArrow array of date strings to a PyArrow date32 array. @@ -248,6 +252,9 @@ def pyarrow_date_conversion(column: pa.Array, format: str = "%Y-%m-%d") -> pa.Ar A PyArrow array of date32 values. """ + import pandas as pd # local import + import pyarrow as pa # local import + str_dates = column.to_pylist() try: @@ -261,7 +268,7 @@ def pyarrow_date_conversion(column: pa.Array, format: str = "%Y-%m-%d") -> pa.Ar return pa.array(parsed_dates, type=pa.date32()) -def pyarrow_time_conversion(column: pa.Array, format: str = "%H:%M:%S") -> pa.Array: +def pyarrow_time_conversion(column: Any, format: str = "%H:%M:%S") -> Any: """ Convert a PyArrow array of time strings to a PyArrow time64 array. @@ -278,6 +285,9 @@ def pyarrow_time_conversion(column: pa.Array, format: str = "%H:%M:%S") -> pa.Ar A PyArrow array of time32 values. """ + import pandas as pd # local import + import pyarrow as pa # local import + str_times = column.to_pylist() try: @@ -328,10 +338,12 @@ def is_image_path(value: Any) -> bool: # This function should be improved to detect complex situations # Like "1.234,56" or "1,234.56" # So it doesn't overwrite already good floats -def comma_float_to_float(array: pa.Array) -> pa.Array: +def comma_float_to_float(array: Any) -> Any: """Convert a PyArrow array of float strings with commas to a PyArrow float64 array.""" # noqa: E501 # Remove commas and convert to float try: + import pyarrow as pa # local import + if pa.types.is_floating(array.type): return array else: diff --git a/DashAI/back/types/value_types.py b/DashAI/back/types/value_types.py index 44e5901ed..8cfad9235 100644 --- a/DashAI/back/types/value_types.py +++ b/DashAI/back/types/value_types.py @@ -1,8 +1,6 @@ from dataclasses import dataclass from typing import Optional -import pyarrow as pa - from DashAI.back.types.dashai_value import DashAIValue @@ -23,7 +21,9 @@ class Integer(DashAIValue): unsigned: bool = False dtype: str = "int64" - def __init__(self, arrow_type: pa.DataType): + def __init__(self, arrow_type: object): + import pyarrow as pa # local import + if not pa.types.is_integer(arrow_type): raise ValueError(f"Arrow type {arrow_type} is not an integer type.") if pa.types.is_unsigned_integer(arrow_type): @@ -51,7 +51,9 @@ class Float(DashAIValue): size: int = 64 dtype: str = "float64" - def __init__(self, arrow_type: pa.DataType): + def __init__(self, arrow_type: object): + import pyarrow as pa # local import + if not pa.types.is_floating(arrow_type): raise ValueError(f"Arrow type {arrow_type} is not a float type.") if pa.types.is_float16(arrow_type): @@ -85,7 +87,9 @@ class Text(DashAIValue): large: bool = False dtype: str = "string" - def __init__(self, arrow_type: pa.DataType): + def __init__(self, arrow_type: object): + import pyarrow as pa # local import + if not (pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type)): raise ValueError(f"Arrow type {arrow_type} is not a string type.") self.dtype = str(arrow_type) @@ -114,7 +118,7 @@ class Time(DashAIValue): format: str = "HH:mm:ss" dtype: str = "string" - def __init__(self, arrow_type: pa.DataType, format: Optional[str]): + def __init__(self, arrow_type: object, format: Optional[str]): self.format = format if format else "HH:mm:ss" self.dtype = str(arrow_type) @@ -137,7 +141,7 @@ class Timestamp(DashAIValue): format: str = "YYYY-MM-DD HH:mm:ss" dtype: str = "string" - def __init__(self, arrow_type: pa.DataType, format: Optional[str] = None): + def __init__(self, arrow_type: object, format: Optional[str] = None): self.format = format if format else "YYYY-MM-DD HH:mm:ss" self.dtype = str(arrow_type) @@ -158,7 +162,9 @@ class Duration(DashAIValue): unit: str = "ms" dtype: str = "duration(ms)" - def __init__(self, arrow_type: pa.DataType): + def __init__(self, arrow_type: object): + import pyarrow as pa # local import + if not pa.types.is_duration(arrow_type): raise ValueError(f"Arrow type {arrow_type} is not a duration type.") self.dtype = str(arrow_type) @@ -189,7 +195,9 @@ class Decimal(DashAIValue): scale: int = 0 dtype: str = "decimal128(8, 0)" - def __init__(self, arrow_type: pa.DataType): + def __init__(self, arrow_type: object): + import pyarrow as pa # local import + if not pa.types.is_decimal(arrow_type): raise ValueError(f"Arrow type {arrow_type} is not a decimal type.") self.dtype = str(arrow_type) @@ -224,7 +232,7 @@ class Date(DashAIValue): format: str = "YYYY-MM-DD" dtype: str = "string" - def __init__(self, arrow_type: pa.DataType, format: Optional[str]): + def __init__(self, arrow_type: object, format: Optional[str]): self.format = format if format else "YYYY-MM-DD" self.dtype = str(arrow_type) @@ -245,7 +253,9 @@ class Binary(DashAIValue): dtype: str = "binary" - def __init__(self, arrow_type: pa.DataType): + def __init__(self, arrow_type: object): + import pyarrow as pa # local import + if not (pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type)): raise ValueError(f"Arrow type {arrow_type} is not a binary type.") self.dtype = str(arrow_type)