From e19a8ffa98f806cf19a3304dfc66dec7098a5c04 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 14:21:36 +0200 Subject: [PATCH 01/16] Added TODOs to the new file structure for stacking (clean) --- autoPyTorch/ensemble/ensemble_builder.py | 338 ---------- .../ensemble/ensemble_builder_manager.py | 358 ++++++++++ autoPyTorch/ensemble/stacking_ensemble.py | 292 +++++++++ .../ensemble/stacking_ensemble_builder.py | 619 ++++++++++++++++++ autoPyTorch/evaluation/stacking_evaluator.py | 359 ++++++++++ autoPyTorch/optimizer/smbo.py | 2 +- test/test_ensemble/ensemble_utils.py | 2 +- test/test_ensemble/test_ensemble.py | 2 +- 8 files changed, 1631 insertions(+), 341 deletions(-) create mode 100644 autoPyTorch/ensemble/ensemble_builder_manager.py create mode 100644 autoPyTorch/ensemble/stacking_ensemble.py create mode 100644 autoPyTorch/ensemble/stacking_ensemble_builder.py create mode 100644 autoPyTorch/evaluation/stacking_evaluator.py diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py index 35a281235..311069e50 100644 --- a/autoPyTorch/ensemble/ensemble_builder.py +++ b/autoPyTorch/ensemble/ensemble_builder.py @@ -1,4 +1,3 @@ -# -*- encoding: utf-8 -*- import glob import gzip import logging @@ -15,8 +14,6 @@ import zlib from typing import Dict, List, Optional, Set, Tuple, Union -import dask.distributed - import numpy as np import pandas as pd @@ -25,9 +22,6 @@ from sklearn.utils.validation import check_random_state -from smac.callbacks import IncorporateRunResultCallback -from smac.optimizer.smbo import SMBO -from smac.runhistory.runhistory import RunInfo, RunValue from autoPyTorch.automl_common.common.utils.backend import Backend from autoPyTorch.constants import BINARY @@ -44,338 +38,6 @@ MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy' -class EnsembleBuilderManager(IncorporateRunResultCallback): - def __init__( - self, - start_time: float, - time_left_for_ensembles: float, - backend: Backend, - dataset_name: str, - task_type: int, - output_type: int, - metrics: List[autoPyTorchMetric], - opt_metric: str, - ensemble_size: int, - ensemble_nbest: int, - max_models_on_disc: Union[float, int], - seed: int, - precision: int, - max_iterations: Optional[int], - read_at_most: int, - ensemble_memory_limit: Optional[int], - random_state: int, - logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - pynisher_context: str = 'fork', - ): - """ SMAC callback to handle ensemble building - Args: - start_time: int - the time when this job was started, to account for any latency in job allocation - time_left_for_ensemble: int - How much time is left for the task. Job should finish within this allocated time - backend: util.backend.Backend - backend to write and read files - dataset_name: str - name of dataset - task_type: int - what type of output is expected. If Binary, we need to argmax the one hot encoding. - metrics: List[autoPyTorchMetric], - A set of metrics that will be used to get performance estimates - opt_metric: str - name of the optimization metrics - ensemble_size: int - maximal size of ensemble (passed to ensemble_selection) - ensemble_nbest: int/float - if int: consider only the n best prediction - if float: consider only this fraction of the best models - Both wrt to validation predictions - If performance_range_threshold > 0, might return less models - max_models_on_disc: Union[float, int] - Defines the maximum number of models that are kept in the disc. - If int, it must be greater or equal than 1, and dictates the max number of - models to keep. - If float, it will be interpreted as the max megabytes allowed of disc space. That - is, if the number of ensemble candidates require more disc space than this float - value, the worst models will be deleted to keep within this budget. - Models and predictions of the worst-performing models will be deleted then. - If None, the feature is disabled. - It defines an upper bound on the models that can be used in the ensemble. - seed: int - random seed - max_iterations: int - maximal number of iterations to run this script - (default None --> deactivated) - precision (int): [16,32,64,128] - precision of floats to read the predictions - memory_limit: Optional[int] - memory limit in mb. If ``None``, no memory limit is enforced. - read_at_most: int - read at most n new prediction files in each iteration - logger_port: int - port in where to publish a msg - pynisher_context: str - The multiprocessing context for pynisher. One of spawn/fork/forkserver. - - Returns: - List[Tuple[int, float, float, float]]: - A list with the performance history of this ensemble, of the form - [[pandas_timestamp, train_performance, val_performance, test_performance], ...] - """ - self.start_time = start_time - self.time_left_for_ensembles = time_left_for_ensembles - self.backend = backend - self.dataset_name = dataset_name - self.task_type = task_type - self.output_type = output_type - self.metrics = metrics - self.opt_metric = opt_metric - self.ensemble_size = ensemble_size - self.ensemble_nbest = ensemble_nbest - self.max_models_on_disc: Union[float, int] = max_models_on_disc - self.seed = seed - self.precision = precision - self.max_iterations = max_iterations - self.read_at_most = read_at_most - self.ensemble_memory_limit = ensemble_memory_limit - self.random_state = random_state - self.logger_port = logger_port - self.pynisher_context = pynisher_context - - # Store something similar to SMAC's runhistory - self.history: List[Dict[str, float]] = [] - - # We only submit new ensembles when there is not an active ensemble job - self.futures: List[dask.Future] = [] - - # The last criteria is the number of iterations - self.iteration = 0 - - # Keep track of when we started to know when we need to finish! - self.start_time = time.time() - - def __call__( - self, - smbo: 'SMBO', - run_info: RunInfo, - result: RunValue, - time_left: float, - ) -> None: - self.build_ensemble(smbo.tae_runner.client) - - def build_ensemble( - self, - dask_client: dask.distributed.Client, - unit_test: bool = False - ) -> None: - - # The second criteria is elapsed time - elapsed_time = time.time() - self.start_time - - logger = get_named_client_logger( - name='EnsembleBuilder', - port=self.logger_port, - ) - - # First test for termination conditions - if self.time_left_for_ensembles < elapsed_time: - logger.info( - "Terminate ensemble building as not time is left (run for {}s)".format( - elapsed_time - ), - ) - return - if self.max_iterations is not None and self.max_iterations <= self.iteration: - logger.info( - "Terminate ensemble building because of max iterations: {} of {}".format( - self.max_iterations, - self.iteration - ) - ) - return - - if len(self.futures) != 0: - if self.futures[0].done(): - result = self.futures.pop().result() - if result: - ensemble_history, self.ensemble_nbest, _, _ = result - logger.debug("iteration={} @ elapsed_time={} has history={}".format( - self.iteration, - elapsed_time, - ensemble_history, - )) - self.history.extend(ensemble_history) - - # Only submit new jobs if the previous ensemble job finished - if len(self.futures) == 0: - - # Add the result of the run - # On the next while iteration, no references to - # ensemble builder object, so it should be garbage collected to - # save memory while waiting for resources - # Also, notice how ensemble nbest is returned, so we don't waste - # iterations testing if the deterministic predictions size can - # be fitted in memory - try: - # Submit a Dask job from this job, to properly - # see it in the dask diagnostic dashboard - # Notice that the forked ensemble_builder_process will - # wait for the below function to be done - self.futures.append(dask_client.submit( - fit_and_return_ensemble, - backend=self.backend, - dataset_name=self.dataset_name, - task_type=self.task_type, - output_type=self.output_type, - metrics=self.metrics, - opt_metric=self.opt_metric, - ensemble_size=self.ensemble_size, - ensemble_nbest=self.ensemble_nbest, - max_models_on_disc=self.max_models_on_disc, - seed=self.seed, - precision=self.precision, - memory_limit=self.ensemble_memory_limit, - read_at_most=self.read_at_most, - random_state=self.seed, - end_at=self.start_time + self.time_left_for_ensembles, - iteration=self.iteration, - return_predictions=False, - priority=100, - pynisher_context=self.pynisher_context, - logger_port=self.logger_port, - unit_test=unit_test, - )) - - logger.info( - "{}/{} Started Ensemble builder job at {} for iteration {}.".format( - # Log the client to make sure we - # remain connected to the scheduler - self.futures[0], - dask_client, - time.strftime("%Y.%m.%d-%H.%M.%S"), - self.iteration, - ), - ) - self.iteration += 1 - except Exception as e: - exception_traceback = traceback.format_exc() - error_message = repr(e) - logger.critical(exception_traceback) - logger.critical(error_message) - - -def fit_and_return_ensemble( - backend: Backend, - dataset_name: str, - task_type: int, - output_type: int, - metrics: List[autoPyTorchMetric], - opt_metric: str, - ensemble_size: int, - ensemble_nbest: int, - max_models_on_disc: Union[float, int], - seed: int, - precision: int, - memory_limit: Optional[int], - read_at_most: int, - random_state: int, - end_at: float, - iteration: int, - return_predictions: bool, - pynisher_context: str, - logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - unit_test: bool = False, -) -> Tuple[ - List[Dict[str, float]], - int, - Optional[np.ndarray], - Optional[np.ndarray], -]: - """ - A short function to fit and create an ensemble. It is just a wrapper to easily send - a request to dask to create an ensemble and clean the memory when finished - Parameters - ---------- - backend: util.backend.Backend - backend to write and read files - dataset_name: str - name of dataset - metrics: List[autoPyTorchMetric], - A set of metrics that will be used to get performance estimates - opt_metric: - Name of the metric to optimize - task_type: int - type of output expected in the ground truth - ensemble_size: int - maximal size of ensemble (passed to ensemble.ensemble_selection) - ensemble_nbest: int/float - if int: consider only the n best prediction - if float: consider only this fraction of the best models - Both wrt to validation predictions - If performance_range_threshold > 0, might return less models - max_models_on_disc: int - Defines the maximum number of models that are kept in the disc. - If int, it must be greater or equal than 1, and dictates the max number of - models to keep. - If float, it will be interpreted as the max megabytes allowed of disc space. That - is, if the number of ensemble candidates require more disc space than this float - value, the worst models will be deleted to keep within this budget. - Models and predictions of the worst-performing models will be deleted then. - If None, the feature is disabled. - It defines an upper bound on the models that can be used in the ensemble. - seed: int - random seed - precision (int): [16,32,64,128] - precision of floats to read the predictions - memory_limit: Optional[int] - memory limit in mb. If ``None``, no memory limit is enforced. - read_at_most: int - read at most n new prediction files in each iteration - end_at: float - At what time the job must finish. Needs to be the endtime and not the time left - because we do not know when dask schedules the job. - iteration: int - The current iteration - pynisher_context: str - Context to use for multiprocessing, can be either fork, spawn or forkserver. - logger_port: int - The port where the logging server is listening to. - unit_test: bool - Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError. - Having this is very bad coding style, but I did not find a way to make - unittest.mock work through the pynisher with all spawn contexts. If you know a - better solution, please let us know by opening an issue. - Returns - ------- - List[Tuple[int, float, float, float]] - A list with the performance history of this ensemble, of the form - [[pandas_timestamp, train_performance, val_performance, test_performance], ...] - """ - result = EnsembleBuilder( - backend=backend, - dataset_name=dataset_name, - task_type=task_type, - output_type=output_type, - metrics=metrics, - opt_metric=opt_metric, - ensemble_size=ensemble_size, - ensemble_nbest=ensemble_nbest, - max_models_on_disc=max_models_on_disc, - seed=seed, - precision=precision, - memory_limit=memory_limit, - read_at_most=read_at_most, - random_state=random_state, - logger_port=logger_port, - unit_test=unit_test, - ).run( - end_at=end_at, - iteration=iteration, - return_predictions=return_predictions, - pynisher_context=pynisher_context, - ) - return result - - class EnsembleBuilder(object): def __init__( self, diff --git a/autoPyTorch/ensemble/ensemble_builder_manager.py b/autoPyTorch/ensemble/ensemble_builder_manager.py new file mode 100644 index 000000000..845992064 --- /dev/null +++ b/autoPyTorch/ensemble/ensemble_builder_manager.py @@ -0,0 +1,358 @@ +# -*- encoding: utf-8 -*- +import logging +import logging.handlers +import time +import traceback +from typing import Dict, List, Optional, Tuple, Union + +import dask.distributed + +import numpy as np + +import pandas as pd + + + +from smac.callbacks import IncorporateRunResultCallback +from smac.optimizer.smbo import SMBO +from smac.runhistory.runhistory import RunInfo, RunValue + +from autoPyTorch.automl_common.common.utils.backend import Backend +from autoPyTorch.constants import BINARY +from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder +from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.utils.logging_ import get_named_client_logger + + +class EnsembleBuilderManager(IncorporateRunResultCallback): + def __init__( + self, + start_time: float, + time_left_for_ensembles: float, + backend: Backend, + dataset_name: str, + task_type: int, + output_type: int, + metrics: List[autoPyTorchMetric], + opt_metric: str, + ensemble_size: int, + ensemble_nbest: int, + max_models_on_disc: Union[float, int], + seed: int, + precision: int, + max_iterations: Optional[int], + read_at_most: int, + ensemble_memory_limit: Optional[int], + random_state: int, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + pynisher_context: str = 'fork', + ): + """ SMAC callback to handle ensemble building + Args: + start_time: int + the time when this job was started, to account for any latency in job allocation + time_left_for_ensemble: int + How much time is left for the task. Job should finish within this allocated time + backend: util.backend.Backend + backend to write and read files + dataset_name: str + name of dataset + task_type: int + what type of output is expected. If Binary, we need to argmax the one hot encoding. + metrics: List[autoPyTorchMetric], + A set of metrics that will be used to get performance estimates + opt_metric: str + name of the optimization metrics + ensemble_size: int + maximal size of ensemble (passed to ensemble_selection) + ensemble_nbest: int/float + if int: consider only the n best prediction + if float: consider only this fraction of the best models + Both wrt to validation predictions + If performance_range_threshold > 0, might return less models + max_models_on_disc: Union[float, int] + Defines the maximum number of models that are kept in the disc. + If int, it must be greater or equal than 1, and dictates the max number of + models to keep. + If float, it will be interpreted as the max megabytes allowed of disc space. That + is, if the number of ensemble candidates require more disc space than this float + value, the worst models will be deleted to keep within this budget. + Models and predictions of the worst-performing models will be deleted then. + If None, the feature is disabled. + It defines an upper bound on the models that can be used in the ensemble. + seed: int + random seed + max_iterations: int + maximal number of iterations to run this script + (default None --> deactivated) + precision (int): [16,32,64,128] + precision of floats to read the predictions + memory_limit: Optional[int] + memory limit in mb. If ``None``, no memory limit is enforced. + read_at_most: int + read at most n new prediction files in each iteration + logger_port: int + port in where to publish a msg + pynisher_context: str + The multiprocessing context for pynisher. One of spawn/fork/forkserver. + + Returns: + List[Tuple[int, float, float, float]]: + A list with the performance history of this ensemble, of the form + [[pandas_timestamp, train_performance, val_performance, test_performance], ...] + """ + self.start_time = start_time + self.time_left_for_ensembles = time_left_for_ensembles + self.backend = backend + self.dataset_name = dataset_name + self.task_type = task_type + self.output_type = output_type + self.metrics = metrics + self.opt_metric = opt_metric + self.ensemble_size = ensemble_size + self.ensemble_nbest = ensemble_nbest + self.max_models_on_disc: Union[float, int] = max_models_on_disc + self.seed = seed + self.precision = precision + self.max_iterations = max_iterations + self.read_at_most = read_at_most + self.ensemble_memory_limit = ensemble_memory_limit + self.random_state = random_state + self.logger_port = logger_port + self.pynisher_context = pynisher_context + + # Store something similar to SMAC's runhistory + self.history: List[Dict[str, float]] = [] + + # We only submit new ensembles when there is not an active ensemble job + self.futures: List[dask.Future] = [] + + # The last criteria is the number of iterations + self.iteration = 0 + + # Keep track of when we started to know when we need to finish! + self.start_time = time.time() + + def __call__( + self, + smbo: 'SMBO', + run_info: RunInfo, + result: RunValue, + time_left: float, + ) -> None: + self.build_ensemble(smbo.tae_runner.client) + + def build_ensemble( + self, + dask_client: dask.distributed.Client, + unit_test: bool = False + ) -> None: + + # The second criteria is elapsed time + elapsed_time = time.time() - self.start_time + + logger = get_named_client_logger( + name='EnsembleBuilder', + port=self.logger_port, + ) + + # First test for termination conditions + if self.time_left_for_ensembles < elapsed_time: + logger.info( + "Terminate ensemble building as not time is left (run for {}s)".format( + elapsed_time + ), + ) + return + if self.max_iterations is not None and self.max_iterations <= self.iteration: + logger.info( + "Terminate ensemble building because of max iterations: {} of {}".format( + self.max_iterations, + self.iteration + ) + ) + return + + if len(self.futures) != 0: + if self.futures[0].done(): + result = self.futures.pop().result() + if result: + ensemble_history, self.ensemble_nbest, _, _ = result + logger.debug("iteration={} @ elapsed_time={} has history={}".format( + self.iteration, + elapsed_time, + ensemble_history, + )) + self.history.extend(ensemble_history) + + # Only submit new jobs if the previous ensemble job finished + if len(self.futures) == 0: + + # Add the result of the run + # On the next while iteration, no references to + # ensemble builder object, so it should be garbage collected to + # save memory while waiting for resources + # Also, notice how ensemble nbest is returned, so we don't waste + # iterations testing if the deterministic predictions size can + # be fitted in memory + try: + # Submit a Dask job from this job, to properly + # see it in the dask diagnostic dashboard + # Notice that the forked ensemble_builder_process will + # wait for the below function to be done + self.futures.append(dask_client.submit( + fit_and_return_ensemble, + backend=self.backend, + dataset_name=self.dataset_name, + task_type=self.task_type, + output_type=self.output_type, + metrics=self.metrics, + opt_metric=self.opt_metric, + ensemble_size=self.ensemble_size, + ensemble_nbest=self.ensemble_nbest, + max_models_on_disc=self.max_models_on_disc, + seed=self.seed, + precision=self.precision, + memory_limit=self.ensemble_memory_limit, + read_at_most=self.read_at_most, + random_state=self.seed, + end_at=self.start_time + self.time_left_for_ensembles, + iteration=self.iteration, + return_predictions=False, + priority=100, + pynisher_context=self.pynisher_context, + logger_port=self.logger_port, + unit_test=unit_test, + )) + + logger.info( + "{}/{} Started Ensemble builder job at {} for iteration {}.".format( + # Log the client to make sure we + # remain connected to the scheduler + self.futures[0], + dask_client, + time.strftime("%Y.%m.%d-%H.%M.%S"), + self.iteration, + ), + ) + self.iteration += 1 + except Exception as e: + exception_traceback = traceback.format_exc() + error_message = repr(e) + logger.critical(exception_traceback) + logger.critical(error_message) + + +def fit_and_return_ensemble( + backend: Backend, + dataset_name: str, + task_type: int, + output_type: int, + metrics: List[autoPyTorchMetric], + opt_metric: str, + ensemble_size: int, + ensemble_nbest: int, + max_models_on_disc: Union[float, int], + seed: int, + precision: int, + memory_limit: Optional[int], + read_at_most: int, + random_state: int, + end_at: float, + iteration: int, + return_predictions: bool, + pynisher_context: str, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + unit_test: bool = False, +) -> Tuple[ + List[Dict[str, float]], + int, + Optional[np.ndarray], + Optional[np.ndarray], +]: + """ + A short function to fit and create an ensemble. It is just a wrapper to easily send + a request to dask to create an ensemble and clean the memory when finished + Parameters + ---------- + backend: util.backend.Backend + backend to write and read files + dataset_name: str + name of dataset + metrics: List[autoPyTorchMetric], + A set of metrics that will be used to get performance estimates + opt_metric: + Name of the metric to optimize + task_type: int + type of output expected in the ground truth + ensemble_size: int + maximal size of ensemble (passed to ensemble.ensemble_selection) + ensemble_nbest: int/float + if int: consider only the n best prediction + if float: consider only this fraction of the best models + Both wrt to validation predictions + If performance_range_threshold > 0, might return less models + max_models_on_disc: int + Defines the maximum number of models that are kept in the disc. + If int, it must be greater or equal than 1, and dictates the max number of + models to keep. + If float, it will be interpreted as the max megabytes allowed of disc space. That + is, if the number of ensemble candidates require more disc space than this float + value, the worst models will be deleted to keep within this budget. + Models and predictions of the worst-performing models will be deleted then. + If None, the feature is disabled. + It defines an upper bound on the models that can be used in the ensemble. + seed: int + random seed + precision (int): [16,32,64,128] + precision of floats to read the predictions + memory_limit: Optional[int] + memory limit in mb. If ``None``, no memory limit is enforced. + read_at_most: int + read at most n new prediction files in each iteration + end_at: float + At what time the job must finish. Needs to be the endtime and not the time left + because we do not know when dask schedules the job. + iteration: int + The current iteration + pynisher_context: str + Context to use for multiprocessing, can be either fork, spawn or forkserver. + logger_port: int + The port where the logging server is listening to. + unit_test: bool + Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError. + Having this is very bad coding style, but I did not find a way to make + unittest.mock work through the pynisher with all spawn contexts. If you know a + better solution, please let us know by opening an issue. + Returns + ------- + List[Tuple[int, float, float, float]] + A list with the performance history of this ensemble, of the form + [[pandas_timestamp, train_performance, val_performance, test_performance], ...] + """ + result = EnsembleBuilder( + backend=backend, + dataset_name=dataset_name, + task_type=task_type, + output_type=output_type, + metrics=metrics, + opt_metric=opt_metric, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + max_models_on_disc=max_models_on_disc, + seed=seed, + precision=precision, + memory_limit=memory_limit, + read_at_most=read_at_most, + random_state=random_state, + logger_port=logger_port, + unit_test=unit_test, + ).run( + end_at=end_at, + iteration=iteration, + return_predictions=return_predictions, + pynisher_context=pynisher_context, + ) + return result + + diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py new file mode 100644 index 000000000..425d2d8ba --- /dev/null +++ b/autoPyTorch/ensemble/stacking_ensemble.py @@ -0,0 +1,292 @@ +from collections import Counter +from typing import Any, Dict, List, Tuple, Union + +import numpy as np + +from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble +from autoPyTorch.pipeline.base_pipeline import BasePipeline +from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss + + +# TODO: for now we can use this and pass this to stacking evaluator. +# TODO: This can be achieved by using `backend.load_ensemble` +# TODO: it loads the last stored ensemble. So we have access to it. +# TODO: the ensemble is a pickled file containing the fitted ensemble of this class. +# TODO: Think of functionality of the functions in this class adjusted for stacking. +class StackingEnsemble(AbstractEnsemble): + def __init__( + self, + ensemble_size: int, + metric: autoPyTorchMetric, + task_type: int, + random_state: np.random.RandomState, + ) -> None: + self.ensemble_size = ensemble_size + self.metric = metric + self.random_state = random_state + self.task_type = task_type + + def __getstate__(self) -> Dict[str, Any]: + # Cannot serialize a metric if + # it is user defined. + # That is, if doing pickle dump + # the metric won't be the same as the + # one in __main__. we don't use the metric + # in the EnsembleSelection so this should + # be fine + self.metric = None # type: ignore + return self.__dict__ + + def fit( + self, + predictions: List[np.ndarray], + labels: np.ndarray, + identifiers: List[Tuple[int, int, float]], + ) -> AbstractEnsemble: + """ + Builds a ensemble given the individual models out of fold predictions. + Fundamentally, defines a set of weights on how to perform a soft-voting + aggregation of the models in the given identifiers. + + Args: + predictions (List[np.ndarray]): + A list of individual model predictions of shape (n_datapoints, n_targets) + corresponding to the OutOfFold estimate of the ground truth + labels (np.ndarray): + The ground truth targets of shape (n_datapoints, n_targets) + identifiers: List[Tuple[int, int, float]] + A list of model identifiers, each with the form + (seed, number of run, budget) + + Returns: + A copy of self + """ + self.ensemble_size = int(self.ensemble_size) + if self.ensemble_size < 1: + raise ValueError('Ensemble size cannot be less than one!') + + self._fit(predictions, labels) + self._calculate_weights() + self.identifiers_ = identifiers + return self + + # TODO: fit a stacked ensemble. + def _fit( + self, + predictions: List[np.ndarray], + labels: np.ndarray, + ) -> None: + """ + Fast version of Rich Caruana's ensemble selection method. + + For more details, please check the paper + "Ensemble Selection from Library of Models" by R Caruana (2004) + + Args: + predictions (List[np.ndarray]): + A list of individual model predictions of shape (n_datapoints, n_targets) + corresponding to the OutOfFold estimate of the ground truth + identifiers (List[Tuple[int, int, float]]): + A list of model identifiers, each with the form + (seed, number of run, budget) + """ + self.num_input_models_ = len(predictions) + + ensemble: List[np.ndarray] = [] + trajectory = [] + order = [] + + ensemble_size = self.ensemble_size + + weighted_ensemble_prediction = np.zeros( + predictions[0].shape, + dtype=np.float64, + ) + fant_ensemble_prediction = np.zeros( + weighted_ensemble_prediction.shape, + dtype=np.float64, + ) + for i in range(ensemble_size): + losses = np.zeros( + (len(predictions)), + dtype=np.float64, + ) + s = len(ensemble) + if s > 0: + np.add( + weighted_ensemble_prediction, + ensemble[-1], + out=weighted_ensemble_prediction, + ) + + # Memory-efficient averaging! + for j, pred in enumerate(predictions): + # fant_ensemble_prediction is the prediction of the current ensemble + # and should be ([predictions[selected_prev_iterations] + predictions[j])/(s+1) + # We overwrite the contents of fant_ensemble_prediction + # directly with weighted_ensemble_prediction + new_prediction and then scale for avg + np.add( + weighted_ensemble_prediction, + pred, + out=fant_ensemble_prediction + ) + np.multiply( + fant_ensemble_prediction, + (1. / float(s + 1)), + out=fant_ensemble_prediction + ) + + # Calculate loss is versatile and can return a dict of slosses + losses[j] = calculate_loss( + metrics=[self.metric], + target=labels, + prediction=fant_ensemble_prediction, + task_type=self.task_type, + )[self.metric.name] + + all_best = np.argwhere(losses == np.nanmin(losses)).flatten() + best = self.random_state.choice(all_best) + ensemble.append(predictions[best]) + trajectory.append(losses[best]) + order.append(best) + + # Handle special case + if len(predictions) == 1: + break + + self.indices_: List[int] = order + self.trajectory_: List[float] = trajectory + self.train_loss_: float = trajectory[-1] + + # TODO: return 1 for models in layer 0, 2 for next and so on + # TODO: 0 for models that are not in stack + def _calculate_weights(self) -> None: + """ + Calculates the contribution each of the individual models + should have, in the final ensemble soft voting. It does so by + a frequency counting scheme. In particular, how many times a model + was used during hill climbing optimization. + """ + ensemble_members = Counter(self.indices_).most_common() + weights = np.zeros( + (self.num_input_models_,), + dtype=np.float64, + ) + for ensemble_member in ensemble_members: + weight = float(ensemble_member[1]) / self.ensemble_size + weights[ensemble_member[0]] = weight + + if np.sum(weights) < 1: + weights = weights / np.sum(weights) + + self.weights_ = weights + + # TODO: Adjust this to use weights and make + def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: + """ + Given a list of predictions from the individual model, this method + aggregates the predictions using a soft voting scheme with the weights + found during training. + + Args: + predictions (List[np.ndarray]): + A list of predictions from the individual base models. + + Returns: + average (np.ndarray): Soft voting predictions of ensemble models, using + the weights found during ensemble selection (self._weights) + """ + + average = np.zeros_like(predictions[0], dtype=np.float64) + tmp_predictions = np.empty_like(predictions[0], dtype=np.float64) + + # if predictions.shape[0] == len(self.weights_), + # predictions include those of zero-weight models. + if len(predictions) == len(self.weights_): + for pred, weight in zip(predictions, self.weights_): + np.multiply(pred, weight, out=tmp_predictions) + np.add(average, tmp_predictions, out=average) + + # if prediction model.shape[0] == len(non_null_weights), + # predictions do not include those of zero-weight models. + elif len(predictions) == np.count_nonzero(self.weights_): + non_null_weights = [w for w in self.weights_ if w > 0] + for pred, weight in zip(predictions, non_null_weights): + np.multiply(pred, weight, out=tmp_predictions) + np.add(average, tmp_predictions, out=average) + + # If none of the above applies, then something must have gone wrong. + else: + raise ValueError("The dimensions of ensemble predictions" + " and ensemble weights do not match!") + del tmp_predictions + return average + + def __str__(self) -> str: + return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \ + '\n\tWeights: %s\n\tIdentifiers: %s' % \ + (' '.join(['%d: %5f' % (idx, performance) + for idx, performance in enumerate(self.trajectory_)]), + self.indices_, self.weights_, + ' '.join([str(identifier) for idx, identifier in + enumerate(self.identifiers_) + if self.weights_[idx] > 0])) + + + def get_models_with_weights( + self, + models: Dict[Any, BasePipeline] + ) -> List[Tuple[float, BasePipeline]]: + """ + Handy function to tag the provided input models with a given weight. + + Args: + models (List[Tuple[float, BasePipeline]]): + A dictionary that maps a model's name to it's actual python object. + + Returns: + output (List[Tuple[float, BasePipeline]]): + each model with the related weight, sorted by ascending + performance. Notice that ensemble selection solves a minimization + problem. + """ + output = [] + for i, weight in enumerate(self.weights_): + if weight > 0.0: + identifier = self.identifiers_[i] + model = models[identifier] + output.append((weight, model)) + + output.sort(reverse=True, key=lambda t: t[0]) + + return output + + def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: + """ + After training of ensemble selection, not all models will be used. + Some of them will have zero weight. This procedure filters this models + out. + + Returns: + output (List[Tuple[int, int, float]]): + The models actually used by ensemble selection + """ + output = [] + + for i, weight in enumerate(self.weights_): + identifier = self.identifiers_[i] + if weight > 0.0: + output.append(identifier) + + return output + + def get_validation_performance(self) -> float: + """ + Returns the best optimization performance seen during hill climbing + + Returns: + (float): + best ensemble training performance + """ + return self.trajectory_[-1] diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py new file mode 100644 index 000000000..39a96bbf6 --- /dev/null +++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py @@ -0,0 +1,619 @@ +import glob +import logging +import logging.handlers +import math +import os +import pickle +import time +import traceback +import zlib +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np + +from autoPyTorch.automl_common.common.utils.backend import Backend +from autoPyTorch.constants import BINARY +from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble +from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder +from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection +from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss +from autoPyTorch.utils.logging_ import get_named_client_logger + +Y_ENSEMBLE = 0 +Y_TEST = 1 + +MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy' + + +# TODO: think of what functions are needed to support stacking +# TODO: make functions to support stacking. +class StackingEnsembleBuilder(EnsembleBuilder): + def __init__( + self, + backend: Backend, + dataset_name: str, + task_type: int, + output_type: int, + metrics: List[autoPyTorchMetric], + opt_metric: str, + ensemble_size: int = 10, + ensemble_nbest: int = 100, + max_models_on_disc: Union[float, int] = 100, + performance_range_threshold: float = 0, + seed: int = 1, + precision: int = 32, + memory_limit: Optional[int] = 1024, + read_at_most: int = 5, + random_state: Optional[Union[int, np.random.RandomState]] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + unit_test: bool = False, + ): + """ + Constructor + Parameters + ---------- + backend: util.backend.Backend + backend to write and read files + dataset_name: str + name of dataset + task_type: int + type of ML task + metrics: List[autoPyTorchMetric], + name of metric to score predictions + opt_metric: str + name of the metric to optimize + ensemble_size: int + maximal size of ensemble (passed to ensemble.ensemble_selection) + ensemble_nbest: int/float + if int: consider only the n best prediction + if float: consider only this fraction of the best models + Both wrt to validation predictions + If performance_range_threshold > 0, might return less models + max_models_on_disc: Union[float, int] + Defines the maximum number of models that are kept in the disc. + If int, it must be greater or equal than 1, and dictates the max number of + models to keep. + If float, it will be interpreted as the max megabytes allowed of disc space. That + is, if the number of ensemble candidates require more disc space than this float + value, the worst models will be deleted to keep within this budget. + Models and predictions of the worst-performing models will be deleted then. + If None, the feature is disabled. + It defines an upper bound on the models that can be used in the ensemble. + performance_range_threshold: float + Keep only models that are better than: + dummy + (best - dummy)*performance_range_threshold + E.g dummy=2, best=4, thresh=0.5 --> only consider models with score > 3 + Will at most return the minimum between ensemble_nbest models, + and max_models_on_disc. Might return less + seed: int + random seed + precision: [16,32,64,128] + precision of floats to read the predictions + memory_limit: Optional[int] + memory limit in mb. If ``None``, no memory limit is enforced. + read_at_most: int + read at most n new prediction files in each iteration + logger_port: int + port that receives logging records + unit_test: bool + Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError. + Having this is very bad coding style, but I did not find a way to make + unittest.mock work through the pynisher with all spawn contexts. If you know a + better solution, please let us know by opening an issue. + """ + + super(StackingEnsembleBuilder, self).__init__( + backend=backend, dataset_name=dataset_name, task_type=task_type, + output_type=output_type, metrics=metrics, opt_metric=opt_metric, + ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, + max_models_on_disc=max_models_on_disc, + performance_range_threshold=performance_range_threshold, + seed=seed, precision=precision, memory_limit=memory_limit, + read_at_most=read_at_most, random_state=random_state, + logger_port=logger_port, unit_test=unit_test) + + # TODO: This is the main wrapper to the EnsembleSelection class which fits + # TODO: the ensemble + def main( + self, time_left: float, iteration: int, return_predictions: bool, + ) -> Tuple[ + List[Dict[str, float]], + int, + Optional[np.ndarray], + Optional[np.ndarray], + ]: + """ + This is the main function of the ensemble builder process and can be considered + a wrapper over the ensemble selection method implemented y EnsembleSelection class. + + This method is going to be called multiple times by the main process, to + build and ensemble, in case the SMAC process produced new models and to provide + anytime results. + + On this regard, this method mainly: + 1- select from all the individual models that smac created, the N-best candidates + (this in the scenario that N > ensemble_nbest argument to this class). This is + done based on a score calculated via the metrics argument. + 2- This pre-selected candidates are provided to the ensemble selection method + and if a ensemble is found under the provided memory/time constraints, a new + ensemble is proposed. + 3- Because this process will be called multiple times, it performs checks to make + sure a new ensenmble is only proposed if new predictions are available, as well + as making sure we do not run out of resources (like disk space) + + Args: + time_left (float): + How much time is left for the ensemble builder process + iteration (int): + Which is the current iteration + return_predictions (bool): + Whether we want to return the predictions of the current model or not + + Returns: + ensemble_history (Dict): + A snapshot of both test and optimization performance. For debugging. + ensemble_nbest (int): + The user provides a direction on how many models to use in ensemble selection. + This number can be reduced internally if the memory requirements force it. + train_predictions (np.ndarray): + The optimization prediction from the current ensemble. + test_predictions (np.ndarray): + The train prediction from the current ensemble. + """ + + # Pynisher jobs inside dask 'forget' + # the logger configuration. So we have to set it up + # accordingly + self.logger = get_named_client_logger( + name='EnsembleBuilder', + port=self.logger_port, + ) + + self.start_time = time.time() + train_pred, test_pred = None, None + + used_time = time.time() - self.start_time + self.logger.debug( + 'Starting iteration %d, time left: %f', + iteration, + time_left - used_time, + ) + + # populates self.read_preds and self.read_losses + if not self.compute_loss_per_model(): + if return_predictions: + return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred + else: + return self.ensemble_history, self.ensemble_nbest, None, None + + # Only the models with the n_best predictions are candidates + # to be in the ensemble + candidate_models = self.get_n_best_preds() + if not candidate_models: # no candidates yet + if return_predictions: + return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred + else: + return self.ensemble_history, self.ensemble_nbest, None, None + + # populates predictions in self.read_preds + # reduces selected models if file reading failed + n_sel_test = self.get_test_preds(selected_keys=candidate_models) + + # If any of n_sel_* is not empty and overlaps with candidate_models, + # then ensure candidate_models AND n_sel_test are sorted the same + candidate_models_set = set(candidate_models) + if candidate_models_set.intersection(n_sel_test): + candidate_models = sorted(list(candidate_models_set.intersection( + n_sel_test))) + n_sel_test = candidate_models + else: + # This has to be the case + n_sel_test = [] + + if os.environ.get('ENSEMBLE_KEEP_ALL_CANDIDATES'): + for candidate in candidate_models: + self._has_been_candidate.add(candidate) + + # train ensemble + ensemble = self.fit_ensemble(selected_keys=candidate_models) + + # Save the ensemble for later use in the main module! + if ensemble is not None and self.SAVE2DISC: + self.backend.save_ensemble(ensemble, iteration, self.seed) + + # Delete files of non-candidate models - can only be done after fitting the ensemble and + # saving it to disc so we do not accidentally delete models in the previous ensemble + if self.max_resident_models is not None: + self._delete_excess_models(selected_keys=candidate_models) + + # Save the read losses status for the next iteration + with open(self.ensemble_loss_file, "wb") as memory: + pickle.dump(self.read_losses, memory) + + if ensemble is not None: + train_pred = self.predict(set_="train", + ensemble=ensemble, + selected_keys=candidate_models, + n_preds=len(candidate_models), + index_run=iteration) + # TODO if predictions fails, build the model again during the + # next iteration! + test_pred = self.predict(set_="test", + ensemble=ensemble, + selected_keys=n_sel_test, + n_preds=len(candidate_models), + index_run=iteration) + + # Add a score to run history to see ensemble progress + self._add_ensemble_trajectory( + train_pred, + test_pred + ) + + # The loaded predictions and the hash can only be saved after the ensemble has been + # built, because the hash is computed during the construction of the ensemble + with open(self.ensemble_memory_file, "wb") as memory: + pickle.dump((self.read_preds, self.last_hash), memory) + + if return_predictions: + return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred + else: + return self.ensemble_history, self.ensemble_nbest, None, None + + def get_disk_consumption(self, pred_path: str) -> float: + """ + gets the cost of a model being on disc + """ + + match = self.model_fn_re.search(pred_path) + if not match: + raise ValueError("Invalid path format %s" % pred_path) + _seed = int(match.group(1)) + _num_run = int(match.group(2)) + _budget = float(match.group(3)) + + stored_files_for_run = os.listdir( + self.backend.get_numrun_directory(_seed, _num_run, _budget)) + stored_files_for_run = [ + os.path.join(self.backend.get_numrun_directory(_seed, _num_run, _budget), file_name) + for file_name in stored_files_for_run] + this_model_cost = sum([os.path.getsize(path) for path in stored_files_for_run]) + + # get the megabytes + return round(this_model_cost / math.pow(1024, 2), 2) + + # TODO: change this function, to compute loss according to Lavesque et al. + # TODO: this will help us in choosing the model with the lowest ensemble error. + def compute_loss_per_model(self) -> bool: + """ + Compute the loss of the predictions on ensemble building data set; + populates self.read_preds and self.read_losses + """ + + self.logger.debug("Read ensemble data set predictions") + + if self.y_true_ensemble is None: + try: + self.y_true_ensemble = self.backend.load_targets_ensemble() + except FileNotFoundError: + self.logger.debug( + "Could not find true targets on ensemble data set: %s", + traceback.format_exc(), + ) + return False + + pred_path = os.path.join( + glob.escape(self.backend.get_runs_directory()), + '%d_*_*' % self.seed, + 'predictions_ensemble_%s_*_*.npy*' % self.seed, + ) + y_ens_files = glob.glob(pred_path) + y_ens_files = [y_ens_file for y_ens_file in y_ens_files + if y_ens_file.endswith('.npy') or y_ens_file.endswith('.npy.gz')] + self.y_ens_files = y_ens_files + # no validation predictions so far -- no files + if len(self.y_ens_files) == 0: + self.logger.debug("Found no prediction files on ensemble data set:" + " %s" % pred_path) + return False + + # First sort files chronologically + to_read = [] + for y_ens_fn in self.y_ens_files: + match = self.model_fn_re.search(y_ens_fn) + if match is None: + raise ValueError(f"Could not interpret file {y_ens_fn} " + "Something went wrong while scoring predictions") + _seed = int(match.group(1)) + _num_run = int(match.group(2)) + _budget = float(match.group(3)) + + to_read.append([y_ens_fn, match, _seed, _num_run, _budget]) + + n_read_files = 0 + # Now read file wrt to num_run + # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list + # as a returning list, so as a work-around we skip next line + for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]): # type: ignore + if self.read_at_most and n_read_files >= self.read_at_most: + # limit the number of files that will be read + # to limit memory consumption + break + + if not y_ens_fn.endswith(".npy") and not y_ens_fn.endswith(".npy.gz"): + self.logger.info('Error loading file (not .npy or .npy.gz): %s', y_ens_fn) + continue + + if not self.read_losses.get(y_ens_fn): + self.read_losses[y_ens_fn] = { + "ens_loss": np.inf, + "mtime_ens": 0, + "mtime_test": 0, + "seed": _seed, + "num_run": _num_run, + "budget": _budget, + "disc_space_cost_mb": None, + # Lazy keys so far: + # 0 - not loaded + # 1 - loaded and in memory + # 2 - loaded but dropped again + # 3 - deleted from disk due to space constraints + "loaded": 0 + } + if not self.read_preds.get(y_ens_fn): + self.read_preds[y_ens_fn] = { + Y_ENSEMBLE: None, + Y_TEST: None, + } + + if self.read_losses[y_ens_fn]["mtime_ens"] == os.path.getmtime(y_ens_fn): + # same time stamp; nothing changed; + continue + + # actually read the predictions and compute their respective loss + try: + y_ensemble = self._read_np_fn(y_ens_fn) + losses = calculate_loss( + metrics=self.metrics, + target=self.y_true_ensemble, + prediction=y_ensemble, + task_type=self.task_type, + ) + + if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]): + self.logger.debug( + 'Changing ensemble loss for file %s from %f to %f ' + 'because file modification time changed? %f - %f', + y_ens_fn, + self.read_losses[y_ens_fn]["ens_loss"], + losses[self.opt_metric], + self.read_losses[y_ens_fn]["mtime_ens"], + os.path.getmtime(y_ens_fn), + ) + + self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric] + + # It is not needed to create the object here + # To save memory, we just compute the loss. + self.read_losses[y_ens_fn]["mtime_ens"] = os.path.getmtime(y_ens_fn) + self.read_losses[y_ens_fn]["loaded"] = 2 + self.read_losses[y_ens_fn]["disc_space_cost_mb"] = self.get_disk_consumption( + y_ens_fn + ) + + n_read_files += 1 + + except Exception: + self.logger.warning( + 'Error loading %s: %s', + y_ens_fn, + traceback.format_exc(), + ) + self.read_losses[y_ens_fn]["ens_loss"] = np.inf + + self.logger.debug( + 'Done reading %d new prediction files. Loaded %d predictions in ' + 'total.', + n_read_files, + np.sum([pred["loaded"] > 0 for pred in self.read_losses.values()]) + ) + return True + + def get_test_preds(self, selected_keys: List[str]) -> List[str]: + """ + test predictions from disc + and store them in self.read_preds + Parameters + --------- + selected_keys: list + list of selected keys of self.read_preds + Return + ------ + success_keys: + all keys in selected keys for which we could read the valid and + test predictions + """ + success_keys_test = [] + + for k in selected_keys: + test_fn = glob.glob( + os.path.join( + glob.escape(self.backend.get_runs_directory()), + '%d_%d_%s' % ( + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"], + ), + 'predictions_test_%d_%d_%s.npy*' % ( + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"] + ) + ) + ) + test_fn = [tfn for tfn in test_fn if tfn.endswith('.npy') or tfn.endswith('.npy.gz')] + + if len(test_fn) == 0: + # self.logger.debug("Not found test prediction file (although " + # "ensemble predictions available):%s" % + # test_fn) + pass + else: + if ( + self.read_losses[k]["mtime_test"] == os.path.getmtime(test_fn[0]) + and k in self.read_preds + and self.read_preds[k][Y_TEST] is not None + ): + success_keys_test.append(k) + continue + try: + y_test = self._read_np_fn(test_fn[0]) + self.read_preds[k][Y_TEST] = y_test + success_keys_test.append(k) + self.read_losses[k]["mtime_test"] = os.path.getmtime(test_fn[0]) + except Exception: + self.logger.warning('Error loading %s: %s', + test_fn, traceback.format_exc()) + + return success_keys_test + + def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]: + """ + fit ensemble + + Parameters + --------- + selected_keys: list + list of selected keys of self.read_losses + + Returns + ------- + ensemble: EnsembleSelection + trained Ensemble + """ + + if self.unit_test: + raise MemoryError() + + predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in selected_keys] + include_num_runs = [ + ( + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"], + ) + for k in selected_keys] + + # check hash if ensemble training data changed + current_hash = "".join([ + str(zlib.adler32(predictions_train[i].data.tobytes())) + for i in range(len(predictions_train)) + ]) + if self.last_hash == current_hash: + self.logger.debug( + "No new model predictions selected -- skip ensemble building " + "-- current performance: %f", + self.validation_performance_, + ) + + return None + self.last_hash = current_hash + + opt_metric = [m for m in self.metrics if m.name == self.opt_metric][0] + if not opt_metric: + raise ValueError(f"Cannot optimize for {self.opt_metric} in {self.metrics} " + "as more than one unique optimization metric was found.") + + ensemble = EnsembleSelection( + ensemble_size=self.ensemble_size, + metric=opt_metric, + random_state=self.random_state, + task_type=self.task_type, + ) + + try: + self.logger.debug( + "Fitting the ensemble on %d models.", + len(predictions_train), + ) + start_time = time.time() + ensemble.fit(predictions_train, self.y_true_ensemble, + include_num_runs) + end_time = time.time() + self.logger.debug( + "Fitting the ensemble took %.2f seconds.", + end_time - start_time, + ) + self.logger.info(str(ensemble)) + self.validation_performance_ = min( + self.validation_performance_, + ensemble.get_validation_performance(), + ) + + except ValueError: + self.logger.error('Caught ValueError: %s', traceback.format_exc()) + return None + except IndexError: + self.logger.error('Caught IndexError: %s' + traceback.format_exc()) + return None + finally: + # Explicitly free memory + del predictions_train + + return ensemble + + def predict(self, set_: str, + ensemble: AbstractEnsemble, + selected_keys: list, + n_preds: int, + index_run: int) -> np.ndarray: + """ + save preditions on ensemble, validation and test data on disc + Parameters + ---------- + set_: ["test"] + data split name + ensemble: EnsembleSelection + trained Ensemble + selected_keys: list + list of selected keys of self.read_losses + n_preds: int + number of prediction models used for ensemble building + same number of predictions on valid and test are necessary + index_run: int + n-th time that ensemble predictions are written to disc + Return + ------ + y: np.ndarray + """ + self.logger.debug("Predicting the %s set with the ensemble!", set_) + + if set_ == 'test': + pred_set = Y_TEST + else: + pred_set = Y_ENSEMBLE + predictions = [self.read_preds[k][pred_set] for k in selected_keys] + + if n_preds == len(predictions): + y = ensemble.predict(predictions) + if self.output_type == BINARY: + y = y[:, 1] + if self.SAVE2DISC: + self.backend.save_predictions_as_txt( + predictions=y, + subset=set_, + idx=index_run, + prefix=self.dataset_name, + precision=8, + ) + return y + else: + self.logger.info( + "Found inconsistent number of predictions and models (%d vs " + "%d) for subset %s", + len(predictions), + n_preds, + set_, + ) + return None \ No newline at end of file diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py new file mode 100644 index 000000000..bf842ecb9 --- /dev/null +++ b/autoPyTorch/evaluation/stacking_evaluator.py @@ -0,0 +1,359 @@ +from multiprocessing.queues import Queue +from typing import Any, Dict, List, Optional, Tuple, Union + +from ConfigSpace.configuration_space import Configuration + +import numpy as np + +from sklearn.base import BaseEstimator + +from smac.tae import StatusType + +from autoPyTorch.automl_common.common.utils.backend import Backend +from autoPyTorch.constants import ( + CLASSIFICATION_TASKS, + MULTICLASSMULTIOUTPUT, +) +from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes +from autoPyTorch.evaluation.abstract_evaluator import ( + AbstractEvaluator, + fit_and_suppress_warnings +) +from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.utils.common import dict_repr, subsampler +from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates + +__all__ = ['StackingEvaluator', 'eval_function'] + + +def _get_y_array(y: np.ndarray, task_type: int) -> np.ndarray: + if task_type in CLASSIFICATION_TASKS and task_type != \ + MULTICLASSMULTIOUTPUT: + return y.ravel() + else: + return y + + +class StackingEvaluator(AbstractEvaluator): + """ + This class builds a pipeline using the provided configuration. + A pipeline implementing the provided configuration is fitted + using the datamanager object retrieved from disc, via the backend. + After the pipeline is fitted, it is save to disc and the performance estimate + is communicated to the main process via a Queue. + + Attributes: + backend (Backend): + An object to interface with the disk storage. In particular, allows to + access the train and test datasets + queue (Queue): + Each worker available will instantiate an evaluator, and after completion, + it will return the evaluation result via a multiprocessing queue + metric (autoPyTorchMetric): + A scorer object that is able to evaluate how good a pipeline was fit. It + is a wrapper on top of the actual score method (a wrapper on top of scikit + lean accuracy for example) that formats the predictions accordingly. + budget: (float): + The amount of epochs/time a configuration is allowed to run. + budget_type (str): + The budget type, which can be epochs or time + pipeline_config (Optional[Dict[str, Any]]): + Defines the content of the pipeline being evaluated. For example, it + contains pipeline specific settings like logging name, or whether or not + to use tensorboard. + configuration (Union[int, str, Configuration]): + Determines the pipeline to be constructed. A dummy estimator is created for + integer configurations, a traditional machine learning pipeline is created + for string based configuration, and NAS is performed when a configuration + object is passed. + seed (int): + A integer that allows for reproducibility of results + output_y_hat_optimization (bool): + Whether this worker should output the target predictions, so that they are + stored on disk. Fundamentally, the resampling strategy might shuffle the + Y_train targets, so we store the split in order to re-use them for ensemble + selection. + num_run (Optional[int]): + An identifier of the current configuration being fit. This number is unique per + configuration. + include (Optional[Dict[str, Any]]): + An optional dictionary to include components of the pipeline steps. + exclude (Optional[Dict[str, Any]]): + An optional dictionary to exclude components of the pipeline steps. + disable_file_output (Union[bool, List[str]]): + By default, the model, it's predictions and other metadata is stored on disk + for each finished configuration. This argument allows the user to skip + saving certain file type, for example the model, from being written to disk. + init_params (Optional[Dict[str, Any]]): + Optional argument that is passed to each pipeline step. It is the equivalent of + kwargs for the pipeline steps. + logger_port (Optional[int]): + Logging is performed using a socket-server scheme to be robust against many + parallel entities that want to write to the same file. This integer states the + socket port for the communication channel. If None is provided, a traditional + logger is used. + all_supported_metrics (bool): + Whether all supported metric should be calculated for every configuration. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + An object used to fine tune the hyperparameter search space of the pipeline + """ + def __init__(self, backend: Backend, queue: Queue, + metric: autoPyTorchMetric, + budget: float, + configuration: Union[int, str, Configuration], + budget_type: str = None, + pipeline_config: Optional[Dict[str, Any]] = None, + seed: int = 1, + output_y_hat_optimization: bool = True, + num_run: Optional[int] = None, + include: Optional[Dict[str, Any]] = None, + exclude: Optional[Dict[str, Any]] = None, + disable_file_output: Union[bool, List] = False, + init_params: Optional[Dict[str, Any]] = None, + logger_port: Optional[int] = None, + keep_models: Optional[bool] = None, + all_supported_metrics: bool = True, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None: + super().__init__( + backend=backend, + queue=queue, + configuration=configuration, + metric=metric, + seed=seed, + output_y_hat_optimization=output_y_hat_optimization, + num_run=num_run, + include=include, + exclude=exclude, + disable_file_output=disable_file_output, + init_params=init_params, + budget=budget, + budget_type=budget_type, + logger_port=logger_port, + all_supported_metrics=all_supported_metrics, + pipeline_config=pipeline_config, + search_space_updates=search_space_updates + ) + + # TODO: we cant store the ensemble pipelines with this class as it is initialised for every TAE (target algorithm evaluation). + # TODO: Therefore we will have to store pipelines using datamanager and load them, see if we only need predictions. + # TODO: but we will need the whole pipeline as we would like to predict with different dataset, like val or something + + self.splits = self.datamanager.splits + if self.splits is None: + raise AttributeError("Must have called create_splits on {}".format(self.datamanager.__class__.__name__)) + self.num_folds: int = len(self.splits) + self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds + self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN + self.pipelines: List[Optional[BaseEstimator]] = [None] * self.num_folds + self.indices: List[Optional[Tuple[Union[np.ndarray, List], Union[np.ndarray, List]]]] = [None] * self.num_folds + + self.logger.debug("Search space updates :{}".format(self.search_space_updates)) + self.keep_models = keep_models + + def fit_predict_and_loss(self) -> None: + """Fit, predict and compute the loss for cross-validation and + holdout""" + assert self.splits is not None, "Can't fit pipeline in {} is datamanager.splits is None" \ + .format(self.__class__.__name__) + additional_run_info: Optional[Dict] = None + split_id = 0 + self.logger.info("Starting fit {}".format(split_id)) + + pipeline = self._get_pipeline() + + train_split, test_split = self.splits[split_id] + self.Y_optimization = self.y_train[test_split] + self.Y_actual_train = self.y_train[train_split] + y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id, + train_indices=train_split, + test_indices=test_split, + add_pipeline_to_self=True) + train_loss = self._loss(self.y_train[train_split], y_train_pred) + loss = self._loss(self.y_train[test_split], y_opt_pred) + + additional_run_info = pipeline.get_additional_run_info() if hasattr( + pipeline, 'get_additional_run_info') else {} + + status = StatusType.SUCCESS + + self.logger.debug("In train evaluator.fit_predict_and_loss, num_run: {} loss:{}," + " status: {},\nadditional run info:\n{}".format(self.num_run, + loss, + dict_repr(additional_run_info), + status)) + self.finish_up( + loss=loss, + train_loss=train_loss, + opt_pred=y_opt_pred, + valid_pred=y_valid_pred, + test_pred=y_test_pred, + additional_run_info=additional_run_info, + file_output=True, + status=status, + ) + + + def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Union[np.ndarray, List], + test_indices: Union[np.ndarray, List], + add_pipeline_to_self: bool + ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: + self.indices[fold] = ((train_indices, test_indices)) + + # See autoPyTorch/pipeline/components/base_component.py::autoPyTorchComponent for more details + # about fit_dictionary + X = {'train_indices': train_indices, + 'val_indices': test_indices, + 'split_id': fold, + 'num_run': self.num_run, + **self.fit_dictionary} # fit dictionary + y = None + fit_and_suppress_warnings(self.logger, pipeline, X, y) + self.logger.info("Model fitted, now predicting") + ( + Y_train_pred, + Y_opt_pred, + Y_valid_pred, + Y_test_pred + ) = self._predict( + pipeline, + train_indices=train_indices, + test_indices=test_indices, + ) + + if add_pipeline_to_self: + self.pipeline = pipeline + else: + self.pipelines[fold] = pipeline + + return Y_train_pred, Y_opt_pred, Y_valid_pred, Y_test_pred + + def _predict(self, pipeline: BaseEstimator, + test_indices: Union[np.ndarray, List], + train_indices: Union[np.ndarray, List] + ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: + # TODO: load ensemble members and predict using the whole ensemble. + + train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline, + self.y_train[train_indices]) + + opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline, + self.y_train[train_indices]) + + if self.X_valid is not None: + valid_pred = self.predict_function(self.X_valid, pipeline, + self.y_valid) + else: + valid_pred = None + + if self.X_test is not None: + test_pred = self.predict_function(self.X_test, pipeline, + self.y_train[train_indices]) + else: + test_pred = None + + return train_pred, opt_pred, valid_pred, test_pred + + +# create closure for evaluating an algorithm +def eval_function( + backend: Backend, + queue: Queue, + metric: autoPyTorchMetric, + budget: float, + config: Optional[Configuration], + seed: int, + num_run: int, + include: Optional[Dict[str, Any]], + exclude: Optional[Dict[str, Any]], + disable_file_output: Union[bool, List], + output_y_hat_optimization: bool, + pipeline_config: Optional[Dict[str, Any]] = None, + budget_type: str = None, + init_params: Optional[Dict[str, Any]] = None, + logger_port: Optional[int] = None, + all_supported_metrics: bool = True, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + instance: str = None, +) -> None: + """ + This closure allows the communication between the ExecuteTaFuncWithQueue and the + pipeline trainer (TrainEvaluator). + + Fundamentally, smac calls the ExecuteTaFuncWithQueue.run() method, which internally + builds a TrainEvaluator. The TrainEvaluator builds a pipeline, stores the output files + to disc via the backend, and puts the performance result of the run in the queue. + + + Attributes: + backend (Backend): + An object to interface with the disk storage. In particular, allows to + access the train and test datasets + queue (Queue): + Each worker available will instantiate an evaluator, and after completion, + it will return the evaluation result via a multiprocessing queue + metric (autoPyTorchMetric): + A scorer object that is able to evaluate how good a pipeline was fit. It + is a wrapper on top of the actual score method (a wrapper on top of scikit + lean accuracy for example) that formats the predictions accordingly. + budget: (float): + The amount of epochs/time a configuration is allowed to run. + budget_type (str): + The budget type, which can be epochs or time + pipeline_config (Optional[Dict[str, Any]]): + Defines the content of the pipeline being evaluated. For example, it + contains pipeline specific settings like logging name, or whether or not + to use tensorboard. + config (Union[int, str, Configuration]): + Determines the pipeline to be constructed. + seed (int): + A integer that allows for reproducibility of results + output_y_hat_optimization (bool): + Whether this worker should output the target predictions, so that they are + stored on disk. Fundamentally, the resampling strategy might shuffle the + Y_train targets, so we store the split in order to re-use them for ensemble + selection. + num_run (Optional[int]): + An identifier of the current configuration being fit. This number is unique per + configuration. + include (Optional[Dict[str, Any]]): + An optional dictionary to include components of the pipeline steps. + exclude (Optional[Dict[str, Any]]): + An optional dictionary to exclude components of the pipeline steps. + disable_file_output (Union[bool, List[str]]): + By default, the model, it's predictions and other metadata is stored on disk + for each finished configuration. This argument allows the user to skip + saving certain file type, for example the model, from being written to disk. + init_params (Optional[Dict[str, Any]]): + Optional argument that is passed to each pipeline step. It is the equivalent of + kwargs for the pipeline steps. + logger_port (Optional[int]): + Logging is performed using a socket-server scheme to be robust against many + parallel entities that want to write to the same file. This integer states the + socket port for the communication channel. If None is provided, a traditional + logger is used. + instance (str): + An instance on which to evaluate the current pipeline. By default we work + with a single instance, being the provided X_train, y_train of a single dataset. + This instance is a compatibility argument for SMAC, that is capable of working + with multiple datasets at the same time. + """ + evaluator = StackingEvaluator( + backend=backend, + queue=queue, + metric=metric, + configuration=config, + seed=seed, + num_run=num_run, + output_y_hat_optimization=output_y_hat_optimization, + include=include, + exclude=exclude, + disable_file_output=disable_file_output, + init_params=init_params, + budget=budget, + budget_type=budget_type, + logger_port=logger_port, + all_supported_metrics=all_supported_metrics, + pipeline_config=pipeline_config, + search_space_updates=search_space_updates + ) + evaluator.fit_predict_and_loss() diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index d790237b7..b6242e379 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -24,7 +24,7 @@ HoldoutValTypes, NoResamplingStrategyTypes ) -from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager +from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash from autoPyTorch.optimizer.utils import read_return_initial_configurations from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric diff --git a/test/test_ensemble/ensemble_utils.py b/test/test_ensemble/ensemble_utils.py index 7b0ab7fb8..addfdd762 100644 --- a/test/test_ensemble/ensemble_utils.py +++ b/test/test_ensemble/ensemble_utils.py @@ -4,7 +4,7 @@ import numpy as np -from autoPyTorch.ensemble.ensemble_builder import ( +from autoPyTorch.ensemble.ensemble_builder_manager import ( AbstractEnsemble, EnsembleBuilder, ) diff --git a/test/test_ensemble/test_ensemble.py b/test/test_ensemble/test_ensemble.py index 402659e08..d8463ab86 100644 --- a/test/test_ensemble/test_ensemble.py +++ b/test/test_ensemble/test_ensemble.py @@ -18,7 +18,7 @@ from smac.runhistory.runhistory import RunHistory, RunKey, RunValue from autoPyTorch.constants import BINARY, MULTICLASS, TABULAR_CLASSIFICATION -from autoPyTorch.ensemble.ensemble_builder import ( +from autoPyTorch.ensemble.ensemble_builder_manager import ( EnsembleBuilder, EnsembleBuilderManager, Y_ENSEMBLE, From 16b10f0f2c8ec5cfb3b05687e03ead65e117ebd3 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 14:30:36 +0200 Subject: [PATCH 02/16] WIP: done changes in stackingensemblebuilder, todo: stackingensemble --- autoPyTorch/api/base_task.py | 5 +- autoPyTorch/ensemble/ensemble_builder.py | 167 +------------- autoPyTorch/ensemble/stacking_ensemble.py | 140 +++-------- .../ensemble/stacking_ensemble_builder.py | 218 ++++++++++++++---- autoPyTorch/evaluation/stacking_evaluator.py | 131 ++++++++++- .../components/training/metrics/metrics.py | 8 +- test/test_ensemble/test_ensemble.py | 4 +- 7 files changed, 353 insertions(+), 320 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index c7c99d5e1..514af72d2 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1872,6 +1872,9 @@ def _init_ensemble_builder( # builder in the provide dask client required_dataset_properties = {'task_type': self.task_type, 'output_type': self.dataset.output_type} + metrics = get_metrics( + dataset_properties=required_dataset_properties, names=[optimize_metric]) + self._logger.info(f"metrics are {metrics}") proc_ensemble = EnsembleBuilderManager( start_time=time.time(), time_left_for_ensembles=time_left_for_ensembles, @@ -1879,7 +1882,7 @@ def _init_ensemble_builder( dataset_name=str(self.dataset.dataset_name), output_type=STRING_TO_OUTPUT_TYPES[self.dataset.output_type], task_type=STRING_TO_TASK_TYPES[self.task_type], - metrics=[self._metric] if self._metric is not None else get_metrics( + metrics=get_metrics( dataset_properties=required_dataset_properties, names=[optimize_metric]), opt_metric=optimize_metric, ensemble_size=ensemble_size, diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py index 311069e50..87ec9a2b9 100644 --- a/autoPyTorch/ensemble/ensemble_builder.py +++ b/autoPyTorch/ensemble/ensemble_builder.py @@ -38,6 +38,7 @@ MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy' +# TODO: store class EnsembleBuilder(object): def __init__( self, @@ -200,6 +201,8 @@ def __init__( # from dask, it builds this object from scratch) # we save the state of this dictionary to memory # and read it if available + # TODO: ensemble_read_preds comes from self.predict, + # see line #513 self.ensemble_memory_file = os.path.join( self.backend.internals_directory, 'ensemble_read_preds.pkl' @@ -646,6 +649,7 @@ def compute_loss_per_model(self) -> bool: os.path.getmtime(y_ens_fn), ) + self.logger.debug(f"keys in losses {losses.keys()}") self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric] # It is not needed to create the object here @@ -674,167 +678,7 @@ def compute_loss_per_model(self) -> bool: ) return True - def get_n_best_preds(self) -> List[str]: - """ - get best n predictions (i.e., keys of self.read_losses) - according to the loss on the "ensemble set" - n: self.ensemble_nbest - - Side effects: - ->Define the n-best models to use in ensemble - ->Only the best models are loaded - ->Any model that is not best is candidate to deletion - if max models in disc is exceeded. - """ - - sorted_keys = self._get_list_of_sorted_preds() - - # number of models available - num_keys = len(sorted_keys) - # remove all that are at most as good as random - # note: dummy model must have run_id=1 (there is no run_id=0) - dummy_losses = list(filter(lambda x: x[2] == 1, sorted_keys)) - # Leave this here for when we enable dummy classifier/scorer - if len(dummy_losses) > 0: - # number of dummy models - num_dummy = len(dummy_losses) - dummy_loss = dummy_losses[0] - self.logger.debug("Use %f as dummy loss" % dummy_loss[1]) - sorted_keys = list(filter(lambda x: x[1] < dummy_loss[1], sorted_keys)) - - # remove Dummy Classifier - sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys)) - if len(sorted_keys) == 0: - # no model left; try to use dummy loss (num_run==0) - # log warning when there are other models but not better than dummy model - if num_keys > num_dummy: - self.logger.warning("No models better than random - using Dummy Score!" - "Number of models besides current dummy model: %d. " - "Number of dummy models: %d", - num_keys - 1, - num_dummy) - sorted_keys = [ - (k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items() - if v["seed"] == self.seed and v["num_run"] == 1 - ] - # reload predictions if losses changed over time and a model is - # considered to be in the top models again! - if not isinstance(self.ensemble_nbest, numbers.Integral): - # Transform to number of models to keep. Keep at least one - keep_nbest = max(1, min(len(sorted_keys), - int(len(sorted_keys) * self.ensemble_nbest))) - self.logger.debug( - "Library pruning: using only top %f percent of the models for ensemble " - "(%d out of %d)", - self.ensemble_nbest * 100, keep_nbest, len(sorted_keys) - ) - else: - # Keep only at most ensemble_nbest - keep_nbest = min(self.ensemble_nbest, len(sorted_keys)) - self.logger.debug("Library Pruning: using for ensemble only " - " %d (out of %d) models" % (keep_nbest, len(sorted_keys))) - - # If max_models_on_disc is None, do nothing - # One can only read at most max_models_on_disc models - if self.max_models_on_disc is not None: - if not isinstance(self.max_models_on_disc, numbers.Integral): - consumption = [ - [ - v["ens_loss"], - v["disc_space_cost_mb"], - ] for v in self.read_losses.values() if v["disc_space_cost_mb"] is not None - ] - max_consumption = max(c[1] for c in consumption) - - # We are pessimistic with the consumption limit indicated by - # max_models_on_disc by 1 model. Such model is assumed to spend - # max_consumption megabytes - if (sum(c[1] for c in consumption) + max_consumption) > self.max_models_on_disc: - - # just leave the best -- smaller is better! - # This list is in descending order, to preserve the best models - sorted_cum_consumption = np.cumsum([ - c[1] for c in list(sorted(consumption)) - ]) + max_consumption - max_models = np.argmax(sorted_cum_consumption > self.max_models_on_disc) - - # Make sure that at least 1 model survives - self.max_resident_models = max(1, max_models) - self.logger.warning( - "Limiting num of models via float max_models_on_disc={}" - " as accumulated={} worst={} num_models={}".format( - self.max_models_on_disc, - (sum(c[1] for c in consumption) + max_consumption), - max_consumption, - self.max_resident_models - ) - ) - else: - self.max_resident_models = None - else: - self.max_resident_models = self.max_models_on_disc - - if self.max_resident_models is not None and keep_nbest > self.max_resident_models: - self.logger.debug( - "Restricting the number of models to %d instead of %d due to argument " - "max_models_on_disc", - self.max_resident_models, keep_nbest, - ) - keep_nbest = self.max_resident_models - - # consider performance_range_threshold - if self.performance_range_threshold > 0: - best_loss = sorted_keys[0][1] - worst_loss = dummy_loss[1] - worst_loss -= (worst_loss - best_loss) * self.performance_range_threshold - if sorted_keys[keep_nbest - 1][1] > worst_loss: - # We can further reduce number of models - # since worst model is worse than thresh - for i in range(0, keep_nbest): - # Look at most at keep_nbest models, - # but always keep at least one model - current_loss = sorted_keys[i][1] - if current_loss >= worst_loss: - self.logger.debug("Dynamic Performance range: " - "Further reduce from %d to %d models", - keep_nbest, max(1, i)) - keep_nbest = max(1, i) - break - ensemble_n_best = keep_nbest - - # reduce to keys - reduced_sorted_keys = list(map(lambda x: x[0], sorted_keys)) - - # remove loaded predictions for non-winning models - for k in reduced_sorted_keys[ensemble_n_best:]: - if k in self.read_preds: - self.read_preds[k][Y_ENSEMBLE] = None - self.read_preds[k][Y_TEST] = None - if self.read_losses[k]['loaded'] == 1: - self.logger.debug( - 'Dropping model %s (%d,%d) with loss %f.', - k, - self.read_losses[k]['seed'], - self.read_losses[k]['num_run'], - self.read_losses[k]['ens_loss'], - ) - self.read_losses[k]['loaded'] = 2 - - # Load the predictions for the winning - for k in reduced_sorted_keys[:ensemble_n_best]: - if ( - ( - k not in self.read_preds or self.read_preds[k][Y_ENSEMBLE] is None - ) - and self.read_losses[k]['loaded'] != 3 - ): - self.read_preds[k][Y_ENSEMBLE] = self._read_np_fn(k) - # No need to load test here because they are loaded - # only if the model ends up in the ensemble - self.read_losses[k]['loaded'] = 1 - - # return best scored keys of self.read_losses - return reduced_sorted_keys[:ensemble_n_best] + def get_test_preds(self, selected_keys: List[str]) -> List[str]: """ @@ -1104,6 +948,7 @@ def _get_list_of_sorted_preds(self) -> List[Tuple[str, float, int]]: # We want small num_run first key=lambda x: (x[1], x[2]), )) + self.logger.debug(f"Selected keys: {sorted_keys}") return sorted_keys def _delete_excess_models(self, selected_keys: List[str]) -> None: diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py index 425d2d8ba..4d6987eb1 100644 --- a/autoPyTorch/ensemble/stacking_ensemble.py +++ b/autoPyTorch/ensemble/stacking_ensemble.py @@ -2,6 +2,7 @@ from typing import Any, Dict, List, Tuple, Union import numpy as np +from sklearn.base import BaseEstimator from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble from autoPyTorch.pipeline.base_pipeline import BasePipeline @@ -21,6 +22,11 @@ def __init__( metric: autoPyTorchMetric, task_type: int, random_state: np.random.RandomState, + # should be with something like numrun_seed_budget. + ensemble_identifiers = None, + best_model_identifier = None, + ensemble_slot_j: int = None, + read_preds = None, ) -> None: self.ensemble_size = ensemble_size self.metric = metric @@ -40,9 +46,11 @@ def __getstate__(self) -> Dict[str, Any]: def fit( self, - predictions: List[np.ndarray], + predictions_ensemble: List[np.ndarray], + best_model_predictions: np.ndarray, labels: np.ndarray, - identifiers: List[Tuple[int, int, float]], + ensemble_identifiers: List[Tuple[int, int, float]], + best_model_identifier: Tuple[int, int, float] ) -> AbstractEnsemble: """ Builds a ensemble given the individual models out of fold predictions. @@ -62,13 +70,7 @@ def fit( Returns: A copy of self """ - self.ensemble_size = int(self.ensemble_size) - if self.ensemble_size < 1: - raise ValueError('Ensemble size cannot be less than one!') - self._fit(predictions, labels) - self._calculate_weights() - self.identifiers_ = identifiers return self # TODO: fit a stacked ensemble. @@ -78,10 +80,10 @@ def _fit( labels: np.ndarray, ) -> None: """ - Fast version of Rich Caruana's ensemble selection method. + Implemenation of Lévesque et al. For more details, please check the paper - "Ensemble Selection from Library of Models" by R Caruana (2004) + "Bayesian hyperparameter optimization for ensemble learning" by Lévesque (2004) Args: predictions (List[np.ndarray]): @@ -103,57 +105,21 @@ def _fit( predictions[0].shape, dtype=np.float64, ) - fant_ensemble_prediction = np.zeros( - weighted_ensemble_prediction.shape, - dtype=np.float64, - ) - for i in range(ensemble_size): - losses = np.zeros( - (len(predictions)), - dtype=np.float64, - ) - s = len(ensemble) - if s > 0: - np.add( - weighted_ensemble_prediction, - ensemble[-1], - out=weighted_ensemble_prediction, - ) - - # Memory-efficient averaging! - for j, pred in enumerate(predictions): - # fant_ensemble_prediction is the prediction of the current ensemble - # and should be ([predictions[selected_prev_iterations] + predictions[j])/(s+1) - # We overwrite the contents of fant_ensemble_prediction - # directly with weighted_ensemble_prediction + new_prediction and then scale for avg - np.add( - weighted_ensemble_prediction, - pred, - out=fant_ensemble_prediction - ) - np.multiply( - fant_ensemble_prediction, - (1. / float(s + 1)), - out=fant_ensemble_prediction - ) - - # Calculate loss is versatile and can return a dict of slosses - losses[j] = calculate_loss( - metrics=[self.metric], - target=labels, - prediction=fant_ensemble_prediction, - task_type=self.task_type, - )[self.metric.name] - - all_best = np.argwhere(losses == np.nanmin(losses)).flatten() - best = self.random_state.choice(all_best) - ensemble.append(predictions[best]) - trajectory.append(losses[best]) - order.append(best) - - # Handle special case - if len(predictions) == 1: - break + + # Calculate loss is versatile and can return a dict of slosses + # losses[j] = calculate_loss( + # metrics=[self.metric], + # target=labels, + # prediction=fant_ensemble_prediction, + # task_type=self.task_type, + # )[self.metric.name] + + # all_best = np.argwhere(losses == np.nanmin(losses)).flatten() + # best = self.random_state.choice(all_best) + # ensemble.append(predictions[best]) + # trajectory.append(losses[best]) + # order.append(best) + self.indices_: List[int] = order self.trajectory_: List[float] = trajectory @@ -174,12 +140,9 @@ def _calculate_weights(self) -> None: dtype=np.float64, ) for ensemble_member in ensemble_members: - weight = float(ensemble_member[1]) / self.ensemble_size + weight = 1 weights[ensemble_member[0]] = weight - if np.sum(weights) < 1: - weights = weights / np.sum(weights) - self.weights_ = weights # TODO: Adjust this to use weights and make @@ -201,16 +164,9 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra average = np.zeros_like(predictions[0], dtype=np.float64) tmp_predictions = np.empty_like(predictions[0], dtype=np.float64) - # if predictions.shape[0] == len(self.weights_), - # predictions include those of zero-weight models. - if len(predictions) == len(self.weights_): - for pred, weight in zip(predictions, self.weights_): - np.multiply(pred, weight, out=tmp_predictions) - np.add(average, tmp_predictions, out=average) - # if prediction model.shape[0] == len(non_null_weights), # predictions do not include those of zero-weight models. - elif len(predictions) == np.count_nonzero(self.weights_): + if len(predictions) == np.count_nonzero(self.weights_): non_null_weights = [w for w in self.weights_ if w > 0] for pred, weight in zip(predictions, non_null_weights): np.multiply(pred, weight, out=tmp_predictions) @@ -233,35 +189,6 @@ def __str__(self) -> str: enumerate(self.identifiers_) if self.weights_[idx] > 0])) - - def get_models_with_weights( - self, - models: Dict[Any, BasePipeline] - ) -> List[Tuple[float, BasePipeline]]: - """ - Handy function to tag the provided input models with a given weight. - - Args: - models (List[Tuple[float, BasePipeline]]): - A dictionary that maps a model's name to it's actual python object. - - Returns: - output (List[Tuple[float, BasePipeline]]): - each model with the related weight, sorted by ascending - performance. Notice that ensemble selection solves a minimization - problem. - """ - output = [] - for i, weight in enumerate(self.weights_): - if weight > 0.0: - identifier = self.identifiers_[i] - model = models[identifier] - output.append((weight, model)) - - output.sort(reverse=True, key=lambda t: t[0]) - - return output - def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: """ After training of ensemble selection, not all models will be used. @@ -290,3 +217,12 @@ def get_validation_performance(self) -> float: best ensemble training performance """ return self.trajectory_[-1] + + def predict_with_current_pipeline( + self, + pipeline_predictions: np.ndarray, + ) -> None: + # TODO: predict with ensemble by replacing model at j = self.iteration mod m, + # where m is ensemble_size. + # returns None + pass diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py index 39a96bbf6..e3f54a818 100644 --- a/autoPyTorch/ensemble/stacking_ensemble_builder.py +++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py @@ -2,20 +2,23 @@ import logging import logging.handlers import math +from mmap import MADV_NOHUGEPAGE import os import pickle +import re import time import traceback import zlib from typing import Dict, List, Optional, Tuple, Union import numpy as np +from numpy.random.mtrand import seed from autoPyTorch.automl_common.common.utils.backend import Backend from autoPyTorch.constants import BINARY from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder -from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection +from autoPyTorch.ensemble.stacking_ensemble import StackingEnsemble from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss from autoPyTorch.utils.logging_ import get_named_client_logger @@ -180,8 +183,17 @@ def main( time_left - used_time, ) - # populates self.read_preds and self.read_losses - if not self.compute_loss_per_model(): + ensemble_identifiers = None + # Get ensemble_identifiers from previous iteration. + ensemble_dir = self.backend.get_ensemble_dir() + if len(os.listdir(ensemble_dir)) >= 1: + old_ensemble = self.backend.load_ensemble(self.seed) + ensemble_identifiers = old_ensemble.ensemble_identifiers + + self.ensemble_slot_j = np.mod(iteration, self.ensemble_size) + + # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss. + if not self.compute_ensemble_loss_per_model(ensemble_identifiers=ensemble_identifiers): if return_predictions: return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred else: @@ -196,7 +208,7 @@ def main( else: return self.ensemble_history, self.ensemble_nbest, None, None - # populates predictions in self.read_preds + # populates test predictions in self.read_preds # reduces selected models if file reading failed n_sel_test = self.get_test_preds(selected_keys=candidate_models) @@ -215,8 +227,19 @@ def main( for candidate in candidate_models: self._has_been_candidate.add(candidate) + # as candidate models is sorted in `get_n_best_preds` + best_model_identifier = candidate_models[0] + + # initialise ensemble_identifier with best_model_identifier + if ensemble_identifiers == None: + ensemble_identifiers = [best_model_identifier] + # train ensemble - ensemble = self.fit_ensemble(selected_keys=candidate_models) + ensemble = self.fit_ensemble( + selected_keys=candidate_models, + ensemble_identifiers=ensemble_identifiers, + best_model_identifier=best_model_identifier + ) # Save the ensemble for later use in the main module! if ensemble is not None and self.SAVE2DISC: @@ -261,31 +284,11 @@ def main( else: return self.ensemble_history, self.ensemble_nbest, None, None - def get_disk_consumption(self, pred_path: str) -> float: - """ - gets the cost of a model being on disc - """ - - match = self.model_fn_re.search(pred_path) - if not match: - raise ValueError("Invalid path format %s" % pred_path) - _seed = int(match.group(1)) - _num_run = int(match.group(2)) - _budget = float(match.group(3)) - - stored_files_for_run = os.listdir( - self.backend.get_numrun_directory(_seed, _num_run, _budget)) - stored_files_for_run = [ - os.path.join(self.backend.get_numrun_directory(_seed, _num_run, _budget), file_name) - for file_name in stored_files_for_run] - this_model_cost = sum([os.path.getsize(path) for path in stored_files_for_run]) - - # get the megabytes - return round(this_model_cost / math.pow(1024, 2), 2) - # TODO: change this function, to compute loss according to Lavesque et al. # TODO: this will help us in choosing the model with the lowest ensemble error. - def compute_loss_per_model(self) -> bool: + # TODO: predictions on ensemble set will be available in read_preds to be used for + # TODO: passing to stacking_ensemble_builder.predict() + def compute_ensemble_loss_per_model(self, ensemble_identifiers) -> bool: """ Compute the loss of the predictions on ensemble building data set; populates self.read_preds and self.read_losses @@ -374,12 +377,10 @@ def compute_loss_per_model(self) -> bool: # actually read the predictions and compute their respective loss try: y_ensemble = self._read_np_fn(y_ens_fn) - losses = calculate_loss( - metrics=self.metrics, - target=self.y_true_ensemble, - prediction=y_ensemble, - task_type=self.task_type, - ) + losses = self.get_ensemble_loss( + ensemble_identifiers=ensemble_identifiers, + model_predictions=y_ensemble + ) if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]): self.logger.debug( @@ -478,7 +479,12 @@ def get_test_preds(self, selected_keys: List[str]) -> List[str]: return success_keys_test - def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]: + def fit_ensemble( + self, + selected_keys: List[str], + best_model_identifier, + ensemble_identifiers = None + ) -> Optional[StackingEnsemble]: """ fit ensemble @@ -489,21 +495,29 @@ def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]: Returns ------- - ensemble: EnsembleSelection + ensemble: StackingEnsemble trained Ensemble """ if self.unit_test: raise MemoryError() - predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in selected_keys] - include_num_runs = [ + predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in ensemble_identifiers] + best_model_predictions = self.read_preds[best_model_identifier][Y_ENSEMBLE] + + ensemble_num_runs = [ ( self.read_losses[k]["seed"], self.read_losses[k]["num_run"], self.read_losses[k]["budget"], ) - for k in selected_keys] + for k in ensemble_identifiers] + + best_model_num_run = ( + self.read_losses[best_model_identifier]["seed"], + self.read_losses[best_model_identifier]["num_run"], + self.read_losses[best_model_identifier]["budget"], + ) # check hash if ensemble training data changed current_hash = "".join([ @@ -525,7 +539,7 @@ def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]: raise ValueError(f"Cannot optimize for {self.opt_metric} in {self.metrics} " "as more than one unique optimization metric was found.") - ensemble = EnsembleSelection( + ensemble = StackingEnsemble( ensemble_size=self.ensemble_size, metric=opt_metric, random_state=self.random_state, @@ -538,8 +552,12 @@ def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]: len(predictions_train), ) start_time = time.time() - ensemble.fit(predictions_train, self.y_true_ensemble, - include_num_runs) + ensemble.fit( + predictions_train, + best_model_predictions, + self.y_true_ensemble, + ensemble_num_runs, + best_model_num_run) end_time = time.time() self.logger.debug( "Fitting the ensemble took %.2f seconds.", @@ -616,4 +634,118 @@ def predict(self, set_: str, n_preds, set_, ) - return None \ No newline at end of file + return None + + def get_n_best_preds(self) -> List[str]: + """ + get best n predictions (i.e., keys of self.read_losses) + according to the loss on the "ensemble set" + n: all models + + Side effects: + ->Define the n-best models to use in ensemble + ->Only the best models are loaded + ->Any model that is not best is candidate to deletion + if max models in disc is exceeded. + """ + + sorted_keys = self._get_list_of_sorted_preds() + + # number of models available + num_keys = len(sorted_keys) + # remove all that are at most as good as random + # note: dummy model must have run_id=1 (there is no run_id=0) + dummy_losses = list(filter(lambda x: x[2] == 1, sorted_keys)) + # Leave this here for when we enable dummy classifier/scorer + if len(dummy_losses) > 0: + # number of dummy models + num_dummy = len(dummy_losses) + dummy_loss = dummy_losses[0] + self.logger.debug("Use %f as dummy loss" % dummy_loss[1]) + sorted_keys = list(filter(lambda x: x[1] < dummy_loss[1], sorted_keys)) + + # remove Dummy Classifier + sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys)) + if len(sorted_keys) == 0: + # no model left; try to use dummy loss (num_run==0) + # log warning when there are other models but not better than dummy model + if num_keys > num_dummy: + self.logger.warning("No models better than random - using Dummy Score!" + "Number of models besides current dummy model: %d. " + "Number of dummy models: %d", + num_keys - 1, + num_dummy) + sorted_keys = [ + (k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items() + if v["seed"] == self.seed and v["num_run"] == 1 + ] + + # reduce to keys + reduced_sorted_keys = list(map(lambda x: x[0], sorted_keys)) + + # Load the predictions for the winning + for k in reduced_sorted_keys: + if ( + ( + k not in self.read_preds or self.read_preds[k][Y_ENSEMBLE] is None + ) + and self.read_losses[k]['loaded'] != 3 + ): + self.read_preds[k][Y_ENSEMBLE] = self._read_np_fn(k) + # No need to load test here because they are loaded + # only if the model ends up in the ensemble + self.read_losses[k]['loaded'] = 1 + # return best scored keys of self.read_losses + return reduced_sorted_keys + + def get_ensemble_loss(self, ensemble_identifiers: List[str], model_predictions: np.ndarray): + """ + Gets the loss of the ensemble given slot j and predictions for new model at slot j + set is ensemble + Args: + ensemble_identifiers ([type]): [description] + model_predictions ([type]): [description] + """ + + + if ensemble_identifiers is None: + loss = calculate_loss( + metrics=[self.metric], + target=self.y_true_ensemble, + prediction=model_predictions, + task_type=self.task_type, + ) + else: + weighted_ensemble_prediction = np.zeros( + model_predictions.shape, + dtype=np.float64, + ) + fant_ensemble_prediction = np.zeros( + weighted_ensemble_prediction.shape, + dtype=np.float64, + ) + + + for i, identifier in enumerate(ensemble_identifiers): + if self.read_preds[identifier][Y_ENSEMBLE] == None: + # y ensemble read_preds is loaded in get_n_best_preds. If there is no value for this that means its a new model at this iteration. + raise ValueError("check here to resolve starting condition") + predictions = self.read_preds[identifier][Y_ENSEMBLE] if i != self.ensemble_slot_j else model_predictions + + np.add( + weighted_ensemble_prediction, + predictions, + out=fant_ensemble_prediction + ) + np.multiply( + fant_ensemble_prediction, + (1. / float(self.ensemble_size)), + out=fant_ensemble_prediction + ) + loss = calculate_loss( + metrics=[self.metric], + target=self.y_true_ensemble, + prediction=fant_ensemble_prediction, + task_type=self.task_type, + ) + return loss \ No newline at end of file diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py index bf842ecb9..3e1da6e5a 100644 --- a/autoPyTorch/evaluation/stacking_evaluator.py +++ b/autoPyTorch/evaluation/stacking_evaluator.py @@ -1,4 +1,6 @@ from multiprocessing.queues import Queue +import os +import time from typing import Any, Dict, List, Optional, Tuple, Union from ConfigSpace.configuration_space import Configuration @@ -19,6 +21,7 @@ AbstractEvaluator, fit_and_suppress_warnings ) +from autoPyTorch.ensemble.stacking_ensemble import StackingEnsemble from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.utils.common import dict_repr, subsampler from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -138,9 +141,6 @@ def __init__(self, backend: Backend, queue: Queue, # TODO: Therefore we will have to store pipelines using datamanager and load them, see if we only need predictions. # TODO: but we will need the whole pipeline as we would like to predict with different dataset, like val or something - self.splits = self.datamanager.splits - if self.splits is None: - raise AttributeError("Must have called create_splits on {}".format(self.datamanager.__class__.__name__)) self.num_folds: int = len(self.splits) self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN @@ -150,6 +150,114 @@ def __init__(self, backend: Backend, queue: Queue, self.logger.debug("Search space updates :{}".format(self.search_space_updates)) self.keep_models = keep_models + def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], + valid_pred: Optional[np.ndarray], + test_pred: Optional[np.ndarray], additional_run_info: Optional[Dict], + file_output: bool, status: StatusType, + ensemble_pred: Optional[np.ndarray], + ) -> Optional[Tuple[float, float, int, Dict]]: + """This function does everything necessary after the fitting is done: + * predicting + * saving the necessary files + We use it as the signal handler so we can recycle the code for the + normal usecase and when the runsolver kills us here :)""" + + self.duration = time.time() - self.starttime + + if file_output: + loss_, additional_run_info_ = self.file_output( + None, valid_pred, test_pred, + ) + else: + loss_ = None + additional_run_info_ = {} + + validation_loss, test_loss = self.calculate_auxiliary_losses( + valid_pred, test_pred + ) + + if loss_ is not None: + return self.duration, loss_, self.seed, additional_run_info_ + + cost = loss[self.metric.name] + + additional_run_info = ( + {} if additional_run_info is None else additional_run_info + ) + for metric_name, value in loss.items(): + additional_run_info[metric_name] = value + additional_run_info['duration'] = self.duration + additional_run_info['num_run'] = self.num_run + if train_loss is not None: + additional_run_info['train_loss'] = train_loss + if validation_loss is not None: + additional_run_info['validation_loss'] = validation_loss + if test_loss is not None: + additional_run_info['test_loss'] = test_loss + + rval_dict = {'loss': cost, + 'additional_run_info': additional_run_info, + 'status': status} + + self.queue.put(rval_dict) + return None + + def file_output( + self, + Y_optimization_pred: np.ndarray, + Y_valid_pred: np.ndarray, + Y_test_pred: np.ndarray, + ) -> Tuple[Optional[float], Dict]: + + # Abort if predictions contain NaNs + for y, s in [ + [Y_valid_pred, 'validation'], + [Y_test_pred, 'test'] + ]: + if y is not None and not np.all(np.isfinite(y)): + return ( + 1.0, + { + 'error': + 'Model predictions for %s set contains NaNs.' % s + }, + ) + + # Abort if we don't want to output anything. + if hasattr(self, 'disable_file_output'): + if self.disable_file_output: + return None, {} + else: + self.disabled_file_outputs = [] + + if hasattr(self, 'pipeline') and self.pipeline is not None: + if 'pipeline' not in self.disabled_file_outputs: + pipeline = self.pipeline + else: + pipeline = None + else: + pipeline = None + + self.logger.debug("Saving model {}_{}_{} to disk".format(self.seed, self.num_run, self.budget)) + self.backend.save_numrun_to_dir( + seed=int(self.seed), + idx=int(self.num_run), + budget=float(self.budget), + model=pipeline, + cv_model=None, + ensemble_predictions=None, + valid_predictions=( + Y_valid_pred if 'y_valid' not in + self.disabled_file_outputs else None + ), + test_predictions=( + Y_test_pred if 'y_test' not in + self.disabled_file_outputs else None + ), + ) + + return None, {} + def fit_predict_and_loss(self) -> None: """Fit, predict and compute the loss for cross-validation and holdout""" @@ -232,18 +340,23 @@ def _predict(self, pipeline: BaseEstimator, train_indices: Union[np.ndarray, List] ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: # TODO: load ensemble members and predict using the whole ensemble. - + # TODO: we need some function to pass this pipeline to the last stored ensemble replace + # TODO: model j, where j = ensemble.iteration mod m. then we need to predict + # TODO: Also, we will pass the predictions from this pipeline as that is what is needed + # TODO: to create the ensemble. train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline, self.y_train[train_indices]) - opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline, + pipeline_opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline, self.y_train[train_indices]) - if self.X_valid is not None: - valid_pred = self.predict_function(self.X_valid, pipeline, - self.y_valid) + ensemble_dir = self.backend.get_ensemble_dir() + if len(os.listdir(ensemble_dir)) >= 1: + old_ensemble = self.backend.load_ensemble(self.seed) + assert isinstance(old_ensemble, StackingEnsemble) + ensemble_opt_pred = old_ensemble.predict_with_current_model() else: - valid_pred = None + ensemble_opt_pred = None if self.X_test is not None: test_pred = self.predict_function(self.X_test, pipeline, diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py index 0d82b9622..2d32dece0 100644 --- a/autoPyTorch/pipeline/components/training/metrics/metrics.py +++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py @@ -46,7 +46,11 @@ sklearn.metrics.balanced_accuracy_score) f1 = make_metric('f1', sklearn.metrics.f1_score) - +zero_one_loss = make_metric('zero_one_loss', + sklearn.metrics.zero_one_loss, + optimum=0, + greater_is_better=False, + worst_possible_result=MAXINT) # Score functions that need decision values roc_auc = make_metric('roc_auc', sklearn.metrics.roc_auc_score, needs_threshold=True) average_precision = make_metric('average_precision', @@ -73,7 +77,7 @@ CLASSIFICATION_METRICS = dict() for scorer in [accuracy, balanced_accuracy, roc_auc, average_precision, - log_loss]: + log_loss, zero_one_loss]: CLASSIFICATION_METRICS[scorer.name] = scorer for name, metric in [('precision', sklearn.metrics.precision_score), diff --git a/test/test_ensemble/test_ensemble.py b/test/test_ensemble/test_ensemble.py index d8463ab86..6a2650313 100644 --- a/test/test_ensemble/test_ensemble.py +++ b/test/test_ensemble/test_ensemble.py @@ -18,9 +18,9 @@ from smac.runhistory.runhistory import RunHistory, RunKey, RunValue from autoPyTorch.constants import BINARY, MULTICLASS, TABULAR_CLASSIFICATION -from autoPyTorch.ensemble.ensemble_builder_manager import ( +from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager +from autoPyTorch.ensemble.ensemble_builder import ( EnsembleBuilder, - EnsembleBuilderManager, Y_ENSEMBLE, Y_TEST, ) From 7ed9693051c8f499dba25c65c66bc16f150d5b61 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 14:34:13 +0200 Subject: [PATCH 03/16] revert deletion of get_n_best_preds (clean) --- autoPyTorch/ensemble/ensemble_builder.py | 161 ++++++++++++++++++++++- 1 file changed, 160 insertions(+), 1 deletion(-) diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py index 87ec9a2b9..ea2b77c97 100644 --- a/autoPyTorch/ensemble/ensemble_builder.py +++ b/autoPyTorch/ensemble/ensemble_builder.py @@ -678,7 +678,166 @@ def compute_loss_per_model(self) -> bool: ) return True - + def get_n_best_preds(self) -> List[str]: + """ + get best n predictions (i.e., keys of self.read_losses) + according to the loss on the "ensemble set" + n: self.ensemble_nbest + Side effects: + ->Define the n-best models to use in ensemble + ->Only the best models are loaded + ->Any model that is not best is candidate to deletion + if max models in disc is exceeded. + """ + + sorted_keys = self._get_list_of_sorted_preds() + + # number of models available + num_keys = len(sorted_keys) + # remove all that are at most as good as random + # note: dummy model must have run_id=1 (there is no run_id=0) + dummy_losses = list(filter(lambda x: x[2] == 1, sorted_keys)) + # Leave this here for when we enable dummy classifier/scorer + if len(dummy_losses) > 0: + # number of dummy models + num_dummy = len(dummy_losses) + dummy_loss = dummy_losses[0] + self.logger.debug("Use %f as dummy loss" % dummy_loss[1]) + sorted_keys = list(filter(lambda x: x[1] < dummy_loss[1], sorted_keys)) + + # remove Dummy Classifier + sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys)) + if len(sorted_keys) == 0: + # no model left; try to use dummy loss (num_run==0) + # log warning when there are other models but not better than dummy model + if num_keys > num_dummy: + self.logger.warning("No models better than random - using Dummy Score!" + "Number of models besides current dummy model: %d. " + "Number of dummy models: %d", + num_keys - 1, + num_dummy) + sorted_keys = [ + (k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items() + if v["seed"] == self.seed and v["num_run"] == 1 + ] + # reload predictions if losses changed over time and a model is + # considered to be in the top models again! + if not isinstance(self.ensemble_nbest, numbers.Integral): + # Transform to number of models to keep. Keep at least one + keep_nbest = max(1, min(len(sorted_keys), + int(len(sorted_keys) * self.ensemble_nbest))) + self.logger.debug( + "Library pruning: using only top %f percent of the models for ensemble " + "(%d out of %d)", + self.ensemble_nbest * 100, keep_nbest, len(sorted_keys) + ) + else: + # Keep only at most ensemble_nbest + keep_nbest = min(self.ensemble_nbest, len(sorted_keys)) + self.logger.debug("Library Pruning: using for ensemble only " + " %d (out of %d) models" % (keep_nbest, len(sorted_keys))) + + # If max_models_on_disc is None, do nothing + # One can only read at most max_models_on_disc models + if self.max_models_on_disc is not None: + if not isinstance(self.max_models_on_disc, numbers.Integral): + consumption = [ + [ + v["ens_loss"], + v["disc_space_cost_mb"], + ] for v in self.read_losses.values() if v["disc_space_cost_mb"] is not None + ] + max_consumption = max(c[1] for c in consumption) + + # We are pessimistic with the consumption limit indicated by + # max_models_on_disc by 1 model. Such model is assumed to spend + # max_consumption megabytes + if (sum(c[1] for c in consumption) + max_consumption) > self.max_models_on_disc: + + # just leave the best -- smaller is better! + # This list is in descending order, to preserve the best models + sorted_cum_consumption = np.cumsum([ + c[1] for c in list(sorted(consumption)) + ]) + max_consumption + max_models = np.argmax(sorted_cum_consumption > self.max_models_on_disc) + + # Make sure that at least 1 model survives + self.max_resident_models = max(1, max_models) + self.logger.warning( + "Limiting num of models via float max_models_on_disc={}" + " as accumulated={} worst={} num_models={}".format( + self.max_models_on_disc, + (sum(c[1] for c in consumption) + max_consumption), + max_consumption, + self.max_resident_models + ) + ) + else: + self.max_resident_models = None + else: + self.max_resident_models = self.max_models_on_disc + + if self.max_resident_models is not None and keep_nbest > self.max_resident_models: + self.logger.debug( + "Restricting the number of models to %d instead of %d due to argument " + "max_models_on_disc", + self.max_resident_models, keep_nbest, + ) + keep_nbest = self.max_resident_models + + # consider performance_range_threshold + if self.performance_range_threshold > 0: + best_loss = sorted_keys[0][1] + worst_loss = dummy_loss[1] + worst_loss -= (worst_loss - best_loss) * self.performance_range_threshold + if sorted_keys[keep_nbest - 1][1] > worst_loss: + # We can further reduce number of models + # since worst model is worse than thresh + for i in range(0, keep_nbest): + # Look at most at keep_nbest models, + # but always keep at least one model + current_loss = sorted_keys[i][1] + if current_loss >= worst_loss: + self.logger.debug("Dynamic Performance range: " + "Further reduce from %d to %d models", + keep_nbest, max(1, i)) + keep_nbest = max(1, i) + break + ensemble_n_best = keep_nbest + + # reduce to keys + reduced_sorted_keys = list(map(lambda x: x[0], sorted_keys)) + + # remove loaded predictions for non-winning models + for k in reduced_sorted_keys[ensemble_n_best:]: + if k in self.read_preds: + self.read_preds[k][Y_ENSEMBLE] = None + self.read_preds[k][Y_TEST] = None + if self.read_losses[k]['loaded'] == 1: + self.logger.debug( + 'Dropping model %s (%d,%d) with loss %f.', + k, + self.read_losses[k]['seed'], + self.read_losses[k]['num_run'], + self.read_losses[k]['ens_loss'], + ) + self.read_losses[k]['loaded'] = 2 + + # Load the predictions for the winning + for k in reduced_sorted_keys[:ensemble_n_best]: + if ( + ( + k not in self.read_preds or self.read_preds[k][Y_ENSEMBLE] is None + ) + and self.read_losses[k]['loaded'] != 3 + ): + self.read_preds[k][Y_ENSEMBLE] = self._read_np_fn(k) + # No need to load test here because they are loaded + # only if the model ends up in the ensemble + self.read_losses[k]['loaded'] = 1 + + # return best scored keys of self.read_losses + return reduced_sorted_keys[:ensemble_n_best] def get_test_preds(self, selected_keys: List[str]) -> List[str]: """ From 056e08857781eb7cf0e027c0bf7fa950e3b97f47 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 14:37:22 +0200 Subject: [PATCH 04/16] made changes to ensemble building to solve persistency issue of ensemble (clean) --- autoPyTorch/ensemble/stacking_ensemble.py | 136 ++++++------ .../ensemble/stacking_ensemble_builder.py | 205 +++++++----------- 2 files changed, 156 insertions(+), 185 deletions(-) diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py index 4d6987eb1..f0621c29b 100644 --- a/autoPyTorch/ensemble/stacking_ensemble.py +++ b/autoPyTorch/ensemble/stacking_ensemble.py @@ -22,16 +22,13 @@ def __init__( metric: autoPyTorchMetric, task_type: int, random_state: np.random.RandomState, - # should be with something like numrun_seed_budget. - ensemble_identifiers = None, - best_model_identifier = None, - ensemble_slot_j: int = None, - read_preds = None, + ensemble_slot_j: int ) -> None: self.ensemble_size = ensemble_size self.metric = metric self.random_state = random_state self.task_type = task_type + self.ensemble_slot_j = ensemble_slot_j def __getstate__(self) -> Dict[str, Any]: # Cannot serialize a metric if @@ -50,7 +47,7 @@ def fit( best_model_predictions: np.ndarray, labels: np.ndarray, ensemble_identifiers: List[Tuple[int, int, float]], - best_model_identifier: Tuple[int, int, float] + best_model_identifier: Tuple[int, int, float], ) -> AbstractEnsemble: """ Builds a ensemble given the individual models out of fold predictions. @@ -70,7 +67,11 @@ def fit( Returns: A copy of self """ - + predictions_ensemble[self.ensemble_slot_j] = best_model_predictions + ensemble_identifiers[self.ensemble_slot_j] = best_model_identifier + self._fit(predictions_ensemble, labels) + self.identifiers_ = ensemble_identifiers + self._calculate_weights() return self # TODO: fit a stacked ensemble. @@ -93,37 +94,43 @@ def _fit( A list of model identifiers, each with the form (seed, number of run, budget) """ - self.num_input_models_ = len(predictions) - - ensemble: List[np.ndarray] = [] - trajectory = [] - order = [] - - ensemble_size = self.ensemble_size weighted_ensemble_prediction = np.zeros( predictions[0].shape, dtype=np.float64, ) - # Calculate loss is versatile and can return a dict of slosses - # losses[j] = calculate_loss( - # metrics=[self.metric], - # target=labels, - # prediction=fant_ensemble_prediction, - # task_type=self.task_type, - # )[self.metric.name] + fant_ensemble_prediction = np.zeros( + weighted_ensemble_prediction.shape, + dtype=np.float64, + ) - # all_best = np.argwhere(losses == np.nanmin(losses)).flatten() - # best = self.random_state.choice(all_best) - # ensemble.append(predictions[best]) - # trajectory.append(losses[best]) - # order.append(best) + nonnull_predictions = [pred for pred in predictions if pred is not None] + size = len(nonnull_predictions) + for pred in nonnull_predictions: + np.add( + weighted_ensemble_prediction, + pred, + out=fant_ensemble_prediction + ) + np.multiply( + fant_ensemble_prediction, + (1. / float(size)), + out=fant_ensemble_prediction + ) + + # Calculate loss is versatile and can return a dict of slosses + loss = calculate_loss( + metrics=[self.metric], + target=labels, + prediction=fant_ensemble_prediction, + task_type=self.task_type, + )[self.metric.name] + # store list of preds for later use + self.ensemble_predictions = predictions - self.indices_: List[int] = order - self.trajectory_: List[float] = trajectory - self.train_loss_: float = trajectory[-1] + self.train_loss_: float = loss # TODO: return 1 for models in layer 0, 2 for next and so on # TODO: 0 for models that are not in stack @@ -134,19 +141,22 @@ def _calculate_weights(self) -> None: a frequency counting scheme. In particular, how many times a model was used during hill climbing optimization. """ - ensemble_members = Counter(self.indices_).most_common() weights = np.zeros( - (self.num_input_models_,), + self.ensemble_size, dtype=np.float64, ) - for ensemble_member in ensemble_members: - weight = 1 - weights[ensemble_member[0]] = weight + current_size = len([id for id in self.identifiers_ if id is not None]) + for i, identifier in enumerate(self.identifiers_): + if identifier is not None: + weights[i] = (1. / float(current_size)) self.weights_ = weights # TODO: Adjust this to use weights and make - def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: + def predict(self, predictions: List[np.ndarray]) -> np.ndarray: + return self._predict(predictions, self.weights_) + + def _predict(self, predictions, weights): """ Given a list of predictions from the individual model, this method aggregates the predictions using a soft voting scheme with the weights @@ -158,7 +168,7 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra Returns: average (np.ndarray): Soft voting predictions of ensemble models, using - the weights found during ensemble selection (self._weights) + the weights """ average = np.zeros_like(predictions[0], dtype=np.float64) @@ -166,8 +176,8 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra # if prediction model.shape[0] == len(non_null_weights), # predictions do not include those of zero-weight models. - if len(predictions) == np.count_nonzero(self.weights_): - non_null_weights = [w for w in self.weights_ if w > 0] + if len(predictions) == np.count_nonzero(weights): + non_null_weights = [w for w in weights if w > 0] for pred, weight in zip(predictions, non_null_weights): np.multiply(pred, weight, out=tmp_predictions) np.add(average, tmp_predictions, out=average) @@ -179,15 +189,15 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra del tmp_predictions return average - def __str__(self) -> str: - return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \ - '\n\tWeights: %s\n\tIdentifiers: %s' % \ - (' '.join(['%d: %5f' % (idx, performance) - for idx, performance in enumerate(self.trajectory_)]), - self.indices_, self.weights_, - ' '.join([str(identifier) for idx, identifier in - enumerate(self.identifiers_) - if self.weights_[idx] > 0])) + # def __str__(self) -> str: + # return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \ + # '\n\tWeights: %s\n\tIdentifiers: %s' % \ + # (' '.join(['%d: %5f' % (idx, performance) + # for idx, performance in enumerate(self.trajectory_)]), + # self.indices_, self.weights_, + # ' '.join([str(identifier) for idx, identifier in + # enumerate(self.identifiers_) + # if self.weights_[idx] > 0])) def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: """ @@ -199,14 +209,7 @@ def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: output (List[Tuple[int, int, float]]): The models actually used by ensemble selection """ - output = [] - - for i, weight in enumerate(self.weights_): - identifier = self.identifiers_[i] - if weight > 0.0: - output.append(identifier) - - return output + return self.identifiers_ def get_validation_performance(self) -> float: """ @@ -216,13 +219,24 @@ def get_validation_performance(self) -> float: (float): best ensemble training performance """ - return self.trajectory_[-1] + return self.train_loss_ def predict_with_current_pipeline( self, pipeline_predictions: np.ndarray, - ) -> None: - # TODO: predict with ensemble by replacing model at j = self.iteration mod m, - # where m is ensemble_size. - # returns None - pass + ) -> np.ndarray: + """ + predict with ensemble by replacing model at j = self.iteration mod m, + where m is ensemble_size. + returns ensemble predictions + """ + predictions = self.ensemble_predictions.copy() + if predictions[self.ensemble_slot_j] is None: + total_predictions = len([pred for pred in predictions if pred is not None]) + total_predictions += 1 + weights = [1/total_predictions for pred in predictions if pred is not None] + else: + weights = self.weights_ + + predictions[self.ensemble_slot_j] = pipeline_predictions + return self._predict(predictions, weights) diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py index e3f54a818..836f7884c 100644 --- a/autoPyTorch/ensemble/stacking_ensemble_builder.py +++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py @@ -1,8 +1,6 @@ import glob import logging import logging.handlers -import math -from mmap import MADV_NOHUGEPAGE import os import pickle import re @@ -12,7 +10,6 @@ from typing import Dict, List, Optional, Tuple, Union import numpy as np -from numpy.random.mtrand import seed from autoPyTorch.automl_common.common.utils.backend import Backend from autoPyTorch.constants import BINARY @@ -20,7 +17,7 @@ from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder from autoPyTorch.ensemble.stacking_ensemble import StackingEnsemble from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric -from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss +from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score from autoPyTorch.utils.logging_ import get_named_client_logger Y_ENSEMBLE = 0 @@ -115,6 +112,11 @@ def __init__( seed=seed, precision=precision, memory_limit=memory_limit, read_at_most=read_at_most, random_state=random_state, logger_port=logger_port, unit_test=unit_test) + # we still need to store ensemble identifiers as this class is not persistant + # we can do this by either storing and reading them in this class + # or passing them via the ensemble builder manager which has persistency with the futures stored. + self.ensemble_identifiers: Optional[List[Optional[str]]] = None + # TODO: This is the main wrapper to the EnsembleSelection class which fits # TODO: the ensemble @@ -183,17 +185,10 @@ def main( time_left - used_time, ) - ensemble_identifiers = None - # Get ensemble_identifiers from previous iteration. - ensemble_dir = self.backend.get_ensemble_dir() - if len(os.listdir(ensemble_dir)) >= 1: - old_ensemble = self.backend.load_ensemble(self.seed) - ensemble_identifiers = old_ensemble.ensemble_identifiers - self.ensemble_slot_j = np.mod(iteration, self.ensemble_size) - + self.ensemble_identifiers = self._load_ensemble_identifiers() # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss. - if not self.compute_ensemble_loss_per_model(ensemble_identifiers=ensemble_identifiers): + if not self.compute_ensemble_loss_per_model(): if return_predictions: return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred else: @@ -201,7 +196,7 @@ def main( # Only the models with the n_best predictions are candidates # to be in the ensemble - candidate_models = self.get_n_best_preds() + candidate_models = self.get_candidate_preds() if not candidate_models: # no candidates yet if return_predictions: return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred @@ -230,21 +225,15 @@ def main( # as candidate models is sorted in `get_n_best_preds` best_model_identifier = candidate_models[0] - # initialise ensemble_identifier with best_model_identifier - if ensemble_identifiers == None: - ensemble_identifiers = [best_model_identifier] - # train ensemble ensemble = self.fit_ensemble( - selected_keys=candidate_models, - ensemble_identifiers=ensemble_identifiers, best_model_identifier=best_model_identifier ) # Save the ensemble for later use in the main module! if ensemble is not None and self.SAVE2DISC: self.backend.save_ensemble(ensemble, iteration, self.seed) - + self._save_ensemble_identifiers(ensemble_identifiers=ensemble.identifiers_) # Delete files of non-candidate models - can only be done after fitting the ensemble and # saving it to disc so we do not accidentally delete models in the previous ensemble if self.max_resident_models is not None: @@ -257,15 +246,15 @@ def main( if ensemble is not None: train_pred = self.predict(set_="train", ensemble=ensemble, - selected_keys=candidate_models, - n_preds=len(candidate_models), + selected_keys=ensemble.identifiers_, + n_preds=len(ensemble.identifiers_), index_run=iteration) # TODO if predictions fails, build the model again during the # next iteration! test_pred = self.predict(set_="test", ensemble=ensemble, - selected_keys=n_sel_test, - n_preds=len(candidate_models), + selected_keys=ensemble.identifiers_, + n_preds=len(ensemble.identifiers_), index_run=iteration) # Add a score to run history to see ensemble progress @@ -288,7 +277,7 @@ def main( # TODO: this will help us in choosing the model with the lowest ensemble error. # TODO: predictions on ensemble set will be available in read_preds to be used for # TODO: passing to stacking_ensemble_builder.predict() - def compute_ensemble_loss_per_model(self, ensemble_identifiers) -> bool: + def compute_ensemble_loss_per_model(self) -> bool: """ Compute the loss of the predictions on ensemble building data set; populates self.read_preds and self.read_losses @@ -377,8 +366,7 @@ def compute_ensemble_loss_per_model(self, ensemble_identifiers) -> bool: # actually read the predictions and compute their respective loss try: y_ensemble = self._read_np_fn(y_ens_fn) - losses = self.get_ensemble_loss( - ensemble_identifiers=ensemble_identifiers, + losses = self.get_ensemble_loss_with_model( model_predictions=y_ensemble ) @@ -421,69 +409,9 @@ def compute_ensemble_loss_per_model(self, ensemble_identifiers) -> bool: ) return True - def get_test_preds(self, selected_keys: List[str]) -> List[str]: - """ - test predictions from disc - and store them in self.read_preds - Parameters - --------- - selected_keys: list - list of selected keys of self.read_preds - Return - ------ - success_keys: - all keys in selected keys for which we could read the valid and - test predictions - """ - success_keys_test = [] - - for k in selected_keys: - test_fn = glob.glob( - os.path.join( - glob.escape(self.backend.get_runs_directory()), - '%d_%d_%s' % ( - self.read_losses[k]["seed"], - self.read_losses[k]["num_run"], - self.read_losses[k]["budget"], - ), - 'predictions_test_%d_%d_%s.npy*' % ( - self.read_losses[k]["seed"], - self.read_losses[k]["num_run"], - self.read_losses[k]["budget"] - ) - ) - ) - test_fn = [tfn for tfn in test_fn if tfn.endswith('.npy') or tfn.endswith('.npy.gz')] - - if len(test_fn) == 0: - # self.logger.debug("Not found test prediction file (although " - # "ensemble predictions available):%s" % - # test_fn) - pass - else: - if ( - self.read_losses[k]["mtime_test"] == os.path.getmtime(test_fn[0]) - and k in self.read_preds - and self.read_preds[k][Y_TEST] is not None - ): - success_keys_test.append(k) - continue - try: - y_test = self._read_np_fn(test_fn[0]) - self.read_preds[k][Y_TEST] = y_test - success_keys_test.append(k) - self.read_losses[k]["mtime_test"] = os.path.getmtime(test_fn[0]) - except Exception: - self.logger.warning('Error loading %s: %s', - test_fn, traceback.format_exc()) - - return success_keys_test - def fit_ensemble( self, - selected_keys: List[str], - best_model_identifier, - ensemble_identifiers = None + best_model_identifier: str, ) -> Optional[StackingEnsemble]: """ fit ensemble @@ -499,10 +427,12 @@ def fit_ensemble( trained Ensemble """ + assert self.ensemble_identifiers is not None + if self.unit_test: raise MemoryError() - predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in ensemble_identifiers] + predictions_train = [self.read_preds[k][Y_ENSEMBLE] if k is not None else None for k in self.ensemble_identifiers] best_model_predictions = self.read_preds[best_model_identifier][Y_ENSEMBLE] ensemble_num_runs = [ @@ -511,7 +441,8 @@ def fit_ensemble( self.read_losses[k]["num_run"], self.read_losses[k]["budget"], ) - for k in ensemble_identifiers] + if k is not None else None + for k in self.ensemble_identifiers] best_model_num_run = ( self.read_losses[best_model_identifier]["seed"], @@ -544,6 +475,7 @@ def fit_ensemble( metric=opt_metric, random_state=self.random_state, task_type=self.task_type, + ensemble_slot_j=self.ensemble_slot_j ) try: @@ -557,7 +489,9 @@ def fit_ensemble( best_model_predictions, self.y_true_ensemble, ensemble_num_runs, - best_model_num_run) + best_model_num_run + ) + end_time = time.time() self.logger.debug( "Fitting the ensemble took %.2f seconds.", @@ -611,7 +545,8 @@ def predict(self, set_: str, pred_set = Y_TEST else: pred_set = Y_ENSEMBLE - predictions = [self.read_preds[k][pred_set] for k in selected_keys] + + predictions = [self.read_preds[k][pred_set] for k in selected_keys if k is not None] if n_preds == len(predictions): y = ensemble.predict(predictions) @@ -636,11 +571,11 @@ def predict(self, set_: str, ) return None - def get_n_best_preds(self) -> List[str]: + def get_candidate_preds(self) -> List[str]: """ - get best n predictions (i.e., keys of self.read_losses) + gets predictions better than dummy score + (i.e., keys of self.read_losses) according to the loss on the "ensemble set" - n: all models Side effects: ->Define the n-best models to use in ensemble @@ -698,35 +633,30 @@ def get_n_best_preds(self) -> List[str]: # return best scored keys of self.read_losses return reduced_sorted_keys - def get_ensemble_loss(self, ensemble_identifiers: List[str], model_predictions: np.ndarray): + def get_ensemble_loss_with_model(self, model_predictions: np.ndarray): """ Gets the loss of the ensemble given slot j and predictions for new model at slot j set is ensemble Args: - ensemble_identifiers ([type]): [description] model_predictions ([type]): [description] """ + weighted_ensemble_prediction = np.zeros( + model_predictions.shape, + dtype=np.float64, + ) + fant_ensemble_prediction = np.zeros( + weighted_ensemble_prediction.shape, + dtype=np.float64, + ) - if ensemble_identifiers is None: - loss = calculate_loss( - metrics=[self.metric], - target=self.y_true_ensemble, - prediction=model_predictions, - task_type=self.task_type, - ) - else: - weighted_ensemble_prediction = np.zeros( - model_predictions.shape, - dtype=np.float64, - ) - fant_ensemble_prediction = np.zeros( - weighted_ensemble_prediction.shape, - dtype=np.float64, - ) - - - for i, identifier in enumerate(ensemble_identifiers): + for i, identifier in enumerate(self.ensemble_identifiers): + if identifier is None: + if i == self.ensemble_slot_j: + predictions = model_predictions + else: + continue + else: if self.read_preds[identifier][Y_ENSEMBLE] == None: # y ensemble read_preds is loaded in get_n_best_preds. If there is no value for this that means its a new model at this iteration. raise ValueError("check here to resolve starting condition") @@ -742,10 +672,37 @@ def get_ensemble_loss(self, ensemble_identifiers: List[str], model_predictions: (1. / float(self.ensemble_size)), out=fant_ensemble_prediction ) - loss = calculate_loss( - metrics=[self.metric], - target=self.y_true_ensemble, - prediction=fant_ensemble_prediction, - task_type=self.task_type, - ) - return loss \ No newline at end of file + loss = calculate_loss( + metrics=[self.metric], + target=self.y_true_ensemble, + prediction=fant_ensemble_prediction, + task_type=self.task_type, + ) + return loss + + def _get_ensemble_identifiers_filename(self): + return os.path.join(self.backend.temporary_directory, 'ensemble_identifiers.pkl') + + def _save_ensemble_identifiers(self, ensemble_identifiers: List[Optional[str]]) -> None: + with open(self._get_ensemble_identifiers_filename(), "wb") as file: + pickle.dump(ensemble_identifiers, file=file) + + def _load_ensemble_identifiers(self) -> List[Optional[str]]: + if os.path.exists(self._get_ensemble_identifiers_filename()): + with open(self._get_ensemble_identifiers_filename(), "rb") as file: + identifiers = pickle.load(file) + else: + identifiers = [None]*self.ensemble_size + return identifiers + + def _get_identifiers_from_num_runs(self, num_runs, subset='ensemble') -> List[Optional[str]]: + identifiers: List[Optional[str]] = [] + for num_run in num_runs: + identifier = None + if num_run is not None: + seed, idx, budget = num_run + identifier = self.backend.get_prediction_filename(subset, seed, idx, budget) + + identifiers.append(identifier) + return identifiers + From 8fc9b7e1347357ce4846c8996529aa71fccdd9cc Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 14:48:57 +0200 Subject: [PATCH 05/16] make stacking evaluator changes, not tested yet (clean) --- autoPyTorch/evaluation/stacking_evaluator.py | 83 ++++++++++++++------ 1 file changed, 61 insertions(+), 22 deletions(-) diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py index 3e1da6e5a..154db1546 100644 --- a/autoPyTorch/evaluation/stacking_evaluator.py +++ b/autoPyTorch/evaluation/stacking_evaluator.py @@ -152,9 +152,11 @@ def __init__(self, backend: Backend, queue: Queue, def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], valid_pred: Optional[np.ndarray], - test_pred: Optional[np.ndarray], additional_run_info: Optional[Dict], - file_output: bool, status: StatusType, - ensemble_pred: Optional[np.ndarray], + test_pred: Optional[np.ndarray], + pipeline_opt_pred: np.ndarray, + ensemble_opt_pred: np.ndarray, + additional_run_info: Optional[Dict], + file_output: bool, status: StatusType, ) -> Optional[Tuple[float, float, int, Dict]]: """This function does everything necessary after the fitting is done: * predicting @@ -166,7 +168,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], if file_output: loss_, additional_run_info_ = self.file_output( - None, valid_pred, test_pred, + ensemble_opt_pred, valid_pred, test_pred ) else: loss_ = None @@ -176,6 +178,10 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], valid_pred, test_pred ) + pipeline_loss, _ = self.calculate_auxiliary_losses( + pipeline_opt_pred, None + ) + if loss_ is not None: return self.duration, loss_, self.seed, additional_run_info_ @@ -188,6 +194,8 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], additional_run_info[metric_name] = value additional_run_info['duration'] = self.duration additional_run_info['num_run'] = self.num_run + if pipeline_loss is not None: + additional_run_info['pipeline_loss'] = pipeline_loss if train_loss is not None: additional_run_info['train_loss'] = train_loss if validation_loss is not None: @@ -209,8 +217,22 @@ def file_output( Y_test_pred: np.ndarray, ) -> Tuple[Optional[float], Dict]: + # Abort in case of shape misalignment + if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: + return ( + 1.0, + { + 'error': + "Targets %s and prediction %s don't have " + "the same length. Probably training didn't " + "finish" % (self.Y_optimization.shape, Y_optimization_pred.shape) + }, + ) + # Abort if predictions contain NaNs for y, s in [ + # Y_train_pred deleted here. Fix unittest accordingly. + [Y_optimization_pred, 'optimization'], [Y_valid_pred, 'validation'], [Y_test_pred, 'test'] ]: @@ -230,6 +252,11 @@ def file_output( else: self.disabled_file_outputs = [] + # This file can be written independently of the others down bellow + if 'y_optimization' not in self.disabled_file_outputs: + if self.output_y_hat_optimization: + self.backend.save_targets_ensemble(self.Y_optimization) + if hasattr(self, 'pipeline') and self.pipeline is not None: if 'pipeline' not in self.disabled_file_outputs: pipeline = self.pipeline @@ -245,7 +272,10 @@ def file_output( budget=float(self.budget), model=pipeline, cv_model=None, - ensemble_predictions=None, + ensemble_predictions=( + Y_optimization_pred if 'y_optimization' not in + self.disabled_file_outputs else None + ), valid_predictions=( Y_valid_pred if 'y_valid' not in self.disabled_file_outputs else None @@ -272,12 +302,19 @@ def fit_predict_and_loss(self) -> None: train_split, test_split = self.splits[split_id] self.Y_optimization = self.y_train[test_split] self.Y_actual_train = self.y_train[train_split] - y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id, - train_indices=train_split, - test_indices=test_split, - add_pipeline_to_self=True) + ( + y_train_pred, + y_pipeline_opt_pred, + y_ensemble_opt_pred, + y_valid_pred, + y_test_pred + ) = self._fit_and_predict(pipeline, split_id, + train_indices=train_split, + test_indices=test_split, + add_pipeline_to_self=True) + train_loss = self._loss(self.y_train[train_split], y_train_pred) - loss = self._loss(self.y_train[test_split], y_opt_pred) + loss = self._loss(self.y_train[test_split], y_ensemble_opt_pred) additional_run_info = pipeline.get_additional_run_info() if hasattr( pipeline, 'get_additional_run_info') else {} @@ -292,12 +329,13 @@ def fit_predict_and_loss(self) -> None: self.finish_up( loss=loss, train_loss=train_loss, - opt_pred=y_opt_pred, + ensemble_opt_pred=y_ensemble_opt_pred, valid_pred=y_valid_pred, test_pred=y_test_pred, additional_run_info=additional_run_info, file_output=True, status=status, + pipeline_opt_pred=y_pipeline_opt_pred ) @@ -317,12 +355,7 @@ def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Un y = None fit_and_suppress_warnings(self.logger, pipeline, X, y) self.logger.info("Model fitted, now predicting") - ( - Y_train_pred, - Y_opt_pred, - Y_valid_pred, - Y_test_pred - ) = self._predict( + Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred = self._predict( pipeline, train_indices=train_indices, test_indices=test_indices, @@ -333,12 +366,12 @@ def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Un else: self.pipelines[fold] = pipeline - return Y_train_pred, Y_opt_pred, Y_valid_pred, Y_test_pred + return Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred def _predict(self, pipeline: BaseEstimator, test_indices: Union[np.ndarray, List], train_indices: Union[np.ndarray, List] - ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: # TODO: load ensemble members and predict using the whole ensemble. # TODO: we need some function to pass this pipeline to the last stored ensemble replace # TODO: model j, where j = ensemble.iteration mod m. then we need to predict @@ -354,9 +387,15 @@ def _predict(self, pipeline: BaseEstimator, if len(os.listdir(ensemble_dir)) >= 1: old_ensemble = self.backend.load_ensemble(self.seed) assert isinstance(old_ensemble, StackingEnsemble) - ensemble_opt_pred = old_ensemble.predict_with_current_model() + ensemble_opt_pred = old_ensemble.predict_with_current_model(pipeline_opt_pred) + else: + ensemble_opt_pred = pipeline_opt_pred.copy() + + if self.X_valid is not None: + valid_pred = self.predict_function(self.X_valid, pipeline, + self.y_valid) else: - ensemble_opt_pred = None + valid_pred = None if self.X_test is not None: test_pred = self.predict_function(self.X_test, pipeline, @@ -364,7 +403,7 @@ def _predict(self, pipeline: BaseEstimator, else: test_pred = None - return train_pred, opt_pred, valid_pred, test_pred + return train_pred, pipeline_opt_pred, ensemble_opt_pred, valid_pred, test_pred # create closure for evaluating an algorithm From 8b931dfda19595cf9b34c364f83d981517c1501d Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 14:52:45 +0200 Subject: [PATCH 06/16] cleanup of stacking evaluator (clean) --- autoPyTorch/evaluation/stacking_evaluator.py | 28 +++++++------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py index 154db1546..eeca7b6e4 100644 --- a/autoPyTorch/evaluation/stacking_evaluator.py +++ b/autoPyTorch/evaluation/stacking_evaluator.py @@ -114,7 +114,6 @@ def __init__(self, backend: Backend, queue: Queue, disable_file_output: Union[bool, List] = False, init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, - keep_models: Optional[bool] = None, all_supported_metrics: bool = True, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None: super().__init__( @@ -141,14 +140,7 @@ def __init__(self, backend: Backend, queue: Queue, # TODO: Therefore we will have to store pipelines using datamanager and load them, see if we only need predictions. # TODO: but we will need the whole pipeline as we would like to predict with different dataset, like val or something - self.num_folds: int = len(self.splits) - self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds - self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN - self.pipelines: List[Optional[BaseEstimator]] = [None] * self.num_folds - self.indices: List[Optional[Tuple[Union[np.ndarray, List], Union[np.ndarray, List]]]] = [None] * self.num_folds - self.logger.debug("Search space updates :{}".format(self.search_space_updates)) - self.keep_models = keep_models def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], valid_pred: Optional[np.ndarray], @@ -310,8 +302,7 @@ def fit_predict_and_loss(self) -> None: y_test_pred ) = self._fit_and_predict(pipeline, split_id, train_indices=train_split, - test_indices=test_split, - add_pipeline_to_self=True) + test_indices=test_split) train_loss = self._loss(self.y_train[train_split], y_train_pred) loss = self._loss(self.y_train[test_split], y_ensemble_opt_pred) @@ -339,11 +330,13 @@ def fit_predict_and_loss(self) -> None: ) - def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Union[np.ndarray, List], - test_indices: Union[np.ndarray, List], - add_pipeline_to_self: bool - ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: - self.indices[fold] = ((train_indices, test_indices)) + def _fit_and_predict( + self, + pipeline: BaseEstimator, + fold: int, + train_indices: Union[np.ndarray, List], + test_indices: Union[np.ndarray, List], + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: # See autoPyTorch/pipeline/components/base_component.py::autoPyTorchComponent for more details # about fit_dictionary @@ -361,10 +354,7 @@ def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Un test_indices=test_indices, ) - if add_pipeline_to_self: - self.pipeline = pipeline - else: - self.pipelines[fold] = pipeline + self.pipeline = pipeline return Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred From f39e482be5ff25818435fadbb88dd0123fea5047 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 15:00:53 +0200 Subject: [PATCH 07/16] added arg to pass ensemble_method from api (clean) --- autoPyTorch/api/base_task.py | 10 +++++++++- autoPyTorch/api/tabular_classification.py | 3 +++ autoPyTorch/api/tabular_regression.py | 3 +++ autoPyTorch/ensemble/ensemble_builder_manager.py | 8 +++++++- autoPyTorch/ensemble/utils.py | 16 ++++++++++++++++ 5 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 autoPyTorch/ensemble/utils.py diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 514af72d2..cb00e9d73 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -49,6 +49,7 @@ ) from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager from autoPyTorch.ensemble.singlebest_ensemble import SingleBest +from autoPyTorch.ensemble.utils import EnsembleSelectionTypes from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash from autoPyTorch.evaluation.utils import DisableFileOutputParameters @@ -171,6 +172,7 @@ def __init__( logging_config: Optional[Dict] = None, ensemble_size: int = 50, ensemble_nbest: int = 50, + ensemble_method: int = EnsembleSelectionTypes.ensemble_selection, max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, @@ -193,6 +195,7 @@ def __init__( self.n_threads = n_threads self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest + self.ensemble_method = ensemble_method self.max_models_on_disc = max_models_on_disc self.logging_config: Optional[Dict] = logging_config self.include_components: Optional[Dict] = include_components @@ -1249,7 +1252,8 @@ def _search( ensemble_size=self.ensemble_size, ensemble_nbest=self.ensemble_nbest, precision=precision, - optimize_metric=self.opt_metric + optimize_metric=self.opt_metric, + ensemble_method=self.ensemble_method ) self._stopwatch.stop_task(ensemble_task_name) @@ -1705,6 +1709,7 @@ def fit_ensemble( precision: Optional[int] = None, ensemble_nbest: int = 50, ensemble_size: int = 50, + ensemble_method: int = EnsembleSelectionTypes.ensemble_selection, load_models: bool = True, time_for_task: int = 100, func_eval_time_limit_secs: int = 50, @@ -1815,6 +1820,7 @@ def fit_ensemble( precision=precision, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, + ensemble_method=ensemble_method, ) manager.build_ensemble(self._dask_client) @@ -1834,6 +1840,7 @@ def _init_ensemble_builder( self, time_left_for_ensembles: float, optimize_metric: str, + ensemble_method: int, ensemble_nbest: int, ensemble_size: int, precision: int = 32, @@ -1887,6 +1894,7 @@ def _init_ensemble_builder( opt_metric=optimize_metric, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, + ensemble_method=ensemble_method, max_models_on_disc=self.max_models_on_disc, seed=self.seed, max_iterations=None, diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 3d80a0338..da1cf293b 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -22,6 +22,7 @@ ) from autoPyTorch.datasets.tabular_dataset import TabularDataset from autoPyTorch.evaluation.utils import DisableFileOutputParameters +from autoPyTorch.ensemble.utils import EnsembleSelectionTypes from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -87,6 +88,7 @@ def __init__( logging_config: Optional[Dict] = None, ensemble_size: int = 50, ensemble_nbest: int = 50, + ensemble_method: int = EnsembleSelectionTypes.ensemble_selection, max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, @@ -106,6 +108,7 @@ def __init__( logging_config=logging_config, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, + ensemble_method=ensemble_method, max_models_on_disc=max_models_on_disc, temporary_directory=temporary_directory, output_directory=output_directory, diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index 073b4d77c..c9f21e453 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -22,6 +22,7 @@ ) from autoPyTorch.datasets.tabular_dataset import TabularDataset from autoPyTorch.evaluation.utils import DisableFileOutputParameters +from autoPyTorch.ensemble.utils import EnsembleSelectionTypes from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -87,6 +88,7 @@ def __init__( logging_config: Optional[Dict] = None, ensemble_size: int = 50, ensemble_nbest: int = 50, + ensemble_method: int = EnsembleSelectionTypes.ensemble_selection, max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, @@ -106,6 +108,7 @@ def __init__( logging_config=logging_config, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, + ensemble_method=ensemble_method, max_models_on_disc=max_models_on_disc, temporary_directory=temporary_directory, output_directory=output_directory, diff --git a/autoPyTorch/ensemble/ensemble_builder_manager.py b/autoPyTorch/ensemble/ensemble_builder_manager.py index 845992064..06f8e696c 100644 --- a/autoPyTorch/ensemble/ensemble_builder_manager.py +++ b/autoPyTorch/ensemble/ensemble_builder_manager.py @@ -20,6 +20,7 @@ from autoPyTorch.automl_common.common.utils.backend import Backend from autoPyTorch.constants import BINARY from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder +from autoPyTorch.ensemble.utils import get_ensemble_builder_class from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.utils.logging_ import get_named_client_logger @@ -37,6 +38,7 @@ def __init__( opt_metric: str, ensemble_size: int, ensemble_nbest: int, + ensemble_method: int, max_models_on_disc: Union[float, int], seed: int, precision: int, @@ -111,6 +113,7 @@ def __init__( self.opt_metric = opt_metric self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest + self.ensemble_method = ensemble_method self.max_models_on_disc: Union[float, int] = max_models_on_disc self.seed = seed self.precision = precision @@ -210,6 +213,7 @@ def build_ensemble( opt_metric=self.opt_metric, ensemble_size=self.ensemble_size, ensemble_nbest=self.ensemble_nbest, + ensemble_method=self.ensemble_method, max_models_on_disc=self.max_models_on_disc, seed=self.seed, precision=self.precision, @@ -252,6 +256,7 @@ def fit_and_return_ensemble( opt_metric: str, ensemble_size: int, ensemble_nbest: int, + ensemble_method: int, max_models_on_disc: Union[float, int], seed: int, precision: int, @@ -330,7 +335,8 @@ def fit_and_return_ensemble( A list with the performance history of this ensemble, of the form [[pandas_timestamp, train_performance, val_performance, test_performance], ...] """ - result = EnsembleBuilder( + ensemble_builder = get_ensemble_builder_class(ensemble_method) + result = ensemble_builder( backend=backend, dataset_name=dataset_name, task_type=task_type, diff --git a/autoPyTorch/ensemble/utils.py b/autoPyTorch/ensemble/utils.py new file mode 100644 index 000000000..705d17e24 --- /dev/null +++ b/autoPyTorch/ensemble/utils.py @@ -0,0 +1,16 @@ +from enum import IntEnum + +from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder +from autoPyTorch.ensemble.stacking_ensemble_builder import StackingEnsembleBuilder + + +class EnsembleSelectionTypes(IntEnum): + ensemble_selection = 1 + stacking_ensemble = 2 + + +def get_ensemble_builder_class(ensemble_method: int): + if ensemble_method == EnsembleSelectionTypes.ensemble_selection: + return EnsembleBuilder + elif ensemble_method == EnsembleSelectionTypes.stacking_ensemble: + return StackingEnsembleBuilder From b6eb0a601d0dd82a41d63e989de26ef4781c16dd Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 17:19:34 +0200 Subject: [PATCH 08/16] working version of levesque et al (clean) --- autoPyTorch/api/base_task.py | 23 ++++-- autoPyTorch/ensemble/stacking_ensemble.py | 42 ++++++++-- .../ensemble/stacking_ensemble_builder.py | 50 ++++++------ autoPyTorch/evaluation/stacking_evaluator.py | 5 +- autoPyTorch/evaluation/tae.py | 11 ++- autoPyTorch/optimizer/smbo.py | 4 + .../20_basics/example_stacking_ensemble.py | 76 +++++++++++++++++++ 7 files changed, 167 insertions(+), 44 deletions(-) create mode 100644 examples/20_basics/example_stacking_ensemble.py diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index cb00e9d73..d4d734e6f 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -761,7 +761,8 @@ def _do_dummy_prediction(self) -> None: stats=stats, memory_limit=memory_limit, disable_file_output=self._disable_file_output, - all_supported_metrics=self._all_supported_metrics + all_supported_metrics=self._all_supported_metrics, + ensemble_method=self.ensemble_method ) status, _, _, additional_info = ta.run(num_run, cutoff=self._time_for_task) @@ -1290,6 +1291,7 @@ def _search( min_budget=min_budget, max_budget=max_budget, ensemble_callback=proc_ensemble, + ensemble_method=self.ensemble_method, logger_port=self._logger_port, # We do not increase the num_run here, this is something # smac does internally @@ -1820,7 +1822,6 @@ def fit_ensemble( precision=precision, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, - ensemble_method=ensemble_method, ) manager.build_ensemble(self._dask_client) @@ -1990,7 +1991,8 @@ def predict( def score( self, y_pred: np.ndarray, - y_test: Union[np.ndarray, pd.DataFrame] + y_test: Union[np.ndarray, pd.DataFrame], + metric: Optional[str] = None ) -> Dict[str, float]: """Calculate the score on the test set. Calculate the evaluation measure on the test set. @@ -2005,15 +2007,22 @@ def score( Dict[str, float]: Value of the evaluation metric calculated on the test set. """ - if self._metric is None: - raise ValueError("No metric found. Either fit/search has not been called yet " - "or AutoPyTorch failed to infer a metric from the dataset ") + if metric is not None: + required_dataset_properties = {'task_type': self.task_type, + 'output_type': self.dataset.output_type} + metric = get_metrics( + dataset_properties=required_dataset_properties, + names=[metric] + )[0] + else: + metric = self._metric + if self.task_type is None: raise ValueError("AutoPytorch failed to infer a task type from the dataset " "Please check the log file for related errors. ") return calculate_score(target=y_test, prediction=y_pred, task_type=STRING_TO_TASK_TYPES[self.task_type], - metrics=[self._metric]) + metrics=[metric]) def __getstate__(self) -> Dict[str, Any]: # Cannot serialize a client! diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py index f0621c29b..913a3024f 100644 --- a/autoPyTorch/ensemble/stacking_ensemble.py +++ b/autoPyTorch/ensemble/stacking_ensemble.py @@ -1,5 +1,6 @@ from collections import Counter from typing import Any, Dict, List, Tuple, Union +import warnings import numpy as np from sklearn.base import BaseEstimator @@ -176,16 +177,17 @@ def _predict(self, predictions, weights): # if prediction model.shape[0] == len(non_null_weights), # predictions do not include those of zero-weight models. - if len(predictions) == np.count_nonzero(weights): - non_null_weights = [w for w in weights if w > 0] - for pred, weight in zip(predictions, non_null_weights): - np.multiply(pred, weight, out=tmp_predictions) - np.add(average, tmp_predictions, out=average) + if len([pred for pred in predictions if pred is not None]) == np.count_nonzero(weights): + for pred, weight in zip(predictions, weights): + if pred is not None: + np.multiply(pred, weight, out=tmp_predictions) + np.add(average, tmp_predictions, out=average) # If none of the above applies, then something must have gone wrong. else: - raise ValueError("The dimensions of ensemble predictions" - " and ensemble weights do not match!") + raise ValueError(f"{len(predictions)}, {self.weights_}\n" + f"The dimensions of non null ensemble predictions" + f" and ensemble weights do not match!") del tmp_predictions return average @@ -240,3 +242,29 @@ def predict_with_current_pipeline( predictions[self.ensemble_slot_j] = pipeline_predictions return self._predict(predictions, weights) + + def get_models_with_weights( + self, + models: Dict[Any, BasePipeline] + ) -> List[Tuple[float, BasePipeline]]: + """ + Handy function to tag the provided input models with a given weight. + Args: + models (List[Tuple[float, BasePipeline]]): + A dictionary that maps a model's name to it's actual python object. + Returns: + output (List[Tuple[float, BasePipeline]]): + each model with the related weight, sorted by ascending + performance. Notice that ensemble selection solves a minimization + problem. + """ + output = [] + for i, weight in enumerate(self.weights_): + if weight > 0.0: + identifier = self.identifiers_[i] + model = models[identifier] + output.append((weight, model)) + + output.sort(reverse=True, key=lambda t: t[0]) + + return output \ No newline at end of file diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py index 836f7884c..ad6136e26 100644 --- a/autoPyTorch/ensemble/stacking_ensemble_builder.py +++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py @@ -6,6 +6,7 @@ import re import time import traceback +import warnings import zlib from typing import Dict, List, Optional, Tuple, Union @@ -186,6 +187,7 @@ def main( ) self.ensemble_slot_j = np.mod(iteration, self.ensemble_size) + self.logger.debug(f"Iteration for ensemble building:{iteration}") self.ensemble_identifiers = self._load_ensemble_identifiers() # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss. if not self.compute_ensemble_loss_per_model(): @@ -233,7 +235,11 @@ def main( # Save the ensemble for later use in the main module! if ensemble is not None and self.SAVE2DISC: self.backend.save_ensemble(ensemble, iteration, self.seed) - self._save_ensemble_identifiers(ensemble_identifiers=ensemble.identifiers_) + ensemble_identifiers=self._get_identifiers_from_num_runs(ensemble.identifiers_) + self.logger.debug(f"ensemble_identifiers being saved are {ensemble_identifiers}") + self._save_ensemble_identifiers( + ensemble_identifiers=ensemble_identifiers + ) # Delete files of non-candidate models - can only be done after fitting the ensemble and # saving it to disc so we do not accidentally delete models in the previous ensemble if self.max_resident_models is not None: @@ -246,15 +252,15 @@ def main( if ensemble is not None: train_pred = self.predict(set_="train", ensemble=ensemble, - selected_keys=ensemble.identifiers_, - n_preds=len(ensemble.identifiers_), + selected_keys=ensemble_identifiers, + n_preds=len(ensemble_identifiers), index_run=iteration) # TODO if predictions fails, build the model again during the # next iteration! test_pred = self.predict(set_="test", ensemble=ensemble, - selected_keys=ensemble.identifiers_, - n_preds=len(ensemble.identifiers_), + selected_keys=ensemble_identifiers, + n_preds=len(ensemble_identifiers), index_run=iteration) # Add a score to run history to see ensemble progress @@ -450,21 +456,6 @@ def fit_ensemble( self.read_losses[best_model_identifier]["budget"], ) - # check hash if ensemble training data changed - current_hash = "".join([ - str(zlib.adler32(predictions_train[i].data.tobytes())) - for i in range(len(predictions_train)) - ]) - if self.last_hash == current_hash: - self.logger.debug( - "No new model predictions selected -- skip ensemble building " - "-- current performance: %f", - self.validation_performance_, - ) - - return None - self.last_hash = current_hash - opt_metric = [m for m in self.metrics if m.name == self.opt_metric][0] if not opt_metric: raise ValueError(f"Cannot optimize for {self.opt_metric} in {self.metrics} " @@ -483,6 +474,7 @@ def fit_ensemble( "Fitting the ensemble on %d models.", len(predictions_train), ) + self.logger.debug(f"predictions sent to ensemble: {predictions_train}") start_time = time.time() ensemble.fit( predictions_train, @@ -497,6 +489,7 @@ def fit_ensemble( "Fitting the ensemble took %.2f seconds.", end_time - start_time, ) + self.logger.debug(f"weights = {ensemble.weights_}") self.logger.info(str(ensemble)) self.validation_performance_ = min( self.validation_performance_, @@ -546,7 +539,9 @@ def predict(self, set_: str, else: pred_set = Y_ENSEMBLE - predictions = [self.read_preds[k][pred_set] for k in selected_keys if k is not None] + self.logger.debug(f"selected_keys with {set_} for predict are {selected_keys}") + predictions = [self.read_preds[k][pred_set] if k is not None else None for k in selected_keys] + self.logger.debug(f"predictions with {set_} for predict are {len(predictions)}") if n_preds == len(predictions): y = ensemble.predict(predictions) @@ -562,6 +557,7 @@ def predict(self, set_: str, ) return y else: + warnings.warn("this is not true so this is the problem") self.logger.info( "Found inconsistent number of predictions and models (%d vs " "%d) for subset %s", @@ -657,9 +653,9 @@ def get_ensemble_loss_with_model(self, model_predictions: np.ndarray): else: continue else: - if self.read_preds[identifier][Y_ENSEMBLE] == None: + if self.read_preds[identifier][Y_ENSEMBLE] is None: # y ensemble read_preds is loaded in get_n_best_preds. If there is no value for this that means its a new model at this iteration. - raise ValueError("check here to resolve starting condition") + raise ValueError(f"check here to resolve starting condition, {self.read_preds[identifier]}") predictions = self.read_preds[identifier][Y_ENSEMBLE] if i != self.ensemble_slot_j else model_predictions np.add( @@ -673,7 +669,7 @@ def get_ensemble_loss_with_model(self, model_predictions: np.ndarray): out=fant_ensemble_prediction ) loss = calculate_loss( - metrics=[self.metric], + metrics=self.metrics, target=self.y_true_ensemble, prediction=fant_ensemble_prediction, task_type=self.task_type, @@ -701,8 +697,10 @@ def _get_identifiers_from_num_runs(self, num_runs, subset='ensemble') -> List[Op identifier = None if num_run is not None: seed, idx, budget = num_run - identifier = self.backend.get_prediction_filename(subset, seed, idx, budget) - + identifier = os.path.join( + self.backend.get_numrun_directory(seed, idx, budget), + self.backend.get_prediction_filename(subset, seed, idx, budget) + ) identifiers.append(identifier) return identifiers diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py index eeca7b6e4..11401fe2b 100644 --- a/autoPyTorch/evaluation/stacking_evaluator.py +++ b/autoPyTorch/evaluation/stacking_evaluator.py @@ -195,6 +195,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], if test_loss is not None: additional_run_info['test_loss'] = test_loss + additional_run_info['opt_loss'] = loss rval_dict = {'loss': cost, 'additional_run_info': additional_run_info, 'status': status} @@ -374,10 +375,10 @@ def _predict(self, pipeline: BaseEstimator, self.y_train[train_indices]) ensemble_dir = self.backend.get_ensemble_dir() - if len(os.listdir(ensemble_dir)) >= 1: + if os.path.exists(ensemble_dir) and len(os.listdir(ensemble_dir)) >= 1: old_ensemble = self.backend.load_ensemble(self.seed) assert isinstance(old_ensemble, StackingEnsemble) - ensemble_opt_pred = old_ensemble.predict_with_current_model(pipeline_opt_pred) + ensemble_opt_pred = old_ensemble.predict_with_current_pipeline(pipeline_opt_pred) else: ensemble_opt_pred = pipeline_opt_pred.copy() diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py index 17830ee94..c756d5e8e 100644 --- a/autoPyTorch/evaluation/tae.py +++ b/autoPyTorch/evaluation/tae.py @@ -21,6 +21,8 @@ from smac.stats.stats import Stats from smac.tae import StatusType, TAEAbortException from smac.tae.execute_func import AbstractTAFunc +from autoPyTorch.ensemble import ensemble_selection +from autoPyTorch.ensemble.utils import EnsembleSelectionTypes from autoPyTorch.automl_common.common.utils.backend import Backend from autoPyTorch.datasets.resampling_strategy import ( @@ -28,6 +30,7 @@ HoldoutValTypes, NoResamplingStrategyTypes ) +import autoPyTorch.evaluation.stacking_evaluator from autoPyTorch.evaluation.test_evaluator import eval_test_function from autoPyTorch.evaluation.train_evaluator import eval_train_function from autoPyTorch.evaluation.utils import ( @@ -127,7 +130,8 @@ def __init__( ta: Optional[Callable] = None, logger_port: int = None, all_supported_metrics: bool = True, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + ensemble_method = None ): self.backend = backend @@ -146,7 +150,10 @@ def __init__( self.resampling_strategy_args = dm.resampling_strategy_args if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)): - eval_function = eval_train_function + if ensemble_method is None or ensemble_method == EnsembleSelectionTypes.ensemble_selection: + eval_function = eval_train_function + elif ensemble_method == EnsembleSelectionTypes.stacking_ensemble: + eval_function = autoPyTorch.evaluation.stacking_evaluator.eval_function self.output_y_hat_optimization = output_y_hat_optimization elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes): eval_function = eval_test_function diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index b6242e379..945ff880d 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -25,6 +25,7 @@ NoResamplingStrategyTypes ) from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager +from autoPyTorch.ensemble.utils import EnsembleSelectionTypes from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash from autoPyTorch.optimizer.utils import read_return_initial_configurations from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric @@ -115,6 +116,7 @@ def __init__(self, pynisher_context: str = 'spawn', min_budget: int = 5, max_budget: int = 50, + ensemble_method: int = EnsembleSelectionTypes.ensemble_selection ): """ Interface to SMAC. This method calls the SMAC optimize method, and allows @@ -228,6 +230,7 @@ def __init__(self, self.pynisher_context = pynisher_context self.min_budget = min_budget self.max_budget = max_budget + self.ensemble_method = ensemble_method self.ensemble_callback = ensemble_callback @@ -292,6 +295,7 @@ def run_smbo(self, func: Optional[Callable] = None pipeline_config=self.pipeline_config, search_space_updates=self.search_space_updates, pynisher_context=self.pynisher_context, + ensemble_method=self.ensemble_method ) ta = ExecuteTaFuncWithQueue self.logger.info("Finish creating Target Algorithm (TA) function") diff --git a/examples/20_basics/example_stacking_ensemble.py b/examples/20_basics/example_stacking_ensemble.py new file mode 100644 index 000000000..4ceefda8d --- /dev/null +++ b/examples/20_basics/example_stacking_ensemble.py @@ -0,0 +1,76 @@ +""" +====================== +Tabular Classification +====================== +The following example shows how to fit a sample classification model +with AutoPyTorch +""" +import os +import tempfile as tmp +import warnings + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import sklearn.datasets +import sklearn.model_selection + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.ensemble.utils import EnsembleSelectionTypes + +############################################################################ +# Data Loading +# ============ +X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X[:200], + y[:200], + random_state=1, +) + +############################################################################ +# Build and fit a classifier +# ========================== +api = TabularClassificationTask( + # To maintain logs of the run, you can uncomment the + # Following lines + temporary_directory='./tmp/autoPyTorch_example_tmp_02', + output_directory='./tmp/autoPyTorch_example_out_02', + delete_tmp_folder_after_terminate=False, + delete_output_folder_after_terminate=False, + seed=42, + ensemble_method=EnsembleSelectionTypes.stacking_ensemble, + ensemble_size=5 +) + +############################################################################ +# Search for an ensemble of machine learning algorithms +# ===================================================== +api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + dataset_name='Australian', + optimize_metric='zero_one_loss', + total_walltime_limit=300, + func_eval_time_limit_secs=50, + enable_traditional_pipeline=False +) + +############################################################################ +# Print the final ensemble performance +# ==================================== +y_pred = api.predict(X_test) +score = api.score(y_pred, y_test, metric='accuracy') +print(score) +# Print the final ensemble built by AutoPyTorch +print(api.show_models()) + +# Print statistics from search +# print(api.sprint_statistics()) \ No newline at end of file From 3681c8f0b1d21e21668545c9b6721bf186ae374b Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 17:21:53 +0200 Subject: [PATCH 09/16] create callback and new smbo --- .../ensemble/ensemble_builder_manager.py | 1 - autoPyTorch/optimizer/run_history_callback.py | 293 ++++++++++++++++++ autoPyTorch/optimizer/utils.py | 128 +++++++- 3 files changed, 420 insertions(+), 2 deletions(-) create mode 100644 autoPyTorch/optimizer/run_history_callback.py diff --git a/autoPyTorch/ensemble/ensemble_builder_manager.py b/autoPyTorch/ensemble/ensemble_builder_manager.py index 06f8e696c..7c0786bb9 100644 --- a/autoPyTorch/ensemble/ensemble_builder_manager.py +++ b/autoPyTorch/ensemble/ensemble_builder_manager.py @@ -19,7 +19,6 @@ from autoPyTorch.automl_common.common.utils.backend import Backend from autoPyTorch.constants import BINARY -from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder from autoPyTorch.ensemble.utils import get_ensemble_builder_class from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.utils.logging_ import get_named_client_logger diff --git a/autoPyTorch/optimizer/run_history_callback.py b/autoPyTorch/optimizer/run_history_callback.py new file mode 100644 index 000000000..02d0616ba --- /dev/null +++ b/autoPyTorch/optimizer/run_history_callback.py @@ -0,0 +1,293 @@ +from json import dump, load +import json +import logging +import os +import pickle +import re +import time +import traceback +from typing import List, Union, Dict, Tuple, Optional + +import dask.distributed +from distributed.utils import Any +from numpy.random.mtrand import seed + +from smac.optimizer.smbo import SMBO +from smac.runhistory.runhistory import RunInfo, RunKey +from torch.utils import data +from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes + +from autoPyTorch.optimizer.utils import AdjustRunHistoryCallback +from autoPyTorch.automl_common.common.utils.backend import Backend +from autoPyTorch.automl_common.common.utils.logging_ import get_named_client_logger + + +MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy' + +class RunHistoryUpdaterManager(AdjustRunHistoryCallback): + def __init__( + self, + backend: Backend, + random_state: int, + dataset_name: str, + resampling_strategy: Union[ + HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes + ], + resampling_strategy_args: Dict[str, Any], + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + ): + """ + SMAC callback to update run history + Args: + backend: util.backend.Backend + backend to write and read files + logger_port: int + port in where to publish a msg + + Returns: + List[Tuple[int, float, float, float]]: + A list with the performance history of this ensemble, of the form + [[pandas_timestamp, train_performance, val_performance, test_performance], ...] + """ + + self.backend = backend + + self.random_state = random_state + self.logger_port = logger_port + + # We only submit new ensembles when there is not an active ensemble job + self.futures: List[dask.Future] = [] + + # The last criteria is the number of iterations + self.iteration = 0 + + # Keep track of when we started to know when we need to finish! + self.start_time = time.time() + self.dataset_name = dataset_name + self.resampling_strategy = resampling_strategy + self.resampling_strategy_args = resampling_strategy_args + + def __call__( + self, + smbo: 'SMBO', + ) -> None: + self.adjust_run_history(smbo.tae_runner.client) + + def adjust_run_history( + self, + dask_client: dask.distributed.Client, + unit_test: bool = False + ) -> None: + + # The second criteria is elapsed time + elapsed_time = time.time() - self.start_time + + logger = get_named_client_logger( + name='EnsembleBuilder', + port=self.logger_port, + ) + + if len(self.futures) != 0: + if self.futures[0].done(): + result = self.futures.pop().result() + if result: + ensemble_history, self.ensemble_nbest, _, _ = result + logger.debug("iteration={} @ elapsed_time={} has history={}".format( + self.iteration, + elapsed_time, + ensemble_history, + )) + + # Only submit new jobs if the previous ensemble job finished + if len(self.futures) == 0: + + # Add the result of the run + # On the next while iteration, no references to + # ensemble builder object, so it should be garbage collected to + # save memory while waiting for resources + # Also, notice how ensemble nbest is returned, so we don't waste + # iterations testing if the deterministic predictions size can + # be fitted in memory + try: + # Submit a Dask job from this job, to properly + # see it in the dask diagnostic dashboard + # Notice that the forked ensemble_builder_process will + # wait for the below function to be done + self.futures.append( + dask_client.submit( + return_run_info_cost, + backend=self.backend, + dataset_name=self.dataset_name, + iteration=self.iteration, + resampling_strategy=self.resampling_strategy, + resampling_strategy_args=self.resampling_strategy_args, + logger_port=self.logger_port, + priority=100 + ) + ) + + logger.info( + "{}/{} Started Ensemble builder job at {} for iteration {}.".format( + # Log the client to make sure we + # remain connected to the scheduler + self.futures[0], + dask_client, + time.strftime("%Y.%m.%d-%H.%M.%S"), + self.iteration, + ), + ) + self.iteration += 1 + except Exception as e: + exception_traceback = traceback.format_exc() + error_message = repr(e) + logger.critical(exception_traceback) + logger.critical(error_message) + + +def return_run_info_cost( + backend: Backend, + dataset_name: str, + resampling_strategy: Union[ + HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes + ], + resampling_strategy_args: Dict[str, Any], + iteration: int, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, +) -> Optional[List[Tuple[RunKey, float]]]: + """ + A short function to fit and create an ensemble. It is just a wrapper to easily send + a request to dask to create an ensemble and clean the memory when finished + Parameters + ---------- + backend: util.backend.Backend + backend to write and read files + dataset_name: str + name of dataset + metrics: List[autoPyTorchMetric], + A set of metrics that will be used to get performance estimates + opt_metric: + Name of the metric to optimize + task_type: int + type of output expected in the ground truth + ensemble_size: int + maximal size of ensemble (passed to ensemble.ensemble_selection) + ensemble_nbest: int/float + if int: consider only the n best prediction + if float: consider only this fraction of the best models + Both wrt to validation predictions + If performance_range_threshold > 0, might return less models + max_models_on_disc: int + Defines the maximum number of models that are kept in the disc. + If int, it must be greater or equal than 1, and dictates the max number of + models to keep. + If float, it will be interpreted as the max megabytes allowed of disc space. That + is, if the number of ensemble candidates require more disc space than this float + value, the worst models will be deleted to keep within this budget. + Models and predictions of the worst-performing models will be deleted then. + If None, the feature is disabled. + It defines an upper bound on the models that can be used in the ensemble. + seed: int + random seed + precision (int): [16,32,64,128] + precision of floats to read the predictions + memory_limit: Optional[int] + memory limit in mb. If ``None``, no memory limit is enforced. + read_at_most: int + read at most n new prediction files in each iteration + end_at: float + At what time the job must finish. Needs to be the endtime and not the time left + because we do not know when dask schedules the job. + iteration: int + The current iteration + pynisher_context: str + Context to use for multiprocessing, can be either fork, spawn or forkserver. + logger_port: int + The port where the logging server is listening to. + unit_test: bool + Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError. + Having this is very bad coding style, but I did not find a way to make + unittest.mock work through the pynisher with all spawn contexts. If you know a + better solution, please let us know by opening an issue. + Returns + ------- + List[Tuple[int, float, float, float]] + A list with the performance history of this ensemble, of the form + [[pandas_timestamp, train_performance, val_performance, test_performance], ...] + """ + result = RunHistoryUpdater( + backend=backend, + dataset_name=dataset_name, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + logger_port=logger_port, + ).run( + iteration=iteration, + ) + return result + + +class RunHistoryUpdater: + def __init__( + self, + backend: Backend, + dataset_name: str, + resampling_strategy: Union[ + HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes + ], + resampling_strategy_args: Dict[str, Any], + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + ): + """ + SMAC callback to update run history + Args: + backend: util.backend.Backend + backend to write and read files + logger_port: int + port in where to publish a msg + + Returns: + List[Tuple[int, float, float, float]]: + A list with the performance history of this ensemble, of the form + [[pandas_timestamp, train_performance, val_performance, test_performance], ...] + """ + + self.model_fn_re = re.compile(MODEL_FN_RE) + self.logger_port = logger_port + self.logger = get_named_client_logger( + name='RunHistoryUpdater', + port=self.logger_port, + ) + self.ensemble_loss_file = os.path.join(backend.internals_directory, 'ensemble_read_losses.pkl') + if isinstance(resampling_strategy, CrossValTypes): + num_splits = resampling_strategy_args['num_splits'] + self.instances = [[json.dumps({'task_id': dataset_name, + 'fold': fold_number})] + for fold_number in range(num_splits)] + else: + self.instances = [[json.dumps({'task_id': dataset_name})]] + + def run(self, iteration: int) -> Optional[List[Tuple[RunKey, float]]]: + results: List[Tuple[RunInfo, float]] = [] + if os.path.exists(self.ensemble_loss_file): + try: + with (open(self.ensemble_loss_file, "rb")) as memory: + read_losses = pickle.load(memory) + except Exception as e: + self.logger.debug(f"Could not read losses at iteration: {iteration} with exception {e}") + return + else: + for k in read_losses.keys(): + match = self.model_fn_re.search(k) + if match is None or read_losses[k]["loaded"] != 1: + continue + else: + _num_run = int(match.group(2)) + _budget = float(match.group(3)) + run_key = RunKey( + seed=0, # 0 is hardcoded for the runhistory coming from smac + config_id=_num_run, + budget=_budget, + instance_id=self.instances[-1] + ) + results.append((run_key, read_losses[k]["ens_loss"])) + return results diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py index 6fb9d5024..37c6795fc 100644 --- a/autoPyTorch/optimizer/utils.py +++ b/autoPyTorch/optimizer/utils.py @@ -1,10 +1,25 @@ import json import os import warnings -from typing import Any, Dict, List +from typing import Any, Dict, List, Union from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +import numpy as np + +from smac.optimizer.smbo import SMBO +from smac.scenario.scenario import Scenario +from smac.stats.stats import Stats +from smac.initial_design.initial_design import InitialDesign +from smac.runhistory.runhistory import RunHistory, RunInfo, RunValue +from smac.runhistory.runhistory2epm import AbstractRunHistory2EPM +from smac.intensification.abstract_racer import AbstractRacer +from smac.epm.rf_with_instances import RandomForestWithInstances +from smac.optimizer.ei_optimization import AbstractAcquisitionFunction, AcquisitionFunctionMaximizer +from smac.tae import FirstRunCrashedException, StatusType, TAEAbortException +from smac.tae.base import BaseRunner +from smac.optimizer.random_configuration_chooser import RandomConfigurationChooser, ChooserNoCoolDown + def read_return_initial_configurations( config_space: ConfigurationSpace, @@ -31,3 +46,114 @@ def read_return_initial_configurations( f"Therefore, it can't be used as an initial " f"configuration as it does not match the current config space. ") return initial_configurations + +class AdjustRunHistoryCallback: + """ + Allows manipulating run history for custom needs + """ + def __call__(self, smbo: 'SMBO') -> RunHistory: + pass + +class autoPyTorchSMBO(SMBO): + def __init__(self, + scenario: Scenario, + stats: Stats, + initial_design: InitialDesign, + runhistory: RunHistory, + runhistory2epm: AbstractRunHistory2EPM, + intensifier: AbstractRacer, + num_run: int, + model: RandomForestWithInstances, + acq_optimizer: AcquisitionFunctionMaximizer, + acquisition_func: AbstractAcquisitionFunction, + rng: np.random.RandomState, + tae_runner: BaseRunner, + restore_incumbent: Configuration = None, + random_configuration_chooser: Union[RandomConfigurationChooser] = ChooserNoCoolDown(2.0), + predict_x_best: bool = True, + min_samples_model: int = 1): + super().__init__( + scenario, + stats, + initial_design, + runhistory, + runhistory2epm, + intensifier, + num_run, + model, + acq_optimizer, + acquisition_func, + rng, + tae_runner, + restore_incumbent, + random_configuration_chooser, + predict_x_best, + min_samples_model, + ) + self._callbacks.update({'_adjust_run_history': list()}) + self._callback_to_key.update({AdjustRunHistoryCallback: '_adjust_run_history'}) + + def _incorporate_run_results(self, run_info: RunInfo, result: RunValue, time_left: float) -> None: + # update SMAC stats + self.stats.ta_time_used += float(result.time) + self.stats.finished_ta_runs += 1 + + self.logger.debug( + "Return: Status: %r, cost: %f, time: %f, additional: %s" % ( + result.status, result.cost, result.time, str(result.additional_info) + ) + ) + + self.runhistory.add( + config=run_info.config, + cost=result.cost, + time=result.time, + status=result.status, + instance_id=run_info.instance, + seed=run_info.seed, + budget=run_info.budget, + starttime=result.starttime, + endtime=result.endtime, + force_update=True, + additional_info=result.additional_info, + ) + self.stats.n_configs = len(self.runhistory.config_ids) + + if result.status == StatusType.ABORT: + raise TAEAbortException("Target algorithm status ABORT - SMAC will " + "exit. The last incumbent can be found " + "in the trajectory-file.") + elif result.status == StatusType.STOP: + self._stop = True + return + + if self.scenario.abort_on_first_run_crash: # type: ignore[attr-defined] # noqa F821 + if self.stats.finished_ta_runs == 1 and result.status == StatusType.CRASHED: + raise FirstRunCrashedException( + "First run crashed, abort. Please check your setup -- we assume that your default " + "configuration does not crashes. (To deactivate this exception, use the SMAC scenario option " + "'abort_on_first_run_crash'). Additional run info: %s" % result.additional_info + ) + for callback in self._callbacks['_incorporate_run_results']: + response = callback(smbo=self, run_info=run_info, result=result, time_left=time_left) + # If a callback returns False, the optimization loop should be interrupted + # the other callbacks are still being called + if response is False: + self.logger.debug("An IncorporateRunResultCallback returned False, requesting abort.") + self._stop = True + + for callback in self._callbacks['_adjust_run_history']: + result = callback(smbo=self) + # Update the intensifier with the result of the runs + self.incumbent, inc_perf = self.intensifier.process_results( + run_info=run_info, + incumbent=self.incumbent, + run_history=self.runhistory, + time_bound=max(self._min_time, time_left), + result=result, + ) + + if self.scenario.save_instantly: # type: ignore[attr-defined] # noqa F821 + self.save() + + return From adca8d2e3aed868bc937270c1d8feeec1ab7df9b Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 17:39:16 +0200 Subject: [PATCH 10/16] Fixed bugs in running paper code, should be fine now (clean) --- autoPyTorch/api/base_task.py | 38 +++- autoPyTorch/api/tabular_classification.py | 4 + .../ensemble/stacking_ensemble_builder.py | 163 ++++++------------ autoPyTorch/evaluation/stacking_evaluator.py | 4 +- autoPyTorch/optimizer/run_history_callback.py | 9 +- autoPyTorch/optimizer/smbo.py | 16 +- autoPyTorch/optimizer/utils.py | 13 +- .../20_basics/example_stacking_ensemble.py | 13 +- 8 files changed, 135 insertions(+), 125 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index d4d734e6f..3366c6bad 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -27,6 +27,7 @@ import pandas as pd +from smac.optimizer.smbo import SMBO from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue from smac.stats.stats import Stats from smac.tae import StatusType @@ -53,6 +54,7 @@ from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash from autoPyTorch.evaluation.utils import DisableFileOutputParameters +from autoPyTorch.optimizer.run_history_callback import RunHistoryUpdaterManager from autoPyTorch.optimizer.smbo import AutoMLSMBO from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners @@ -974,7 +976,8 @@ def _search( disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, - dask_client: Optional[dask.distributed.Client] = None + dask_client: Optional[dask.distributed.Client] = None, + smbo_class: Optional[SMBO] = None ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -1215,7 +1218,7 @@ def _search( # ============> Run dummy predictions # We only want to run dummy predictions in case we want to build an ensemble - if self.ensemble_size > 0: + if self.ensemble_size > 0 and self.ensemble_method != EnsembleSelectionTypes.stacking_ensemble: dummy_task_name = 'runDummy' self._stopwatch.start_task(dummy_task_name) self._do_dummy_prediction() @@ -1248,7 +1251,6 @@ def _search( else: self._logger.info("Starting ensemble") ensemble_task_name = 'ensemble' - self._stopwatch.start_task(ensemble_task_name) proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles, ensemble_size=self.ensemble_size, ensemble_nbest=self.ensemble_nbest, @@ -1256,7 +1258,12 @@ def _search( optimize_metric=self.opt_metric, ensemble_method=self.ensemble_method ) - self._stopwatch.stop_task(ensemble_task_name) + proc_runhistory_updater = None + if ( + self.ensemble_method == EnsembleSelectionTypes.stacking_ensemble + and smbo_class is not None + ): + proc_runhistory_updater = self._init_result_history_updater() # ==> Run SMAC smac_task_name: str = 'runSMAC' @@ -1299,6 +1306,8 @@ def _search( search_space_updates=self.search_space_updates, portfolio_selection=portfolio_selection, pynisher_context=self._multiprocessing_context, + smbo_class = smbo_class, + other_callbacks=[proc_runhistory_updater] if proc_runhistory_updater is not None else None ) try: run_history, self._results_manager.trajectory, budget_type = \ @@ -1934,6 +1943,27 @@ def _collect_results_ensemble( pd.DataFrame(self.ensemble_performance_history).to_json( os.path.join(self._backend.internals_directory, 'ensemble_history.json')) + def _init_result_history_updater(self): + if self.dataset is None: + raise ValueError("runhistory updater can only be initialised after or during `search()`. " + "Please call the `search()` method of {}.".format(self.__class__.__name__)) + + self._logger.info("Starting Runhistory updater") + runhistory_task_name = 'runhistory_updater' + self._stopwatch.start_task(runhistory_task_name) + + proc_runhistory_updater = RunHistoryUpdaterManager( + backend=self._backend, + dataset_name=self.dataset_name, + resampling_strategy=self.resampling_strategy, + resampling_strategy_args=self.resampling_strategy_args, + logger_port=self._logger_port + ) + + self._stopwatch.stop_task(runhistory_task_name) + + return proc_runhistory_updater + def predict( self, X_test: np.ndarray, diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index da1cf293b..5641c1005 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -4,6 +4,8 @@ import pandas as pd +from smac.optimizer.smbo import SMBO + from autoPyTorch.api.base_task import BaseTask from autoPyTorch.automl_common.common.utils.backend import Backend from autoPyTorch.constants import ( @@ -256,6 +258,7 @@ def search( load_models: bool = True, portfolio_selection: Optional[str] = None, dataset_compression: Union[Mapping[str, Any], bool] = False, + smbo_class: Optional[SMBO] = None ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -455,6 +458,7 @@ def search( disable_file_output=disable_file_output, load_models=load_models, portfolio_selection=portfolio_selection, + smbo_class=smbo_class ) def predict( diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py index ad6136e26..9b324af8b 100644 --- a/autoPyTorch/ensemble/stacking_ensemble_builder.py +++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py @@ -117,7 +117,7 @@ def __init__( # we can do this by either storing and reading them in this class # or passing them via the ensemble builder manager which has persistency with the futures stored. self.ensemble_identifiers: Optional[List[Optional[str]]] = None - + self.read_losses = {} # TODO: This is the main wrapper to the EnsembleSelection class which fits # TODO: the ensemble @@ -207,18 +207,9 @@ def main( # populates test predictions in self.read_preds # reduces selected models if file reading failed - n_sel_test = self.get_test_preds(selected_keys=candidate_models) - - # If any of n_sel_* is not empty and overlaps with candidate_models, - # then ensure candidate_models AND n_sel_test are sorted the same - candidate_models_set = set(candidate_models) - if candidate_models_set.intersection(n_sel_test): - candidate_models = sorted(list(candidate_models_set.intersection( - n_sel_test))) - n_sel_test = candidate_models - else: - # This has to be the case - n_sel_test = [] + candidate_models = self.get_test_preds(selected_keys=candidate_models) + + self.logger.debug(f"n_sel_test: {candidate_models}") if os.environ.get('ENSEMBLE_KEEP_ALL_CANDIDATES'): for candidate in candidate_models: @@ -227,6 +218,8 @@ def main( # as candidate models is sorted in `get_n_best_preds` best_model_identifier = candidate_models[0] + self.logger.debug(f"for iteration {iteration}, best_model_identifier: {best_model_identifier} \n candidate_models: \n{candidate_models}") + # train ensemble ensemble = self.fit_ensemble( best_model_identifier=best_model_identifier @@ -334,6 +327,7 @@ def compute_ensemble_loss_per_model(self) -> bool: # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list # as a returning list, so as a work-around we skip next line for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]): # type: ignore + self.logger.debug(f"This is for model {y_ens_fn}") if self.read_at_most and n_read_files >= self.read_at_most: # limit the number of files that will be read # to limit memory consumption @@ -343,22 +337,21 @@ def compute_ensemble_loss_per_model(self) -> bool: self.logger.info('Error loading file (not .npy or .npy.gz): %s', y_ens_fn) continue - if not self.read_losses.get(y_ens_fn): - self.read_losses[y_ens_fn] = { - "ens_loss": np.inf, - "mtime_ens": 0, - "mtime_test": 0, - "seed": _seed, - "num_run": _num_run, - "budget": _budget, - "disc_space_cost_mb": None, - # Lazy keys so far: - # 0 - not loaded - # 1 - loaded and in memory - # 2 - loaded but dropped again - # 3 - deleted from disk due to space constraints - "loaded": 0 - } + self.read_losses[y_ens_fn] = { + "ens_loss": np.inf, + "mtime_ens": 0, + "mtime_test": 0, + "seed": _seed, + "num_run": _num_run, + "budget": _budget, + "disc_space_cost_mb": None, + # Lazy keys so far: + # 0 - not loaded + # 1 - loaded and in memory + # 2 - loaded but dropped again + # 3 - deleted from disk due to space constraints + "loaded": 0 + } if not self.read_preds.get(y_ens_fn): self.read_preds[y_ens_fn] = { Y_ENSEMBLE: None, @@ -371,20 +364,12 @@ def compute_ensemble_loss_per_model(self) -> bool: # actually read the predictions and compute their respective loss try: + ensemble_idenitfiers = self.ensemble_identifiers.copy() + ensemble_idenitfiers[self.ensemble_slot_j] = y_ens_fn y_ensemble = self._read_np_fn(y_ens_fn) losses = self.get_ensemble_loss_with_model( - model_predictions=y_ensemble - ) - - if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]): - self.logger.debug( - 'Changing ensemble loss for file %s from %f to %f ' - 'because file modification time changed? %f - %f', - y_ens_fn, - self.read_losses[y_ens_fn]["ens_loss"], - losses[self.opt_metric], - self.read_losses[y_ens_fn]["mtime_ens"], - os.path.getmtime(y_ens_fn), + model_predictions=y_ensemble, + ensemble_identifiers=ensemble_idenitfiers ) self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric] @@ -470,11 +455,11 @@ def fit_ensemble( ) try: - self.logger.debug( - "Fitting the ensemble on %d models.", - len(predictions_train), - ) - self.logger.debug(f"predictions sent to ensemble: {predictions_train}") + # self.logger.debug( + # "Fitting the ensemble on %d models.", + # len(predictions_train), + # ) + # self.logger.debug(f"predictions sent to ensemble: {predictions_train}") start_time = time.time() ensemble.fit( predictions_train, @@ -489,7 +474,7 @@ def fit_ensemble( "Fitting the ensemble took %.2f seconds.", end_time - start_time, ) - self.logger.debug(f"weights = {ensemble.weights_}") + # self.logger.debug(f"weights = {ensemble.weights_}") self.logger.info(str(ensemble)) self.validation_performance_ = min( self.validation_performance_, @@ -539,9 +524,7 @@ def predict(self, set_: str, else: pred_set = Y_ENSEMBLE - self.logger.debug(f"selected_keys with {set_} for predict are {selected_keys}") predictions = [self.read_preds[k][pred_set] if k is not None else None for k in selected_keys] - self.logger.debug(f"predictions with {set_} for predict are {len(predictions)}") if n_preds == len(predictions): y = ensemble.predict(predictions) @@ -582,35 +565,6 @@ def get_candidate_preds(self) -> List[str]: sorted_keys = self._get_list_of_sorted_preds() - # number of models available - num_keys = len(sorted_keys) - # remove all that are at most as good as random - # note: dummy model must have run_id=1 (there is no run_id=0) - dummy_losses = list(filter(lambda x: x[2] == 1, sorted_keys)) - # Leave this here for when we enable dummy classifier/scorer - if len(dummy_losses) > 0: - # number of dummy models - num_dummy = len(dummy_losses) - dummy_loss = dummy_losses[0] - self.logger.debug("Use %f as dummy loss" % dummy_loss[1]) - sorted_keys = list(filter(lambda x: x[1] < dummy_loss[1], sorted_keys)) - - # remove Dummy Classifier - sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys)) - if len(sorted_keys) == 0: - # no model left; try to use dummy loss (num_run==0) - # log warning when there are other models but not better than dummy model - if num_keys > num_dummy: - self.logger.warning("No models better than random - using Dummy Score!" - "Number of models besides current dummy model: %d. " - "Number of dummy models: %d", - num_keys - 1, - num_dummy) - sorted_keys = [ - (k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items() - if v["seed"] == self.seed and v["num_run"] == 1 - ] - # reduce to keys reduced_sorted_keys = list(map(lambda x: x[0], sorted_keys)) @@ -629,7 +583,10 @@ def get_candidate_preds(self) -> List[str]: # return best scored keys of self.read_losses return reduced_sorted_keys - def get_ensemble_loss_with_model(self, model_predictions: np.ndarray): + def get_ensemble_loss_with_model(self, + model_predictions: np.ndarray, + ensemble_identifiers: List[str] + ): """ Gets the loss of the ensemble given slot j and predictions for new model at slot j set is ensemble @@ -637,41 +594,33 @@ def get_ensemble_loss_with_model(self, model_predictions: np.ndarray): model_predictions ([type]): [description] """ - weighted_ensemble_prediction = np.zeros( - model_predictions.shape, - dtype=np.float64, - ) - fant_ensemble_prediction = np.zeros( - weighted_ensemble_prediction.shape, - dtype=np.float64, - ) + # self.logger.debug(f"in ensemble_loss predictions for current are \n{model_predictions}") + self.logger.debug(f"in ensemble_loss ensemble_identifiers: {ensemble_identifiers}") + + average_predictions = np.zeros_like(model_predictions, dtype=np.float64) + tmp_predictions = np.empty_like(model_predictions, dtype=np.float64) + nonnull_identifiers = len([identifier for identifier in ensemble_identifiers if identifier is not None]) - for i, identifier in enumerate(self.ensemble_identifiers): - if identifier is None: - if i == self.ensemble_slot_j: + self.logger.debug(f"non null identifiers : {nonnull_identifiers}") + weight = 1. / float(nonnull_identifiers) + # if prediction model.shape[0] == len(non_null_weights), + # predictions do not include those of zero-weight models. + for identifier in ensemble_identifiers: + if identifier is not None: + if self.read_preds[identifier][Y_ENSEMBLE] is None: predictions = model_predictions else: - continue + predictions = self.read_preds[identifier][Y_ENSEMBLE] else: - if self.read_preds[identifier][Y_ENSEMBLE] is None: - # y ensemble read_preds is loaded in get_n_best_preds. If there is no value for this that means its a new model at this iteration. - raise ValueError(f"check here to resolve starting condition, {self.read_preds[identifier]}") - predictions = self.read_preds[identifier][Y_ENSEMBLE] if i != self.ensemble_slot_j else model_predictions - - np.add( - weighted_ensemble_prediction, - predictions, - out=fant_ensemble_prediction - ) - np.multiply( - fant_ensemble_prediction, - (1. / float(self.ensemble_size)), - out=fant_ensemble_prediction - ) + break + + np.multiply(predictions, weight, out=tmp_predictions) + np.add(average_predictions, tmp_predictions, out=average_predictions) + loss = calculate_loss( metrics=self.metrics, target=self.y_true_ensemble, - prediction=fant_ensemble_prediction, + prediction=average_predictions, task_type=self.task_type, ) return loss diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py index 11401fe2b..d01c846b0 100644 --- a/autoPyTorch/evaluation/stacking_evaluator.py +++ b/autoPyTorch/evaluation/stacking_evaluator.py @@ -11,6 +11,7 @@ from smac.tae import StatusType +from autoPyTorch import ensemble from autoPyTorch.automl_common.common.utils.backend import Backend from autoPyTorch.constants import ( CLASSIFICATION_TASKS, @@ -160,7 +161,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], if file_output: loss_, additional_run_info_ = self.file_output( - ensemble_opt_pred, valid_pred, test_pred + pipeline_opt_pred, valid_pred, test_pred ) else: loss_ = None @@ -382,6 +383,7 @@ def _predict(self, pipeline: BaseEstimator, else: ensemble_opt_pred = pipeline_opt_pred.copy() + self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}") if self.X_valid is not None: valid_pred = self.predict_function(self.X_valid, pipeline, self.y_valid) diff --git a/autoPyTorch/optimizer/run_history_callback.py b/autoPyTorch/optimizer/run_history_callback.py index 02d0616ba..6b020e174 100644 --- a/autoPyTorch/optimizer/run_history_callback.py +++ b/autoPyTorch/optimizer/run_history_callback.py @@ -28,7 +28,6 @@ class RunHistoryUpdaterManager(AdjustRunHistoryCallback): def __init__( self, backend: Backend, - random_state: int, dataset_name: str, resampling_strategy: Union[ HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes @@ -52,7 +51,6 @@ def __init__( self.backend = backend - self.random_state = random_state self.logger_port = logger_port # We only submit new ensembles when there is not an active ensemble job @@ -91,11 +89,11 @@ def adjust_run_history( if self.futures[0].done(): result = self.futures.pop().result() if result: - ensemble_history, self.ensemble_nbest, _, _ = result - logger.debug("iteration={} @ elapsed_time={} has history={}".format( + response = result + logger.debug("iteration={} @ elapsed_time={} has response={}".format( self.iteration, elapsed_time, - ensemble_history, + response, )) # Only submit new jobs if the previous ensemble job finished @@ -267,6 +265,7 @@ def __init__( self.instances = [[json.dumps({'task_id': dataset_name})]] def run(self, iteration: int) -> Optional[List[Tuple[RunKey, float]]]: + self.logger.info(f"Starting iteration {iteration} of run history updater") results: List[Tuple[RunInfo, float]] = [] if os.path.exists(self.ensemble_loss_file): try: diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index 945ff880d..1238e608d 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -10,6 +10,7 @@ from smac.facade.smac_ac_facade import SMAC4AC from smac.intensification.hyperband import Hyperband +from smac.optimizer.smbo import SMBO from smac.runhistory.runhistory import RunHistory from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost from smac.scenario.scenario import Scenario @@ -43,6 +44,7 @@ def get_smac_object( initial_budget: int, max_budget: int, dask_client: Optional[dask.distributed.Client], + smbo_class: Optional[SMBO] = None, initial_configurations: Optional[List[Configuration]] = None, ) -> SMAC4AC: """ @@ -80,6 +82,7 @@ def get_smac_object( 'eta': 3, 'min_chall': 1, 'instance_order': 'shuffle_once'}, dask_client=dask_client, n_jobs=n_jobs, + smbo_class=smbo_class ) @@ -116,7 +119,9 @@ def __init__(self, pynisher_context: str = 'spawn', min_budget: int = 5, max_budget: int = 50, - ensemble_method: int = EnsembleSelectionTypes.ensemble_selection + ensemble_method: int = EnsembleSelectionTypes.ensemble_selection, + other_callbacks: Optional[List] = None, + smbo_class: Optional[SMBO] = None ): """ Interface to SMAC. This method calls the SMAC optimize method, and allows @@ -234,6 +239,9 @@ def __init__(self, self.ensemble_callback = ensemble_callback + self.other_callbacks = other_callbacks + self.smbo_class = smbo_class + self.search_space_updates = search_space_updates if logger_port is None: @@ -362,11 +370,15 @@ def run_smbo(self, func: Optional[Callable] = None initial_budget=self.min_budget, max_budget=self.max_budget, dask_client=self.dask_client, - initial_configurations=self.initial_configurations) + initial_configurations=self.initial_configurations, + smbo_class=self.smbo_class) if self.ensemble_callback is not None: smac.register_callback(self.ensemble_callback) + if self.other_callbacks is not None: + for callback in self.other_callbacks: + smac.register_callback(callback) self.logger.info("initialised SMBO, running SMBO.optimize()") smac.optimize() diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py index 37c6795fc..b15cf2580 100644 --- a/autoPyTorch/optimizer/utils.py +++ b/autoPyTorch/optimizer/utils.py @@ -20,6 +20,7 @@ from smac.tae.base import BaseRunner from smac.optimizer.random_configuration_chooser import RandomConfigurationChooser, ChooserNoCoolDown +from autoPyTorch.utils.common import dict_repr def read_return_initial_configurations( config_space: ConfigurationSpace, @@ -134,6 +135,7 @@ def _incorporate_run_results(self, run_info: RunInfo, result: RunValue, time_lef "configuration does not crashes. (To deactivate this exception, use the SMAC scenario option " "'abort_on_first_run_crash'). Additional run info: %s" % result.additional_info ) + self.logger.debug(f"\nbefore ensemble, result: {result}, \nrunhistory: {self.runhistory.data}") for callback in self._callbacks['_incorporate_run_results']: response = callback(smbo=self, run_info=run_info, result=result, time_left=time_left) # If a callback returns False, the optimization loop should be interrupted @@ -143,7 +145,16 @@ def _incorporate_run_results(self, run_info: RunInfo, result: RunValue, time_lef self._stop = True for callback in self._callbacks['_adjust_run_history']: - result = callback(smbo=self) + response = callback(smbo=self) + if response is not None: + for run_key, cost in response: + run_value = self.runhistory.data.get(run_key, None) + if run_value is not None: + run_value.cost = cost + self.epm_chooser.runhistory = self.runhistory + + self.logger.debug(f"\nafter runhistory updater, result: {result}, \nrunhistory: {dict_repr(self.runhistory.data)}") + # Update the intensifier with the result of the runs self.incumbent, inc_perf = self.intensifier.process_results( run_info=run_info, diff --git a/examples/20_basics/example_stacking_ensemble.py b/examples/20_basics/example_stacking_ensemble.py index 4ceefda8d..e3d7c308a 100644 --- a/examples/20_basics/example_stacking_ensemble.py +++ b/examples/20_basics/example_stacking_ensemble.py @@ -22,14 +22,15 @@ from autoPyTorch.api.tabular_classification import TabularClassificationTask from autoPyTorch.ensemble.utils import EnsembleSelectionTypes +from autoPyTorch.optimizer.utils import autoPyTorchSMBO ############################################################################ # Data Loading # ============ X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X[:200], - y[:200], + X, + y, random_state=1, ) @@ -57,10 +58,12 @@ X_test=X_test.copy(), y_test=y_test.copy(), dataset_name='Australian', - optimize_metric='zero_one_loss', - total_walltime_limit=300, + optimize_metric='accuracy', + total_walltime_limit=1000, func_eval_time_limit_secs=50, - enable_traditional_pipeline=False + enable_traditional_pipeline=False, + smbo_class=autoPyTorchSMBO, + all_supported_metrics=False ) ############################################################################ From 4f8289c678eace59b7c00ca2f5dbb2fe4d9c57dd Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 17:52:07 +0200 Subject: [PATCH 11/16] removed finished TODO comments and fix run_history_updater (clean) --- autoPyTorch/api/base_task.py | 13 ++- autoPyTorch/ensemble/stacking_ensemble.py | 17 +--- .../ensemble/stacking_ensemble_builder.py | 21 ++--- autoPyTorch/evaluation/stacking_evaluator.py | 11 +-- autoPyTorch/optimizer/run_history_callback.py | 91 +++++++------------ autoPyTorch/optimizer/utils.py | 15 ++- 6 files changed, 63 insertions(+), 105 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 3366c6bad..b4ba01c99 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1250,7 +1250,6 @@ def _search( self._logger.info("Not starting ensemble builder as ensemble size is 0") else: self._logger.info("Starting ensemble") - ensemble_task_name = 'ensemble' proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles, ensemble_size=self.ensemble_size, ensemble_nbest=self.ensemble_nbest, @@ -1258,12 +1257,15 @@ def _search( optimize_metric=self.opt_metric, ensemble_method=self.ensemble_method ) + + smac_initial_num_run = self._backend.get_next_num_run(peek=True) + proc_runhistory_updater = None if ( self.ensemble_method == EnsembleSelectionTypes.stacking_ensemble and smbo_class is not None ): - proc_runhistory_updater = self._init_result_history_updater() + proc_runhistory_updater = self._init_result_history_updater(initial_num_run=smac_initial_num_run) # ==> Run SMAC smac_task_name: str = 'runSMAC' @@ -1302,7 +1304,7 @@ def _search( logger_port=self._logger_port, # We do not increase the num_run here, this is something # smac does internally - start_num_run=self._backend.get_next_num_run(peek=True), + start_num_run=smac_initial_num_run, search_space_updates=self.search_space_updates, portfolio_selection=portfolio_selection, pynisher_context=self._multiprocessing_context, @@ -1943,7 +1945,7 @@ def _collect_results_ensemble( pd.DataFrame(self.ensemble_performance_history).to_json( os.path.join(self._backend.internals_directory, 'ensemble_history.json')) - def _init_result_history_updater(self): + def _init_result_history_updater(self, initial_num_run: int) -> RunHistoryUpdaterManager: if self.dataset is None: raise ValueError("runhistory updater can only be initialised after or during `search()`. " "Please call the `search()` method of {}.".format(self.__class__.__name__)) @@ -1957,7 +1959,8 @@ def _init_result_history_updater(self): dataset_name=self.dataset_name, resampling_strategy=self.resampling_strategy, resampling_strategy_args=self.resampling_strategy_args, - logger_port=self._logger_port + logger_port=self._logger_port, + initial_num_run=initial_num_run ) self._stopwatch.stop_task(runhistory_task_name) diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py index 913a3024f..a0acc9015 100644 --- a/autoPyTorch/ensemble/stacking_ensemble.py +++ b/autoPyTorch/ensemble/stacking_ensemble.py @@ -11,10 +11,6 @@ from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss -# TODO: for now we can use this and pass this to stacking evaluator. -# TODO: This can be achieved by using `backend.load_ensemble` -# TODO: it loads the last stored ensemble. So we have access to it. -# TODO: the ensemble is a pickled file containing the fitted ensemble of this class. # TODO: Think of functionality of the functions in this class adjusted for stacking. class StackingEnsemble(AbstractEnsemble): def __init__( @@ -153,7 +149,6 @@ def _calculate_weights(self) -> None: self.weights_ = weights - # TODO: Adjust this to use weights and make def predict(self, predictions: List[np.ndarray]) -> np.ndarray: return self._predict(predictions, self.weights_) @@ -191,15 +186,9 @@ def _predict(self, predictions, weights): del tmp_predictions return average - # def __str__(self) -> str: - # return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \ - # '\n\tWeights: %s\n\tIdentifiers: %s' % \ - # (' '.join(['%d: %5f' % (idx, performance) - # for idx, performance in enumerate(self.trajectory_)]), - # self.indices_, self.weights_, - # ' '.join([str(identifier) for idx, identifier in - # enumerate(self.identifiers_) - # if self.weights_[idx] > 0])) + def __str__(self) -> str: + return f"Ensemble Selection:\n\tWeights: {self.weights_}\ + \n\tIdentifiers: {' '.join([str(identifier) for idx, identifier in enumerate(self.identifiers_) if self.weights_[idx] > 0])}" def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: """ diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py index 9b324af8b..4aa96440b 100644 --- a/autoPyTorch/ensemble/stacking_ensemble_builder.py +++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py @@ -27,7 +27,6 @@ MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy' -# TODO: think of what functions are needed to support stacking # TODO: make functions to support stacking. class StackingEnsembleBuilder(EnsembleBuilder): def __init__( @@ -119,8 +118,7 @@ def __init__( self.ensemble_identifiers: Optional[List[Optional[str]]] = None self.read_losses = {} - # TODO: This is the main wrapper to the EnsembleSelection class which fits - # TODO: the ensemble + # This is the main wrapper to the EnsembleSelection class which fits the ensemble def main( self, time_left: float, iteration: int, return_predictions: bool, ) -> Tuple[ @@ -209,7 +207,7 @@ def main( # reduces selected models if file reading failed candidate_models = self.get_test_preds(selected_keys=candidate_models) - self.logger.debug(f"n_sel_test: {candidate_models}") + # self.logger.debug(f"n_sel_test: {candidate_models}") if os.environ.get('ENSEMBLE_KEEP_ALL_CANDIDATES'): for candidate in candidate_models: @@ -218,7 +216,7 @@ def main( # as candidate models is sorted in `get_n_best_preds` best_model_identifier = candidate_models[0] - self.logger.debug(f"for iteration {iteration}, best_model_identifier: {best_model_identifier} \n candidate_models: \n{candidate_models}") + # self.logger.debug(f"for iteration {iteration}, best_model_identifier: {best_model_identifier} \n candidate_models: \n{candidate_models}") # train ensemble ensemble = self.fit_ensemble( @@ -229,7 +227,7 @@ def main( if ensemble is not None and self.SAVE2DISC: self.backend.save_ensemble(ensemble, iteration, self.seed) ensemble_identifiers=self._get_identifiers_from_num_runs(ensemble.identifiers_) - self.logger.debug(f"ensemble_identifiers being saved are {ensemble_identifiers}") + # self.logger.debug(f"ensemble_identifiers being saved are {ensemble_identifiers}") self._save_ensemble_identifiers( ensemble_identifiers=ensemble_identifiers ) @@ -272,10 +270,6 @@ def main( else: return self.ensemble_history, self.ensemble_nbest, None, None - # TODO: change this function, to compute loss according to Lavesque et al. - # TODO: this will help us in choosing the model with the lowest ensemble error. - # TODO: predictions on ensemble set will be available in read_preds to be used for - # TODO: passing to stacking_ensemble_builder.predict() def compute_ensemble_loss_per_model(self) -> bool: """ Compute the loss of the predictions on ensemble building data set; @@ -327,7 +321,6 @@ def compute_ensemble_loss_per_model(self) -> bool: # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list # as a returning list, so as a work-around we skip next line for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]): # type: ignore - self.logger.debug(f"This is for model {y_ens_fn}") if self.read_at_most and n_read_files >= self.read_at_most: # limit the number of files that will be read # to limit memory consumption @@ -595,13 +588,13 @@ def get_ensemble_loss_with_model(self, """ # self.logger.debug(f"in ensemble_loss predictions for current are \n{model_predictions}") - self.logger.debug(f"in ensemble_loss ensemble_identifiers: {ensemble_identifiers}") + # self.logger.debug(f"in ensemble_loss ensemble_identifiers: {ensemble_identifiers}") average_predictions = np.zeros_like(model_predictions, dtype=np.float64) tmp_predictions = np.empty_like(model_predictions, dtype=np.float64) nonnull_identifiers = len([identifier for identifier in ensemble_identifiers if identifier is not None]) - self.logger.debug(f"non null identifiers : {nonnull_identifiers}") + # self.logger.debug(f"non null identifiers : {nonnull_identifiers}") weight = 1. / float(nonnull_identifiers) # if prediction model.shape[0] == len(non_null_weights), # predictions do not include those of zero-weight models. @@ -626,7 +619,7 @@ def get_ensemble_loss_with_model(self, return loss def _get_ensemble_identifiers_filename(self): - return os.path.join(self.backend.temporary_directory, 'ensemble_identifiers.pkl') + return os.path.join(self.backend.internals_directory, 'ensemble_identifiers.pkl') def _save_ensemble_identifiers(self, ensemble_identifiers: List[Optional[str]]) -> None: with open(self._get_ensemble_identifiers_filename(), "wb") as file: diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py index d01c846b0..15efac5b9 100644 --- a/autoPyTorch/evaluation/stacking_evaluator.py +++ b/autoPyTorch/evaluation/stacking_evaluator.py @@ -137,10 +137,6 @@ def __init__(self, backend: Backend, queue: Queue, search_space_updates=search_space_updates ) - # TODO: we cant store the ensemble pipelines with this class as it is initialised for every TAE (target algorithm evaluation). - # TODO: Therefore we will have to store pipelines using datamanager and load them, see if we only need predictions. - # TODO: but we will need the whole pipeline as we would like to predict with different dataset, like val or something - self.logger.debug("Search space updates :{}".format(self.search_space_updates)) def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], @@ -364,11 +360,6 @@ def _predict(self, pipeline: BaseEstimator, test_indices: Union[np.ndarray, List], train_indices: Union[np.ndarray, List] ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: - # TODO: load ensemble members and predict using the whole ensemble. - # TODO: we need some function to pass this pipeline to the last stored ensemble replace - # TODO: model j, where j = ensemble.iteration mod m. then we need to predict - # TODO: Also, we will pass the predictions from this pipeline as that is what is needed - # TODO: to create the ensemble. train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline, self.y_train[train_indices]) @@ -383,7 +374,7 @@ def _predict(self, pipeline: BaseEstimator, else: ensemble_opt_pred = pipeline_opt_pred.copy() - self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}") + # self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}") if self.X_valid is not None: valid_pred = self.predict_function(self.X_valid, pipeline, self.y_valid) diff --git a/autoPyTorch/optimizer/run_history_callback.py b/autoPyTorch/optimizer/run_history_callback.py index 6b020e174..376478813 100644 --- a/autoPyTorch/optimizer/run_history_callback.py +++ b/autoPyTorch/optimizer/run_history_callback.py @@ -10,7 +10,9 @@ import dask.distributed from distributed.utils import Any + from numpy.random.mtrand import seed +import numpy as np from smac.optimizer.smbo import SMBO from smac.runhistory.runhistory import RunInfo, RunKey @@ -28,6 +30,7 @@ class RunHistoryUpdaterManager(AdjustRunHistoryCallback): def __init__( self, backend: Backend, + initial_num_run: int, dataset_name: str, resampling_strategy: Union[ HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes @@ -59,6 +62,7 @@ def __init__( # The last criteria is the number of iterations self.iteration = 0 + self.initial_num_run = initial_num_run # Keep track of when we started to know when we need to finish! self.start_time = time.time() self.dataset_name = dataset_name @@ -67,16 +71,12 @@ def __init__( def __call__( self, - smbo: 'SMBO', - ) -> None: - self.adjust_run_history(smbo.tae_runner.client) + ) -> Optional[List[Tuple[RunKey, float]]]: + return self.adjust_run_history() def adjust_run_history( self, - dask_client: dask.distributed.Client, - unit_test: bool = False - ) -> None: - + ) -> Optional[List[Tuple[RunKey, float]]]: # The second criteria is elapsed time elapsed_time = time.time() - self.start_time @@ -85,65 +85,35 @@ def adjust_run_history( port=self.logger_port, ) - if len(self.futures) != 0: - if self.futures[0].done(): - result = self.futures.pop().result() - if result: - response = result - logger.debug("iteration={} @ elapsed_time={} has response={}".format( - self.iteration, - elapsed_time, - response, - )) + logger.info( + "Started Ensemble builder job at {} for iteration {}.".format( + # Log the client to make sure we + # remain connected to the scheduler + time.strftime("%Y.%m.%d-%H.%M.%S"), + self.iteration, + )) - # Only submit new jobs if the previous ensemble job finished - if len(self.futures) == 0: - - # Add the result of the run - # On the next while iteration, no references to - # ensemble builder object, so it should be garbage collected to - # save memory while waiting for resources - # Also, notice how ensemble nbest is returned, so we don't waste - # iterations testing if the deterministic predictions size can - # be fitted in memory - try: - # Submit a Dask job from this job, to properly - # see it in the dask diagnostic dashboard - # Notice that the forked ensemble_builder_process will - # wait for the below function to be done - self.futures.append( - dask_client.submit( - return_run_info_cost, + response = return_run_info_cost( backend=self.backend, dataset_name=self.dataset_name, iteration=self.iteration, resampling_strategy=self.resampling_strategy, resampling_strategy_args=self.resampling_strategy_args, logger_port=self.logger_port, - priority=100 - ) - ) - - logger.info( - "{}/{} Started Ensemble builder job at {} for iteration {}.".format( - # Log the client to make sure we - # remain connected to the scheduler - self.futures[0], - dask_client, - time.strftime("%Y.%m.%d-%H.%M.%S"), - self.iteration, - ), + initial_num_run=self.initial_num_run ) - self.iteration += 1 - except Exception as e: - exception_traceback = traceback.format_exc() - error_message = repr(e) - logger.critical(exception_traceback) - logger.critical(error_message) + logger.debug("iteration={} @ elapsed_time={} has response={}".format( + self.iteration, + elapsed_time, + response, + )) + self.iteration += 1 + return response def return_run_info_cost( backend: Backend, + initial_num_run: int, dataset_name: str, resampling_strategy: Union[ HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes @@ -218,6 +188,7 @@ def return_run_info_cost( resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, logger_port=logger_port, + initial_num_run=initial_num_run ).run( iteration=iteration, ) @@ -228,6 +199,7 @@ class RunHistoryUpdater: def __init__( self, backend: Backend, + initial_num_run: int, dataset_name: str, resampling_strategy: Union[ HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes @@ -250,6 +222,7 @@ def __init__( """ self.model_fn_re = re.compile(MODEL_FN_RE) + self.initial_num_run = initial_num_run self.logger_port = logger_port self.logger = get_named_client_logger( name='RunHistoryUpdater', @@ -266,27 +239,27 @@ def __init__( def run(self, iteration: int) -> Optional[List[Tuple[RunKey, float]]]: self.logger.info(f"Starting iteration {iteration} of run history updater") - results: List[Tuple[RunInfo, float]] = [] + results: List[Tuple[RunKey, float]] = [] if os.path.exists(self.ensemble_loss_file): try: with (open(self.ensemble_loss_file, "rb")) as memory: read_losses = pickle.load(memory) except Exception as e: self.logger.debug(f"Could not read losses at iteration: {iteration} with exception {e}") - return + return None else: for k in read_losses.keys(): match = self.model_fn_re.search(k) - if match is None or read_losses[k]["loaded"] != 1: + if match is None or not np.isfinite(read_losses[k]["ens_loss"]): continue else: _num_run = int(match.group(2)) _budget = float(match.group(3)) run_key = RunKey( seed=0, # 0 is hardcoded for the runhistory coming from smac - config_id=_num_run, + config_id=_num_run - self.initial_num_run, budget=_budget, - instance_id=self.instances[-1] + instance_id=self.instances[-1][-1] ) results.append((run_key, read_losses[k]["ens_loss"])) return results diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py index b15cf2580..c44252021 100644 --- a/autoPyTorch/optimizer/utils.py +++ b/autoPyTorch/optimizer/utils.py @@ -145,15 +145,24 @@ def _incorporate_run_results(self, run_info: RunInfo, result: RunValue, time_lef self._stop = True for callback in self._callbacks['_adjust_run_history']: - response = callback(smbo=self) + response = callback() if response is not None: for run_key, cost in response: run_value = self.runhistory.data.get(run_key, None) if run_value is not None: - run_value.cost = cost + self.logger.debug(f"updated run_key: {run_key} with cost: {cost}") + updated_run_value = RunValue( + cost, + run_value.time, + run_value.status, + run_value.starttime, + run_value.endtime, + run_value.additional_info + ) + self.runhistory.data[run_key] = updated_run_value self.epm_chooser.runhistory = self.runhistory - self.logger.debug(f"\nafter runhistory updater, result: {result}, \nrunhistory: {dict_repr(self.runhistory.data)}") + # self.logger.debug(f"\nafter runhistory updater, result: {result}, \nrunhistory: {dict_repr(self.runhistory.data)}") # Update the intensifier with the result of the runs self.incumbent, inc_perf = self.intensifier.process_results( From 8a7e897fbf061eb1677ff69bfee6b8d2e1ab00d5 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 18:01:14 +0200 Subject: [PATCH 12/16] add possibility of normalised margin loss (clean) --- autoPyTorch/ensemble/stacking_ensemble.py | 18 ++++++++---- .../ensemble/stacking_ensemble_builder.py | 23 +++++++++++++-- autoPyTorch/evaluation/stacking_evaluator.py | 28 +++++++++++-------- 3 files changed, 51 insertions(+), 18 deletions(-) diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py index a0acc9015..40ca5bc98 100644 --- a/autoPyTorch/ensemble/stacking_ensemble.py +++ b/autoPyTorch/ensemble/stacking_ensemble.py @@ -1,5 +1,5 @@ from collections import Counter -from typing import Any, Dict, List, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import warnings import numpy as np @@ -74,7 +74,7 @@ def fit( # TODO: fit a stacked ensemble. def _fit( self, - predictions: List[np.ndarray], + predictions: List[Optional[np.ndarray]], labels: np.ndarray, ) -> None: """ @@ -125,7 +125,7 @@ def _fit( )[self.metric.name] # store list of preds for later use - self.ensemble_predictions = predictions + self.ensemble_predictions_ = predictions self.train_loss_: float = loss @@ -221,17 +221,25 @@ def predict_with_current_pipeline( where m is ensemble_size. returns ensemble predictions """ - predictions = self.ensemble_predictions.copy() + predictions = self.ensemble_predictions_.copy() if predictions[self.ensemble_slot_j] is None: total_predictions = len([pred for pred in predictions if pred is not None]) total_predictions += 1 - weights = [1/total_predictions for pred in predictions if pred is not None] + weights: np.ndarray = np.ndarray([1/total_predictions if pred is not None else 0 for pred in predictions]) else: weights = self.weights_ predictions[self.ensemble_slot_j] = pipeline_predictions return self._predict(predictions, weights) + def get_ensemble_predictions_with_current_pipeline( + self, + pipeline_predictions: np.ndarray + ) -> List[Optional[np.ndarray]]: + predictions = self.ensemble_predictions_.copy() + predictions[self.ensemble_slot_j] = pipeline_predictions + return predictions + def get_models_with_weights( self, models: Dict[Any, BasePipeline] diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py index 4aa96440b..17689657d 100644 --- a/autoPyTorch/ensemble/stacking_ensemble_builder.py +++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py @@ -20,6 +20,8 @@ from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score from autoPyTorch.utils.logging_ import get_named_client_logger +from autoPyTorch.metrics import zero_one_loss + Y_ENSEMBLE = 0 Y_TEST = 1 @@ -27,6 +29,19 @@ MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy' +def calculate_nomalised_margin_loss(ensemble_predictions, y_true) -> float: + nonnull_preds = 0 + margin: float = 0 + for pred in ensemble_predictions: + if pred is not None: + nonnull_preds += 1 + margin += (1 - 2*zero_one_loss(y_true, pred)) + + margin /= nonnull_preds + + return pow((1-margin), 2)/4 + + # TODO: make functions to support stacking. class StackingEnsembleBuilder(EnsembleBuilder): def __init__( @@ -185,8 +200,9 @@ def main( ) self.ensemble_slot_j = np.mod(iteration, self.ensemble_size) - self.logger.debug(f"Iteration for ensemble building:{iteration}") self.ensemble_identifiers = self._load_ensemble_identifiers() + self.logger.debug(f"Iteration for ensemble building:{iteration}, " + f"current model to be updated: {self.ensemble_identifiers[self.ensemble_slot_j]} at slot : {self.ensemble_slot_j}") # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss. if not self.compute_ensemble_loss_per_model(): if return_predictions: @@ -366,7 +382,7 @@ def compute_ensemble_loss_per_model(self) -> bool: ) self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric] - + # self.read_losses[y_ens_fn]["ens_loss"] = losses["ensemble_opt_loss"] # It is not needed to create the object here # To save memory, we just compute the loss. self.read_losses[y_ens_fn]["mtime_ens"] = os.path.getmtime(y_ens_fn) @@ -598,6 +614,7 @@ def get_ensemble_loss_with_model(self, weight = 1. / float(nonnull_identifiers) # if prediction model.shape[0] == len(non_null_weights), # predictions do not include those of zero-weight models. + ensemble_predictions = list() for identifier in ensemble_identifiers: if identifier is not None: if self.read_preds[identifier][Y_ENSEMBLE] is None: @@ -607,6 +624,7 @@ def get_ensemble_loss_with_model(self, else: break + ensemble_predictions.append(predictions) np.multiply(predictions, weight, out=tmp_predictions) np.add(average_predictions, tmp_predictions, out=average_predictions) @@ -616,6 +634,7 @@ def get_ensemble_loss_with_model(self, prediction=average_predictions, task_type=self.task_type, ) + # loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble) return loss def _get_ensemble_identifiers_filename(self): diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py index 15efac5b9..e3d77534e 100644 --- a/autoPyTorch/evaluation/stacking_evaluator.py +++ b/autoPyTorch/evaluation/stacking_evaluator.py @@ -11,13 +11,12 @@ from smac.tae import StatusType -from autoPyTorch import ensemble from autoPyTorch.automl_common.common.utils.backend import Backend from autoPyTorch.constants import ( CLASSIFICATION_TASKS, MULTICLASSMULTIOUTPUT, ) -from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes +from autoPyTorch.datasets.ensemble.stacking_ensemble_builder import calculate_nomalised_margin_loss from autoPyTorch.evaluation.abstract_evaluator import ( AbstractEvaluator, fit_and_suppress_warnings @@ -175,6 +174,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], return self.duration, loss_, self.seed, additional_run_info_ cost = loss[self.metric.name] + # cost = loss["ensemble_opt_loss"] additional_run_info = ( {} if additional_run_info is None else additional_run_info @@ -297,7 +297,8 @@ def fit_predict_and_loss(self) -> None: y_pipeline_opt_pred, y_ensemble_opt_pred, y_valid_pred, - y_test_pred + y_test_pred, + y_ensemble_preds ) = self._fit_and_predict(pipeline, split_id, train_indices=train_split, test_indices=test_split) @@ -305,6 +306,7 @@ def fit_predict_and_loss(self) -> None: train_loss = self._loss(self.y_train[train_split], y_train_pred) loss = self._loss(self.y_train[test_split], y_ensemble_opt_pred) + # loss['ensemble_opt_loss'] = calculate_nomalised_margin_loss(y_ensemble_preds, self.y_train[test_split]) additional_run_info = pipeline.get_additional_run_info() if hasattr( pipeline, 'get_additional_run_info') else {} @@ -334,7 +336,7 @@ def _fit_and_predict( fold: int, train_indices: Union[np.ndarray, List], test_indices: Union[np.ndarray, List], - ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: # See autoPyTorch/pipeline/components/base_component.py::autoPyTorchComponent for more details # about fit_dictionary @@ -346,7 +348,7 @@ def _fit_and_predict( y = None fit_and_suppress_warnings(self.logger, pipeline, X, y) self.logger.info("Model fitted, now predicting") - Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred = self._predict( + Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred, Y_ensemble_preds = self._predict( pipeline, train_indices=train_indices, test_indices=test_indices, @@ -354,12 +356,14 @@ def _fit_and_predict( self.pipeline = pipeline - return Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred + return Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred, Y_ensemble_preds - def _predict(self, pipeline: BaseEstimator, - test_indices: Union[np.ndarray, List], - train_indices: Union[np.ndarray, List] - ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: + def _predict( + self, + pipeline: BaseEstimator, + test_indices: Union[np.ndarray, List], + train_indices: Union[np.ndarray, List] + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline, self.y_train[train_indices]) @@ -371,8 +375,10 @@ def _predict(self, pipeline: BaseEstimator, old_ensemble = self.backend.load_ensemble(self.seed) assert isinstance(old_ensemble, StackingEnsemble) ensemble_opt_pred = old_ensemble.predict_with_current_pipeline(pipeline_opt_pred) + ensemble_preds = old_ensemble.get_ensemble_predictions_with_current_pipeline(pipeline_opt_pred) else: ensemble_opt_pred = pipeline_opt_pred.copy() + ensemble_preds = [pipeline_opt_pred] # self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}") if self.X_valid is not None: @@ -387,7 +393,7 @@ def _predict(self, pipeline: BaseEstimator, else: test_pred = None - return train_pred, pipeline_opt_pred, ensemble_opt_pred, valid_pred, test_pred + return train_pred, pipeline_opt_pred, ensemble_opt_pred, valid_pred, test_pred, ensemble_preds # create closure for evaluating an algorithm From 26ff2a404285f5f7912eb0a1d14d9c6dd86f2b47 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 18:13:42 +0200 Subject: [PATCH 13/16] add option for use ensemble loss and minor fixes --- autoPyTorch/api/base_task.py | 15 ++++++++---- autoPyTorch/api/tabular_classification.py | 6 +++-- .../ensemble/ensemble_builder_manager.py | 8 +++++-- .../ensemble/stacking_ensemble_builder.py | 23 +++++++++---------- autoPyTorch/evaluation/abstract_evaluator.py | 5 +++- autoPyTorch/evaluation/stacking_evaluator.py | 19 ++++++++------- autoPyTorch/evaluation/tae.py | 7 ++++-- autoPyTorch/evaluation/train_evaluator.py | 10 +++++--- autoPyTorch/optimizer/smbo.py | 8 +++++-- 9 files changed, 64 insertions(+), 37 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index b4ba01c99..b8e2af296 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -48,7 +48,7 @@ NoResamplingStrategyTypes, ResamplingStrategies, ) -from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager +from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager from autoPyTorch.ensemble.singlebest_ensemble import SingleBest from autoPyTorch.ensemble.utils import EnsembleSelectionTypes from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings @@ -649,9 +649,10 @@ def _load_models(self) -> bool: if self.ensemble_: identifiers = self.ensemble_.get_selected_model_identifiers() - self.models_ = self._backend.load_models_by_identifiers(identifiers) + nonnull_identifiers = [i for i in identifiers if i is not None] + self.models_ = self._backend.load_models_by_identifiers(nonnull_identifiers) if isinstance(self.resampling_strategy, CrossValTypes): - self.cv_models_ = self._backend.load_cv_models_by_identifiers(identifiers) + self.cv_models_ = self._backend.load_cv_models_by_identifiers(nonnull_identifiers) if isinstance(self.resampling_strategy, CrossValTypes): if len(self.cv_models_) == 0: @@ -977,7 +978,8 @@ def _search( load_models: bool = True, portfolio_selection: Optional[str] = None, dask_client: Optional[dask.distributed.Client] = None, - smbo_class: Optional[SMBO] = None + smbo_class: Optional[SMBO] = None, + use_ensemble_opt_loss: bool = False ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -1234,6 +1236,7 @@ def _search( func_eval_time_limit_secs=func_eval_time_limit_secs) # ============> Starting ensemble + self.use_ensemble_opt_loss = use_ensemble_opt_loss self.precision = precision self.opt_metric = optimize_metric elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) @@ -1309,6 +1312,7 @@ def _search( portfolio_selection=portfolio_selection, pynisher_context=self._multiprocessing_context, smbo_class = smbo_class, + use_ensemble_opt_loss=self.use_ensemble_opt_loss, other_callbacks=[proc_runhistory_updater] if proc_runhistory_updater is not None else None ) try: @@ -1915,6 +1919,7 @@ def _init_ensemble_builder( random_state=self.seed, precision=precision, logger_port=self._logger_port, + use_ensemble_loss=self.use_ensemble_opt_loss ) self._stopwatch.stop_task(ensemble_task_name) @@ -2005,7 +2010,7 @@ def predict( joblib.delayed(_pipeline_predict)( models[identifier], X_test, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type] ) - for identifier in self.ensemble_.get_selected_model_identifiers() + for identifier in self.ensemble_.get_selected_model_identifiers() if identifier is not None ) if len(all_predictions) == 0: diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 5641c1005..3e6354c03 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -258,7 +258,8 @@ def search( load_models: bool = True, portfolio_selection: Optional[str] = None, dataset_compression: Union[Mapping[str, Any], bool] = False, - smbo_class: Optional[SMBO] = None + smbo_class: Optional[SMBO] = None, + use_ensemble_opt_loss=False ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -458,7 +459,8 @@ def search( disable_file_output=disable_file_output, load_models=load_models, portfolio_selection=portfolio_selection, - smbo_class=smbo_class + smbo_class=smbo_class, + use_ensemble_opt_loss=use_ensemble_opt_loss ) def predict( diff --git a/autoPyTorch/ensemble/ensemble_builder_manager.py b/autoPyTorch/ensemble/ensemble_builder_manager.py index 7c0786bb9..84ef362ba 100644 --- a/autoPyTorch/ensemble/ensemble_builder_manager.py +++ b/autoPyTorch/ensemble/ensemble_builder_manager.py @@ -47,6 +47,7 @@ def __init__( random_state: int, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, pynisher_context: str = 'fork', + use_ensemble_loss=False ): """ SMAC callback to handle ensemble building Args: @@ -135,6 +136,8 @@ def __init__( # Keep track of when we started to know when we need to finish! self.start_time = time.time() + self.use_ensemble_loss = use_ensemble_loss + def __call__( self, smbo: 'SMBO', @@ -226,6 +229,7 @@ def build_ensemble( pynisher_context=self.pynisher_context, logger_port=self.logger_port, unit_test=unit_test, + use_ensemble_opt_loss=self.use_ensemble_loss )) logger.info( @@ -268,6 +272,7 @@ def fit_and_return_ensemble( pynisher_context: str, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, unit_test: bool = False, + use_ensemble_opt_loss=False ) -> Tuple[ List[Dict[str, float]], int, @@ -352,6 +357,7 @@ def fit_and_return_ensemble( random_state=random_state, logger_port=logger_port, unit_test=unit_test, + use_ensemble_opt_loss=use_ensemble_opt_loss ).run( end_at=end_at, iteration=iteration, @@ -359,5 +365,3 @@ def fit_and_return_ensemble( pynisher_context=pynisher_context, ) return result - - diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py index 17689657d..01c582410 100644 --- a/autoPyTorch/ensemble/stacking_ensemble_builder.py +++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py @@ -20,7 +20,6 @@ from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score from autoPyTorch.utils.logging_ import get_named_client_logger -from autoPyTorch.metrics import zero_one_loss Y_ENSEMBLE = 0 @@ -29,18 +28,17 @@ MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy' -def calculate_nomalised_margin_loss(ensemble_predictions, y_true) -> float: - nonnull_preds = 0 - margin: float = 0 +def calculate_nomalised_margin_loss(ensemble_predictions, y_true, task_type) -> float: + n_ensemble = 0 + loss = 0 for pred in ensemble_predictions: if pred is not None: - nonnull_preds += 1 - margin += (1 - 2*zero_one_loss(y_true, pred)) - - margin /= nonnull_preds - - return pow((1-margin), 2)/4 + n_ensemble += 1 + loss += 1 -2*(y_true != np.argmax(pred, axis=1)).astype(float) + loss /= n_ensemble + margin = np.power(1-loss, 2)/4 + return np.mean(margin) # TODO: make functions to support stacking. class StackingEnsembleBuilder(EnsembleBuilder): @@ -63,6 +61,7 @@ def __init__( random_state: Optional[Union[int, np.random.RandomState]] = None, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, unit_test: bool = False, + use_ensemble_opt_loss=False ): """ Constructor @@ -381,7 +380,7 @@ def compute_ensemble_loss_per_model(self) -> bool: ensemble_identifiers=ensemble_idenitfiers ) - self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric] + self.read_losses[y_ens_fn]["ens_loss"] = losses["ensemble_opt_loss"] if self.use_ensemble_opt_loss else losses[self.opt_metric] # self.read_losses[y_ens_fn]["ens_loss"] = losses["ensemble_opt_loss"] # It is not needed to create the object here # To save memory, we just compute the loss. @@ -634,7 +633,7 @@ def get_ensemble_loss_with_model(self, prediction=average_predictions, task_type=self.task_type, ) - # loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble) + loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble) return loss def _get_ensemble_identifiers_filename(self): diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index 7202096b6..3fcc64889 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -424,7 +424,8 @@ def __init__(self, backend: Backend, init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, all_supported_metrics: bool = True, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + use_ensemble_opt_loss=False ) -> None: self.starttime = time.time() @@ -510,6 +511,8 @@ def __init__(self, backend: Backend, port=logger_port, ) + self.use_ensemble_opt_loss = use_ensemble_opt_loss + self._init_fit_dictionary(logger_port=logger_port, pipeline_config=pipeline_config, metrics_dict=metrics_dict) self.Y_optimization: Optional[np.ndarray] = None self.Y_actual_train: Optional[np.ndarray] = None diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py index e3d77534e..4207e234f 100644 --- a/autoPyTorch/evaluation/stacking_evaluator.py +++ b/autoPyTorch/evaluation/stacking_evaluator.py @@ -16,7 +16,7 @@ CLASSIFICATION_TASKS, MULTICLASSMULTIOUTPUT, ) -from autoPyTorch.datasets.ensemble.stacking_ensemble_builder import calculate_nomalised_margin_loss +from autoPyTorch.ensemble.stacking_ensemble_builder import calculate_nomalised_margin_loss from autoPyTorch.evaluation.abstract_evaluator import ( AbstractEvaluator, fit_and_suppress_warnings @@ -115,7 +115,8 @@ def __init__(self, backend: Backend, queue: Queue, init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, all_supported_metrics: bool = True, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None: + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + use_ensemble_opt_loss=False) -> None: super().__init__( backend=backend, queue=queue, @@ -133,10 +134,11 @@ def __init__(self, backend: Backend, queue: Queue, logger_port=logger_port, all_supported_metrics=all_supported_metrics, pipeline_config=pipeline_config, - search_space_updates=search_space_updates + search_space_updates=search_space_updates, + use_ensemble_opt_loss=use_ensemble_opt_loss ) - self.logger.debug("Search space updates :{}".format(self.search_space_updates)) + self.logger.debug("use_ensemble_loss :{}".format(self.use_ensemble_opt_loss)) def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], valid_pred: Optional[np.ndarray], @@ -173,8 +175,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], if loss_ is not None: return self.duration, loss_, self.seed, additional_run_info_ - cost = loss[self.metric.name] - # cost = loss["ensemble_opt_loss"] + cost = loss["ensemble_opt_loss"] if self.use_ensemble_opt_loss else loss[self.metric.name] additional_run_info = ( {} if additional_run_info is None else additional_run_info @@ -306,7 +307,7 @@ def fit_predict_and_loss(self) -> None: train_loss = self._loss(self.y_train[train_split], y_train_pred) loss = self._loss(self.y_train[test_split], y_ensemble_opt_pred) - # loss['ensemble_opt_loss'] = calculate_nomalised_margin_loss(y_ensemble_preds, self.y_train[test_split]) + loss['ensemble_opt_loss'] = calculate_nomalised_margin_loss(y_ensemble_preds, self.y_train[test_split], self.task_type) additional_run_info = pipeline.get_additional_run_info() if hasattr( pipeline, 'get_additional_run_info') else {} @@ -415,6 +416,7 @@ def eval_function( logger_port: Optional[int] = None, all_supported_metrics: bool = True, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + use_ensemble_opt_loss=False, instance: str = None, ) -> None: """ @@ -496,6 +498,7 @@ def eval_function( logger_port=logger_port, all_supported_metrics=all_supported_metrics, pipeline_config=pipeline_config, - search_space_updates=search_space_updates + search_space_updates=search_space_updates, + use_ensemble_opt_loss=use_ensemble_opt_loss ) evaluator.fit_predict_and_loss() diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py index c756d5e8e..4ac84c8ef 100644 --- a/autoPyTorch/evaluation/tae.py +++ b/autoPyTorch/evaluation/tae.py @@ -131,7 +131,8 @@ def __init__( logger_port: int = None, all_supported_metrics: bool = True, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, - ensemble_method = None + ensemble_method = None, + use_ensemble_opt_loss=False ): self.backend = backend @@ -208,6 +209,7 @@ def __init__( self.memory_limit = memory_limit self.search_space_updates = search_space_updates + self.use_ensemble_opt_loss = use_ensemble_opt_loss def _check_and_get_default_budget(self) -> float: budget_type_choices = ('epochs', 'runtime') @@ -346,7 +348,8 @@ def run( pipeline_config=self.pipeline_config, logger_port=self.logger_port, all_supported_metrics=self.all_supported_metrics, - search_space_updates=self.search_space_updates + search_space_updates=self.search_space_updates, + use_ensemble_opt_loss=self.use_ensemble_opt_loss ) info: Optional[List[RunValue]] diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index 5c937d614..e5884a9f7 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -131,7 +131,8 @@ def __init__(self, backend: Backend, queue: Queue, logger_port: Optional[int] = None, keep_models: Optional[bool] = None, all_supported_metrics: bool = True, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None: + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + use_ensemble_opt_loss=False) -> None: super().__init__( backend=backend, queue=queue, @@ -149,7 +150,8 @@ def __init__(self, backend: Backend, queue: Queue, logger_port=logger_port, all_supported_metrics=all_supported_metrics, pipeline_config=pipeline_config, - search_space_updates=search_space_updates + search_space_updates=search_space_updates, + use_ensemble_opt_loss=use_ensemble_opt_loss ) if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)): @@ -428,6 +430,7 @@ def eval_train_function( logger_port: Optional[int] = None, all_supported_metrics: bool = True, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + use_ensemble_opt_loss=False, instance: str = None, ) -> None: """ @@ -509,6 +512,7 @@ def eval_train_function( logger_port=logger_port, all_supported_metrics=all_supported_metrics, pipeline_config=pipeline_config, - search_space_updates=search_space_updates + search_space_updates=search_space_updates, + use_ensemble_opt_loss=use_ensemble_opt_loss ) evaluator.fit_predict_and_loss() diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index 1238e608d..a4a8ce20e 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -121,7 +121,8 @@ def __init__(self, max_budget: int = 50, ensemble_method: int = EnsembleSelectionTypes.ensemble_selection, other_callbacks: Optional[List] = None, - smbo_class: Optional[SMBO] = None + smbo_class: Optional[SMBO] = None, + use_ensemble_opt_loss: bool = False ): """ Interface to SMAC. This method calls the SMAC optimize method, and allows @@ -253,6 +254,8 @@ def __init__(self, port=self.logger_port) self.logger.info("initialised {}".format(self.__class__.__name__)) + self.use_ensemble_opt_loss = use_ensemble_opt_loss + self.initial_configurations: Optional[List[Configuration]] = None if portfolio_selection is not None: initial_configurations = read_return_initial_configurations(config_space=config_space, @@ -303,7 +306,8 @@ def run_smbo(self, func: Optional[Callable] = None pipeline_config=self.pipeline_config, search_space_updates=self.search_space_updates, pynisher_context=self.pynisher_context, - ensemble_method=self.ensemble_method + ensemble_method=self.ensemble_method, + use_ensemble_opt_loss=self.use_ensemble_opt_loss ) ta = ExecuteTaFuncWithQueue self.logger.info("Finish creating Target Algorithm (TA) function") From 7e7001b36acbc195c598fb7023ec28a14b1a000c Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 18:15:19 +0200 Subject: [PATCH 14/16] final working version of ensemble bayesian learning --- autoPyTorch/optimizer/smbo.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index a4a8ce20e..47fb4e619 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -258,11 +258,12 @@ def __init__(self, self.initial_configurations: Optional[List[Configuration]] = None if portfolio_selection is not None: - initial_configurations = read_return_initial_configurations(config_space=config_space, - portfolio_selection=portfolio_selection) - # incase we dont have any valid configuration from the portfolio - self.initial_configurations = initial_configurations \ - if len(initial_configurations) > 0 else None + self.initial_configurations = read_return_initial_configurations(config_space=config_space, + portfolio_selection=portfolio_selection) + if len(self.initial_configurations) == 0: + self.initial_configurations = None + self.logger.warning("None of the portfolio configurations are compatible" + " with the current search space. Skipping initial configuration...") def run_smbo(self, func: Optional[Callable] = None ) -> Tuple[RunHistory, List[TrajEntry], str]: From 32f0d2f82177cba65693764cdcfcd8503a3a33e3 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 26 Apr 2022 19:11:19 +0200 Subject: [PATCH 15/16] minor fix for init use ensemble loss --- autoPyTorch/ensemble/stacking_ensemble_builder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py index 01c582410..5097835f2 100644 --- a/autoPyTorch/ensemble/stacking_ensemble_builder.py +++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py @@ -131,6 +131,7 @@ def __init__( # or passing them via the ensemble builder manager which has persistency with the futures stored. self.ensemble_identifiers: Optional[List[Optional[str]]] = None self.read_losses = {} + self.use_ensemble_opt_loss = use_ensemble_opt_loss # This is the main wrapper to the EnsembleSelection class which fits the ensemble def main( @@ -633,7 +634,7 @@ def get_ensemble_loss_with_model(self, prediction=average_predictions, task_type=self.task_type, ) - loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble) + loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble, self.task_type) return loss def _get_ensemble_identifiers_filename(self): From 5c09afcd38cc693247994812f46e7d006fc7e074 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Wed, 22 Jun 2022 12:08:01 +0200 Subject: [PATCH 16/16] Stacking ensemble selection clean (#2) * add repeated kfold * working repeated k fold * working stacking evaluator without changing dataset and no final predict * replace datamanager * fix prediction with stack ensembles * adaptive repeats * working version of stacking with changing dataset preserving categorical info * working version of ensemble selection per layer, TODO: send predictions according to the weights associated with the model * finish previous todo: send predictions according to the weights associated with the model * working version of base repeat stacked ensembles, todo: check if other methods still work, add autogluon stacking * working all stacking versions * rename optimisation stacking ensemble * Add autogluon stacking (#1) * add working traditional models according autofluon * working pytorch embedding with skew and embed column splitting * work in progress: autogluon ensembling * working autogluon ensemble * important fix for more than 2 stacking layers * fix for running more than 2 stacking layers * working autogluon with default nn config from autogluon * working xgboost model * add configurationspace to traditional classification models * working autogluon stacking and stacking optimisation, todo: search for autogluon models and post hoc ensemble selection for ensemble optimisation * added post fit ensemble optimization, working per layer selection, repeat models, stacking optimisation * update config space for search, fix stratified resampling, fix printing model with weights for soe * fix running traditional pipeline for all the ensembles, fix get config from run history * fix cut off num run for all ensembles * __init__ file for column splittin * all requirements * add __init__.py for trad ml * pass smbo class to custom callback * early stop also ensemble opt * remove -1 from autogluon stacking * reduce number of models stored after stcking * fix issue wiuth null identifiers in selected ensemble identifiers * remove pointless line for debug * set multiprocessing context to forkserver for n workers 1 * fix error when all repeats do not finish * examples changed --- autoPyTorch/api/base_task.py | 1114 ++++++++++++----- autoPyTorch/api/tabular_classification.py | 64 +- autoPyTorch/api/tabular_regression.py | 10 +- autoPyTorch/api/utils.py | 139 ++ autoPyTorch/data/base_feature_validator.py | 2 +- autoPyTorch/data/tabular_feature_validator.py | 12 +- autoPyTorch/data/tabular_validator.py | 7 +- autoPyTorch/datasets/base_dataset.py | 91 +- autoPyTorch/datasets/resampling_strategy.py | 91 +- autoPyTorch/datasets/tabular_dataset.py | 7 +- autoPyTorch/datasets/utils.py | 48 + .../ensemble/autogluon_stacking_ensemble.py | 158 +++ autoPyTorch/ensemble/ensemble_builder.py | 14 +- .../ensemble/ensemble_builder_manager.py | 66 +- ...nsemble_optimisation_stacking_ensemble.py} | 69 +- ...optimisation_stacking_ensemble_builder.py} | 173 ++- ...e_selection_per_layer_stacking_ensemble.py | 145 +++ ...ion_per_layer_stacking_ensemble_builder.py | 620 +++++++++ .../repeat_models_stacking_ensemble.py | 179 +++ autoPyTorch/ensemble/utils.py | 26 +- autoPyTorch/evaluation/abstract_evaluator.py | 65 +- .../ensemble_optimisation_evaluator.py | 648 ++++++++++ ...ator.py => repeated_crossval_evaluator.py} | 282 +++-- autoPyTorch/evaluation/tae.py | 39 +- autoPyTorch/evaluation/train_evaluator.py | 24 +- autoPyTorch/evaluation/utils.py | 9 + autoPyTorch/optimizer/run_history_callback.py | 1 + autoPyTorch/optimizer/smbo.py | 148 ++- autoPyTorch/optimizer/utils.py | 11 +- autoPyTorch/pipeline/base_pipeline.py | 27 - .../TabularColumnTransformer.py | 20 +- .../base_tabular_preprocessing.py | 5 +- .../column_splitting/ColumnSplitter.py | 100 ++ .../column_splitting/__init__.py | 0 .../encoding/NoEncoder.py | 12 - .../encoding/OneHotEncoder.py | 11 +- .../encoding/base_encoder.py | 10 +- .../scaling/MinMaxScaler.py | 3 +- .../tabular_preprocessing/scaling/NoScaler.py | 14 - .../scaling/Normalizer.py | 3 +- .../scaling/RobustScaler.py | 3 +- .../scaling/StandardScaler.py | 3 +- .../scaling/base_scaler.py | 9 +- .../skew_transformer/NoSkewTransformer.py | 42 + .../PowerTransformer.py | 9 +- .../QuantileTransformer.py | 9 +- .../skew_transformer/__init__.py | 143 +++ .../skew_transformer/base_skew_transformer.py | 33 + .../tabular_preprocessing/utils.py | 9 +- .../early_preprocessor/EarlyPreprocessing.py | 5 +- .../components/setup/network/base_network.py | 1 - .../setup/network_backbone/MLPBackbone.py | 10 +- .../network_backbone/base_network_backbone.py | 6 +- .../setup/network_backbone/utils.py | 5 +- .../LearnedEntityEmbedding.py | 109 +- .../setup/network_embedding/NoEmbedding.py | 2 +- .../base_network_embedding.py | 42 +- .../setup/traditional_ml/base_model.py | 32 +- .../estimator_configs/catboost.json | 4 - .../estimator_configs/extra_trees.json | 3 - .../traditional_ml/estimator_configs/knn.json | 3 - .../traditional_ml/estimator_configs/lgb.json | 9 - .../estimator_configs/random_forest.json | 3 - .../estimator_configs/rotation_forest.json | 2 - .../traditional_ml/estimator_configs/svm.json | 4 - .../tabular_traditional_model.py | 65 +- .../traditional_learner/__init__.py | 27 +- .../base_traditional_learner.py | 27 +- .../traditional_learner/catboost/__init__.py | 0 .../traditional_learner/catboost/catboost.py | 142 +++ .../traditional_learner/catboost/utils.py | 138 ++ .../extratrees/__init__.py | 0 .../extratrees/extratrees.py | 99 ++ .../traditional_learner/extratrees/utils.py | 7 + .../traditional_learner/knn/__init__.py | 0 .../traditional_learner/knn/knn.py | 108 ++ .../traditional_learner/knn/utils.py | 8 + .../traditional_learner/learners.py | 361 ------ .../traditional_learner/lgbm/__init__.py | 0 .../traditional_learner/lgbm/lgbm.py | 153 +++ .../traditional_learner/lgbm/utils.py | 298 +++++ .../random_forest/__init__.py | 0 .../random_forest/random_forest.py | 103 ++ .../random_forest/utils.py | 9 + .../traditional_learner/utils.py | 15 - .../traditional_learner/xgboost/__init__.py | 0 .../xgboost/early_stopping_custom.py | 90 ++ .../traditional_learner/xgboost/utils.py | 85 ++ .../traditional_learner/xgboost/xgboost.py | 198 +++ .../training/data_loader/base_data_loader.py | 10 +- .../components/training/trainer/__init__.py | 16 +- .../pipeline/tabular_classification.py | 8 +- .../traditional_tabular_classification.py | 22 +- autoPyTorch/utils/common.py | 44 +- autoPyTorch/utils/data_classes.py | 27 + autoPyTorch/utils/early_stopping.py | 47 + autoPyTorch/utils/parallel_model_runner.py | 167 +++ .../20_basics/example_autogluon_ensemble.py | 105 ++ .../20_basics/example_stacking_ensemble.py | 119 +- ...xample_stacking_ensemble_selection_base.py | 109 ++ ...e_stacking_ensemble_selection_per_layer.py | 107 ++ .../example_tabular_classification.py | 51 +- requirements.txt | 10 +- 103 files changed, 6480 insertions(+), 1262 deletions(-) create mode 100644 autoPyTorch/api/utils.py create mode 100644 autoPyTorch/datasets/utils.py create mode 100644 autoPyTorch/ensemble/autogluon_stacking_ensemble.py rename autoPyTorch/ensemble/{stacking_ensemble.py => ensemble_optimisation_stacking_ensemble.py} (80%) rename autoPyTorch/ensemble/{stacking_ensemble_builder.py => ensemble_optimisation_stacking_ensemble_builder.py} (77%) create mode 100644 autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble.py create mode 100644 autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble_builder.py create mode 100644 autoPyTorch/ensemble/repeat_models_stacking_ensemble.py create mode 100644 autoPyTorch/evaluation/ensemble_optimisation_evaluator.py rename autoPyTorch/evaluation/{stacking_evaluator.py => repeated_crossval_evaluator.py} (62%) create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/__init__.py create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/NoSkewTransformer.py rename autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/{scaling => skew_transformer}/PowerTransformer.py (76%) rename autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/{scaling => skew_transformer}/QuantileTransformer.py (90%) create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/__init__.py create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/base_skew_transformer.py delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/catboost.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/utils.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/extratrees.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/utils.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/knn.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/utils.py delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/learners.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/lgbm.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/utils.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/random_forest.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/utils.py delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/utils.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/early_stopping_custom.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/utils.py create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/xgboost.py create mode 100644 autoPyTorch/utils/data_classes.py create mode 100644 autoPyTorch/utils/early_stopping.py create mode 100644 autoPyTorch/utils/parallel_model_runner.py create mode 100644 examples/20_basics/example_autogluon_ensemble.py create mode 100644 examples/20_basics/example_stacking_ensemble_selection_base.py create mode 100644 examples/20_basics/example_stacking_ensemble_selection_per_layer.py diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index b8e2af296..fa4998917 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -8,6 +8,7 @@ import sys import tempfile import time +from turtle import pos import typing import unittest.mock import warnings @@ -33,6 +34,7 @@ from smac.tae import StatusType from autoPyTorch import metrics +from autoPyTorch.api.utils import get_autogluon_default_nn_config, get_config_from_run_history from autoPyTorch.automl_common.common.utils.backend import Backend, create from autoPyTorch.constants import ( REGRESSION_TASKS, @@ -40,6 +42,7 @@ STRING_TO_TASK_TYPES, ) from autoPyTorch.data.base_validator import BaseInputValidator +from autoPyTorch.data.tabular_validator import TabularInputValidator from autoPyTorch.data.utils import DatasetCompressionSpec from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( @@ -47,9 +50,15 @@ HoldoutValTypes, NoResamplingStrategyTypes, ResamplingStrategies, + RepeatedCrossValTypes ) +from autoPyTorch.datasets.utils import get_appended_dataset +from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection +from autoPyTorch.ensemble.repeat_models_stacking_ensemble import RepeatModelsStackingEnsemble from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager from autoPyTorch.ensemble.singlebest_ensemble import SingleBest +from autoPyTorch.ensemble.autogluon_stacking_ensemble import AutogluonStackingEnsemble +from autoPyTorch.ensemble.ensemble_selection_per_layer_stacking_ensemble import EnsembleSelectionPerLayerStackingEnsemble from autoPyTorch.ensemble.utils import EnsembleSelectionTypes from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash @@ -60,7 +69,8 @@ from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score, get_metrics -from autoPyTorch.utils.common import FitRequirement, dict_repr, replace_string_bool_to_bool +from autoPyTorch.utils.common import FitRequirement, ENSEMBLE_ITERATION_MULTIPLIER, dict_repr, replace_string_bool_to_bool, validate_config +from autoPyTorch.utils.parallel_model_runner import run_models_on_dataset from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates from autoPyTorch.utils.logging_ import ( PicklableClientLogger, @@ -172,9 +182,11 @@ def __init__( n_jobs: int = 1, n_threads: int = 1, logging_config: Optional[Dict] = None, - ensemble_size: int = 50, + ensemble_size: int = 5, ensemble_nbest: int = 50, - ensemble_method: int = EnsembleSelectionTypes.ensemble_selection, + ensemble_method: EnsembleSelectionTypes = EnsembleSelectionTypes.ensemble_selection, + use_ensemble_opt_loss: bool = False, + num_stacking_layers: int = 1, max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, @@ -185,6 +197,7 @@ def __init__( backend: Optional[Backend] = None, resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, + feat_type: Optional[List[str]] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, task_type: Optional[str] = None ) -> None: @@ -198,6 +211,9 @@ def __init__( self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest self.ensemble_method = ensemble_method + self.num_stacking_layers = num_stacking_layers + self.use_ensemble_opt_loss = use_ensemble_opt_loss + self.max_models_on_disc = max_models_on_disc self.logging_config: Optional[Dict] = logging_config self.include_components: Optional[Dict] = include_components @@ -230,7 +246,7 @@ def __init__( self.precision: Optional[int] = None self.opt_metric: Optional[str] = None self.dataset: Optional[BaseDataset] = None - + self.ensemble_ = None self._results_manager = ResultsManager() # By default try to use the TCP logging port or get a new port @@ -239,7 +255,7 @@ def __init__( # Store the resampling strategy from the dataset, to load models as needed self.resampling_strategy = resampling_strategy self.resampling_strategy_args = resampling_strategy_args - + self.feat_type = feat_type self.stop_logging_server: Optional[multiprocessing.synchronize.Event] = None # Single core, local runs should use fork @@ -249,8 +265,8 @@ def __init__( # possibility of a deadlock self._dask_client: Optional[dask.distributed.Client] = None self._multiprocessing_context = 'forkserver' - if self.n_jobs == 1: - self._multiprocessing_context = 'fork' + # if self.n_jobs == 1: + # self._multiprocessing_context = 'fork' self.input_validator: Optional[BaseInputValidator] = None @@ -305,6 +321,7 @@ def _get_dataset_input_validator( y_train: Union[List, pd.DataFrame, np.ndarray], X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + feat_type: Optional[List] = None, resampling_strategy: Optional[ResamplingStrategies] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, dataset_name: Optional[str] = None, @@ -351,6 +368,7 @@ def get_dataset( y_train: Union[List, pd.DataFrame, np.ndarray], X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + feat_type: Optional[List] = None, resampling_strategy: Optional[ResamplingStrategies] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, dataset_name: Optional[str] = None, @@ -422,7 +440,8 @@ def get_dataset( resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, dataset_name=dataset_name, - dataset_compression=dataset_compression) + dataset_compression=dataset_compression, + feat_type=feat_type) return dataset @@ -480,18 +499,31 @@ def get_search_space(self, dataset: BaseDataset = None) -> ConfigurationSpace: if self.search_space is not None: return self.search_space elif dataset is not None: - dataset_requirements = get_dataset_requirements( - info=dataset.get_required_dataset_info(), - include=self.include_components, - exclude=self.exclude_components, + return self._get_search_space( + dataset, + include_components=self.include_components, + exclude_components=self.exclude_components, search_space_updates=self.search_space_updates) - return get_configuration_space(info=dataset.get_dataset_properties(dataset_requirements), - include=self.include_components, - exclude=self.exclude_components, - search_space_updates=self.search_space_updates) raise ValueError("No search space initialised and no dataset passed. " "Can't create default search space without the dataset") + @staticmethod + def _get_search_space( + dataset: BaseDataset, + include_components, + exclude_components, + search_space_updates, + ) -> ConfigurationSpace: + dataset_requirements = get_dataset_requirements( + info=dataset.get_required_dataset_info(), + include=include_components, + exclude=exclude_components, + search_space_updates=search_space_updates) + return get_configuration_space(info=dataset.get_dataset_properties(dataset_requirements), + include=include_components, + exclude=exclude_components, + search_space_updates=search_space_updates) + def _get_logger(self, name: str) -> PicklableClientLogger: """ Instantiates the logger used throughout the experiment @@ -649,12 +681,28 @@ def _load_models(self) -> bool: if self.ensemble_: identifiers = self.ensemble_.get_selected_model_identifiers() - nonnull_identifiers = [i for i in identifiers if i is not None] - self.models_ = self._backend.load_models_by_identifiers(nonnull_identifiers) - if isinstance(self.resampling_strategy, CrossValTypes): - self.cv_models_ = self._backend.load_cv_models_by_identifiers(nonnull_identifiers) + # nonnull_identifiers = [i for i in identifiers if i is not None] + # self.models_ = self._backend.load_models_by_identifiers(nonnull_identifiers) + # if isinstance(self.resampling_strategy, CrossValTypes): + # self.cv_models_ = self._backend.load_cv_models_by_identifiers(nonnull_identifiers) + + # self._logger.debug(f"stacked ensemble identifiers are :{identifiers}") + if self.ensemble_method.is_stacking_ensemble(): + models = [] + cv_models = [] + for identifier in identifiers: + nonnull_identifiers = [i for i in identifier if i is not None] + models.append(self._backend.load_models_by_identifiers(nonnull_identifiers)) + cv_models.append(self._backend.load_cv_models_by_identifiers(nonnull_identifiers)) + # self._logger.debug(f"stacked ensemble models are :{models}") + self.models_ = models + self.cv_models_ = cv_models - if isinstance(self.resampling_strategy, CrossValTypes): + else: + self.models_ = self._backend.load_models_by_identifiers(identifiers) + if isinstance(self.resampling_strategy, (CrossValTypes, RepeatedCrossValTypes)): + self.cv_models_ = self._backend.load_cv_models_by_identifiers(identifiers) + if isinstance(self.resampling_strategy, (CrossValTypes, RepeatedCrossValTypes)): if len(self.cv_models_) == 0: raise ValueError('No models fitted!') @@ -765,7 +813,8 @@ def _do_dummy_prediction(self) -> None: memory_limit=memory_limit, disable_file_output=self._disable_file_output, all_supported_metrics=self._all_supported_metrics, - ensemble_method=self.ensemble_method + ensemble_method=self.ensemble_method, + pipeline_config=self.pipeline_options ) status, _, _, additional_info = ta.run(num_run, cutoff=self._time_for_task) @@ -814,113 +863,33 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: assert self._dask_client is not None self._logger.info("Starting to create traditional classifier predictions.") - starttime = time.time() - # Initialise run history for the traditional classifiers - run_history = RunHistory() - memory_limit = self._memory_limit - if memory_limit is not None: - memory_limit = int(math.ceil(memory_limit)) - available_classifiers = get_available_traditional_learners() - dask_futures = [] - - total_number_classifiers = len(available_classifiers) - for n_r, classifier in enumerate(available_classifiers): - - # Only launch a task if there is time - start_time = time.time() - if time_left >= func_eval_time_limit_secs: - self._logger.info(f"{n_r}: Started fitting {classifier} with cutoff={func_eval_time_limit_secs}") - scenario_mock = unittest.mock.Mock() - scenario_mock.wallclock_limit = time_left - # This stats object is a hack - maybe the SMAC stats object should - # already be generated here! - stats = Stats(scenario_mock) - stats.start_timing() - ta = ExecuteTaFuncWithQueue( - pynisher_context=self._multiprocessing_context, - backend=self._backend, - seed=self.seed, - multi_objectives=["cost"], - metric=self._metric, - logger_port=self._logger_port, - cost_for_crash=get_cost_of_crash(self._metric), - abort_on_first_run_crash=False, - initial_num_run=self._backend.get_next_num_run(), - stats=stats, - memory_limit=memory_limit, - disable_file_output=self._disable_file_output, - all_supported_metrics=self._all_supported_metrics - ) - dask_futures.append([ - classifier, - self._dask_client.submit( - ta.run, config=classifier, - cutoff=func_eval_time_limit_secs, - ) - ]) - - # When managing time, we need to take into account the allocated time resources, - # which are dependent on the number of cores. 'dask_futures' is a proxy to the number - # of workers /n_jobs that we have, in that if there are 4 cores allocated, we can run at most - # 4 task in parallel. Every 'cutoff' seconds, we generate up to 4 tasks. - # If we only have 4 workers and there are 4 futures in dask_futures, it means that every - # worker has a task. We would not like to launch another job until a worker is available. To this - # end, the following if-statement queries the number of active jobs, and forces to wait for a job - # completion via future.result(), so that a new worker is available for the next iteration. - if len(dask_futures) >= self.n_jobs: - - # How many workers to wait before starting fitting the next iteration - workers_to_wait = 1 - if n_r >= total_number_classifiers - 1 or time_left <= func_eval_time_limit_secs: - # If on the last iteration, flush out all tasks - workers_to_wait = len(dask_futures) - - while workers_to_wait >= 1: - workers_to_wait -= 1 - # We launch dask jobs only when there are resources available. - # This allow us to control time allocation properly, and early terminate - # the traditional machine learning pipeline - cls, future = dask_futures.pop(0) - status, cost, runtime, additional_info = future.result() - if status == StatusType.SUCCESS: - self._logger.info( - "Fitting {} took {} [sec] and got performance: {}.\n" - "additional info:\n{}".format(cls, runtime, cost, dict_repr(additional_info)) - ) - configuration = additional_info['pipeline_configuration'] - origin = additional_info['configuration_origin'] - additional_info.pop('pipeline_configuration') - run_history.add(config=configuration, cost=cost, - time=runtime, status=status, seed=self.seed, - starttime=starttime, endtime=starttime + runtime, - origin=origin, additional_info=additional_info) - else: - if additional_info.get('exitcode') == -6: - self._logger.error( - "Traditional prediction for {} failed with run state {},\n" - "because the provided memory limits were too tight.\n" - "Please increase the 'ml_memory_limit' and try again.\n" - "If you still get the problem, please open an issue\n" - "and paste the additional info.\n" - "Additional info:\n{}".format(cls, str(status), dict_repr(additional_info)) - ) - else: - self._logger.error( - "Traditional prediction for {} failed with run state {}.\nAdditional info:\n{}".format( - cls, str(status), dict_repr(additional_info) - ) - ) - - # In the case of a serial execution, calling submit halts the run for a resource - # dynamically adjust time in this case - time_left -= int(time.time() - start_time) - - # Exit if no more time is available for a new classifier - if time_left < func_eval_time_limit_secs: - self._logger.warning("Not enough time to fit all traditional machine learning models." - "Please consider increasing the run time to further improve performance.") - break + available_classifiers = get_available_traditional_learners(dataset_properties=self._get_dataset_properties(self.dataset)) + model_configs = [(key, self.pipeline_options[self.pipeline_options['budget_type']]) for key in available_classifiers.keys()] + + run_history, _ = run_models_on_dataset( + time_left=time_left, + func_eval_time_limit_secs=func_eval_time_limit_secs, + model_configs=model_configs, + logger=self._logger, + logger_port=self._logger_port, + metric=self._metric, + dask_client=self._dask_client, + backend=self._backend, + memory_limit=self._memory_limit, + disable_file_output=self._disable_file_output, + all_supported_metrics=self._all_supported_metrics, + ensemble_method=self.ensemble_method, + include=self.include_components, + exclude=self.exclude_components, + search_space_updates=self.search_space_updates, + pipeline_options=self.pipeline_options, + seed=self.seed, + multiprocessing_context=self._multiprocessing_context, + n_jobs=self.n_jobs, + current_search_space=self.search_space, + smac_initial_run=self._backend.get_next_num_run() + ) self._logger.debug("Run history traditional: {}".format(run_history)) # add run history of traditional to api run history @@ -958,6 +927,447 @@ def run_traditional_ml( ) self._stopwatch.stop_task(traditional_task_name) + def _fit_models_on_dataset( + self, + model_configs, + func_eval_time_limit_secs, + stacking_layer, + time_left, + current_search_space, + smac_initial_run, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> List[Tuple]:\ + + search_space_updates = search_space_updates if search_space_updates is not None else self.search_space_updates + + run_history, model_identifiers = run_models_on_dataset( + time_left=time_left, + func_eval_time_limit_secs=func_eval_time_limit_secs, + model_configs=model_configs, + logger=self._logger, + logger_port=self._logger_port, + metric=self._metric, + dask_client=self._dask_client, + backend=self._backend, + memory_limit=self._memory_limit, + disable_file_output=self._disable_file_output, + all_supported_metrics=self._all_supported_metrics, + ensemble_method=self.ensemble_method, + include=self.include_components, + exclude=self.exclude_components, + search_space_updates=search_space_updates, + pipeline_options=self.pipeline_options, + seed=self.seed, + multiprocessing_context=self._multiprocessing_context, + n_jobs=self.n_jobs, + current_search_space=current_search_space, + smac_initial_run=smac_initial_run + ) + + self._logger.debug("Run history for layer: {}: {}".format(stacking_layer, run_history)) + # add run history of traditional to api run history + self.run_history.update(run_history, DataOrigin.EXTERNAL_SAME_INSTANCES) + run_history.save_json(os.path.join(self._backend.internals_directory, f'run_history_{stacking_layer}.json'), + save_external=True) + return model_identifiers + + def _reset_datamanager_in_backend(self, datamanager)-> None: + self._backend.save_datamanager(datamanager) + + def _run_autogluon_stacking( + self, + optimize_metric: str, + dataset: BaseDataset, + max_budget: int = 50, + budget_type: str = 'epochs', + total_walltime_limit: int = 100, + func_eval_time_limit_secs: Optional[int] = None, + memory_limit: Optional[int] = 4096, + all_supported_metrics: bool = True, + precision: int = 32, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, + dask_client: Optional[dask.distributed.Client] = None, + ): + """ + This function can be used to create a stacking ensemble + Args: + current_task_name (str): name of the current task, + runtime_limit (int): time limit for fitting traditional models, + func_eval_time_limit_secs (int): Time limit + for a single call to the machine learning model. + Model fitting will be terminated if the machine + learning algorithm runs over the time limit. + """ + experiment_task_name: str = 'runStacking' + self._init_required_args( + experiment_task_name=experiment_task_name, + optimize_metric=optimize_metric, + dataset=dataset, + budget_type=budget_type, + max_budget=max_budget, + total_walltime_limit=total_walltime_limit, + memory_limit=memory_limit, + all_supported_metrics=all_supported_metrics, + precision=precision, + disable_file_output=disable_file_output, + dask_client=dask_client + ) + self.pipeline_options['func_eval_time_limit_secs'] = func_eval_time_limit_secs + self.precision = precision + available_classifiers = get_available_traditional_learners(dataset_properties=self._get_dataset_properties(self.dataset)) + model_configs = [(key, self.pipeline_options[self.pipeline_options['budget_type']]) for key in available_classifiers.keys()] + + if self.feat_type is None: + raise ValueError("Cant run autogluon stacking without information about dataset features passed with `feat_type`") + autogluon_nn_search_space_updates = get_autogluon_default_nn_config(feat_type=self.feat_type) + autogluon_nn_search_space = self._get_search_space( + self.dataset, + include_components=self.include_components, + exclude_components=self.exclude_components, + search_space_updates=autogluon_nn_search_space_updates) + + default_nn_config = autogluon_nn_search_space.get_default_configuration() + model_configs.append((default_nn_config, self.pipeline_options[self.pipeline_options['budget_type']])) + self._logger.info("Starting Autogluon Stacking.") + + model_identifiers = [] + stacked_weights = [] + last_successful_smac_initial_num_run = None + for stacking_layer in range(self.num_stacking_layers): + smac_initial_run=self._backend.get_next_num_run() + updated_model_configs, current_search_space = self._update_configs_for_current_config_space( + model_configs, + dataset, + autogluon_nn_search_space_updates, + assert_skew_transformer_quantile=True) + layer_model_identifiers = self._fit_models_on_dataset( + updated_model_configs, + func_eval_time_limit_secs, + stacking_layer, + time_left=(0.9*total_walltime_limit)/(self.num_stacking_layers), + current_search_space=current_search_space, + smac_initial_run=smac_initial_run, + search_space_updates=autogluon_nn_search_space_updates) + nonnull_identifiers = [identifier for identifier in layer_model_identifiers if identifier is not None] + if len(nonnull_identifiers) > 0: + model_identifiers.append( + nonnull_identifiers + ) + last_successful_smac_initial_num_run = smac_initial_run + ensemble_size = len(nonnull_identifiers) + weights = [1/ensemble_size] * ensemble_size + stacked_weights.append(weights) + _, previous_layer_predictions_train, previous_layer_predictions_test = self._get_previous_predictions(smac_initial_run, model_identifiers[-1], weights, ensemble_size) + dataset = get_appended_dataset( + original_dataset=self.dataset, + previous_layer_predictions_train=previous_layer_predictions_train, + previous_layer_predictions_test=previous_layer_predictions_test, + resampling_strategy=self.resampling_strategy, + resampling_strategy_args=self.resampling_strategy_args, + ) + self._reset_datamanager_in_backend(datamanager=dataset) + + ensemble = AutogluonStackingEnsemble() + iteration = 0 + time_left_for_ensemble = total_walltime_limit-self._stopwatch.wall_elapsed(experiment_task_name) + final_model_identifiers, final_weights = self._posthoc_fit_ensemble( + optimize_metric, + time_left_for_ensemble, + last_successful_smac_initial_num_run, + ensemble_size, + iteration) + model_identifiers[-1] = final_model_identifiers + stacked_weights[-1] = final_weights + ensemble = ensemble.fit(model_identifiers, stacked_weights) + self._backend.save_ensemble(ensemble, iteration+1, self.seed) + self._load_models() + + def _posthoc_fit_ensemble( + self, + optimize_metric, + time_left_for_ensemble, + last_successful_smac_initial_num_run, + ensemble_size, + iteration, + enable_traditional_pipeline=False, + cleanup=True, + func_eval_time_limit_secs: int = 50, + ): + self.fit_ensemble( + optimize_metric=optimize_metric, + precision=self.precision, + ensemble_size=ensemble_size, + ensemble_nbest=self.ensemble_nbest, + initial_num_run=last_successful_smac_initial_num_run, + time_for_task=time_left_for_ensemble, + enable_traditional_pipeline=enable_traditional_pipeline, + func_eval_time_limit_secs=func_eval_time_limit_secs, + iteration=iteration, + cleanup=cleanup, + load_models=False + ) + final_ensemble: EnsembleSelection = self._backend.load_ensemble(self.seed) + final_model_identifiers = final_ensemble.get_selected_model_identifiers() + final_model_identifiers_dict = {identifier: identifier for identifier in final_model_identifiers} + models_with_weights = final_ensemble.get_models_with_weights(final_model_identifiers_dict) + final_model_identifiers = [identifier[1] for identifier in models_with_weights] + final_weights = [identifier[0] for identifier in models_with_weights] + return final_model_identifiers,final_weights + + def _run_search_stacking( + self, + optimize_metric: str, + min_budget, + max_budget, + precision, + portfolio_selection, + experiment_task_name, + tae_func = None, + budget_type: str = 'epochs', + total_walltime_limit: int = 400, + func_eval_time_limit_secs: Optional[int] = None, + smac_scenario_args: Optional[Dict[str, Any]] = None, + get_smac_object_callback: Optional[Callable] = None, + ): + stacking_task_name = "runStacking" + self._stopwatch.start_task(stacking_task_name) + self.precision = precision + self.opt_metric = optimize_metric + time_left_for_search_base_models = math.floor(0.5*total_walltime_limit) + proc_ensemble = None + if time_left_for_search_base_models <= 0: + # Fit only raises error when ensemble_size is not zero but + # time_left_for_search_base_models is zero. + if self.ensemble_size > 0: + raise ValueError("Not starting ensemble builder because there " + "is no time left. Try increasing the value " + "of time_left_for_this_task.") + elif self.ensemble_size <= 0: + self._logger.info("Not starting ensemble builder as ensemble size is 0") + else: + self._logger.info("Starting ensemble") + proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_search_base_models, + ensemble_size=self.ensemble_size, + ensemble_nbest=self.ensemble_nbest, + precision=precision, + optimize_metric=self.opt_metric, + ensemble_method=self.ensemble_method, + num_stacking_layers=1 + ) + + smac_initial_run = self._run_smbo( + min_budget=min_budget, + max_budget=max_budget, + total_walltime_limit=time_left_for_search_base_models, + func_eval_time_limit_secs=func_eval_time_limit_secs, + smac_scenario_args=smac_scenario_args, + get_smac_object_callback=get_smac_object_callback, + tae_func=tae_func, + portfolio_selection=portfolio_selection, + experiment_task_name=experiment_task_name, + proc_ensemble=proc_ensemble, + num_stacking_layers=1 + ) + if proc_ensemble is not None: + self._collect_results_ensemble(proc_ensemble) + base_ensemble = self._backend.load_ensemble(self.seed) + model_identifiers = [base_ensemble.get_selected_model_identifiers()] + ensemble = RepeatModelsStackingEnsemble(base_ensemble=base_ensemble) + + weights = [weight for weight in base_ensemble.weights_ if weight > 0] + ensemble_size = self.ensemble_size + model_configs, previous_layer_predictions_train, previous_layer_predictions_test = self._get_previous_predictions(smac_initial_run, model_identifiers[-1], weights, ensemble_size) + + self._logger.debug(f"Finished search for base models, starting fitting next layers") + for stacking_layer in range(1, self.num_stacking_layers): + smac_layer_initial_run = self._backend.get_next_num_run() + time_left_for_higher_stacking_layers = total_walltime_limit -self._stopwatch.wall_elapsed(stacking_task_name) + if time_left_for_higher_stacking_layers < func_eval_time_limit_secs: + break + self._logger.debug(f"Original feat types len: {len(self.dataset.feat_type)}") + nonnull_model_predictions_train = [pred for pred in previous_layer_predictions_train if pred is not None] + nonnull_model_predictions_test = [pred for pred in previous_layer_predictions_test if pred is not None] + assert len(nonnull_model_predictions_train) == len(nonnull_model_predictions_test) + self._logger.debug(f"length Non null predictions: {len(nonnull_model_predictions_train)}") + dataset = get_appended_dataset( + original_dataset=self.dataset, + previous_layer_predictions_train=nonnull_model_predictions_train, + previous_layer_predictions_test=nonnull_model_predictions_test, + resampling_strategy=self.resampling_strategy, + resampling_strategy_args=self.resampling_strategy_args, + ) + self._logger.debug(f"new feat_types len: {len(dataset.feat_type)}") + updated_model_configs, current_search_space = self._update_configs_for_current_config_space(model_configs, dataset) + self._reset_datamanager_in_backend(datamanager=dataset) + layer_model_identifiers = self._fit_models_on_dataset(updated_model_configs, func_eval_time_limit_secs, stacking_layer, time_left=time_left_for_higher_stacking_layers/(self.num_stacking_layers - 1), current_search_space=current_search_space, smac_initial_run=smac_layer_initial_run) + if any([identifier is not None for identifier in layer_model_identifiers]): + model_identifiers.append( + layer_model_identifiers + ) + _, previous_layer_predictions_train, previous_layer_predictions_test = self._get_previous_predictions(smac_initial_run, model_identifiers[-1], weights, ensemble_size) + + ensemble = ensemble.fit(model_identifiers) + self._backend.save_ensemble(ensemble, proc_ensemble.iteration+10, self.seed) + self._load_models() + + def _get_previous_predictions(self, smac_initial_run, model_identifiers, weights, ensemble_size): + model_configs = [] + previous_layer_predictions_train = [] + previous_layer_predictions_test = [] + self._logger.debug(f'id_config: {self.run_history.ids_config}') + for weight, model_identifier in zip(weights, model_identifiers): + if model_identifier is None: + model_configs.append(None) + previous_layer_predictions_train.append(None) + previous_layer_predictions_test.append(None) + continue + seed, num_run, budget = model_identifier + + self._logger.debug(f'num_run: {num_run}') + config = get_config_from_run_history(self.run_history, num_run=num_run) # self.run_history.ids_config.get(num_run-smac_initial_run, None) + self._logger.debug(f'Configuration from previous layer: {config}') + model_configs.append((config, budget)) + previous_layer_predictions_train.extend( + [np.load(os.path.join( + self._backend.get_numrun_directory(seed=seed, num_run=num_run, budget=budget), + self._backend.get_prediction_filename('ensemble', seed, num_run, budget) + ), allow_pickle=True)] * int(weight * ensemble_size)) + previous_layer_predictions_test.extend([np.load(os.path.join( + self._backend.get_numrun_directory(seed=seed, num_run=num_run, budget=budget), + self._backend.get_prediction_filename('test', seed, num_run, budget) + ), allow_pickle=True)] * int(weight * ensemble_size)) + return model_configs,previous_layer_predictions_train,previous_layer_predictions_test + + def _update_configs_for_current_config_space( + self, + model_description: List[Tuple], + dataset: BaseDataset, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + assert_skew_transformer_quantile: bool = False + ) -> List[Tuple]: + + search_space_updates = search_space_updates if search_space_updates is not None else self.search_space_updates + + dataset_properties = self._get_dataset_properties(dataset=dataset) + current_search_space = self._get_search_space( + dataset, + include_components=self.include_components, + exclude_components=self.exclude_components, + search_space_updates=search_space_updates) + self._logger.debug(f"dataset properties after appending predictions: {dict_repr(dataset_properties)}") + n_numerical_in_incumbent_on_task_id = len(self.dataset.numerical_columns) + num_numerical = len(dataset.numerical_columns) + updated_model_descriptions = [] + for config, budget in model_description: + if config is None: + continue + + if not isinstance(config, (Configuration, dict)): + updated_model_descriptions.append((config, budget)) + continue + + updated_config = validate_config( + config=config, + search_space=current_search_space, + num_numerical=num_numerical, + n_numerical_in_incumbent_on_task_id=n_numerical_in_incumbent_on_task_id, + assert_autogluon_numerical_hyperparameters=assert_skew_transformer_quantile + ) + updated_model_descriptions.append((updated_config, budget)) + return updated_model_descriptions, current_search_space + + def _run_smbo( + self, + min_budget, + max_budget, + total_walltime_limit, + func_eval_time_limit_secs, + smac_scenario_args, + portfolio_selection, + experiment_task_name, + proc_ensemble, + num_stacking_layers, + get_smac_object_callback=None, + tae_func=None, + smbo_class=None, + ) -> int: + smac_initial_num_run = self._backend.get_next_num_run(peek=True) + proc_runhistory_updater = None + if ( + self.ensemble_method == EnsembleSelectionTypes.stacking_optimisation_ensemble + and smbo_class is not None + ): + proc_runhistory_updater = self._init_result_history_updater(initial_num_run=smac_initial_num_run) + + # ==> Run SMAC + smac_task_name: str = 'runSMAC' + self._stopwatch.start_task(smac_task_name) + elapsed_time = self._stopwatch.wall_elapsed(experiment_task_name) + time_left_for_smac = max(0, total_walltime_limit - elapsed_time) + + self._logger.info("Starting SMAC with %5.2f sec time left" % time_left_for_smac) + if time_left_for_smac <= 0: + self._logger.warning(" Not starting SMAC because there is no time left") + else: + _proc_smac = AutoMLSMBO( + config_space=self.search_space, + dataset_name=str(self.dataset_name), + backend=self._backend, + total_walltime_limit=total_walltime_limit, + func_eval_time_limit_secs=func_eval_time_limit_secs, + dask_client=self._dask_client, + memory_limit=self._memory_limit, + n_jobs=self.n_jobs, + watcher=self._stopwatch, + metric=self._metric, + seed=self.seed, + include=self.include_components, + exclude=self.exclude_components, + disable_file_output=self._disable_file_output, + all_supported_metrics=self._all_supported_metrics, + smac_scenario_args=smac_scenario_args, + get_smac_object_callback=get_smac_object_callback, + pipeline_config=self.pipeline_options, + min_budget=min_budget, + max_budget=max_budget, + ensemble_callback=proc_ensemble, + ensemble_method=self.ensemble_method, + logger_port=self._logger_port, + resampling_strategy=self.resampling_strategy, + resampling_strategy_args=self.resampling_strategy_args, + # We do not increase the num_run here, this is something + # smac does internally + start_num_run=smac_initial_num_run, + search_space_updates=self.search_space_updates, + portfolio_selection=portfolio_selection, + pynisher_context=self._multiprocessing_context, + smbo_class=smbo_class, + use_ensemble_opt_loss=self.use_ensemble_opt_loss, + other_callbacks=[proc_runhistory_updater] if proc_runhistory_updater is not None else None, + num_stacking_layers=num_stacking_layers + ) + try: + run_history, self._results_manager.trajectory, budget_type = \ + _proc_smac.run_smbo(func=tae_func) + self.run_history.update(run_history, DataOrigin.INTERNAL) + trajectory_filename = os.path.join( + self._backend.get_smac_output_directory_for_run(self.seed), + 'trajectory.json') + + assert self.trajectory is not None # mypy check + saveable_trajectory = \ + [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) + for entry in self.trajectory] + try: + with open(trajectory_filename, 'w') as fh: + json.dump(saveable_trajectory, fh) + except Exception as e: + self._logger.warning(f"Cannot save {trajectory_filename} due to {e}...") + except Exception as e: + self._logger.exception(str(e)) + raise + return smac_initial_num_run + def _search( self, optimize_metric: str, @@ -979,7 +1389,8 @@ def _search( portfolio_selection: Optional[str] = None, dask_client: Optional[dask.distributed.Client] = None, smbo_class: Optional[SMBO] = None, - use_ensemble_opt_loss: bool = False + use_ensemble_opt_loss: bool = False, + posthoc_ensemble_fit_stacking_ensemble_optimization: bool = False ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -1112,6 +1523,184 @@ def _search( self """ + experiment_task_name: str = 'runSearch' + + self._init_required_args( + experiment_task_name=experiment_task_name, + optimize_metric=optimize_metric, + dataset=dataset, + budget_type=budget_type, + max_budget=max_budget, + total_walltime_limit=total_walltime_limit, + memory_limit=memory_limit, + all_supported_metrics=all_supported_metrics, + precision=precision, + disable_file_output=disable_file_output, + dask_client=dask_client + ) + + # Handle time resource allocation + elapsed_time = self._stopwatch.wall_elapsed(experiment_task_name) + time_left_for_modelfit = int(max(0, total_walltime_limit - elapsed_time)) + if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_left_for_modelfit: + self._logger.warning( + 'Time limit for a single run is higher than total time ' + 'limit. Capping the limit for a single run to the total ' + 'time given to SMAC (%f)' % time_left_for_modelfit + ) + func_eval_time_limit_secs = time_left_for_modelfit + + # Make sure that at least 2 models are created for the ensemble process + num_models = time_left_for_modelfit // func_eval_time_limit_secs + if num_models < 2 and self.ensemble_size > 0: + func_eval_time_limit_secs = time_left_for_modelfit // 2 + self._logger.warning( + "Capping the func_eval_time_limit_secs to {} to have " + "time for a least 2 models to ensemble.".format( + func_eval_time_limit_secs + ) + ) + + self.pipeline_options['func_eval_time_limit_secs'] = func_eval_time_limit_secs + # ============> Run dummy predictions + # We only want to run dummy predictions in case we want to build an ensemble + if self.ensemble_size > 0 and self.ensemble_method != EnsembleSelectionTypes.stacking_optimisation_ensemble: + dummy_task_name = 'runDummy' + self._stopwatch.start_task(dummy_task_name) + self._do_dummy_prediction() + self._stopwatch.stop_task(dummy_task_name) + + # ============> Run traditional ml + # We only want to run traditional predictions in case we want to build an ensemble + # We want time for at least 1 Neural network in SMAC + if enable_traditional_pipeline and self.ensemble_size > 0 and self.ensemble_method != EnsembleSelectionTypes.stacking_optimisation_ensemble: + traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs) + self.run_traditional_ml(current_task_name=self.dataset_name, + runtime_limit=traditional_runtime_limit, + func_eval_time_limit_secs=func_eval_time_limit_secs) + + # ============> Starting ensemble + self.use_ensemble_opt_loss = use_ensemble_opt_loss + if self.ensemble_method == EnsembleSelectionTypes.stacking_repeat_models: + elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) + time_left_for_stacking = max(0, total_walltime_limit - elapsed_time) + self._run_search_stacking( + optimize_metric=optimize_metric, + min_budget=min_budget, + max_budget=max_budget, + smac_scenario_args=smac_scenario_args, + total_walltime_limit=time_left_for_stacking, + func_eval_time_limit_secs=func_eval_time_limit_secs, + budget_type=budget_type, + portfolio_selection=portfolio_selection, + tae_func=tae_func, + precision=precision, + experiment_task_name=experiment_task_name + ) + else: + self.precision = precision + self.opt_metric = optimize_metric + elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) + time_left_for_ensembles = max(0, total_walltime_limit - elapsed_time) + posthoc_ensemble_fit_stacking_ensemble_optimization = posthoc_ensemble_fit_stacking_ensemble_optimization \ + and self.ensemble_method == EnsembleSelectionTypes.stacking_optimisation_ensemble + TIME_ALLOCATION_FACTOR_POSTHOC_ENSEMBLE_FIT = 0.95 + time_left_for_ensembles = int(time_left_for_ensembles * TIME_ALLOCATION_FACTOR_POSTHOC_ENSEMBLE_FIT) if posthoc_ensemble_fit_stacking_ensemble_optimization else time_left_for_ensembles + proc_ensemble = None + if time_left_for_ensembles <= 0: + # Fit only raises error when ensemble_size is not zero but + # time_left_for_ensembles is zero. + if self.ensemble_size > 0: + raise ValueError("Not starting ensemble builder because there " + "is no time left. Try increasing the value " + "of time_left_for_this_task.") + elif self.ensemble_size <= 0: + self._logger.info("Not starting ensemble builder as ensemble size is 0") + else: + self._logger.info("Starting ensemble") + proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles, + ensemble_size=self.ensemble_size, + ensemble_nbest=self.ensemble_nbest, + precision=precision, + optimize_metric=self.opt_metric, + ensemble_method=self.ensemble_method, + num_stacking_layers=self.num_stacking_layers + ) + + self._run_smbo( + min_budget=min_budget, + max_budget=max_budget, + total_walltime_limit=total_walltime_limit * TIME_ALLOCATION_FACTOR_POSTHOC_ENSEMBLE_FIT \ + if posthoc_ensemble_fit_stacking_ensemble_optimization \ + else total_walltime_limit, + func_eval_time_limit_secs=func_eval_time_limit_secs, + smac_scenario_args=smac_scenario_args, + get_smac_object_callback=get_smac_object_callback, + tae_func=tae_func, + portfolio_selection=portfolio_selection, + smbo_class=smbo_class, + experiment_task_name=experiment_task_name, + proc_ensemble=proc_ensemble, + num_stacking_layers=self.num_stacking_layers + ) + + if proc_ensemble is not None: + self._collect_results_ensemble(proc_ensemble) + # Wait until the ensemble process is finished to avoid shutting down + # while the ensemble builder tries to access the data + self._logger.info("Starting Shutdown") + + if posthoc_ensemble_fit_stacking_ensemble_optimization: + ensemble = self._backend.load_ensemble(self.seed) + initial_num_run = int(open(os.path.join(self._backend.internals_directory, 'ensemble_cutoff_run.txt'), 'r').read()) + time_for_post_fit_ensemble = max(0, total_walltime_limit-self._stopwatch.wall_elapsed(self.dataset_name)) + iteration = (self.num_stacking_layers+1)*ENSEMBLE_ITERATION_MULTIPLIER + final_model_identifiers, final_weights = self._posthoc_fit_ensemble( + optimize_metric=self.opt_metric, + time_left_for_ensemble=time_for_post_fit_ensemble, + last_successful_smac_initial_num_run=initial_num_run + 1, + ensemble_size=self.ensemble_size, + iteration=iteration, + enable_traditional_pipeline=enable_traditional_pipeline, + cleanup=False, + func_eval_time_limit_secs=0.5*func_eval_time_limit_secs + ) + ensemble.identifiers_ = final_model_identifiers + stacked_ensemble_identifiers = ensemble.stacked_ensemble_identifiers + broken = False + for i, layer_identifiers in enumerate(stacked_ensemble_identifiers): + if all([identifier is None for identifier in layer_identifiers]): + broken = True + break + last_nonnull_layer = i-1 if broken else i + self._logger.debug(f"broken: {broken}, lastnonnull layer: {last_nonnull_layer}, i: {i}") + ensemble.stacked_ensemble_identifiers[last_nonnull_layer] = final_model_identifiers + ensemble.weights_ = final_weights + self._backend.save_ensemble(ensemble, iteration+1, self.seed) + + if load_models: + self._logger.info("Loading models...") + self._load_models() + self._logger.info("Finished loading models...") + + self._cleanup() + + return self + + def _init_required_args( + self, + experiment_task_name: str, + optimize_metric: str, + dataset: BaseDataset, + budget_type: str, + max_budget: int, + total_walltime_limit: int, + memory_limit: int, + all_supported_metrics: bool, + precision: int, + dask_client: Optional[dask.distributed.Client] = None, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None + ) -> None: if self.task_type != dataset.task_type: raise ValueError("Incompatible dataset entered for current task," "expected dataset to have task type :{} but got " @@ -1120,14 +1709,7 @@ def _search( raise ValueError("precision must be one of 16, 32, 64 but got {}".format(precision)) # Initialise information needed for the experiment - experiment_task_name: str = 'runSearch' - dataset_requirements = get_dataset_requirements( - info=dataset.get_required_dataset_info(), - include=self.include_components, - exclude=self.exclude_components, - search_space_updates=self.search_space_updates) - self._dataset_requirements = dataset_requirements - dataset_properties = dataset.get_dataset_properties(dataset_requirements) + dataset_properties = self._get_dataset_properties(dataset) self._stopwatch.start_task(experiment_task_name) self.dataset_name = dataset.dataset_name assert self.dataset_name is not None @@ -1195,161 +1777,16 @@ def _search( else: self._dask_client = dask_client self._is_dask_client_internally_created = False + return - # Handle time resource allocation - elapsed_time = self._stopwatch.wall_elapsed(experiment_task_name) - time_left_for_modelfit = int(max(0, total_walltime_limit - elapsed_time)) - if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_left_for_modelfit: - self._logger.warning( - 'Time limit for a single run is higher than total time ' - 'limit. Capping the limit for a single run to the total ' - 'time given to SMAC (%f)' % time_left_for_modelfit - ) - func_eval_time_limit_secs = time_left_for_modelfit - - # Make sure that at least 2 models are created for the ensemble process - num_models = time_left_for_modelfit // func_eval_time_limit_secs - if num_models < 2 and self.ensemble_size > 0: - func_eval_time_limit_secs = time_left_for_modelfit // 2 - self._logger.warning( - "Capping the func_eval_time_limit_secs to {} to have " - "time for a least 2 models to ensemble.".format( - func_eval_time_limit_secs - ) - ) - - # ============> Run dummy predictions - # We only want to run dummy predictions in case we want to build an ensemble - if self.ensemble_size > 0 and self.ensemble_method != EnsembleSelectionTypes.stacking_ensemble: - dummy_task_name = 'runDummy' - self._stopwatch.start_task(dummy_task_name) - self._do_dummy_prediction() - self._stopwatch.stop_task(dummy_task_name) - - # ============> Run traditional ml - # We only want to run traditional predictions in case we want to build an ensemble - # We want time for at least 1 Neural network in SMAC - if enable_traditional_pipeline and self.ensemble_size > 0: - traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs) - self.run_traditional_ml(current_task_name=self.dataset_name, - runtime_limit=traditional_runtime_limit, - func_eval_time_limit_secs=func_eval_time_limit_secs) - - # ============> Starting ensemble - self.use_ensemble_opt_loss = use_ensemble_opt_loss - self.precision = precision - self.opt_metric = optimize_metric - elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) - time_left_for_ensembles = max(0, total_walltime_limit - elapsed_time) - proc_ensemble = None - if time_left_for_ensembles <= 0: - # Fit only raises error when ensemble_size is not zero but - # time_left_for_ensembles is zero. - if self.ensemble_size > 0: - raise ValueError("Not starting ensemble builder because there " - "is no time left. Try increasing the value " - "of time_left_for_this_task.") - elif self.ensemble_size <= 0: - self._logger.info("Not starting ensemble builder as ensemble size is 0") - else: - self._logger.info("Starting ensemble") - proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles, - ensemble_size=self.ensemble_size, - ensemble_nbest=self.ensemble_nbest, - precision=precision, - optimize_metric=self.opt_metric, - ensemble_method=self.ensemble_method - ) - - smac_initial_num_run = self._backend.get_next_num_run(peek=True) - - proc_runhistory_updater = None - if ( - self.ensemble_method == EnsembleSelectionTypes.stacking_ensemble - and smbo_class is not None - ): - proc_runhistory_updater = self._init_result_history_updater(initial_num_run=smac_initial_num_run) - - # ==> Run SMAC - smac_task_name: str = 'runSMAC' - self._stopwatch.start_task(smac_task_name) - elapsed_time = self._stopwatch.wall_elapsed(experiment_task_name) - time_left_for_smac = max(0, total_walltime_limit - elapsed_time) - - self._logger.info("Starting SMAC with %5.2f sec time left" % time_left_for_smac) - if time_left_for_smac <= 0: - self._logger.warning(" Not starting SMAC because there is no time left") - else: - - _proc_smac = AutoMLSMBO( - config_space=self.search_space, - dataset_name=str(dataset.dataset_name), - backend=self._backend, - total_walltime_limit=total_walltime_limit, - func_eval_time_limit_secs=func_eval_time_limit_secs, - dask_client=self._dask_client, - memory_limit=self._memory_limit, - n_jobs=self.n_jobs, - watcher=self._stopwatch, - metric=self._metric, - seed=self.seed, - include=self.include_components, - exclude=self.exclude_components, - disable_file_output=self._disable_file_output, - all_supported_metrics=self._all_supported_metrics, - smac_scenario_args=smac_scenario_args, - get_smac_object_callback=get_smac_object_callback, - pipeline_config=self.pipeline_options, - min_budget=min_budget, - max_budget=max_budget, - ensemble_callback=proc_ensemble, - ensemble_method=self.ensemble_method, - logger_port=self._logger_port, - # We do not increase the num_run here, this is something - # smac does internally - start_num_run=smac_initial_num_run, - search_space_updates=self.search_space_updates, - portfolio_selection=portfolio_selection, - pynisher_context=self._multiprocessing_context, - smbo_class = smbo_class, - use_ensemble_opt_loss=self.use_ensemble_opt_loss, - other_callbacks=[proc_runhistory_updater] if proc_runhistory_updater is not None else None - ) - try: - run_history, self._results_manager.trajectory, budget_type = \ - _proc_smac.run_smbo(func=tae_func) - self.run_history.update(run_history, DataOrigin.INTERNAL) - trajectory_filename = os.path.join( - self._backend.get_smac_output_directory_for_run(self.seed), - 'trajectory.json') - - assert self.trajectory is not None # mypy check - saveable_trajectory = \ - [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) - for entry in self.trajectory] - try: - with open(trajectory_filename, 'w') as fh: - json.dump(saveable_trajectory, fh) - except Exception as e: - self._logger.warning(f"Cannot save {trajectory_filename} due to {e}...") - except Exception as e: - self._logger.exception(str(e)) - raise - # Wait until the ensemble process is finished to avoid shutting down - # while the ensemble builder tries to access the data - self._logger.info("Starting Shutdown") - - if proc_ensemble is not None: - self._collect_results_ensemble(proc_ensemble) - - if load_models: - self._logger.info("Loading models...") - self._load_models() - self._logger.info("Finished loading models...") - - self._cleanup() - - return self + def _get_dataset_properties(self, dataset): + dataset_requirements = get_dataset_requirements( + info=dataset.get_required_dataset_info(), + include=self.include_components, + exclude=self.exclude_components, + search_space_updates=self.search_space_updates) + dataset_properties = dataset.get_dataset_properties(dataset_requirements) + return dataset_properties def _get_fit_dictionary( self, @@ -1451,7 +1888,7 @@ def fit_pipeline( X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, dataset_name: Optional[str] = None, - resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes]] = None, + resampling_strategy: Optional[ResamplingStrategies] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, run_time_limit_secs: int = 60, memory_limit: Optional[int] = None, @@ -1631,7 +2068,7 @@ def fit_pipeline( pipeline_options = self.pipeline_options.copy().update(pipeline_options) if pipeline_options is not None \ else self.pipeline_options.copy() - + pipeline_options['func_eval_time_limit_secs'] = run_time_limit_secs assert pipeline_options is not None if budget_type is not None: @@ -1727,10 +2164,14 @@ def fit_ensemble( ensemble_nbest: int = 50, ensemble_size: int = 50, ensemble_method: int = EnsembleSelectionTypes.ensemble_selection, + num_stacking_layers: int = 1, + initial_num_run: int = 0, load_models: bool = True, time_for_task: int = 100, func_eval_time_limit_secs: int = 50, enable_traditional_pipeline: bool = True, + iteration: int = 0, + cleanup: bool = True ) -> 'BaseTask': """ Enables post-hoc fitting of the ensemble after the `search()` @@ -1778,7 +2219,7 @@ def fit_ensemble( self """ # Make sure that input is valid - if self.dataset is None or self.opt_metric is None: + if self.dataset is None: raise ValueError("fit_ensemble() can only be called after `search()`. " "Please call the `search()` method of {} prior to " "fit_ensemble().".format(self.__class__.__name__)) @@ -1837,6 +2278,10 @@ def fit_ensemble( precision=precision, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, + ensemble_method=ensemble_method, + num_stacking_layers=num_stacking_layers, + initial_num_run=initial_num_run, + iteration=iteration ) manager.build_ensemble(self._dask_client) @@ -1848,7 +2293,8 @@ def fit_ensemble( self._stopwatch.stop_task(ensemble_fit_task_name) - self._cleanup() + if cleanup: + self._cleanup() return self @@ -1859,7 +2305,10 @@ def _init_ensemble_builder( ensemble_method: int, ensemble_nbest: int, ensemble_size: int, + num_stacking_layers: Optional[int] = None, precision: int = 32, + initial_num_run: int = 0, + iteration: int = 0, ) -> EnsembleBuilderManager: """ Initializes an `EnsembleBuilderManager`. @@ -1919,7 +2368,10 @@ def _init_ensemble_builder( random_state=self.seed, precision=precision, logger_port=self._logger_port, - use_ensemble_loss=self.use_ensemble_opt_loss + use_ensemble_loss=self.use_ensemble_opt_loss, + num_stacking_layers=num_stacking_layers, + initial_num_run=initial_num_run, + iteration=iteration ) self._stopwatch.stop_task(ensemble_task_name) @@ -2000,19 +2452,51 @@ def predict( # Mypy assert assert self.ensemble_ is not None, "Load models should error out if no ensemble" + predictions = self._predict_with_ensemble(X_test=X_test, batch_size=batch_size, n_jobs=n_jobs) + + self._cleanup() + + return predictions + + def _predict_with_ensemble(self, X_test, batch_size, n_jobs) -> np.ndarray: + + assert self.ensemble_ is not None, "Load models should error out if no ensemble" if isinstance(self.resampling_strategy, (HoldoutValTypes, NoResamplingStrategyTypes)): models = self.models_ - elif isinstance(self.resampling_strategy, CrossValTypes): + elif isinstance(self.resampling_strategy, (CrossValTypes, RepeatedCrossValTypes)): models = self.cv_models_ - all_predictions = joblib.Parallel(n_jobs=n_jobs)( - joblib.delayed(_pipeline_predict)( - models[identifier], X_test, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type] - ) - for identifier in self.ensemble_.get_selected_model_identifiers() if identifier is not None - ) + X_test_copy = X_test.copy() + if self.ensemble_method.is_stacking_ensemble(): + ensemble_identifiers = self.ensemble_.get_selected_model_identifiers() + self._logger.debug(f"ensemble identifiers: {ensemble_identifiers}") + for i, (model, layer_identifiers) in enumerate(zip(models, ensemble_identifiers)): + if all([identifier is None for identifier in layer_identifiers]): + break + self._logger.debug(f"layer : {i} of stacking ensemble,\n layer identifiers: {layer_identifiers},\n model: {model}") + all_predictions = joblib.Parallel(n_jobs=n_jobs)( + joblib.delayed(_pipeline_predict)( + model[identifier], X_test_copy, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type] + ) + for identifier in layer_identifiers if identifier is not None + ) + if self.ensemble_method in (EnsembleSelectionTypes.stacking_ensemble_selection_per_layer, EnsembleSelectionTypes.stacking_repeat_models, EnsembleSelectionTypes.stacking_autogluon): + concat_all_predictions = self.ensemble_.get_expanded_layer_stacking_ensemble_predictions( + stacking_layer=i, raw_stacking_layer_ensemble_predictions=all_predictions) + else: + concat_all_predictions = all_predictions + + X_test_copy = np.concatenate([X_test, *concat_all_predictions], axis=1) + else: + all_predictions = joblib.Parallel(n_jobs=n_jobs)( + joblib.delayed(_pipeline_predict)( + models[identifier], X_test_copy, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type] + ) + for identifier in self.ensemble_.get_selected_model_identifiers() + ) + if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' @@ -2118,13 +2602,23 @@ def show_models(self) -> str: str: Markdown table of models. """ - df = [] - for weight, model in self.get_models_with_weights(): - representation = model.get_pipeline_representation() - representation.update({'Weight': weight}) - df.append(representation) - models_markdown: str = pd.DataFrame(df).to_markdown() - return models_markdown + if self.ensemble_method.is_stacking_ensemble(): + df = [] + for layer, model_weight in enumerate(self.get_models_with_weights()): + for weight, model in model_weight: + representation = model.get_pipeline_representation() + representation.update({'Weight': weight, "Stacking Layer": layer}) + df.append(representation) + models_markdown: str = pd.DataFrame(df).to_markdown() + return models_markdown + else: + df = [] + for weight, model in self.get_models_with_weights(): + representation = model.get_pipeline_representation() + representation.update({'Weight': weight}) + df.append(representation) + models_markdown: str = pd.DataFrame(df).to_markdown() + return models_markdown def _print_debug_info_to_log(self) -> None: """ diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 3e6354c03..f1ac64d58 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -1,5 +1,7 @@ from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union +import dask.distributed + import numpy as np import pandas as pd @@ -91,6 +93,7 @@ def __init__( ensemble_size: int = 50, ensemble_nbest: int = 50, ensemble_method: int = EnsembleSelectionTypes.ensemble_selection, + num_stacking_layers: int = 1, max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, @@ -100,6 +103,7 @@ def __init__( exclude_components: Optional[Dict[str, Any]] = None, resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, + feat_type: Optional[List[str]] = None, backend: Optional[Backend] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ): @@ -121,8 +125,10 @@ def __init__( backend=backend, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, + feat_type=feat_type, search_space_updates=search_space_updates, task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION], + num_stacking_layers=num_stacking_layers ) def build_pipeline( @@ -169,6 +175,7 @@ def _get_dataset_input_validator( y_train: Union[List, pd.DataFrame, np.ndarray], X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + feat_type: Optional[List] = None, resampling_strategy: Optional[ResamplingStrategies] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, dataset_name: Optional[str] = None, @@ -210,13 +217,14 @@ def _get_dataset_input_validator( resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ self.resampling_strategy_args - + feat_type = feat_type if feat_type is not None else self.feat_type # Create a validator object to make sure that the data provided by # the user matches the autopytorch requirements input_validator = TabularInputValidator( is_classification=True, logger_port=self._logger_port, - dataset_compression=dataset_compression + dataset_compression=dataset_compression, + feat_type=feat_type ) # Fit a input validator to check the provided data @@ -235,6 +243,51 @@ def _get_dataset_input_validator( return dataset, input_validator + def run_autogluon_stacking( + self, + optimize_metric: str, + X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + dataset_name: Optional[str] = None, + max_budget: int = 50, + budget_type: str = 'epochs', + total_walltime_limit: int = 100, + func_eval_time_limit_secs: Optional[int] = None, + memory_limit: Optional[int] = 4096, + dataset_compression: Union[Mapping[str, Any], bool] = False, + all_supported_metrics: bool = True, + precision: int = 32, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, + dask_client: Optional[dask.distributed.Client] = None + ): + self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression) + + self.dataset, self.input_validator = self._get_dataset_input_validator( + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=self.resampling_strategy, + resampling_strategy_args=self.resampling_strategy_args, + dataset_name=dataset_name, + dataset_compression=self._dataset_compression) + + return self._run_autogluon_stacking( + optimize_metric=optimize_metric, + dataset=self.dataset, + max_budget=max_budget, + budget_type=budget_type, + total_walltime_limit=total_walltime_limit, + func_eval_time_limit_secs=func_eval_time_limit_secs, + memory_limit=memory_limit, + all_supported_metrics=all_supported_metrics, + precision=precision, + disable_file_output=disable_file_output, + dask_client=dask_client, + ) + def search( self, optimize_metric: str, @@ -259,7 +312,8 @@ def search( portfolio_selection: Optional[str] = None, dataset_compression: Union[Mapping[str, Any], bool] = False, smbo_class: Optional[SMBO] = None, - use_ensemble_opt_loss=False + use_ensemble_opt_loss=False, + posthoc_ensemble_fit_stacking_ensemble_optimization: bool = False ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -460,7 +514,8 @@ def search( load_models=load_models, portfolio_selection=portfolio_selection, smbo_class=smbo_class, - use_ensemble_opt_loss=use_ensemble_opt_loss + use_ensemble_opt_loss=use_ensemble_opt_loss, + posthoc_ensemble_fit_stacking_ensemble_optimization=posthoc_ensemble_fit_stacking_ensemble_optimization ) def predict( @@ -504,3 +559,4 @@ def predict_proba(self, "the estimator search() method.") X_test = self.input_validator.feature_validator.transform(X_test) return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs) + diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index c9f21e453..b932cda2e 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -89,6 +89,7 @@ def __init__( ensemble_size: int = 50, ensemble_nbest: int = 50, ensemble_method: int = EnsembleSelectionTypes.ensemble_selection, + num_stacking_layers: int = 1, max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, @@ -98,6 +99,7 @@ def __init__( exclude_components: Optional[Dict[str, Any]] = None, resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, + feat_type: Optional[List[str]] = None, backend: Optional[Backend] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ): @@ -109,6 +111,7 @@ def __init__( ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, ensemble_method=ensemble_method, + num_stacking_layers=num_stacking_layers, max_models_on_disc=max_models_on_disc, temporary_directory=temporary_directory, output_directory=output_directory, @@ -119,6 +122,7 @@ def __init__( backend=backend, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, + feat_type=feat_type, search_space_updates=search_space_updates, task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION], ) @@ -167,6 +171,7 @@ def _get_dataset_input_validator( y_train: Union[List, pd.DataFrame, np.ndarray], X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + feat_type: Optional[List[str]] = None, resampling_strategy: Optional[ResamplingStrategies] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, dataset_name: Optional[str] = None, @@ -207,13 +212,14 @@ def _get_dataset_input_validator( resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ self.resampling_strategy_args - + feat_type = feat_type if feat_type is not None else self.feat_type # Create a validator object to make sure that the data provided by # the user matches the autopytorch requirements input_validator = TabularInputValidator( is_classification=False, logger_port=self._logger_port, - dataset_compression=dataset_compression + dataset_compression=dataset_compression, + feat_type=feat_type ) # Fit a input validator to check the provided data diff --git a/autoPyTorch/api/utils.py b/autoPyTorch/api/utils.py new file mode 100644 index 000000000..4559854f8 --- /dev/null +++ b/autoPyTorch/api/utils.py @@ -0,0 +1,139 @@ +from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates +from smac.runhistory.runhistory import RunHistory + +def get_autogluon_default_nn_config(feat_type): + has_numerical_features = "numerical" in feat_type + has_cat_features = "categorical" in feat_type + search_space_updates = HyperparameterSearchSpaceUpdates() + + + # architecture head + search_space_updates.append( + node_name='network_head', + hyperparameter='__choice__', + value_range=['no_head'], + default_value='no_head', + ) + search_space_updates.append( + node_name='network_head', + hyperparameter='no_head:activation', + value_range=['relu', 'elu'], + default_value='relu', + ) + + # backbone architecture + search_space_updates.append( + node_name='network_backbone', + hyperparameter='__choice__', + value_range=['MLPBackbone'], + default_value='MLPBackbone', + ) + search_space_updates.append( + node_name='network_backbone', + hyperparameter='MLPBackbone:num_groups', + value_range=(2, 4), + default_value=4, + ) + search_space_updates.append( + node_name='network_backbone', + hyperparameter='MLPBackbone:num_units', + value_range=[128, 512], + default_value=128, + log=True + ) + search_space_updates.append( + node_name='network_backbone', + hyperparameter='MLPBackbone:dropout', + value_range=(0.1, 0.5), + default_value=0.1, + ) + search_space_updates.append( + node_name='network_backbone', + hyperparameter='MLPBackbone:activation', + value_range=['relu', 'elu'], + default_value='relu', + ) + + # training updates + search_space_updates.append( + node_name='lr_scheduler', + hyperparameter='__choice__', + value_range=['NoScheduler'], + default_value='NoScheduler', + ) + search_space_updates.append( + node_name='optimizer', + hyperparameter='__choice__', + value_range=['AdamOptimizer', 'SGDOptimizer'], + default_value='AdamOptimizer', + ) + search_space_updates.append( + node_name='optimizer', + hyperparameter='AdamOptimizer:lr', + value_range=[1e-4, 3e-2], + default_value=3e-4, + ) + search_space_updates.append( + node_name='optimizer', + hyperparameter='AdamOptimizer:weight_decay', + value_range=(1E-12, 0.1), + default_value=1e-6, + ) + search_space_updates.append( + node_name='data_loader', + hyperparameter='max_batch_size', + value_range=[512], + default_value=512, + ) + + # preprocessing + search_space_updates.append( + node_name='feature_preprocessor', + hyperparameter='__choice__', + value_range=['NoFeaturePreprocessor'], + default_value='NoFeaturePreprocessor', + ) + + if has_numerical_features: + search_space_updates.append( + node_name='imputer', + hyperparameter='numerical_strategy', + value_range=['median', 'mean', 'most_frequent'], + default_value='median', + ) + search_space_updates.append( + node_name='scaler', + hyperparameter='__choice__', + value_range=['StandardScaler'], + default_value='StandardScaler', + ) + # preprocessing + search_space_updates.append( + node_name='skew_transformer', + hyperparameter='__choice__', + value_range=['QuantileTransformer'], + default_value='QuantileTransformer', + ) + + if has_cat_features: + search_space_updates.append( + node_name='encoder', + hyperparameter='__choice__', + value_range=['OneHotEncoder', 'NoEncoder'], + default_value='OneHotEncoder', + ) + search_space_updates.append( + node_name="network_embedding", + hyperparameter="__choice__", + value_range=('NoEmbedding', 'LearnedEntityEmbedding'), + default_value='LearnedEntityEmbedding' + ) + + return search_space_updates + + +def get_config_from_run_history(run_history: RunHistory, num_run: int): + for _, run_value in run_history.data.items(): + if run_value.additional_info.get('num_run', -1) == num_run: # to ensure that unsuccessful configs are not returned + return run_value.additional_info['configuration'] + \ No newline at end of file diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index c2d3b1c91..3b0a45d01 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -46,7 +46,7 @@ def __init__( # Required for dataset properties self.num_features: Optional[int] = None - self.categories: List[List[int]] = [] + self.num_categories_per_col: List[List[int]] = [] self.categorical_columns: List[int] = [] self.numerical_columns: List[int] = [] diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index af7932557..c17bd4416 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -98,8 +98,10 @@ class TabularFeatureValidator(BaseFeatureValidator): def __init__( self, logger: Optional[Union[PicklableClientLogger, Logger]] = None, + feat_type: Optional[List[str]] = None ): super().__init__(logger) + self.feat_type = feat_type @staticmethod def _comparator(cmp1: str, cmp2: str) -> int: @@ -168,7 +170,10 @@ def _fit( self.dtypes = [dt.name for dt in X.dtypes] # Also note this change in self.dtypes self.all_nan_columns = set(all_nan_columns) - self.enc_columns, self.feat_type = self._get_columns_info(X) + if self.feat_type is not None: + self.enc_columns = [X.columns[i] for i, col in enumerate(self.feat_type) if col.lower() == 'categorical'] + else: + self.enc_columns, self.feat_type = self._get_columns_info(X) if len(self.enc_columns) > 0: @@ -193,10 +198,7 @@ def _fit( encoded_categories = self.column_transformer.\ named_transformers_['categorical_pipeline'].\ named_steps['ordinalencoder'].categories_ - self.categories = [ - list(range(len(cat))) - for cat in encoded_categories - ] + self.num_categories_per_col = [len(cat) for cat in encoded_categories] # differently to categorical_columns and numerical_columns, # this saves the index of the column. diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py index 492327fbe..347708d92 100644 --- a/autoPyTorch/data/tabular_validator.py +++ b/autoPyTorch/data/tabular_validator.py @@ -1,6 +1,6 @@ # -*- encoding: utf-8 -*- import logging -from typing import Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import numpy as np @@ -48,12 +48,14 @@ def __init__( logger_port: Optional[int] = None, dataset_compression: Optional[DatasetCompressionSpec] = None, seed: int = 42, + feat_type: Optional[List[str]] = None ): self.dataset_compression = dataset_compression self._reduced_dtype: Optional[DatasetDTypeContainerType] = None self.is_classification = is_classification self.logger_port = logger_port self.seed = seed + self.feat_type = feat_type if self.logger_port is not None: self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger( name='Validation', @@ -63,7 +65,8 @@ def __init__( self.logger = logging.getLogger('Validation') self.feature_validator = TabularFeatureValidator( - logger=self.logger) + logger=self.logger, + feat_type=self.feat_type) self.target_validator = TabularTargetValidator( is_classification=self.is_classification, logger=self.logger diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index baea81680..36729c807 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -25,7 +25,10 @@ NoResamplingFunc, NoResamplingFuncs, NoResamplingStrategyTypes, - ResamplingStrategies + ResamplingStrategies, + RepeatedCrossValFunc, + RepeatedCrossValFuncs, + RepeatedCrossValTypes ) from autoPyTorch.utils.common import FitRequirement, ispandas @@ -154,6 +157,7 @@ def __init__( self.cross_validators: Dict[str, CrossValFunc] = {} self.holdout_validators: Dict[str, HoldOutFunc] = {} self.no_resampling_validators: Dict[str, NoResamplingFunc] = {} + self.repeated_cross_validators: Dict[str, RepeatedCrossValFunc] = {} self.random_state = np.random.RandomState(seed=seed) self.shuffle = shuffle self.resampling_strategy = resampling_strategy @@ -167,7 +171,7 @@ def __init__( # Make sure cross validation splits are created once self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes) self.holdout_validators = HoldOutFuncs.get_holdout_validators(*HoldoutValTypes) - + self.repeated_cross_validators = RepeatedCrossValFuncs.get_repeated_cross_validators(*RepeatedCrossValTypes) self.no_resampling_validators = NoResamplingFuncs.get_no_resampling_validators(*NoResamplingStrategyTypes) self.splits = self.get_splits_from_resampling_strategy() @@ -237,12 +241,12 @@ def __len__(self) -> int: def _get_indices(self) -> np.ndarray: return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self)) - def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[List[int]]]]: + def get_splits_from_resampling_strategy(self) -> List[List[Tuple[List[int], Optional[List[int]]]]]: """ Creates a set of splits based on a resampling strategy provided Returns - (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format + (List[List[Tuple[List[int], Optional[List[int]]]]]): splits in the [train_indices, val_indices] format """ splits = [] if isinstance(self.resampling_strategy, HoldoutValTypes): @@ -251,10 +255,12 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[ if self.resampling_strategy_args is not None: val_share = self.resampling_strategy_args.get('val_share', val_share) splits.append( - self.create_holdout_val_split( - holdout_val_type=self.resampling_strategy, - val_share=val_share, - ) + [ + self.create_holdout_val_split( + holdout_val_type=self.resampling_strategy, + val_share=val_share, + ) + ] ) elif isinstance(self.resampling_strategy, CrossValTypes): num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get( @@ -262,15 +268,32 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[ if self.resampling_strategy_args is not None: num_splits = self.resampling_strategy_args.get('num_splits', num_splits) # Create the split if it was not created before - splits.extend( + splits.append( self.create_cross_val_splits( cross_val_type=self.resampling_strategy, num_splits=cast(int, num_splits), ) ) + elif isinstance(self.resampling_strategy, RepeatedCrossValTypes): + num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get( + 'num_splits', None) + num_repeats = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get( + 'num_repeats', None + ) + if self.resampling_strategy_args is not None: + num_splits = self.resampling_strategy_args.get('num_splits', num_splits) + num_repeats = self.resampling_strategy_args.get('num_repeats', num_splits) + # Create the split if it was not created before + splits.extend( + self.create_repeated_cross_val_splits( + repeated_cross_val_type=self.resampling_strategy, + num_splits=cast(int, num_splits), + num_repeats=cast(int, num_repeats) + ) + ) elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes): - splits.append((self.no_resampling_validators[self.resampling_strategy.name](self.random_state, - self._get_indices()), None)) + splits.append([(self.no_resampling_validators[self.resampling_strategy.name](self.random_state, + self._get_indices()), None)]) else: raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}") return splits @@ -307,6 +330,38 @@ def create_cross_val_splits( self.random_state, num_splits, self._get_indices(), **kwargs) return splits + def create_repeated_cross_val_splits( + self, + repeated_cross_val_type: RepeatedCrossValTypes, + num_splits: int, + num_repeats: int + ) -> List[List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]]: + """ + This function creates the cross validation split for the given task. + It is done once per dataset to have comparable results among pipelines + Args: + repeated_cross_val_type (RepeatedCrossValTypes): + num_splits (int): number of splits to be created + num_repeats (int): number of repeats of splits to be created + Returns: + (List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]): + list containing 'num_splits' splits. + """ + # Create just the split once + # This is gonna be called multiple times, because the current dataset + # is being used for multiple pipelines. That is, to be efficient with memory + # we dump the dataset to memory and read it on a need basis. So this function + # should be robust against multiple calls, and it does so by remembering the splits + if not isinstance(repeated_cross_val_type, RepeatedCrossValTypes): + raise NotImplementedError(f'The selected `repeated_cross_val_type` "{repeated_cross_val_type}" is not implemented.') + kwargs = {} + if repeated_cross_val_type.is_stratified(): + # we need additional information about the data for stratification + kwargs["stratify"] = self.train_tensors[-1] + splits = self.repeated_cross_validators[repeated_cross_val_type.name]( + random_state=self.random_state, num_splits=num_splits, num_repeats=num_repeats, indices=self._get_indices(), **kwargs) + return splits + def create_holdout_val_split( self, holdout_val_type: HoldoutValTypes, @@ -342,7 +397,7 @@ def create_holdout_val_split( self.random_state, val_share, self._get_indices(), **kwargs) return train, val - def get_dataset(self, split_id: int, train: bool) -> Dataset: + def get_dataset(self, split_id: int, train: bool, repeat_id: int = 0) -> Dataset: """ The above split methods employ the Subset to internally subsample the whole dataset. @@ -358,10 +413,14 @@ def get_dataset(self, split_id: int, train: bool) -> Dataset: Dataset: the reduced dataset to be used for testing """ # Subset creates a dataset. Splits is a (train_indices, test_indices) tuple - if split_id >= len(self.splits): # old version: split_id > len(self.splits) - raise IndexError(f"self.splits index out of range, got split_id={split_id}" - f" (>= num_splits={len(self.splits)})") - indices = self.splits[split_id][int(not train)] # 0: for training, 1: for evaluation + if repeat_id >= len(self.splits): + raise IndexError("repeat_id out of range, got repeat_id={}" + " (>= num_repeats={})".format(split_id, len(self.splits))) + if split_id >= len(self.splits[repeat_id]): + raise IndexError("split_id out of range, got split_id={}" + " (>= num_splits={})".format(split_id, len(self.splits[repeat_id]))) + subset = int(not train) + indices = self.splits[repeat_id][split_id][subset] if indices is None: raise ValueError("Specified fold (or subset) does not exist") diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index 78447a04e..12750d8a4 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -9,7 +9,9 @@ StratifiedKFold, StratifiedShuffleSplit, TimeSeriesSplit, - train_test_split + train_test_split, + RepeatedKFold, + RepeatedStratifiedKFold ) from typing_extensions import Protocol @@ -39,6 +41,16 @@ def __call__(self, random_state: np.random.RandomState, val_share: float, ... +class RepeatedCrossValFunc(Protocol): + def __call__(self, + random_state: np.random.RandomState, + num_splits: int, + num_repeats: int, + indices: np.ndarray, + stratify: Optional[Any]) -> List[List[Tuple[np.ndarray, np.ndarray]]]: + ... + + class CrossValTypes(IntEnum): """The type of cross validation @@ -90,8 +102,29 @@ def is_stratified(self) -> bool: return False +class RepeatedCrossValTypes(IntEnum): + """The type of repeated cross validation + This class is used to specify the cross validation function + and is not supposed to be instantiated. + Examples: This class is supposed to be used as follows + >>> cv_type = RepeatedCrossValTypes.repeated_k_fold_cross_validation + >>> print(cv_type.name) + repeated_k_fold_cross_validation + >>> for cross_val_type in CrossValTypes: + print(cross_val_type.name, cross_val_type.value) + stratified_repeated_k_fold_cross_validation 1 + repeated_k_fold_cross_validation 2 + """ + stratified_repeated_k_fold_cross_validation = 1 + repeated_k_fold_cross_validation = 2 + + def is_stratified(self) -> bool: + stratified = [self.stratified_repeated_k_fold_cross_validation] + return getattr(self, self.name) in stratified + + # TODO: replace it with another way -ResamplingStrategies = Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes] +ResamplingStrategies = Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes, RepeatedCrossValTypes] DEFAULT_RESAMPLING_PARAMETERS: Dict[ ResamplingStrategies, @@ -115,7 +148,11 @@ def is_stratified(self) -> bool: CrossValTypes.time_series_cross_validation: { 'num_splits': 5, }, - NoResamplingStrategyTypes.no_resampling: {} + NoResamplingStrategyTypes.no_resampling: {}, + RepeatedCrossValTypes.repeated_k_fold_cross_validation: { + 'num_splits': 2, + 'num_repeats': 2 + }, } @@ -270,3 +307,51 @@ def no_resampling(random_state: np.random.RandomState, np.ndarray: array of indices """ return indices + + +# TODO: Add resampling strategy for stacking, depends on the choice of implementation +class RepeatedCrossValFuncs: + @staticmethod + def repeated_k_fold_cross_validation(random_state: np.random.RandomState, + num_splits: int, + num_repeats: int, + indices: np.ndarray, + **kwargs: Any + ) -> List[List[Tuple[np.ndarray, np.ndarray]]]: + cv = RepeatedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state=random_state) + + tmp_splits = list(cv.split(indices)) + splits = [] + for i in range(num_repeats): + folds = [] + for j in range(num_splits): + folds.append(tmp_splits[i*num_splits + j]) + splits.append(folds) + return splits + + @staticmethod + def stratified_repeated_k_fold_cross_validation(random_state: np.random.RandomState, + num_splits: int, + num_repeats: int, + indices: np.ndarray, + **kwargs: Any + ) -> List[List[Tuple[np.ndarray, np.ndarray]]]: + cv = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state=random_state) + y=kwargs["stratify"] + tmp_splits = list(cv.split(indices, y[indices])) + splits = [] + for i in range(num_repeats): + folds = [] + for j in range(num_splits): + folds.append(tmp_splits[i*num_splits + j]) + splits.append(folds) + return splits + + @classmethod + def get_repeated_cross_validators(cls, *repeated_cross_validator_types: RepeatedCrossValTypes + ) -> Dict[str, RepeatedCrossValFunc]: + repeated_cross_validators: Dict[str, RepeatedCrossValFunc] = { + repeated_cross_validator.name: getattr(cls, repeated_cross_validator.name) + for repeated_cross_validator in repeated_cross_validator_types + } + return repeated_cross_validators diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py index 6cabfe525..5a15e759b 100644 --- a/autoPyTorch/datasets/tabular_dataset.py +++ b/autoPyTorch/datasets/tabular_dataset.py @@ -16,7 +16,7 @@ TABULAR_REGRESSION, TASK_TYPES_TO_STRING, ) -from autoPyTorch.data.base_validator import BaseInputValidator +from autoPyTorch.data.tabular_validator import TabularInputValidator from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, @@ -65,7 +65,7 @@ def __init__(self, train_transforms: Optional[torchvision.transforms.Compose] = None, val_transforms: Optional[torchvision.transforms.Compose] = None, dataset_name: Optional[str] = None, - validator: Optional[BaseInputValidator] = None, + validator: Optional[TabularInputValidator] = None, ): # Take information from the validator, which guarantees clean data for the @@ -81,7 +81,8 @@ def __init__(self, self.categorical_columns = validator.feature_validator.categorical_columns self.numerical_columns = validator.feature_validator.numerical_columns self.num_features = validator.feature_validator.num_features - self.categories = validator.feature_validator.categories + self.num_categories_per_col = validator.feature_validator.num_categories_per_col + self.feat_type = validator.feature_validator.feat_type super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle, resampling_strategy=resampling_strategy, diff --git a/autoPyTorch/datasets/utils.py b/autoPyTorch/datasets/utils.py new file mode 100644 index 000000000..aaa5d8df2 --- /dev/null +++ b/autoPyTorch/datasets/utils.py @@ -0,0 +1,48 @@ +from typing import Dict, List, Optional + +import numpy as np + +import pandas as pd + +from autoPyTorch.data.base_validator import BaseInputValidator +from autoPyTorch.datasets.base_dataset import BaseDataset +from autoPyTorch.datasets.resampling_strategy import ResamplingStrategies +from autoPyTorch.constants import ( + STRING_TO_TASK_TYPES, + CLASSIFICATION_TASKS, +) +from autoPyTorch.utils.data_classes import get_data_validator_class, get_dataset_class + + +def get_appended_dataset( + original_dataset: BaseDataset, + previous_layer_predictions_train: List[Optional[np.ndarray]], + previous_layer_predictions_test: List[Optional[np.ndarray]], + resampling_strategy: ResamplingStrategies, + resampling_strategy_args: Optional[Dict] + ) -> BaseDataset: + + X_train, y_train = original_dataset.train_tensors + X_test, y_test = original_dataset.test_tensors + + X_train = pd.DataFrame(np.concatenate([X_train, *previous_layer_predictions_train], axis=1)) + X_test = pd.DataFrame(np.concatenate([X_test, *previous_layer_predictions_test], axis=1)) + + new_feat_types: List[str] = original_dataset.feat_type.copy() + new_feat_types.extend(['numerical'] * (original_dataset.num_classes * len(previous_layer_predictions_train))) + validator: BaseInputValidator = get_data_validator_class(original_dataset.task_type)( + is_classification=STRING_TO_TASK_TYPES[original_dataset.task_type] in CLASSIFICATION_TASKS, + feat_type=new_feat_types) + validator.fit(X_train, y_train, X_test=X_test, y_test=y_test) + + dataset = get_dataset_class(original_dataset.task_type)( + X=X_train, + Y=y_train, + X_test=X_test, + Y_test=y_test, + validator=validator, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args) + + return dataset + diff --git a/autoPyTorch/ensemble/autogluon_stacking_ensemble.py b/autoPyTorch/ensemble/autogluon_stacking_ensemble.py new file mode 100644 index 000000000..4f255b4ab --- /dev/null +++ b/autoPyTorch/ensemble/autogluon_stacking_ensemble.py @@ -0,0 +1,158 @@ +from collections import Counter +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np + +from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble +from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection +from autoPyTorch.pipeline.base_pipeline import BasePipeline +from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss + + +class AutogluonStackingEnsemble(AbstractEnsemble): + def __init__( + self, + ) -> None: + self.ensemble_identifiers: Optional[List[List[Tuple[int, int, float]]]] = None + self.ensemble_weights: Optional[List[List]] = None + + def fit( + self, + identifiers: List[List[Tuple[int, int, float]]], + weights: List[List] + ) -> AbstractEnsemble: + """ + Builds a ensemble given the individual models out of fold predictions. + Fundamentally, defines a set of weights on how to perform a soft-voting + aggregation of the models in the given identifiers. + + Args: + predictions (List[np.ndarray]): + A list of individual model predictions of shape (n_datapoints, n_targets) + corresponding to the OutOfFold estimate of the ground truth + labels (np.ndarray): + The ground truth targets of shape (n_datapoints, n_targets) + identifiers: List[Tuple[int, int, float]] + A list of model identifiers, each with the form + (seed, number of run, budget) + + Returns: + A copy of self + """ + self.ensemble_identifiers = identifiers + self.ensemble_weights = weights + return self + + def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: + """ + Given a list of predictions from the individual model, this method + aggregates the predictions using a soft voting scheme with the weights + found during training. + + Args: + predictions (List[np.ndarray]): + A list of predictions from the individual base models. + + Returns: + average (np.ndarray): Soft voting predictions of ensemble models, using + the weights found during ensemble selection (self._weights) + """ + + average = np.zeros_like(predictions[0], dtype=np.float64) + tmp_predictions = np.empty_like(predictions[0], dtype=np.float64) + + # if predictions.shape[0] == len(self.weights_), + # predictions include those of zero-weight models. + if len(predictions) == len(self.ensemble_weights[-1]): + for pred, weight in zip(predictions, self.ensemble_weights[-1]): + np.multiply(pred, weight, out=tmp_predictions) + np.add(average, tmp_predictions, out=average) + + # if prediction model.shape[0] == len(non_null_weights), + # predictions do not include those of zero-weight models. + elif len(predictions) == np.count_nonzero(self.ensemble_weights[-1]): + non_null_weights = [w for w in self.ensemble_weights[-1] if w > 0] + for pred, weight in zip(predictions, non_null_weights): + np.multiply(pred, weight, out=tmp_predictions) + np.add(average, tmp_predictions, out=average) + + # If none of the above applies, then something must have gone wrong. + else: + raise ValueError("The dimensions of ensemble predictions" + " and ensemble weights do not match!") + del tmp_predictions + return average + + def __str__(self) -> str: + return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \ + '\n\tWeights: %s\n\tIdentifiers: %s' % \ + (' '.join(['%d: %5f' % (idx, performance) + for idx, performance in enumerate(self.trajectory_)]), + self.indices_, self.ensemble_weights[-1], + ' '.join([str(identifier) for idx, identifier in + enumerate(self.identifiers_) + if self.ensemble_weights[-1][idx] > 0])) + + def get_models_with_weights( + self, + models: Dict[Any, BasePipeline] + ) -> List[Tuple[float, BasePipeline]]: + """ + Handy function to tag the provided input models with a given weight. + + Args: + models (List[Tuple[float, BasePipeline]]): + A dictionary that maps a model's name to it's actual python object. + + Returns: + output (List[Tuple[float, BasePipeline]]): + each model with the related weight, sorted by ascending + performance. Notice that ensemble selection solves a minimization + problem. + """ + outputs = [] + for layer_models, identifiers, layer_weights in zip(models, self.ensemble_identifiers, self.ensemble_weights): + output = [] + for identifier, weight in zip(identifiers, layer_weights): + model = layer_models[identifier] + output.append((weight, model)) + output.sort(reverse=True, key=lambda t: t[0]) + outputs.append(output) + + return outputs + + def get_expanded_layer_stacking_ensemble_predictions( + self, + stacking_layer, + raw_stacking_layer_ensemble_predictions + ) -> List[np.ndarray]: + layer_weights = self.ensemble_weights[stacking_layer] + layer_size = len(self.ensemble_weights[stacking_layer]) + ensemble_predictions = [] + for weight, pred in zip(layer_weights, raw_stacking_layer_ensemble_predictions): + ensemble_predictions.extend([pred] * int(weight * layer_size)) + return ensemble_predictions + + def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: + """ + After training of ensemble selection, not all models will be used. + Some of them will have zero weight. This procedure filters this models + out. + + Returns: + output (List[Tuple[int, int, float]]): + The models actually used by ensemble selection + """ + return self.ensemble_identifiers + + def get_validation_performance(self) -> float: + """ + Returns the best optimization performance seen during hill climbing + + Returns: + (float): + best ensemble training performance + """ + return 0 + diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py index ea2b77c97..1d075e151 100644 --- a/autoPyTorch/ensemble/ensemble_builder.py +++ b/autoPyTorch/ensemble/ensemble_builder.py @@ -59,6 +59,9 @@ def __init__( random_state: Optional[Union[int, np.random.RandomState]] = None, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, unit_test: bool = False, + initial_num_run: int = 0, + num_stacking_layers: Optional[int] = None, + use_ensemble_opt_loss = False ): """ Constructor @@ -125,6 +128,7 @@ def __init__( self.ensemble_size = ensemble_size self.performance_range_threshold = performance_range_threshold + self.initial_num_run = initial_num_run if isinstance(ensemble_nbest, numbers.Integral) and ensemble_nbest < 1: raise ValueError("Integer ensemble_nbest has to be larger 1: %s" % ensemble_nbest) @@ -593,6 +597,8 @@ def compute_loss_per_model(self) -> bool: # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list # as a returning list, so as a work-around we skip next line for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]): # type: ignore + if _num_run < self.initial_num_run: + continue if self.read_at_most and n_read_files >= self.read_at_most: # limit the number of files that will be read # to limit memory consumption @@ -649,7 +655,6 @@ def compute_loss_per_model(self) -> bool: os.path.getmtime(y_ens_fn), ) - self.logger.debug(f"keys in losses {losses.keys()}") self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric] # It is not needed to create the object here @@ -1107,7 +1112,7 @@ def _get_list_of_sorted_preds(self) -> List[Tuple[str, float, int]]: # We want small num_run first key=lambda x: (x[1], x[2]), )) - self.logger.debug(f"Selected keys: {sorted_keys}") + # self.logger.debug(f"Selected keys: {sorted_keys}") return sorted_keys def _delete_excess_models(self, selected_keys: List[str]) -> None: @@ -1130,6 +1135,8 @@ def _delete_excess_models(self, selected_keys: List[str]) -> None: # Don't waste time if not enough models to delete return + self.logger.debug(f"num sorted_keys before delete: {len(sorted_keys)}, pred files: {len(self.y_ens_files)}") + # The top self.max_resident_models models would be the candidates # Any other low performance model will be deleted # The list is in ascending order of score @@ -1154,7 +1161,8 @@ def _delete_excess_models(self, selected_keys: List[str]) -> None: _budget = float(match.group(3)) # Do not delete the dummy prediction - if _num_run == 1: + if _num_run == 1 or _num_run < self.initial_num_run: + self.logger.debug(f"skipping for numrun {_num_run}") continue numrun_dir = self.backend.get_numrun_directory(_seed, _num_run, _budget) diff --git a/autoPyTorch/ensemble/ensemble_builder_manager.py b/autoPyTorch/ensemble/ensemble_builder_manager.py index 84ef362ba..0e22f4c96 100644 --- a/autoPyTorch/ensemble/ensemble_builder_manager.py +++ b/autoPyTorch/ensemble/ensemble_builder_manager.py @@ -19,10 +19,10 @@ from autoPyTorch.automl_common.common.utils.backend import Backend from autoPyTorch.constants import BINARY -from autoPyTorch.ensemble.utils import get_ensemble_builder_class +from autoPyTorch.ensemble.utils import EnsembleSelectionTypes, get_ensemble_builder_class from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.utils.logging_ import get_named_client_logger - +from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder class EnsembleBuilderManager(IncorporateRunResultCallback): def __init__( @@ -37,7 +37,7 @@ def __init__( opt_metric: str, ensemble_size: int, ensemble_nbest: int, - ensemble_method: int, + ensemble_method: EnsembleSelectionTypes, max_models_on_disc: Union[float, int], seed: int, precision: int, @@ -47,7 +47,10 @@ def __init__( random_state: int, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, pynisher_context: str = 'fork', - use_ensemble_loss=False + initial_num_run: int = 0, + use_ensemble_loss=False, + num_stacking_layers: Optional[int] = None, + iteration=0 ): """ SMAC callback to handle ensemble building Args: @@ -114,6 +117,11 @@ def __init__( self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest self.ensemble_method = ensemble_method + self.cur_stacking_layer = 0 if self.ensemble_method.is_stacking_ensemble() else None + if self.ensemble_method.is_stacking_ensemble() and num_stacking_layers is None: + raise ValueError("Cant be none for stacked ensembles") + + self.num_stacking_layers = num_stacking_layers self.max_models_on_disc: Union[float, int] = max_models_on_disc self.seed = seed self.precision = precision @@ -124,6 +132,7 @@ def __init__( self.logger_port = logger_port self.pynisher_context = pynisher_context + self.is_new_layer = False # Store something similar to SMAC's runhistory self.history: List[Dict[str, float]] = [] @@ -131,12 +140,13 @@ def __init__( self.futures: List[dask.Future] = [] # The last criteria is the number of iterations - self.iteration = 0 + self.iteration = iteration # Keep track of when we started to know when we need to finish! self.start_time = time.time() self.use_ensemble_loss = use_ensemble_loss + self.initial_num_run = initial_num_run def __call__( self, @@ -229,7 +239,11 @@ def build_ensemble( pynisher_context=self.pynisher_context, logger_port=self.logger_port, unit_test=unit_test, - use_ensemble_opt_loss=self.use_ensemble_loss + use_ensemble_opt_loss=self.use_ensemble_loss, + cur_stacking_layer=self.cur_stacking_layer, + is_new_layer=self.is_new_layer, + num_stacking_layers=self.num_stacking_layers, + initial_num_run=self.initial_num_run )) logger.info( @@ -243,12 +257,23 @@ def build_ensemble( ), ) self.iteration += 1 + # reset to False so only signal from smbo sets is_new_layer = True + self.is_new_layer = False except Exception as e: exception_traceback = traceback.format_exc() error_message = repr(e) logger.critical(exception_traceback) logger.critical(error_message) + def update_for_new_stacking_layer(self, cur_stacking_layer: int, initial_num_run: int) -> None: + if cur_stacking_layer >= self.num_stacking_layers: + raise ValueError(f"Unexpected value '{cur_stacking_layer}' for cur_stacking_layer. " + f"Max stacking layers are : {self.num_stacking_layers}.") + self.cur_stacking_layer = cur_stacking_layer + self.iteration = 0 + self.initial_num_run = initial_num_run + self.is_new_layer = True + def fit_and_return_ensemble( backend: Backend, @@ -259,7 +284,7 @@ def fit_and_return_ensemble( opt_metric: str, ensemble_size: int, ensemble_nbest: int, - ensemble_method: int, + ensemble_method: EnsembleSelectionTypes, max_models_on_disc: Union[float, int], seed: int, precision: int, @@ -272,7 +297,11 @@ def fit_and_return_ensemble( pynisher_context: str, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, unit_test: bool = False, - use_ensemble_opt_loss=False + use_ensemble_opt_loss=False, + cur_stacking_layer: Optional[int] = None, + is_new_layer: bool = False, + num_stacking_layers: Optional[int] = None, + initial_num_run: int = 0, ) -> Tuple[ List[Dict[str, float]], int, @@ -340,6 +369,18 @@ def fit_and_return_ensemble( [[pandas_timestamp, train_performance, val_performance, test_performance], ...] """ ensemble_builder = get_ensemble_builder_class(ensemble_method) + ensemble_builder_run_kwargs = { + 'end_at': end_at, + 'iteration': iteration, + 'return_predictions': return_predictions, + 'pynisher_context': pynisher_context, + } + if ensemble_method.is_stacking_ensemble() and ensemble_method != EnsembleSelectionTypes.stacking_repeat_models: + ensemble_builder_run_kwargs.update({'cur_stacking_layer': cur_stacking_layer}) + + if ensemble_method == EnsembleSelectionTypes.stacking_ensemble_selection_per_layer: + ensemble_builder_run_kwargs.update({'is_new_layer': is_new_layer}) + result = ensemble_builder( backend=backend, dataset_name=dataset_name, @@ -357,11 +398,10 @@ def fit_and_return_ensemble( random_state=random_state, logger_port=logger_port, unit_test=unit_test, - use_ensemble_opt_loss=use_ensemble_opt_loss + use_ensemble_opt_loss=use_ensemble_opt_loss, + num_stacking_layers=num_stacking_layers, + initial_num_run=initial_num_run ).run( - end_at=end_at, - iteration=iteration, - return_predictions=return_predictions, - pynisher_context=pynisher_context, + **ensemble_builder_run_kwargs ) return result diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble.py similarity index 80% rename from autoPyTorch/ensemble/stacking_ensemble.py rename to autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble.py index 40ca5bc98..66541ad51 100644 --- a/autoPyTorch/ensemble/stacking_ensemble.py +++ b/autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble.py @@ -12,20 +12,26 @@ # TODO: Think of functionality of the functions in this class adjusted for stacking. -class StackingEnsemble(AbstractEnsemble): +class EnsembleOptimisationStackingEnsemble(AbstractEnsemble): def __init__( self, ensemble_size: int, metric: autoPyTorchMetric, task_type: int, random_state: np.random.RandomState, - ensemble_slot_j: int + ensemble_slot_j: int, + cur_stacking_layer: int, + stacked_ensemble_identifiers: List[List[Optional[Tuple[int, int, float]]]], + predictions_stacking_ensemble: List[List[Dict[str, Optional[np.ndarray]]]] ) -> None: self.ensemble_size = ensemble_size self.metric = metric self.random_state = random_state self.task_type = task_type self.ensemble_slot_j = ensemble_slot_j + self.cur_stacking_layer = cur_stacking_layer + self.stacked_ensemble_identifiers = stacked_ensemble_identifiers + self.predictions_stacking_ensemble = predictions_stacking_ensemble def __getstate__(self) -> Dict[str, Any]: # Cannot serialize a metric if @@ -41,7 +47,8 @@ def __getstate__(self) -> Dict[str, Any]: def fit( self, predictions_ensemble: List[np.ndarray], - best_model_predictions: np.ndarray, + best_model_predictions_ensemble: np.ndarray, + best_model_predictions_test: np.ndarray, labels: np.ndarray, ensemble_identifiers: List[Tuple[int, int, float]], best_model_identifier: Tuple[int, int, float], @@ -64,10 +71,15 @@ def fit( Returns: A copy of self """ - predictions_ensemble[self.ensemble_slot_j] = best_model_predictions + predictions_ensemble[self.ensemble_slot_j] = best_model_predictions_ensemble ensemble_identifiers[self.ensemble_slot_j] = best_model_identifier self._fit(predictions_ensemble, labels) self.identifiers_ = ensemble_identifiers + self.stacked_ensemble_identifiers[self.cur_stacking_layer] = ensemble_identifiers + self.predictions_stacking_ensemble[self.cur_stacking_layer][self.ensemble_slot_j] = { + 'ensemble': best_model_predictions_ensemble, + 'test': best_model_predictions_test + } self._calculate_weights() return self @@ -91,9 +103,10 @@ def _fit( A list of model identifiers, each with the form (seed, number of run, budget) """ + nonnull_predictions = [pred for pred in predictions if pred is not None] weighted_ensemble_prediction = np.zeros( - predictions[0].shape, + nonnull_predictions[0].shape, dtype=np.float64, ) @@ -102,7 +115,6 @@ def _fit( dtype=np.float64, ) - nonnull_predictions = [pred for pred in predictions if pred is not None] size = len(nonnull_predictions) for pred in nonnull_predictions: np.add( @@ -129,8 +141,6 @@ def _fit( self.train_loss_: float = loss - # TODO: return 1 for models in layer 0, 2 for next and so on - # TODO: 0 for models that are not in stack def _calculate_weights(self) -> None: """ Calculates the contribution each of the individual models @@ -167,12 +177,13 @@ def _predict(self, predictions, weights): the weights """ - average = np.zeros_like(predictions[0], dtype=np.float64) - tmp_predictions = np.empty_like(predictions[0], dtype=np.float64) + nonnull_predictions = [pred for pred in predictions if pred is not None] + average = np.zeros_like(nonnull_predictions[0], dtype=np.float64) + tmp_predictions = np.empty_like(nonnull_predictions[0], dtype=np.float64) # if prediction model.shape[0] == len(non_null_weights), # predictions do not include those of zero-weight models. - if len([pred for pred in predictions if pred is not None]) == np.count_nonzero(weights): + if len(nonnull_predictions) == np.count_nonzero(weights): for pred, weight in zip(predictions, weights): if pred is not None: np.multiply(pred, weight, out=tmp_predictions) @@ -187,9 +198,17 @@ def _predict(self, predictions, weights): return average def __str__(self) -> str: - return f"Ensemble Selection:\n\tWeights: {self.weights_}\ + return f"Ensemble Optimisation Stacking Ensemble:\n\tWeights: {self.weights_}\ \n\tIdentifiers: {' '.join([str(identifier) for idx, identifier in enumerate(self.identifiers_) if self.weights_[idx] > 0])}" + def get_layer_stacking_ensemble_predictions( + self, + stacking_layer: int, + dataset: str = 'ensemble' + ) -> List[Optional[np.ndarray]]: + + return [predictions[dataset] if predictions is not None else None for predictions in self.predictions_stacking_ensemble[stacking_layer]] + def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: """ After training of ensemble selection, not all models will be used. @@ -200,7 +219,7 @@ def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: output (List[Tuple[int, int, float]]): The models actually used by ensemble selection """ - return self.identifiers_ + return self.stacked_ensemble_identifiers def get_validation_performance(self) -> float: """ @@ -255,13 +274,17 @@ def get_models_with_weights( performance. Notice that ensemble selection solves a minimization problem. """ - output = [] - for i, weight in enumerate(self.weights_): - if weight > 0.0: - identifier = self.identifiers_[i] - model = models[identifier] - output.append((weight, model)) - - output.sort(reverse=True, key=lambda t: t[0]) - - return output \ No newline at end of file + outputs = [] + for i, layer_models in enumerate(models): + output = [] + num_models = len(layer_models) + if i == len(models): + weights = self.weights_ + else: + weights = [1/num_models] * len(models) + for weight, model in zip(weights, layer_models): + output.append((weight, layer_models[model])) + output.sort(reverse=True, key=lambda t: t[0]) + outputs.append(output) + + return outputs diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble_builder.py similarity index 77% rename from autoPyTorch/ensemble/stacking_ensemble_builder.py rename to autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble_builder.py index 5097835f2..27c9f1527 100644 --- a/autoPyTorch/ensemble/stacking_ensemble_builder.py +++ b/autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble_builder.py @@ -3,11 +3,9 @@ import logging.handlers import os import pickle -import re import time import traceback import warnings -import zlib from typing import Dict, List, Optional, Tuple, Union import numpy as np @@ -16,11 +14,11 @@ from autoPyTorch.constants import BINARY from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder -from autoPyTorch.ensemble.stacking_ensemble import StackingEnsemble +from autoPyTorch.ensemble.ensemble_optimisation_stacking_ensemble import EnsembleOptimisationStackingEnsemble from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score from autoPyTorch.utils.logging_ import get_named_client_logger - +from autoPyTorch.utils.common import ENSEMBLE_ITERATION_MULTIPLIER Y_ENSEMBLE = 0 Y_TEST = 1 @@ -28,7 +26,7 @@ MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy' -def calculate_nomalised_margin_loss(ensemble_predictions, y_true, task_type) -> float: +def calculate_nomalised_margin_loss(ensemble_predictions, y_true) -> float: n_ensemble = 0 loss = 0 for pred in ensemble_predictions: @@ -41,7 +39,7 @@ def calculate_nomalised_margin_loss(ensemble_predictions, y_true, task_type) -> return np.mean(margin) # TODO: make functions to support stacking. -class StackingEnsembleBuilder(EnsembleBuilder): +class EnsembleOptimisationStackingEnsembleBuilder(EnsembleBuilder): def __init__( self, backend: Backend, @@ -61,7 +59,10 @@ def __init__( random_state: Optional[Union[int, np.random.RandomState]] = None, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, unit_test: bool = False, - use_ensemble_opt_loss=False + use_ensemble_opt_loss=False, + num_stacking_layers: int = 2, + cur_stacking_layer: int = 0, + initial_num_run: int = 0 ): """ Constructor @@ -117,7 +118,7 @@ def __init__( better solution, please let us know by opening an issue. """ - super(StackingEnsembleBuilder, self).__init__( + super(EnsembleOptimisationStackingEnsembleBuilder, self).__init__( backend=backend, dataset_name=dataset_name, task_type=task_type, output_type=output_type, metrics=metrics, opt_metric=opt_metric, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, @@ -125,13 +126,27 @@ def __init__( performance_range_threshold=performance_range_threshold, seed=seed, precision=precision, memory_limit=memory_limit, read_at_most=read_at_most, random_state=random_state, - logger_port=logger_port, unit_test=unit_test) + logger_port=logger_port, unit_test=unit_test, initial_num_run=initial_num_run) # we still need to store ensemble identifiers as this class is not persistant # we can do this by either storing and reading them in this class # or passing them via the ensemble builder manager which has persistency with the futures stored. - self.ensemble_identifiers: Optional[List[Optional[str]]] = None self.read_losses = {} self.use_ensemble_opt_loss = use_ensemble_opt_loss + self.num_stacking_layers = num_stacking_layers + self.cur_stacking_layer = cur_stacking_layer + + def run( + self, + iteration: int, + pynisher_context: str, + cur_stacking_layer: int, + time_left: Optional[float] = None, + end_at: Optional[float] = None, + time_buffer: int = 5, + return_predictions: bool = False, + ) -> Tuple[List[Dict[str, float]], int, Optional[np.ndarray], Optional[np.ndarray]]: + self.cur_stacking_layer = cur_stacking_layer + return super().run(iteration, pynisher_context, time_left, end_at, time_buffer, return_predictions) # This is the main wrapper to the EnsembleSelection class which fits the ensemble def main( @@ -199,10 +214,17 @@ def main( time_left - used_time, ) + self.current_ensemble_identifiers = self._load_current_ensemble_identifiers(cur_stacking_layer=self.cur_stacking_layer) self.ensemble_slot_j = np.mod(iteration, self.ensemble_size) - self.ensemble_identifiers = self._load_ensemble_identifiers() + # self.cutoff_num_run = self._load_ensemble_cutoff_num_run() + # # checks if we have moved to a new stacking layer. + # if self.cutoff_num_run == None: + # self.cutoff_num_run = self.initial_num_run self.logger.debug(f"Iteration for ensemble building:{iteration}, " - f"current model to be updated: {self.ensemble_identifiers[self.ensemble_slot_j]} at slot : {self.ensemble_slot_j}") + f"current model to be updated: {self.current_ensemble_identifiers[self.ensemble_slot_j]}" + f" at slot : {self.ensemble_slot_j}" + f" with cur_stacking_layer: {self.cur_stacking_layer}" + f" cut off num run: {self.initial_num_run}") # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss. if not self.compute_ensemble_loss_per_model(): if return_predictions: @@ -241,12 +263,14 @@ def main( # Save the ensemble for later use in the main module! if ensemble is not None and self.SAVE2DISC: - self.backend.save_ensemble(ensemble, iteration, self.seed) + self.backend.save_ensemble(ensemble, (self.cur_stacking_layer)*ENSEMBLE_ITERATION_MULTIPLIER + iteration, self.seed) ensemble_identifiers=self._get_identifiers_from_num_runs(ensemble.identifiers_) - # self.logger.debug(f"ensemble_identifiers being saved are {ensemble_identifiers}") - self._save_ensemble_identifiers( - ensemble_identifiers=ensemble_identifiers + self.logger.debug(f"ensemble_identifiers being saved are {ensemble_identifiers}") + self._save_current_ensemble_identifiers( + ensemble_identifiers=ensemble_identifiers, + cur_stacking_layer=self.cur_stacking_layer ) + self._save_ensemble_cutoff_num_run(cutoff_num_run=self.initial_num_run) # Delete files of non-candidate models - can only be done after fitting the ensemble and # saving it to disc so we do not accidentally delete models in the previous ensemble if self.max_resident_models is not None: @@ -286,6 +310,7 @@ def main( else: return self.ensemble_history, self.ensemble_nbest, None, None + # TODO: change to calculate stacked ensemble loss per model def compute_ensemble_loss_per_model(self) -> bool: """ Compute the loss of the predictions on ensemble building data set; @@ -337,6 +362,11 @@ def compute_ensemble_loss_per_model(self) -> bool: # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list # as a returning list, so as a work-around we skip next line for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]): # type: ignore + + # skip models that were part of previous stacking layer + if _num_run < self.initial_num_run: + continue + if self.read_at_most and n_read_files >= self.read_at_most: # limit the number of files that will be read # to limit memory consumption @@ -373,9 +403,10 @@ def compute_ensemble_loss_per_model(self) -> bool: # actually read the predictions and compute their respective loss try: - ensemble_idenitfiers = self.ensemble_identifiers.copy() + ensemble_idenitfiers = self.current_ensemble_identifiers.copy() ensemble_idenitfiers[self.ensemble_slot_j] = y_ens_fn y_ensemble = self._read_np_fn(y_ens_fn) + # self.logger.debug(f"predictions: {y_ensemble}, ensemble_identiifers: {ensemble_idenitfiers}") losses = self.get_ensemble_loss_with_model( model_predictions=y_ensemble, ensemble_identifiers=ensemble_idenitfiers @@ -412,7 +443,7 @@ def compute_ensemble_loss_per_model(self) -> bool: def fit_ensemble( self, best_model_identifier: str, - ) -> Optional[StackingEnsemble]: + ) -> Optional[EnsembleOptimisationStackingEnsemble]: """ fit ensemble @@ -426,23 +457,16 @@ def fit_ensemble( ensemble: StackingEnsemble trained Ensemble """ - - assert self.ensemble_identifiers is not None + assert self.current_ensemble_identifiers is not None if self.unit_test: raise MemoryError() - predictions_train = [self.read_preds[k][Y_ENSEMBLE] if k is not None else None for k in self.ensemble_identifiers] - best_model_predictions = self.read_preds[best_model_identifier][Y_ENSEMBLE] + predictions_train = [self.read_preds[k][Y_ENSEMBLE] if k is not None else None for k in self.current_ensemble_identifiers] + best_model_predictions_ensemble = self.read_preds[best_model_identifier][Y_ENSEMBLE] + best_model_predictions_test = self.read_preds[best_model_identifier][Y_TEST] - ensemble_num_runs = [ - ( - self.read_losses[k]["seed"], - self.read_losses[k]["num_run"], - self.read_losses[k]["budget"], - ) - if k is not None else None - for k in self.ensemble_identifiers] + ensemble_num_runs = self._get_num_runs_from_identifiers(self.current_ensemble_identifiers) best_model_num_run = ( self.read_losses[best_model_identifier]["seed"], @@ -450,17 +474,34 @@ def fit_ensemble( self.read_losses[best_model_identifier]["budget"], ) + stacked_ensemble_identifiers = self._load_stacked_ensemble_identifiers() + self.logger.debug(f"Stacked ensemble identifiers: {stacked_ensemble_identifiers}") + stacked_ensemble_num_runs = [ + self._get_num_runs_from_identifiers(layer_identifiers) + for layer_identifiers in stacked_ensemble_identifiers + ] + + predictions_stacking_ensemble = [ + [ + {'ensemble': self.read_preds[k][Y_ENSEMBLE], 'test': self.read_preds[k][Y_TEST]} if k is not None else None for k in layer_identifiers + ] + for layer_identifiers in stacked_ensemble_identifiers + ] + opt_metric = [m for m in self.metrics if m.name == self.opt_metric][0] if not opt_metric: raise ValueError(f"Cannot optimize for {self.opt_metric} in {self.metrics} " "as more than one unique optimization metric was found.") - ensemble = StackingEnsemble( + ensemble = EnsembleOptimisationStackingEnsemble( ensemble_size=self.ensemble_size, metric=opt_metric, random_state=self.random_state, task_type=self.task_type, - ensemble_slot_j=self.ensemble_slot_j + ensemble_slot_j=self.ensemble_slot_j, + cur_stacking_layer=self.cur_stacking_layer, + stacked_ensemble_identifiers=stacked_ensemble_num_runs, + predictions_stacking_ensemble=predictions_stacking_ensemble ) try: @@ -468,11 +509,13 @@ def fit_ensemble( # "Fitting the ensemble on %d models.", # len(predictions_train), # ) - # self.logger.debug(f"predictions sent to ensemble: {predictions_train}") + # self.logger.debug(f"predictions sent to ensemble: {predictions_train}, ensemble_num_runs: {ensemble_num_runs}") + # self.logger.debug(f"best model predictions: {best_model_predictions_ensemble}, ensemble_slot: {ensemble.ensemble_slot_j}, best_model_num_run: {best_model_num_run}") start_time = time.time() ensemble.fit( predictions_train, - best_model_predictions, + best_model_predictions_ensemble, + best_model_predictions_test, self.y_true_ensemble, ensemble_num_runs, best_model_num_run @@ -535,6 +578,8 @@ def predict(self, set_: str, predictions = [self.read_preds[k][pred_set] if k is not None else None for k in selected_keys] + # self.logger.debug(f" in predic(), selected_keys: {selected_keys}" + # f"predictions sent to ensemble.predict: {predictions}") if n_preds == len(predictions): y = ensemble.predict(predictions) if self.output_type == BINARY: @@ -622,11 +667,12 @@ def get_ensemble_loss_with_model(self, else: predictions = self.read_preds[identifier][Y_ENSEMBLE] else: - break + predictions=None ensemble_predictions.append(predictions) - np.multiply(predictions, weight, out=tmp_predictions) - np.add(average_predictions, tmp_predictions, out=average_predictions) + if predictions is not None: + np.multiply(predictions, weight, out=tmp_predictions) + np.add(average_predictions, tmp_predictions, out=average_predictions) loss = calculate_loss( metrics=self.metrics, @@ -634,24 +680,46 @@ def get_ensemble_loss_with_model(self, prediction=average_predictions, task_type=self.task_type, ) - loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble, self.task_type) + loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble) return loss - def _get_ensemble_identifiers_filename(self): - return os.path.join(self.backend.internals_directory, 'ensemble_identifiers.pkl') + def _get_ensemble_identifiers_filename(self, cur_stacking_layer) -> str: + return os.path.join(self.backend.internals_directory, f'ensemble_identifiers_{cur_stacking_layer}.pkl') + + def _get_ensemble_cutoff_num_run_filename(self): + return os.path.join(self.backend.internals_directory, 'ensemble_cutoff_run.txt') + + def _save_ensemble_cutoff_num_run(self, cutoff_num_run: int) -> None: + with open(self._get_ensemble_cutoff_num_run_filename(), "w") as file: + file.write(str(cutoff_num_run)) - def _save_ensemble_identifiers(self, ensemble_identifiers: List[Optional[str]]) -> None: - with open(self._get_ensemble_identifiers_filename(), "wb") as file: + def _load_ensemble_cutoff_num_run(self) -> Optional[int]: + if os.path.exists(self._get_ensemble_cutoff_num_run_filename()): + with open(self._get_ensemble_cutoff_num_run_filename(), "r") as file: + cutoff_num_run = int(file.read()) + else: + cutoff_num_run = None + return cutoff_num_run + + def _save_current_ensemble_identifiers(self, ensemble_identifiers: List[Optional[str]], cur_stacking_layer) -> None: + with open(self._get_ensemble_identifiers_filename(cur_stacking_layer=cur_stacking_layer), "wb") as file: pickle.dump(ensemble_identifiers, file=file) - def _load_ensemble_identifiers(self) -> List[Optional[str]]: - if os.path.exists(self._get_ensemble_identifiers_filename()): - with open(self._get_ensemble_identifiers_filename(), "rb") as file: + def _load_current_ensemble_identifiers(self, cur_stacking_layer) -> List[Optional[str]]: + file_name = self._get_ensemble_identifiers_filename(cur_stacking_layer) + if os.path.exists(file_name): + with open(file_name, "rb") as file: identifiers = pickle.load(file) else: identifiers = [None]*self.ensemble_size return identifiers + def _load_stacked_ensemble_identifiers(self) -> List[List[Optional[str]]]: + ensemble_identifiers = list() + for i in range(self.num_stacking_layers): + ensemble_identifiers.append(self._load_current_ensemble_identifiers(cur_stacking_layer=i)) + return ensemble_identifiers + def _get_identifiers_from_num_runs(self, num_runs, subset='ensemble') -> List[Optional[str]]: identifiers: List[Optional[str]] = [] for num_run in num_runs: @@ -665,3 +733,18 @@ def _get_identifiers_from_num_runs(self, num_runs, subset='ensemble') -> List[Op identifiers.append(identifier) return identifiers + def _get_num_runs_from_identifiers(self, identifiers) -> List[Optional[Tuple[int, int, float]]]: + num_runs: List[Optional[Tuple[int, int, float]]] = [] + for identifier in identifiers: + num_run = None + if identifier is not None: + match = self.model_fn_re.search(identifier) + if match is None: + raise ValueError(f"Could not interpret file {identifier} " + "Something went wrong while scoring predictions") + _seed = int(match.group(1)) + _num_run = int(match.group(2)) + _budget = float(match.group(3)) + num_run = (_seed, _num_run, _budget) + num_runs.append(num_run) + return num_runs diff --git a/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble.py b/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble.py new file mode 100644 index 000000000..a20a8fc9c --- /dev/null +++ b/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble.py @@ -0,0 +1,145 @@ +from copyreg import pickle +from ctypes import cast +from glob import glob +from typing import Any, Dict, List, Optional, Tuple, Union +import warnings + +import numpy as np + + +from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble +from autoPyTorch.pipeline.base_pipeline import BasePipeline +from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss +from autoPyTorch.automl_common.common.utils.backend import Backend + +# TODO: Think of functionality of the functions in this class adjusted for stacking. +class EnsembleSelectionPerLayerStackingEnsemble(AbstractEnsemble): + def __init__( + self, + num_stacking_layers, + cur_stacking_layer, + ensembles = None, + ensemble_predictions = None, + ) -> None: + self.ensembles: List[Optional[AbstractEnsemble]] = [None] * num_stacking_layers if ensembles is None else ensembles + self.cur_stacking_layer = cur_stacking_layer + self.ensemble_predictions = [None] * num_stacking_layers if ensemble_predictions is None else ensemble_predictions + + # def __getstate__(self) -> Dict[str, Any]: + # # Cannot serialize a metric if + # # it is user defined. + # # That is, if doing pickle dump + # # the metric won't be the same as the + # # one in __main__. we don't use the metric + # # in the EnsembleSelection so this should + # # be fine + # self.metric = None # type: ignore + # return self.__dict__ + + def fit( + self, + cur_ensemble: AbstractEnsemble, + cur_ensemble_predictions, + ) -> AbstractEnsemble: + """ + Builds a ensemble given the individual models out of fold predictions. + Fundamentally, defines a set of weights on how to perform a soft-voting + aggregation of the models in the given identifiers. + + Args: + predictions (List[np.ndarray]): + A list of individual model predictions of shape (n_datapoints, n_targets) + corresponding to the OutOfFold estimate of the ground truth + labels (np.ndarray): + The ground truth targets of shape (n_datapoints, n_targets) + identifiers: List[Tuple[int, int, float]] + A list of model identifiers, each with the form + (seed, number of run, budget) + + Returns: + A copy of self + """ + self.ensembles[self.cur_stacking_layer] = cur_ensemble + self.ensemble_predictions[self.cur_stacking_layer] = cur_ensemble_predictions + return self + + def predict(self, predictions: List[np.ndarray]) -> np.ndarray: + # should be the last layer + return self.ensembles[self.cur_stacking_layer].predict(predictions) + + def __str__(self) -> str: + return f"Ensemble Selection Per Layer Stacking Ensemble:\n\tWeights: {self.ensembles[self.cur_stacking_layer].weights_}\ + \n\tIdentifiers: {' '.join([str(identifier) for idx, identifier in enumerate(self.ensembles[self.cur_stacking_layer].identifiers_) if self.ensembles[self.cur_stacking_layer].weights_[idx] > 0])}" + + def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: + """ + After training of ensemble selection, not all models will be used. + Some of them will have zero weight. This procedure filters this models + out. + + Returns: + output (List[Tuple[int, int, float]]): + The models actually used by ensemble selection + """ + ensemble_identifiers = list() + for ensemble in self.ensembles: + if ensemble is None: + return ensemble_identifiers + ensemble_identifiers.append(ensemble.get_selected_model_identifiers()) + + return ensemble_identifiers + + def get_validation_performance(self) -> float: + """ + Returns the best optimization performance seen during hill climbing + + Returns: + (float): + best ensemble training performance + """ + return self.ensembles[self.cur_stacking_layer].trajectory_[-1] + + def get_models_with_weights( + self, + models: Dict[Any, BasePipeline] + ) -> List[Tuple[float, BasePipeline]]: + """ + Handy function to tag the provided input models with a given weight. + + Args: + models (List[Tuple[float, BasePipeline]]): + A dictionary that maps a model's name to it's actual python object. + + Returns: + output (List[Tuple[float, BasePipeline]]): + each model with the related weight, sorted by ascending + performance. Notice that ensemble selection solves a minimization + problem. + """ + outputs = [] + for ensemble, layer_models in zip(self.ensembles, models): + outputs.append(ensemble.get_models_with_weights(layer_models)) + + return outputs + + def get_expanded_layer_stacking_ensemble_predictions( + self, + stacking_layer: int, + raw_stacking_layer_ensemble_predictions + ) -> List[np.ndarray]: + layer_weights = [weight for weight in self.ensembles[stacking_layer].weights_ if weight > 0] + layer_size = self.ensembles[stacking_layer].ensemble_size + ensemble_predictions = [] + for weight, pred in zip(layer_weights, raw_stacking_layer_ensemble_predictions): + ensemble_predictions.extend([pred] * int(weight * layer_size)) + return ensemble_predictions + + def get_layer_stacking_ensemble_predictions( + self, + stacking_layer: int, + dataset: str = 'ensemble' + ) -> List[Optional[np.ndarray]]: + raw_stacking_layer_ensemble_predictions = self.ensemble_predictions[stacking_layer][dataset] + + return self.get_expanded_layer_stacking_ensemble_predictions(stacking_layer=stacking_layer, raw_stacking_layer_ensemble_predictions=raw_stacking_layer_ensemble_predictions) diff --git a/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble_builder.py b/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble_builder.py new file mode 100644 index 000000000..bb7a868ed --- /dev/null +++ b/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble_builder.py @@ -0,0 +1,620 @@ +import glob +import logging +import logging.handlers +import os +import pickle +import re +import time +import traceback +import warnings +from typing import Dict, List, Optional, Tuple, Union +import zlib + +import numpy as np + +from autoPyTorch.automl_common.common.utils.backend import Backend +from autoPyTorch.constants import BINARY +from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble +from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder +from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection +from autoPyTorch.ensemble.ensemble_selection_per_layer_stacking_ensemble import EnsembleSelectionPerLayerStackingEnsemble +from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score +from autoPyTorch.utils.logging_ import get_named_client_logger + +Y_ENSEMBLE = 0 +Y_TEST = 1 + +MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy' + + +# TODO: make functions to support stacking. +class EnsembleSelectionPerLayerStackingEnsembleBuilder(EnsembleBuilder): + def __init__( + self, + backend: Backend, + dataset_name: str, + task_type: int, + output_type: int, + metrics: List[autoPyTorchMetric], + opt_metric: str, + ensemble_size: int = 10, + ensemble_nbest: int = 100, + max_models_on_disc: Union[float, int] = 100, + performance_range_threshold: float = 0, + seed: int = 1, + precision: int = 32, + memory_limit: Optional[int] = 1024, + read_at_most: int = 5, + random_state: Optional[Union[int, np.random.RandomState]] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + unit_test: bool = False, + use_ensemble_opt_loss=False, + num_stacking_layers: int = 2, + cur_stacking_layer: int = 0, + initial_num_run: int = 0 + ): + """ + Constructor + Parameters + ---------- + backend: util.backend.Backend + backend to write and read files + dataset_name: str + name of dataset + task_type: int + type of ML task + metrics: List[autoPyTorchMetric], + name of metric to score predictions + opt_metric: str + name of the metric to optimize + ensemble_size: int + maximal size of ensemble (passed to ensemble.ensemble_selection) + ensemble_nbest: int/float + if int: consider only the n best prediction + if float: consider only this fraction of the best models + Both wrt to validation predictions + If performance_range_threshold > 0, might return less models + max_models_on_disc: Union[float, int] + Defines the maximum number of models that are kept in the disc. + If int, it must be greater or equal than 1, and dictates the max number of + models to keep. + If float, it will be interpreted as the max megabytes allowed of disc space. That + is, if the number of ensemble candidates require more disc space than this float + value, the worst models will be deleted to keep within this budget. + Models and predictions of the worst-performing models will be deleted then. + If None, the feature is disabled. + It defines an upper bound on the models that can be used in the ensemble. + performance_range_threshold: float + Keep only models that are better than: + dummy + (best - dummy)*performance_range_threshold + E.g dummy=2, best=4, thresh=0.5 --> only consider models with score > 3 + Will at most return the minimum between ensemble_nbest models, + and max_models_on_disc. Might return less + seed: int + random seed + precision: [16,32,64,128] + precision of floats to read the predictions + memory_limit: Optional[int] + memory limit in mb. If ``None``, no memory limit is enforced. + read_at_most: int + read at most n new prediction files in each iteration + logger_port: int + port that receives logging records + unit_test: bool + Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError. + Having this is very bad coding style, but I did not find a way to make + unittest.mock work through the pynisher with all spawn contexts. If you know a + better solution, please let us know by opening an issue. + """ + + super(EnsembleSelectionPerLayerStackingEnsembleBuilder, self).__init__( + backend=backend, dataset_name=dataset_name, task_type=task_type, + output_type=output_type, metrics=metrics, opt_metric=opt_metric, + ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, + max_models_on_disc=max_models_on_disc, + performance_range_threshold=performance_range_threshold, + seed=seed, precision=precision, memory_limit=memory_limit, + read_at_most=read_at_most, random_state=random_state, + logger_port=logger_port, unit_test=unit_test, initial_num_run=initial_num_run if cur_stacking_layer==0 else 1) + + self.num_stacking_layers = num_stacking_layers + self.cur_stacking_layer = cur_stacking_layer + self.ensembles = None + self.ensemble_predictions = None + old_ensemble: Optional[EnsembleSelectionPerLayerStackingEnsemble] = None + if os.path.exists(self.backend.get_ensemble_dir()) and len(os.listdir(self.backend.get_ensemble_dir())) >= 1: + old_ensemble = self.backend.load_ensemble(seed=seed) + self.ensembles = old_ensemble.ensembles + self.ensemble_predictions = old_ensemble.ensemble_predictions + + def run( + self, + iteration: int, + pynisher_context: str, + cur_stacking_layer: int, + time_left: Optional[float] = None, + end_at: Optional[float] = None, + time_buffer: int = 5, + return_predictions: bool = False, + is_new_layer: bool = False, + ) -> Tuple[List[Dict[str, float]], int, Optional[np.ndarray], Optional[np.ndarray]]: + self.cur_stacking_layer = cur_stacking_layer + self.is_new_layer = is_new_layer + return super().run(iteration, pynisher_context, time_left, end_at, time_buffer, return_predictions) + + # This is the main wrapper to the EnsembleSelection class which fits the ensemble + def main( + self, time_left: float, iteration: int, return_predictions: bool, + ) -> Tuple[ + List[Dict[str, float]], + int, + Optional[np.ndarray], + Optional[np.ndarray], + ]: + """ + This is the main function of the ensemble builder process and can be considered + a wrapper over the ensemble selection method implemented y EnsembleSelection class. + + This method is going to be called multiple times by the main process, to + build and ensemble, in case the SMAC process produced new models and to provide + anytime results. + + On this regard, this method mainly: + 1- select from all the individual models that smac created, the N-best candidates + (this in the scenario that N > ensemble_nbest argument to this class). This is + done based on a score calculated via the metrics argument. + 2- This pre-selected candidates are provided to the ensemble selection method + and if a ensemble is found under the provided memory/time constraints, a new + ensemble is proposed. + 3- Because this process will be called multiple times, it performs checks to make + sure a new ensenmble is only proposed if new predictions are available, as well + as making sure we do not run out of resources (like disk space) + + Args: + time_left (float): + How much time is left for the ensemble builder process + iteration (int): + Which is the current iteration + return_predictions (bool): + Whether we want to return the predictions of the current model or not + + Returns: + ensemble_history (Dict): + A snapshot of both test and optimization performance. For debugging. + ensemble_nbest (int): + The user provides a direction on how many models to use in ensemble selection. + This number can be reduced internally if the memory requirements force it. + train_predictions (np.ndarray): + The optimization prediction from the current ensemble. + test_predictions (np.ndarray): + The train prediction from the current ensemble. + """ + + # Pynisher jobs inside dask 'forget' + # the logger configuration. So we have to set it up + # accordingly + self.logger = get_named_client_logger( + name='EnsembleBuilder', + port=self.logger_port, + ) + + self.start_time = time.time() + train_pred, test_pred = None, None + + used_time = time.time() - self.start_time + self.logger.debug( + 'Starting iteration %d, time left: %f', + iteration, + time_left - used_time, + ) + + + # self.cutoff_num_run = self._load_ensemble_cutoff_num_run() + # # TODO: check how to handle this now. + # # checks if we have moved to a new stacking layer. + # if self.cutoff_num_run is None or self.is_new_layer: + # # to exclude the latest model we subtract 1 from last available num run + # self.cutoff_num_run = self.backend.get_next_num_run(peek=True) - 1 + # self.logger.debug(f"Updated cut off num run to : {self.cutoff_num_run}") + + # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss. + if not self.compute_loss_per_model(): + if return_predictions: + return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred + else: + return self.ensemble_history, self.ensemble_nbest, None, None + + # Only the models with the n_best predictions are candidates + # to be in the ensemble + candidate_models = self.get_n_best_preds() + if not candidate_models: # no candidates yet + if return_predictions: + return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred + else: + return self.ensemble_history, self.ensemble_nbest, None, None + + # populates test predictions in self.read_preds + # reduces selected models if file reading failed + n_sel_test = self.get_test_preds(selected_keys=candidate_models) + + # If any of n_sel_* is not empty and overlaps with candidate_models, + # then ensure candidate_models AND n_sel_test are sorted the same + candidate_models_set = set(candidate_models) + if candidate_models_set.intersection(n_sel_test): + candidate_models = sorted(list(candidate_models_set.intersection( + n_sel_test))) + n_sel_test = candidate_models + else: + # This has to be the case + n_sel_test = [] + + if os.environ.get('ENSEMBLE_KEEP_ALL_CANDIDATES'): + for candidate in candidate_models: + self._has_been_candidate.add(candidate) + + # self.logger.debug(f"for iteration {iteration}, best_model_identifier: {best_model_identifier} \n candidate_models: \n{candidate_models}") + # train ensemble + ensemble = self.fit_ensemble(selected_keys=candidate_models) + + # Save the ensemble for later use in the main module! + if ensemble is not None and self.SAVE2DISC: + self.backend.save_ensemble(ensemble, iteration + (pow(10, 9))* self.cur_stacking_layer, self.seed) + # self._save_ensemble_cutoff_num_run(cutoff_num_run=self.cutoff_num_run) + # Delete files of non-candidate models - can only be done after fitting the ensemble and + # saving it to disc so we do not accidentally delete models in the previous ensemble + if self.max_resident_models is not None: + self._delete_excess_models(selected_keys=candidate_models) + + # Save the read losses status for the next iteration + with open(self.ensemble_loss_file, "wb") as memory: + pickle.dump(self.read_losses, memory) + + if ensemble is not None: + train_pred = self.predict(set_="train", + ensemble=ensemble, + selected_keys=candidate_models, + n_preds=len(candidate_models), + index_run=iteration) + # TODO if predictions fails, build the model again during the + # next iteration! + test_pred = self.predict(set_="test", + ensemble=ensemble, + selected_keys=n_sel_test, + n_preds=len(candidate_models), + index_run=iteration) + + # Add a score to run history to see ensemble progress + self._add_ensemble_trajectory( + train_pred, + test_pred + ) + + # The loaded predictions and the hash can only be saved after the ensemble has been + # built, because the hash is computed during the construction of the ensemble + with open(self.ensemble_memory_file, "wb") as memory: + pickle.dump((self.read_preds, self.last_hash), memory) + + if return_predictions: + return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred + else: + return self.ensemble_history, self.ensemble_nbest, None, None + + def compute_loss_per_model(self) -> bool: + """ + Compute the loss of the predictions on ensemble building data set; + populates self.read_preds and self.read_losses + """ + + self.logger.debug("Read ensemble data set predictions") + + if self.y_true_ensemble is None: + try: + self.y_true_ensemble = self.backend.load_targets_ensemble() + except FileNotFoundError: + self.logger.debug( + "Could not find true targets on ensemble data set: %s", + traceback.format_exc(), + ) + return False + + pred_path = os.path.join( + glob.escape(self.backend.get_runs_directory()), + '%d_*_*' % self.seed, + 'predictions_ensemble_%s_*_*.npy*' % self.seed, + ) + y_ens_files = glob.glob(pred_path) + y_ens_files = [y_ens_file for y_ens_file in y_ens_files + if y_ens_file.endswith('.npy') or y_ens_file.endswith('.npy.gz')] + self.y_ens_files = y_ens_files + # no validation predictions so far -- no files + if len(self.y_ens_files) == 0: + self.logger.debug("Found no prediction files on ensemble data set:" + " %s" % pred_path) + return False + + # First sort files chronologically + to_read = [] + for y_ens_fn in self.y_ens_files: + match = self.model_fn_re.search(y_ens_fn) + if match is None: + raise ValueError(f"Could not interpret file {y_ens_fn} " + "Something went wrong while scoring predictions") + _seed = int(match.group(1)) + _num_run = int(match.group(2)) + _budget = float(match.group(3)) + + to_read.append([y_ens_fn, match, _seed, _num_run, _budget]) + + n_read_files = 0 + # Now read file wrt to num_run + # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list + # as a returning list, so as a work-around we skip next line + for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]): # type: ignore + # skip models that were part of previous stacking layer + if _num_run < self.initial_num_run: + if y_ens_fn in self.read_losses: + del self.read_losses[y_ens_fn] + continue + + if self.read_at_most and n_read_files >= self.read_at_most: + # limit the number of files that will be read + # to limit memory consumption + break + + if not y_ens_fn.endswith(".npy") and not y_ens_fn.endswith(".npy.gz"): + self.logger.info('Error loading file (not .npy or .npy.gz): %s', y_ens_fn) + continue + + if not self.read_losses.get(y_ens_fn): + self.read_losses[y_ens_fn] = { + "ens_loss": np.inf, + "mtime_ens": 0, + "mtime_test": 0, + "seed": _seed, + "num_run": _num_run, + "budget": _budget, + "disc_space_cost_mb": None, + # Lazy keys so far: + # 0 - not loaded + # 1 - loaded and in memory + # 2 - loaded but dropped again + # 3 - deleted from disk due to space constraints + "loaded": 0 + } + if not self.read_preds.get(y_ens_fn): + self.read_preds[y_ens_fn] = { + Y_ENSEMBLE: None, + Y_TEST: None, + } + + if self.read_losses[y_ens_fn]["mtime_ens"] == os.path.getmtime(y_ens_fn): + # same time stamp; nothing changed; + continue + + # actually read the predictions and compute their respective loss + try: + y_ensemble = self._read_np_fn(y_ens_fn) + losses = calculate_loss( + metrics=self.metrics, + target=self.y_true_ensemble, + prediction=y_ensemble, + task_type=self.task_type, + ) + + if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]): + self.logger.debug( + 'Changing ensemble loss for file %s from %f to %f ' + 'because file modification time changed? %f - %f', + y_ens_fn, + self.read_losses[y_ens_fn]["ens_loss"], + losses[self.opt_metric], + self.read_losses[y_ens_fn]["mtime_ens"], + os.path.getmtime(y_ens_fn), + ) + + self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric] + + # It is not needed to create the object here + # To save memory, we just compute the loss. + self.read_losses[y_ens_fn]["mtime_ens"] = os.path.getmtime(y_ens_fn) + self.read_losses[y_ens_fn]["loaded"] = 2 + self.read_losses[y_ens_fn]["disc_space_cost_mb"] = self.get_disk_consumption( + y_ens_fn + ) + + n_read_files += 1 + + except Exception: + self.logger.warning( + 'Error loading %s: %s', + y_ens_fn, + traceback.format_exc(), + ) + self.read_losses[y_ens_fn]["ens_loss"] = np.inf + + self.logger.debug( + 'Done reading %d new prediction files. Loaded %d predictions in ' + 'total.', + n_read_files, + np.sum([pred["loaded"] > 0 for pred in self.read_losses.values()]) + ) + return True + + def fit_ensemble( + self, + selected_keys: List[str] + ) -> Optional[EnsembleSelectionPerLayerStackingEnsemble]: + """ + fit ensemble + + Parameters + --------- + selected_keys: list + list of selected keys of self.read_losses + + Returns + ------- + ensemble: StackingEnsemble + trained Ensemble + """ + + if self.unit_test: + raise MemoryError() + + predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in selected_keys] + include_num_runs = [ + ( + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"], + ) + for k in selected_keys] + + # check hash if ensemble training data changed + current_hash = "".join([ + str(zlib.adler32(predictions_train[i].data.tobytes())) + for i in range(len(predictions_train)) + ]) + if self.last_hash == current_hash: + self.logger.debug( + "No new model predictions selected -- skip ensemble building " + "-- current performance: %f", + self.validation_performance_, + ) + + return None + self.last_hash = current_hash + + opt_metric = [m for m in self.metrics if m.name == self.opt_metric][0] + if not opt_metric: + raise ValueError(f"Cannot optimize for {self.opt_metric} in {self.metrics} " + "as more than one unique optimization metric was found.") + + + cur_ensemble = EnsembleSelection( + ensemble_size=self.ensemble_size, + metric=opt_metric, + random_state=self.random_state, + task_type=self.task_type, + ) + + try: + # self.logger.debug( + # "Fitting the ensemble on %d models.", + # len(predictions_train), + # ) + + start_time = time.time() + cur_ensemble.fit( + predictions_train, + self.y_true_ensemble, + include_num_runs, + ) + + end_time = time.time() + self.logger.debug( + "Fitting the ensemble took %.2f seconds.", + end_time - start_time, + ) + # self.logger.debug(f"weights = {ensemble.weights_}") + self.logger.info(str(cur_ensemble)) + self.validation_performance_ = min( + self.validation_performance_, + cur_ensemble.get_validation_performance(), + ) + cur_ensemble_model_identifiers = self._get_identifiers_from_num_runs( + cur_ensemble.get_selected_model_identifiers() + ) + + ensemble = EnsembleSelectionPerLayerStackingEnsemble( + num_stacking_layers=self.num_stacking_layers, + cur_stacking_layer=self.cur_stacking_layer, + ensembles=self.ensembles, + ensemble_predictions=self.ensemble_predictions + ) + cur_ensemble_predictions_ensemble_set = [self.read_preds[k][Y_ENSEMBLE] for k in cur_ensemble_model_identifiers] + cur_ensemble_predictions_test_set = [self.read_preds[k][Y_TEST] for k in cur_ensemble_model_identifiers] + ensemble.fit(cur_ensemble=cur_ensemble, cur_ensemble_predictions={ + 'ensemble': cur_ensemble_predictions_ensemble_set, + 'test': cur_ensemble_predictions_test_set + }) + + except ValueError: + self.logger.error('Caught ValueError: %s', traceback.format_exc()) + return None + except IndexError: + self.logger.error('Caught IndexError: %s' + traceback.format_exc()) + return None + finally: + # Explicitly free memory + del predictions_train + + return ensemble + + def _get_ensemble_identifiers_filename(self, cur_stacking_layer) -> str: + return os.path.join(self.backend.internals_directory, f'ensemble_identifiers_{cur_stacking_layer}.pkl') + + def _get_ensemble_cutoff_num_run_filename(self): + return os.path.join(self.backend.internals_directory, 'ensemble_cutoff_run.txt') + + def _save_ensemble_cutoff_num_run(self, cutoff_num_run: int) -> None: + with open(self._get_ensemble_cutoff_num_run_filename(), "w") as file: + file.write(str(cutoff_num_run)) + + def _load_ensemble_cutoff_num_run(self) -> Optional[int]: + if os.path.exists(self._get_ensemble_cutoff_num_run_filename()): + with open(self._get_ensemble_cutoff_num_run_filename(), "r") as file: + cutoff_num_run = int(file.read()) + else: + cutoff_num_run = None + return cutoff_num_run + + def _save_current_ensemble_identifiers(self, ensemble_identifiers: List[Optional[str]], cur_stacking_layer) -> None: + with open(self._get_ensemble_identifiers_filename(cur_stacking_layer=cur_stacking_layer), "wb") as file: + pickle.dump(ensemble_identifiers, file=file) + + def _load_current_ensemble_identifiers(self, cur_stacking_layer) -> List[Optional[str]]: + file_name = self._get_ensemble_identifiers_filename(cur_stacking_layer) + if os.path.exists(file_name): + with open(file_name, "rb") as file: + identifiers = pickle.load(file) + else: + identifiers = [None]*self.ensemble_size + return identifiers + + def _load_stacked_ensemble_identifiers(self) -> List[List[Optional[str]]]: + ensemble_identifiers = list() + for i in range(self.num_stacking_layers): + ensemble_identifiers.append(self._load_current_ensemble_identifiers(cur_stacking_layer=i)) + return ensemble_identifiers + + def _get_identifiers_from_num_runs(self, num_runs, subset='ensemble') -> List[Optional[str]]: + identifiers: List[Optional[str]] = [] + for num_run in num_runs: + identifier = None + if num_run is not None: + seed, idx, budget = num_run + identifier = os.path.join( + self.backend.get_numrun_directory(seed, idx, budget), + self.backend.get_prediction_filename(subset, seed, idx, budget) + ) + identifiers.append(identifier) + return identifiers + + def _get_num_runs_from_identifiers(self, identifiers) -> List[Optional[Tuple[int, int, float]]]: + num_runs: List[Optional[Tuple[int, int, float]]] = [] + for identifier in identifiers: + num_run = None + if identifier is not None: + match = self.model_fn_re.search(identifier) + if match is None: + raise ValueError(f"Could not interpret file {identifier} " + "Something went wrong while scoring predictions") + _seed = int(match.group(1)) + _num_run = int(match.group(2)) + _budget = float(match.group(3)) + num_run = (_seed, _num_run, _budget) + num_runs.append(num_run) + + return num_runs \ No newline at end of file diff --git a/autoPyTorch/ensemble/repeat_models_stacking_ensemble.py b/autoPyTorch/ensemble/repeat_models_stacking_ensemble.py new file mode 100644 index 000000000..78d65220b --- /dev/null +++ b/autoPyTorch/ensemble/repeat_models_stacking_ensemble.py @@ -0,0 +1,179 @@ +from collections import Counter +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np + +from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble +from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection +from autoPyTorch.pipeline.base_pipeline import BasePipeline +from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss + + +class RepeatModelsStackingEnsemble(AbstractEnsemble): + def __init__( + self, + base_ensemble: EnsembleSelection + ) -> None: + self.ensemble_identifiers: Optional[List[List[Tuple[int, int, float]]]] = None + self.base_ensemble = base_ensemble + self.base_weights = [w for w in base_ensemble.weights_ if w > 0] + self.ensemble_weights = None + + def fit( + self, + identifiers: List[Tuple[int, int, float]], + ) -> AbstractEnsemble: + """ + Builds a ensemble given the individual models out of fold predictions. + Fundamentally, defines a set of weights on how to perform a soft-voting + aggregation of the models in the given identifiers. + + Args: + predictions (List[np.ndarray]): + A list of individual model predictions of shape (n_datapoints, n_targets) + corresponding to the OutOfFold estimate of the ground truth + labels (np.ndarray): + The ground truth targets of shape (n_datapoints, n_targets) + identifiers: List[Tuple[int, int, float]] + A list of model identifiers, each with the form + (seed, number of run, budget) + + Returns: + A copy of self + """ + self.ensemble_identifiers = identifiers + self.ensemble_weights = [] + for layer_identifiers in identifiers: + layer_weights = [] + for i, identifier in enumerate(layer_identifiers): + if identifier is not None: + layer_weights.append(self.base_weights[i]) + self.ensemble_weights.append(layer_weights) + return self + + def _predict(self, predictions, weights): + """ + Given a list of predictions from the individual model, this method + aggregates the predictions using a soft voting scheme with the weights + found during training. + + Args: + predictions (List[np.ndarray]): + A list of predictions from the individual base models. + + Returns: + average (np.ndarray): Soft voting predictions of ensemble models, using + the weights found during ensemble selection (self._weights) + """ + + average = np.zeros_like(predictions[0], dtype=np.float64) + tmp_predictions = np.empty_like(predictions[0], dtype=np.float64) + + # if predictions.shape[0] == len(weights), + # predictions include those of zero-weight models. + if len(predictions) == len(weights): + for pred, weight in zip(predictions, weights): + np.multiply(pred, weight, out=tmp_predictions) + np.add(average, tmp_predictions, out=average) + + # if prediction model.shape[0] == len(non_null_weights), + # predictions do not include those of zero-weight models. + elif len(predictions) == np.count_nonzero(weights): + non_null_weights = [w for w in weights if w > 0] + for pred, weight in zip(predictions, non_null_weights): + np.multiply(pred, weight, out=tmp_predictions) + np.add(average, tmp_predictions, out=average) + + # If none of the above applies, then something must have gone wrong. + else: + raise ValueError("The dimensions of ensemble predictions" + " and ensemble weights do not match!") + del tmp_predictions + return average + + def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: + """ + Given a list of predictions from the individual model, this method + aggregates the predictions using a soft voting scheme with the weights + found during training. + + Args: + predictions (List[np.ndarray]): + A list of predictions from the individual base models. + + Returns: + average (np.ndarray): Soft voting predictions of ensemble models, using + the weights found during ensemble selection (self._weights) + """ + + return self._predict(predictions=predictions, weights=self.ensemble_weights[-1]) + + def __str__(self) -> str: + return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \ + '\n\tWeights: %s\n\tIdentifiers: %s' % \ + (' '.join(['%d: %5f' % (idx, performance) + for idx, performance in enumerate(self.trajectory_)]), + self.indices_, self.weights_, + ' '.join([str(identifier) for idx, identifier in + enumerate(self.identifiers_) + if self.weights_[idx] > 0])) + + def get_models_with_weights( + self, + models: Dict[Any, BasePipeline] + ) -> List[Tuple[float, BasePipeline]]: + """ + Handy function to tag the provided input models with a given weight. + + Args: + models (List[Tuple[float, BasePipeline]]): + A dictionary that maps a model's name to it's actual python object. + + Returns: + output (List[Tuple[float, BasePipeline]]): + each model with the related weight, sorted by ascending + performance. Notice that ensemble selection solves a minimization + problem. + """ + outputs = [] + first_layer_models = models[0] + for _ in models: + outputs.append(self.base_ensemble.get_models_with_weights(first_layer_models)) + + return outputs + + def get_expanded_layer_stacking_ensemble_predictions( + self, + stacking_layer, + raw_stacking_layer_ensemble_predictions + ) -> List[np.ndarray]: + layer_weights = [weight for weight in self.base_ensemble.weights_ if weight > 0] + layer_size = self.base_ensemble.ensemble_size + ensemble_predictions = [] + for weight, pred in zip(layer_weights, raw_stacking_layer_ensemble_predictions): + ensemble_predictions.extend([pred] * int(weight * layer_size)) + return ensemble_predictions + + def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]: + """ + After training of ensemble selection, not all models will be used. + Some of them will have zero weight. This procedure filters this models + out. + + Returns: + output (List[Tuple[int, int, float]]): + The models actually used by ensemble selection + """ + return self.ensemble_identifiers + + def get_validation_performance(self) -> float: + """ + Returns the best optimization performance seen during hill climbing + + Returns: + (float): + best ensemble training performance + """ + return self.base_ensemble.trajectory_[-1] + diff --git a/autoPyTorch/ensemble/utils.py b/autoPyTorch/ensemble/utils.py index 705d17e24..17fe011d0 100644 --- a/autoPyTorch/ensemble/utils.py +++ b/autoPyTorch/ensemble/utils.py @@ -1,16 +1,32 @@ from enum import IntEnum from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder -from autoPyTorch.ensemble.stacking_ensemble_builder import StackingEnsembleBuilder +from autoPyTorch.ensemble.ensemble_optimisation_stacking_ensemble_builder import EnsembleOptimisationStackingEnsembleBuilder +from autoPyTorch.ensemble.ensemble_selection_per_layer_stacking_ensemble_builder import EnsembleSelectionPerLayerStackingEnsembleBuilder class EnsembleSelectionTypes(IntEnum): ensemble_selection = 1 - stacking_ensemble = 2 + stacking_optimisation_ensemble = 2 + stacking_ensemble_selection_per_layer = 3 + stacking_repeat_models = 4 + stacking_autogluon = 5 + + def is_stacking_ensemble(self) -> bool: + stacked = [self.stacking_optimisation_ensemble, + self.stacking_ensemble_selection_per_layer, + self.stacking_repeat_models, + self.stacking_autogluon] + return getattr(self, self.name) in stacked def get_ensemble_builder_class(ensemble_method: int): - if ensemble_method == EnsembleSelectionTypes.ensemble_selection: + if ( + ensemble_method == EnsembleSelectionTypes.ensemble_selection + or ensemble_method == EnsembleSelectionTypes.stacking_repeat_models + ): return EnsembleBuilder - elif ensemble_method == EnsembleSelectionTypes.stacking_ensemble: - return StackingEnsembleBuilder + elif ensemble_method == EnsembleSelectionTypes.stacking_optimisation_ensemble: + return EnsembleOptimisationStackingEnsembleBuilder + elif ensemble_method == EnsembleSelectionTypes.stacking_ensemble_selection_per_layer: + return EnsembleSelectionPerLayerStackingEnsembleBuilder diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index 3fcc64889..f19f24b0d 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -83,15 +83,19 @@ def __init__(self, config: str, self.init_params = init_params self.pipeline = autoPyTorch.pipeline.traditional_tabular_classification. \ TraditionalTabularClassificationPipeline(dataset_properties=dataset_properties, - random_state=self.random_state) - configuration_space = self.pipeline.get_hyperparameter_search_space() - default_configuration = configuration_space.get_default_configuration().get_dictionary() - default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config - self.configuration = Configuration(configuration_space, default_configuration) - self.pipeline.set_hyperparameters(self.configuration) + random_state=self.random_state, + search_space_updates=self._get_search_space_updates()) + # configuration_space = self.pipeline.get_hyperparameter_search_space() + # default_configuration = configuration_space.get_default_configuration().get_dictionary() + # default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config + # self.configuration = Configuration(configuration_space, default_configuration) + # self.pipeline.set_hyperparameters(self.configuration) + self.configuration = self.pipeline.config + self.is_fitted_ = False def fit(self, X: Dict[str, Any], y: Any, sample_weight: Optional[np.ndarray] = None) -> object: + self.is_fitted_ = True return self.pipeline.fit(X, y) def predict_proba(self, X: Union[np.ndarray, pd.DataFrame], @@ -113,12 +117,18 @@ def get_additional_run_info(self) -> Dict[str, Any]: Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs """ return {'pipeline_configuration': self.configuration, - 'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config(), + # 'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config(), 'configuration_origin': 'traditional'} def get_pipeline_representation(self) -> Dict[str, str]: return self.pipeline.get_pipeline_representation() + def _get_search_space_updates(self): + from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates + updates = HyperparameterSearchSpaceUpdates() + updates.append(node_name='model_trainer', hyperparameter='traditional_learner', value_range=(self.config,), default_value=self.config) + return updates + @staticmethod def get_default_pipeline_options() -> Dict[str, Any]: return autoPyTorch.pipeline.traditional_tabular_classification. \ @@ -153,15 +163,19 @@ def __init__(self, config: str, self.init_params = init_params self.pipeline = autoPyTorch.pipeline.traditional_tabular_regression. \ TraditionalTabularRegressionPipeline(dataset_properties=dataset_properties, - random_state=self.random_state) - configuration_space = self.pipeline.get_hyperparameter_search_space() - default_configuration = configuration_space.get_default_configuration().get_dictionary() - default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config - self.configuration = Configuration(configuration_space, default_configuration) - self.pipeline.set_hyperparameters(self.configuration) + random_state=self.random_state, + search_space_updates=self._get_search_space_updates()) + # configuration_space = self.pipeline.get_hyperparameter_search_space() + # default_configuration = configuration_space.get_default_configuration().get_dictionary() + # default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config + # self.configuration = Configuration(configuration_space, default_configuration) + # self.pipeline.set_hyperparameters(self.configuration) + self.configuration = self.pipeline.config + self.is_fitted_ = False def fit(self, X: Dict[str, Any], y: Any, sample_weight: Optional[np.ndarray] = None) -> object: + self.is_fitted_ = True return self.pipeline.fit(X, y) def predict(self, X: Union[np.ndarray, pd.DataFrame], @@ -179,15 +193,22 @@ def get_additional_run_info(self) -> Dict[str, Any]: Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs """ return {'pipeline_configuration': self.configuration, - 'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config()} + # 'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config(), + 'configuration_origin': 'traditional'} def get_pipeline_representation(self) -> Dict[str, str]: return self.pipeline.get_pipeline_representation() + def _get_search_space_updates(self): + from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates + updates = HyperparameterSearchSpaceUpdates() + updates.append(node_name='model_trainer', hyperparameter='traditional_learner', value_range=(self.config,), default_value=self.config) + return updates + @staticmethod def get_default_pipeline_options() -> Dict[str, Any]: - return autoPyTorch.pipeline.traditional_tabular_regression.\ - TraditionalTabularRegressionPipeline.get_default_pipeline_options() + return autoPyTorch.pipeline.traditional_tabular_classification. \ + TraditionalTabularClassificationPipeline.get_default_pipeline_options() class DummyClassificationPipeline(DummyClassifier): @@ -216,9 +237,11 @@ def __init__(self, config: Configuration, super(DummyClassificationPipeline, self).__init__(strategy="uniform") else: super(DummyClassificationPipeline, self).__init__(strategy="most_frequent") + self.is_fitted_ = False def fit(self, X: Dict[str, Any], y: Any, sample_weight: Optional[np.ndarray] = None) -> object: + self.is_fitted_ = True X_train = subsampler(X['X_train'], X['train_indices']) y_train = subsampler(X['y_train'], X['train_indices']) return super(DummyClassificationPipeline, self).fit(np.ones((X_train.shape[0], 1)), y_train, @@ -278,9 +301,11 @@ def __init__(self, config: Configuration, super(DummyRegressionPipeline, self).__init__(strategy='mean') else: super(DummyRegressionPipeline, self).__init__(strategy='median') + self.is_fitted_ = False def fit(self, X: Dict[str, Any], y: Any, sample_weight: Optional[np.ndarray] = None) -> object: + self.is_fitted_ = True X_train = subsampler(X['X_train'], X['train_indices']) y_train = subsampler(X['y_train'], X['train_indices']) return super(DummyRegressionPipeline, self).fit(np.ones((X_train.shape[0], 1)), y_train, @@ -425,7 +450,8 @@ def __init__(self, backend: Backend, logger_port: Optional[int] = None, all_supported_metrics: bool = True, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, - use_ensemble_opt_loss=False + use_ensemble_opt_loss=False, + cur_stacking_layer: int = 0 ) -> None: self.starttime = time.time() @@ -499,7 +525,7 @@ def __init__(self, backend: Backend, else self.pipeline_class.get_default_pipeline_options() self.budget_type = pipeline_config['budget_type'] if budget_type is None else budget_type self.budget = pipeline_config[self.budget_type] if budget == 0 else budget - + self.cutoff = pipeline_config['func_eval_time_limit_secs'] * 0.9 self.num_run = 0 if num_run is None else num_run logger_name = '%s(%d)' % (self.__class__.__name__.split('.')[-1], @@ -768,6 +794,9 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], if test_loss is not None: additional_run_info['test_loss'] = test_loss + additional_run_info['configuration'] = self.configuration if not isinstance(self.configuration, Configuration) else self.configuration.get_dictionary() + additional_run_info['budget'] = self.budget + rval_dict = {'loss': cost, 'additional_run_info': additional_run_info, 'status': status} diff --git a/autoPyTorch/evaluation/ensemble_optimisation_evaluator.py b/autoPyTorch/evaluation/ensemble_optimisation_evaluator.py new file mode 100644 index 000000000..569068154 --- /dev/null +++ b/autoPyTorch/evaluation/ensemble_optimisation_evaluator.py @@ -0,0 +1,648 @@ +from math import floor +from multiprocessing.queues import Queue +import os +import time +from typing import Any, Dict, List, Optional, Tuple, Union + +from ConfigSpace.configuration_space import Configuration + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.ensemble import VotingClassifier + +from smac.tae import StatusType + +from autoPyTorch.automl_common.common.utils.backend import Backend +from autoPyTorch.constants import ( + CLASSIFICATION_TASKS, + MULTICLASSMULTIOUTPUT, +) +from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes, RepeatedCrossValTypes +from autoPyTorch.ensemble.ensemble_optimisation_stacking_ensemble_builder import calculate_nomalised_margin_loss +from autoPyTorch.evaluation.abstract_evaluator import ( + AbstractEvaluator, + fit_and_suppress_warnings +) +from autoPyTorch.ensemble.ensemble_optimisation_stacking_ensemble import EnsembleOptimisationStackingEnsemble +from autoPyTorch.evaluation.utils import VotingRegressorWrapper, check_pipeline_is_fitted +from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline +from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.utils.common import dict_repr, subsampler +from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates + +__all__ = ['EnsembleOptimisationEvaluator', 'eval_ensemble_optimise_function'] + + +def _get_y_array(y: np.ndarray, task_type: int) -> np.ndarray: + if task_type in CLASSIFICATION_TASKS and task_type != \ + MULTICLASSMULTIOUTPUT: + return y.ravel() + else: + return y + + +class EnsembleOptimisationEvaluator(AbstractEvaluator): + """ + This class builds a pipeline using the provided configuration. + A pipeline implementing the provided configuration is fitted + using the datamanager object retrieved from disc, via the backend. + After the pipeline is fitted, it is save to disc and the performance estimate + is communicated to the main process via a Queue. + + Attributes: + backend (Backend): + An object to interface with the disk storage. In particular, allows to + access the train and test datasets + queue (Queue): + Each worker available will instantiate an evaluator, and after completion, + it will return the evaluation result via a multiprocessing queue + metric (autoPyTorchMetric): + A scorer object that is able to evaluate how good a pipeline was fit. It + is a wrapper on top of the actual score method (a wrapper on top of scikit + lean accuracy for example) that formats the predictions accordingly. + budget: (float): + The amount of epochs/time a configuration is allowed to run. + budget_type (str): + The budget type, which can be epochs or time + pipeline_config (Optional[Dict[str, Any]]): + Defines the content of the pipeline being evaluated. For example, it + contains pipeline specific settings like logging name, or whether or not + to use tensorboard. + configuration (Union[int, str, Configuration]): + Determines the pipeline to be constructed. A dummy estimator is created for + integer configurations, a traditional machine learning pipeline is created + for string based configuration, and NAS is performed when a configuration + object is passed. + seed (int): + A integer that allows for reproducibility of results + output_y_hat_optimization (bool): + Whether this worker should output the target predictions, so that they are + stored on disk. Fundamentally, the resampling strategy might shuffle the + Y_train targets, so we store the split in order to re-use them for ensemble + selection. + num_run (Optional[int]): + An identifier of the current configuration being fit. This number is unique per + configuration. + include (Optional[Dict[str, Any]]): + An optional dictionary to include components of the pipeline steps. + exclude (Optional[Dict[str, Any]]): + An optional dictionary to exclude components of the pipeline steps. + disable_file_output (Union[bool, List[str]]): + By default, the model, it's predictions and other metadata is stored on disk + for each finished configuration. This argument allows the user to skip + saving certain file type, for example the model, from being written to disk. + init_params (Optional[Dict[str, Any]]): + Optional argument that is passed to each pipeline step. It is the equivalent of + kwargs for the pipeline steps. + logger_port (Optional[int]): + Logging is performed using a socket-server scheme to be robust against many + parallel entities that want to write to the same file. This integer states the + socket port for the communication channel. If None is provided, a traditional + logger is used. + all_supported_metrics (bool): + Whether all supported metric should be calculated for every configuration. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + An object used to fine tune the hyperparameter search space of the pipeline + """ + def __init__(self, backend: Backend, queue: Queue, + metric: autoPyTorchMetric, + budget: float, + configuration: Union[int, str, Configuration], + budget_type: str = None, + pipeline_config: Optional[Dict[str, Any]] = None, + seed: int = 1, + output_y_hat_optimization: bool = True, + num_run: Optional[int] = None, + include: Optional[Dict[str, Any]] = None, + exclude: Optional[Dict[str, Any]] = None, + disable_file_output: Union[bool, List] = False, + init_params: Optional[Dict[str, Any]] = None, + logger_port: Optional[int] = None, + all_supported_metrics: bool = True, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + use_ensemble_opt_loss=False, + cur_stacking_layer: int = 0) -> None: + super().__init__( + backend=backend, + queue=queue, + configuration=configuration, + metric=metric, + seed=seed, + output_y_hat_optimization=output_y_hat_optimization, + num_run=num_run, + include=include, + exclude=exclude, + disable_file_output=disable_file_output, + init_params=init_params, + budget=budget, + budget_type=budget_type, + logger_port=logger_port, + all_supported_metrics=all_supported_metrics, + pipeline_config=pipeline_config, + search_space_updates=search_space_updates, + use_ensemble_opt_loss=use_ensemble_opt_loss + ) + + self.cur_stacking_layer = cur_stacking_layer + self.num_repeats = len(self.splits) + self.num_folds = len(self.splits[0]) + self.logger.debug("use_ensemble_loss :{}".format(self.use_ensemble_opt_loss)) + self.old_ensemble: Optional[EnsembleOptimisationStackingEnsemble] = None + ensemble_dir = self.backend.get_ensemble_dir() + if os.path.exists(ensemble_dir) and len(os.listdir(ensemble_dir)) >= 1: + self.old_ensemble = self.backend.load_ensemble(self.seed) + assert isinstance(self.old_ensemble, EnsembleOptimisationStackingEnsemble) + + self.logger.debug(f"for num run: {num_run}, X_train.shape: {self.X_train.shape} and X_test.shape: {self.X_test.shape}") + + def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], + valid_pred: Optional[np.ndarray], + test_pred: Optional[np.ndarray], + pipeline_opt_pred: np.ndarray, + ensemble_opt_pred: np.ndarray, + additional_run_info: Optional[Dict], + file_output: bool, status: StatusType, + ) -> Optional[Tuple[float, float, int, Dict]]: + """This function does everything necessary after the fitting is done: + * predicting + * saving the necessary files + We use it as the signal handler so we can recycle the code for the + normal usecase and when the runsolver kills us here :)""" + + self.duration = time.time() - self.starttime + + if file_output: + loss_, additional_run_info_ = self.file_output( + pipeline_opt_pred, valid_pred, test_pred + ) + else: + loss_ = None + additional_run_info_ = {} + + validation_loss, test_loss = self.calculate_auxiliary_losses( + valid_pred, test_pred + ) + + pipeline_loss, _ = self.calculate_auxiliary_losses( + pipeline_opt_pred, None + ) + + if loss_ is not None: + return self.duration, loss_, self.seed, additional_run_info_ + + cost = loss["ensemble_opt_loss"] if self.use_ensemble_opt_loss else loss[self.metric.name] + + additional_run_info = ( + {} if additional_run_info is None else additional_run_info + ) + for metric_name, value in loss.items(): + additional_run_info[metric_name] = value + additional_run_info['duration'] = self.duration + additional_run_info['num_run'] = self.num_run + if pipeline_loss is not None: + additional_run_info['pipeline_loss'] = pipeline_loss + if train_loss is not None: + additional_run_info['train_loss'] = train_loss + if validation_loss is not None: + additional_run_info['validation_loss'] = validation_loss + if test_loss is not None: + additional_run_info['test_loss'] = test_loss + additional_run_info['configuration'] = self.configuration if not isinstance(self.configuration, Configuration) else self.configuration.get_dictionary() + additional_run_info['budget'] = self.budget + + additional_run_info['opt_loss'] = loss + rval_dict = {'loss': cost, + 'additional_run_info': additional_run_info, + 'status': status} + + self.queue.put(rval_dict) + return None + + def get_sorted_preds(self, preds: List[List[np.ndarray]], repeat_id: int) -> np.ndarray: + predictions = np.concatenate([pred for pred in preds if pred is not None]) + indices = np.concatenate([test_indices for _, test_indices in self.splits[repeat_id]]) + zipped_lists = zip(indices, predictions) + + sorted_zipped_lists = sorted(zipped_lists) + predictions = [pred for _, pred in sorted_zipped_lists] + return predictions + + def get_sorted_train_preds(self, preds: List[List[np.ndarray]], repeat_id: int): + predictions = np.concatenate([pred for pred in preds if pred is not None]) + indices = np.concatenate([train_indices for train_indices, _ in self.splits[repeat_id]]) + + unique_indices = set(indices) + sorted_predictions = np.zeros((len(unique_indices), self.num_classes)) + + for i in unique_indices: + positions = np.where(indices == i) + tmp = list() + for position in positions: + tmp.append(predictions[position]) + mean_tmp = np.squeeze(np.mean(tmp, axis=1)) + for j, mean in enumerate(mean_tmp): + sorted_predictions[i][j] = mean + return sorted_predictions + + def get_sorted_train_targets(self, preds: List[List[np.ndarray]], repeat_id: int): + predictions = np.concatenate([pred for pred in preds if pred is not None]) + indices = np.concatenate([train_indices for train_indices, _ in self.splits[repeat_id]]) + + unique_indices = set(indices) + sorted_predictions = np.zeros(len(unique_indices)) + + for i in unique_indices: + positions = np.where(indices == i) + tmp = list() + for position in positions: + tmp.append(predictions[position]) + mean_tmp = np.squeeze(np.mean(tmp, axis=1)) + sorted_predictions[i] = mean_tmp + return sorted_predictions + + def file_output( + self, + Y_optimization_pred: np.ndarray, + Y_valid_pred: np.ndarray, + Y_test_pred: np.ndarray, + ) -> Tuple[Optional[float], Dict]: + + # Abort in case of shape misalignment + if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: + return ( + 1.0, + { + 'error': + "Targets %s and prediction %s don't have " + "the same length. Probably training didn't " + "finish" % (self.Y_optimization.shape, Y_optimization_pred.shape) + }, + ) + + # Abort if predictions contain NaNs + for y, s in [ + # Y_train_pred deleted here. Fix unittest accordingly. + [Y_optimization_pred, 'optimization'], + [Y_valid_pred, 'validation'], + [Y_test_pred, 'test'] + ]: + if y is not None and not np.all(np.isfinite(y)): + return ( + 1.0, + { + 'error': + 'Model predictions for %s set contains NaNs.' % s + }, + ) + + # Abort if we don't want to output anything. + if hasattr(self, 'disable_file_output'): + if self.disable_file_output: + return None, {} + else: + self.disabled_file_outputs = [] + + # This file can be written independently of the others down bellow + if 'y_optimization' not in self.disabled_file_outputs: + if self.output_y_hat_optimization: + self.backend.save_targets_ensemble(self.Y_optimization) + + if hasattr(self, 'pipelines') and self.pipelines is not None and isinstance(self.resampling_strategy, RepeatedCrossValTypes): + if self.pipelines[0] is not None and len(self.pipelines) > 0: + if 'pipelines' not in self.disabled_file_outputs: + if self.task_type in CLASSIFICATION_TASKS: + pipelines = VotingClassifier(estimators=None, voting='soft', ) + else: + pipelines = VotingRegressorWrapper(estimators=None) + pipelines.estimators_ = [pipeline for repeat_pipelines in self.pipelines for pipeline in repeat_pipelines if check_pipeline_is_fitted(pipeline, self.configuration)] + else: + pipelines = None + else: + pipelines = None + else: + pipelines = None + + if hasattr(self, 'pipeline') and self.pipeline is not None and isinstance(self.resampling_strategy, HoldoutValTypes): + if 'pipeline' not in self.disabled_file_outputs: + pipeline = self.pipeline + else: + pipeline = None + else: + # need a pipeline to get representation of the model. + # see https://github.com/automl/Auto-PyTorch/blob/master/autoPyTorch/api/base_task.py#L467 + pipeline = self.pipelines[-1][-1] + + self.logger.debug("Saving model {}_{}_{} to disk".format(self.seed, self.num_run, self.budget)) + self.backend.save_numrun_to_dir( + seed=int(self.seed), + idx=int(self.num_run), + budget=float(self.budget), + model=pipeline, + cv_model=pipelines, + ensemble_predictions=( + Y_optimization_pred if 'y_optimization' not in + self.disabled_file_outputs else None + ), + valid_predictions=( + Y_valid_pred if 'y_valid' not in + self.disabled_file_outputs else None + ), + test_predictions=( + Y_test_pred if 'y_test' not in + self.disabled_file_outputs else None + ), + ) + + return None, {} + + def fit_predict_and_loss(self) -> None: + """Fit, predict and compute the loss for cross-validation and + holdout""" + assert self.splits is not None, "Can't fit pipeline in {} is datamanager.splits is None" \ + .format(self.__class__.__name__) + + Y_train_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats + Y_pipeline_optimization_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats + Y_valid_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats + Y_test_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats + # Y_train_targets: List[Optional[np.ndarray]] = [None] * self.num_folds + # Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds + + self.pipelines = [[self._get_pipeline() for _ in range(self.num_folds)] for _ in range(self.num_repeats)] + + additional_run_info = {} + + total_repeats = self.num_repeats + for repeat_id, folds in enumerate(self.splits): + if repeat_id >= total_repeats: + break + y_train_pred_folds = [None] * self.num_folds + y_pipeline_optimization_pred_folds = [None] * self.num_folds + y_valid_pred_folds = [None] * self.num_folds + y_test_pred_folds = [None] * self.num_folds + # y_train_targets: List[Optional[np.ndarray]] = [None] * self.num_folds + # y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds + + for i, (train_split, test_split) in enumerate(folds): + starttime = time.time() + self.logger.info(f"Starting fit for repeat: {repeat_id} and fold: {i}") + pipeline = self.pipelines[repeat_id][i] + ( + y_train_pred, + y_pipeline_opt_pred, + y_valid_pred, + y_test_pred, + ) = self._fit_and_predict(pipeline, i, repeat_id, + train_indices=train_split, + test_indices=test_split) + y_train_pred_folds[i] = y_train_pred + y_pipeline_optimization_pred_folds[i] = y_pipeline_opt_pred + if y_valid_pred is not None: + y_valid_pred_folds[i] = y_valid_pred + if y_test_pred is not None: + y_test_pred_folds[i] = y_test_pred + + # y_train_targets[i] = self.y_train[train_split] + # y_targets[i] = self.y_train[test_split] + + additional_run_info.update(pipeline.get_additional_run_info() if hasattr( + pipeline, 'get_additional_run_info') and pipeline.get_additional_run_info() is not None else {}) + duration_fit_single = time.time() - starttime + if repeat_id == 0 and i == 0: + expected_num_folds = floor(self.cutoff/(1.15*duration_fit_single)) + self.logger.debug(f"cutoff :{self.cutoff}, expected num folds: {expected_num_folds}, duration_fit_single: {duration_fit_single}") + expected_total_repeats = floor(expected_num_folds/self.num_folds) + if expected_total_repeats < total_repeats: + self.logger.debug(f"For num_run: {self.num_run}, expected repeats of cross validation: {expected_total_repeats} " + f"is less than the given value: {total_repeats}. Will only run for {expected_total_repeats}") + total_repeats = expected_total_repeats + if total_repeats <= repeat_id: + raise ValueError("Not expected to complete first repeat, terminating configuration") + + Y_train_pred[repeat_id] = self.get_sorted_train_preds(y_train_pred_folds, repeat_id) + Y_pipeline_optimization_pred[repeat_id] = self.get_sorted_preds(y_pipeline_optimization_pred_folds, repeat_id) + if self.X_valid is not None: + Y_valid_pred[repeat_id] = np.array([y_valid_pred_folds[i] for i in range(self.num_folds) if y_valid_pred_folds[i] is not None]) + # Average the predictions of several pipelines + if len(Y_valid_pred[repeat_id].shape) == 3: + Y_valid_pred[repeat_id] = np.nanmean(Y_valid_pred[repeat_id], axis=0) + else: + Y_valid_pred = None + + if self.X_test is not None: + Y_test_pred[repeat_id] = np.array([y_test_pred_folds[i] for i in range(self.num_folds) if y_test_pred_folds[i] is not None]) + # Average the predictions of several pipelines of the folds + if len(Y_test_pred[repeat_id].shape) == 3: + Y_test_pred[repeat_id] = np.nanmean(Y_test_pred[repeat_id], axis=0) + else: + Y_test_pred = None + + # # as targets do change within repeats + # Y_targets = self.y_train.copy() # self.get_sorted_preds(y_targets, -1) + # Y_train_targets = self.y_train.copy() # self.get_sorted_train_targets(y_train_targets, -1) + + # Average prediction values accross repeats + Y_train_pred = np.nanmean(Y_train_pred[:total_repeats], axis=0) + Y_pipeline_optimization_pred = np.nanmean(Y_pipeline_optimization_pred[:total_repeats], axis=0) + Y_valid_pred = np.nanmean(Y_valid_pred[:total_repeats], axis=0) if Y_valid_pred is not None else None + Y_test_pred = np.nanmean(Y_test_pred[:total_repeats], axis=0) if Y_test_pred is not None else None + + if self.old_ensemble is not None: + Y_ensemble_optimization_pred = self.old_ensemble.predict_with_current_pipeline(Y_pipeline_optimization_pred) + Y_ensemble_preds = self.old_ensemble.get_ensemble_predictions_with_current_pipeline(Y_pipeline_optimization_pred) + else: + Y_ensemble_optimization_pred = Y_pipeline_optimization_pred.copy() + Y_ensemble_preds = [Y_pipeline_optimization_pred] + + self.Y_optimization = self.y_train # np.array(Y_targets) + self.Y_actual_train = self.y_train # np.array(Y_train_targets) + + self.pipeline = self._get_pipeline() + + train_loss = self._loss(self.Y_actual_train, Y_train_pred) + opt_loss = self._loss(self.Y_optimization, Y_ensemble_optimization_pred) + + opt_loss ['ensemble_opt_loss'] = calculate_nomalised_margin_loss(Y_ensemble_preds, self.Y_optimization) + status = StatusType.SUCCESS + self.logger.debug("In train evaluator fit_predict_and_loss, num_run: {} loss:{}".format( + self.num_run, + opt_loss + )) + self.finish_up( + loss=opt_loss, + train_loss=train_loss, + ensemble_opt_pred=Y_ensemble_optimization_pred, + valid_pred=Y_valid_pred, + test_pred=Y_test_pred, + additional_run_info=additional_run_info, + file_output=True, + status=status, + pipeline_opt_pred=Y_pipeline_optimization_pred + ) + + def _fit_and_predict( + self, + pipeline: BaseEstimator, + fold: int, + repeat_id: int, + train_indices: Union[np.ndarray, List], + test_indices: Union[np.ndarray, List], + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: + + # See autoPyTorch/pipeline/components/base_component.py::autoPyTorchComponent for more details + # about fit_dictionary + X = {'train_indices': train_indices, + 'val_indices': test_indices, + 'split_id': fold, + 'repeat_id': repeat_id, + 'num_run': self.num_run, + **self.fit_dictionary} # fit dictionary + y = None + fit_and_suppress_warnings(self.logger, pipeline, X, y) + self.logger.info("Model fitted, now predicting") + Y_train_pred, Y_pipeline_opt_pred, Y_valid_pred, Y_test_pred = self._predict( + pipeline, + train_indices=train_indices, + test_indices=test_indices, + ) + + self.pipeline = pipeline + + return Y_train_pred, Y_pipeline_opt_pred, Y_valid_pred, Y_test_pred + + def _predict( + self, + pipeline: BaseEstimator, + test_indices: Union[np.ndarray, List], + train_indices: Union[np.ndarray, List] + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: + train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline, + self.y_train[train_indices]) + + pipeline_opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline, + self.y_train[train_indices]) + + # self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}") + if self.X_valid is not None: + valid_pred = self.predict_function(self.X_valid, pipeline, + self.y_valid) + else: + valid_pred = None + + if self.X_test is not None: + test_pred = self.predict_function(self.X_test, pipeline, + self.y_train[train_indices]) + else: + test_pred = None + + return train_pred, pipeline_opt_pred, valid_pred, test_pred + + +# create closure for evaluating an algorithm +def eval_ensemble_optimise_function( + backend: Backend, + queue: Queue, + metric: autoPyTorchMetric, + budget: float, + config: Optional[Configuration], + seed: int, + num_run: int, + include: Optional[Dict[str, Any]], + exclude: Optional[Dict[str, Any]], + disable_file_output: Union[bool, List], + output_y_hat_optimization: bool, + pipeline_config: Optional[Dict[str, Any]] = None, + budget_type: str = None, + init_params: Optional[Dict[str, Any]] = None, + logger_port: Optional[int] = None, + all_supported_metrics: bool = True, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + use_ensemble_opt_loss=False, + cur_stacking_layer: int = 0, + instance: str = None, +) -> None: + """ + This closure allows the communication between the ExecuteTaFuncWithQueue and the + pipeline trainer (TrainEvaluator). + + Fundamentally, smac calls the ExecuteTaFuncWithQueue.run() method, which internally + builds a TrainEvaluator. The TrainEvaluator builds a pipeline, stores the output files + to disc via the backend, and puts the performance result of the run in the queue. + + + Attributes: + backend (Backend): + An object to interface with the disk storage. In particular, allows to + access the train and test datasets + queue (Queue): + Each worker available will instantiate an evaluator, and after completion, + it will return the evaluation result via a multiprocessing queue + metric (autoPyTorchMetric): + A scorer object that is able to evaluate how good a pipeline was fit. It + is a wrapper on top of the actual score method (a wrapper on top of scikit + lean accuracy for example) that formats the predictions accordingly. + budget: (float): + The amount of epochs/time a configuration is allowed to run. + budget_type (str): + The budget type, which can be epochs or time + pipeline_config (Optional[Dict[str, Any]]): + Defines the content of the pipeline being evaluated. For example, it + contains pipeline specific settings like logging name, or whether or not + to use tensorboard. + config (Union[int, str, Configuration]): + Determines the pipeline to be constructed. + seed (int): + A integer that allows for reproducibility of results + output_y_hat_optimization (bool): + Whether this worker should output the target predictions, so that they are + stored on disk. Fundamentally, the resampling strategy might shuffle the + Y_train targets, so we store the split in order to re-use them for ensemble + selection. + num_run (Optional[int]): + An identifier of the current configuration being fit. This number is unique per + configuration. + include (Optional[Dict[str, Any]]): + An optional dictionary to include components of the pipeline steps. + exclude (Optional[Dict[str, Any]]): + An optional dictionary to exclude components of the pipeline steps. + disable_file_output (Union[bool, List[str]]): + By default, the model, it's predictions and other metadata is stored on disk + for each finished configuration. This argument allows the user to skip + saving certain file type, for example the model, from being written to disk. + init_params (Optional[Dict[str, Any]]): + Optional argument that is passed to each pipeline step. It is the equivalent of + kwargs for the pipeline steps. + logger_port (Optional[int]): + Logging is performed using a socket-server scheme to be robust against many + parallel entities that want to write to the same file. This integer states the + socket port for the communication channel. If None is provided, a traditional + logger is used. + instance (str): + An instance on which to evaluate the current pipeline. By default we work + with a single instance, being the provided X_train, y_train of a single dataset. + This instance is a compatibility argument for SMAC, that is capable of working + with multiple datasets at the same time. + """ + evaluator = EnsembleOptimisationEvaluator( + backend=backend, + queue=queue, + metric=metric, + configuration=config, + seed=seed, + num_run=num_run, + output_y_hat_optimization=output_y_hat_optimization, + include=include, + exclude=exclude, + disable_file_output=disable_file_output, + init_params=init_params, + budget=budget, + budget_type=budget_type, + logger_port=logger_port, + all_supported_metrics=all_supported_metrics, + pipeline_config=pipeline_config, + search_space_updates=search_space_updates, + use_ensemble_opt_loss=use_ensemble_opt_loss, + cur_stacking_layer=cur_stacking_layer + ) + evaluator.fit_predict_and_loss() diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/repeated_crossval_evaluator.py similarity index 62% rename from autoPyTorch/evaluation/stacking_evaluator.py rename to autoPyTorch/evaluation/repeated_crossval_evaluator.py index 4207e234f..c71be49a9 100644 --- a/autoPyTorch/evaluation/stacking_evaluator.py +++ b/autoPyTorch/evaluation/repeated_crossval_evaluator.py @@ -1,6 +1,10 @@ +from math import floor from multiprocessing.queues import Queue +from optparse import Option import os +import re import time +from timeit import repeat from typing import Any, Dict, List, Optional, Tuple, Union from ConfigSpace.configuration_space import Configuration @@ -8,6 +12,7 @@ import numpy as np from sklearn.base import BaseEstimator +from sklearn.ensemble import VotingClassifier from smac.tae import StatusType @@ -16,17 +21,17 @@ CLASSIFICATION_TASKS, MULTICLASSMULTIOUTPUT, ) -from autoPyTorch.ensemble.stacking_ensemble_builder import calculate_nomalised_margin_loss +from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes, RepeatedCrossValTypes from autoPyTorch.evaluation.abstract_evaluator import ( AbstractEvaluator, fit_and_suppress_warnings ) -from autoPyTorch.ensemble.stacking_ensemble import StackingEnsemble +from autoPyTorch.evaluation.utils import VotingRegressorWrapper, check_pipeline_is_fitted from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric -from autoPyTorch.utils.common import dict_repr, subsampler +from autoPyTorch.utils.common import subsampler from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates -__all__ = ['StackingEvaluator', 'eval_function'] +__all__ = ['RepeatedCrossValEvaluator', 'eval_repeated_cv_function'] def _get_y_array(y: np.ndarray, task_type: int) -> np.ndarray: @@ -37,7 +42,7 @@ def _get_y_array(y: np.ndarray, task_type: int) -> np.ndarray: return y -class StackingEvaluator(AbstractEvaluator): +class RepeatedCrossValEvaluator(AbstractEvaluator): """ This class builds a pipeline using the provided configuration. A pipeline implementing the provided configuration is fitted @@ -116,7 +121,9 @@ def __init__(self, backend: Backend, queue: Queue, logger_port: Optional[int] = None, all_supported_metrics: bool = True, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, - use_ensemble_opt_loss=False) -> None: + use_ensemble_opt_loss=False, + cur_stacking_layer: int = 0 + ) -> None: super().__init__( backend=backend, queue=queue, @@ -135,20 +142,23 @@ def __init__(self, backend: Backend, queue: Queue, all_supported_metrics=all_supported_metrics, pipeline_config=pipeline_config, search_space_updates=search_space_updates, - use_ensemble_opt_loss=use_ensemble_opt_loss + use_ensemble_opt_loss=use_ensemble_opt_loss, ) - self.logger.debug("use_ensemble_loss :{}".format(self.use_ensemble_opt_loss)) + self.num_repeats = len(self.splits) + self.num_folds = len(self.splits[0]) + + self.logger.debug(f"for num run: {num_run}, X_train.shape: {self.X_train.shape} and X_test.shape: {self.X_test.shape}") def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], valid_pred: Optional[np.ndarray], test_pred: Optional[np.ndarray], - pipeline_opt_pred: np.ndarray, - ensemble_opt_pred: np.ndarray, + opt_pred: np.ndarray, additional_run_info: Optional[Dict], - file_output: bool, status: StatusType, + file_output: bool, status: StatusType, ) -> Optional[Tuple[float, float, int, Dict]]: """This function does everything necessary after the fitting is done: + * predicting * saving the necessary files We use it as the signal handler so we can recycle the code for the @@ -158,7 +168,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], if file_output: loss_, additional_run_info_ = self.file_output( - pipeline_opt_pred, valid_pred, test_pred + opt_pred, valid_pred, test_pred ) else: loss_ = None @@ -169,13 +179,12 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], ) pipeline_loss, _ = self.calculate_auxiliary_losses( - pipeline_opt_pred, None + opt_pred, None ) - if loss_ is not None: return self.duration, loss_, self.seed, additional_run_info_ - cost = loss["ensemble_opt_loss"] if self.use_ensemble_opt_loss else loss[self.metric.name] + cost = loss[self.metric.name] additional_run_info = ( {} if additional_run_info is None else additional_run_info @@ -184,16 +193,18 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], additional_run_info[metric_name] = value additional_run_info['duration'] = self.duration additional_run_info['num_run'] = self.num_run - if pipeline_loss is not None: - additional_run_info['pipeline_loss'] = pipeline_loss if train_loss is not None: additional_run_info['train_loss'] = train_loss if validation_loss is not None: additional_run_info['validation_loss'] = validation_loss if test_loss is not None: additional_run_info['test_loss'] = test_loss - + if pipeline_loss is not None: + additional_run_info['pipeline_loss'] = pipeline_loss additional_run_info['opt_loss'] = loss + additional_run_info['configuration'] = self.configuration if not isinstance(self.configuration, Configuration) else self.configuration.get_dictionary() + additional_run_info['budget'] = self.budget + rval_dict = {'loss': cost, 'additional_run_info': additional_run_info, 'status': status} @@ -201,6 +212,48 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], self.queue.put(rval_dict) return None + def get_sorted_preds(self, preds: List[List[np.ndarray]], repeat_id: int) -> np.ndarray: + predictions = np.concatenate([pred for pred in preds if pred is not None]) + indices = np.concatenate([test_indices for _, test_indices in self.splits[repeat_id]]) + zipped_lists = zip(indices, predictions) + + sorted_zipped_lists = sorted(zipped_lists) + predictions = [pred for _, pred in sorted_zipped_lists] + return predictions + + def get_sorted_train_preds(self, preds: List[List[np.ndarray]], repeat_id: int): + predictions = np.concatenate([pred for pred in preds if pred is not None]) + indices = np.concatenate([train_indices for train_indices, _ in self.splits[repeat_id]]) + + unique_indices = set(indices) + sorted_predictions = np.zeros((len(unique_indices), self.num_classes)) + + for i in unique_indices: + positions = np.where(indices == i) + tmp = list() + for position in positions: + tmp.append(predictions[position]) + mean_tmp = np.squeeze(np.mean(tmp, axis=1)) + for j, mean in enumerate(mean_tmp): + sorted_predictions[i][j] = mean + return sorted_predictions + + def get_sorted_train_targets(self, preds: List[List[np.ndarray]], repeat_id: int): + predictions = np.concatenate([pred for pred in preds if pred is not None]) + indices = np.concatenate([train_indices for train_indices, _ in self.splits[repeat_id]]) + + unique_indices = set(indices) + sorted_predictions = np.zeros(len(unique_indices)) + + for i in unique_indices: + positions = np.where(indices == i) + tmp = list() + for position in positions: + tmp.append(predictions[position]) + mean_tmp = np.squeeze(np.mean(tmp, axis=1)) + sorted_predictions[i] = mean_tmp + return sorted_predictions + def file_output( self, Y_optimization_pred: np.ndarray, @@ -225,7 +278,7 @@ def file_output( # Y_train_pred deleted here. Fix unittest accordingly. [Y_optimization_pred, 'optimization'], [Y_valid_pred, 'validation'], - [Y_test_pred, 'test'] + [Y_test_pred, 'test'], ]: if y is not None and not np.all(np.isfinite(y)): return ( @@ -248,13 +301,30 @@ def file_output( if self.output_y_hat_optimization: self.backend.save_targets_ensemble(self.Y_optimization) - if hasattr(self, 'pipeline') and self.pipeline is not None: + if hasattr(self, 'pipelines') and self.pipelines is not None and isinstance(self.resampling_strategy, RepeatedCrossValTypes): + if self.pipelines[0] is not None and len(self.pipelines) > 0: + if 'pipelines' not in self.disabled_file_outputs: + if self.task_type in CLASSIFICATION_TASKS: + pipelines = VotingClassifier(estimators=None, voting='soft', ) + else: + pipelines = VotingRegressorWrapper(estimators=None) + pipelines.estimators_ = [pipeline for repeat_pipelines in self.pipelines for pipeline in repeat_pipelines if check_pipeline_is_fitted(pipeline, self.configuration)] + else: + pipelines = None + else: + pipelines = None + else: + pipelines = None + + if hasattr(self, 'pipeline') and self.pipeline is not None and isinstance(self.resampling_strategy, HoldoutValTypes): if 'pipeline' not in self.disabled_file_outputs: pipeline = self.pipeline else: pipeline = None else: - pipeline = None + # need a pipeline to get representation of the model. + # see https://github.com/automl/Auto-PyTorch/blob/master/autoPyTorch/api/base_task.py#L467 + pipeline = self.pipelines[-1][-1] self.logger.debug("Saving model {}_{}_{} to disk".format(self.seed, self.num_run, self.budget)) self.backend.save_numrun_to_dir( @@ -262,7 +332,7 @@ def file_output( idx=int(self.num_run), budget=float(self.budget), model=pipeline, - cv_model=None, + cv_model=pipelines, ensemble_predictions=( Y_optimization_pred if 'y_optimization' not in self.disabled_file_outputs else None @@ -284,57 +354,124 @@ def fit_predict_and_loss(self) -> None: holdout""" assert self.splits is not None, "Can't fit pipeline in {} is datamanager.splits is None" \ .format(self.__class__.__name__) - additional_run_info: Optional[Dict] = None - split_id = 0 - self.logger.info("Starting fit {}".format(split_id)) - pipeline = self._get_pipeline() + Y_train_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats + Y_optimization_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats + Y_valid_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats + Y_test_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats + # Y_train_targets: List[Optional[np.ndarray]] = [None] * self.num_folds + # Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds + + + self.pipelines = [[self._get_pipeline() for _ in range(self.num_folds)] for _ in range(self.num_repeats)] + + additional_run_info = {} + + total_repeats = self.num_repeats + for repeat_id, folds in enumerate(self.splits): + if repeat_id >= total_repeats: + break + + y_train_pred_folds = [None] * self.num_folds + y_optimization_pred_folds = [None] * self.num_folds + y_valid_pred_folds = [None] * self.num_folds + y_test_pred_folds = [None] * self.num_folds + # y_train_targets: List[Optional[np.ndarray]] = [None] * self.num_folds + # y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds + + for i, (train_split, test_split) in enumerate(folds): + starttime = time.time() + self.logger.info(f"Starting fit for repeat: {repeat_id} and fold: {i}") + pipeline = self.pipelines[repeat_id][i] + ( + y_train_pred, + y_opt_pred, + y_valid_pred, + y_test_pred, + ) = self._fit_and_predict(pipeline, i, repeat_id, + train_indices=train_split, + test_indices=test_split) + y_train_pred_folds[i] = y_train_pred + y_optimization_pred_folds[i] = y_opt_pred + if y_valid_pred is not None: + y_valid_pred_folds[i] = y_valid_pred + if y_test_pred is not None: + y_test_pred_folds[i] = y_test_pred + + # y_train_targets[i] = self.y_train[train_split] + # y_targets[i] = self.y_train[test_split] + + additional_run_info.update(pipeline.get_additional_run_info() if hasattr( + pipeline, 'get_additional_run_info') and pipeline.get_additional_run_info() is not None else {}) + duration_fit_single = time.time() - starttime + if repeat_id == 0 and i == 0: + expected_num_folds = floor(self.cutoff/(1.15*duration_fit_single)) + self.logger.debug(f"cutoff :{self.cutoff}, expected num folds: {expected_num_folds}, duration_fit_single: {duration_fit_single}") + expected_total_repeats = floor(expected_num_folds/self.num_folds) + if expected_total_repeats < total_repeats: + self.logger.debug(f"For num_run: {self.num_run}, expected repeats of cross validation: {expected_total_repeats} " + f"is less than the given value: {total_repeats}. Will only run for {expected_total_repeats}") + total_repeats = expected_total_repeats + if total_repeats <= repeat_id: + raise ValueError("Not expected to complete first repeat, terminating configuration") + + Y_train_pred[repeat_id] = self.get_sorted_train_preds(y_train_pred_folds, repeat_id) + Y_optimization_pred[repeat_id] = self.get_sorted_preds(y_optimization_pred_folds, repeat_id) + if self.X_valid is not None: + Y_valid_pred[repeat_id] = np.array([y_valid_pred_folds[i] for i in range(self.num_folds) if y_valid_pred_folds[i] is not None]) + # Average the predictions of several pipelines + if len(Y_valid_pred[repeat_id].shape) == 3: + Y_valid_pred[repeat_id] = np.nanmean(Y_valid_pred[repeat_id], axis=0) + else: + Y_valid_pred = None - train_split, test_split = self.splits[split_id] - self.Y_optimization = self.y_train[test_split] - self.Y_actual_train = self.y_train[train_split] - ( - y_train_pred, - y_pipeline_opt_pred, - y_ensemble_opt_pred, - y_valid_pred, - y_test_pred, - y_ensemble_preds - ) = self._fit_and_predict(pipeline, split_id, - train_indices=train_split, - test_indices=test_split) - - train_loss = self._loss(self.y_train[train_split], y_train_pred) - loss = self._loss(self.y_train[test_split], y_ensemble_opt_pred) - - loss['ensemble_opt_loss'] = calculate_nomalised_margin_loss(y_ensemble_preds, self.y_train[test_split], self.task_type) - additional_run_info = pipeline.get_additional_run_info() if hasattr( - pipeline, 'get_additional_run_info') else {} + if self.X_test is not None: + Y_test_pred[repeat_id] = np.array([y_test_pred_folds[i] for i in range(self.num_folds) if y_test_pred_folds[i] is not None]) + # Average the predictions of several pipelines of the folds + if len(Y_test_pred[repeat_id].shape) == 3: + Y_test_pred[repeat_id] = np.nanmean(Y_test_pred[repeat_id], axis=0) + else: + Y_test_pred = None - status = StatusType.SUCCESS + # # as targets do change within repeats + # Y_targets = self.y_train.copy() # self.get_sorted_preds(y_targets, -1) + # Y_train_targets = self.y_train.copy() # self.get_sorted_train_targets(y_train_targets, -1) + + # Average prediction values accross repeats + Y_train_pred = np.nanmean(Y_train_pred[:total_repeats], axis=0) + Y_optimization_pred = np.nanmean(Y_optimization_pred[:total_repeats], axis=0) + Y_valid_pred = np.nanmean(Y_valid_pred[:total_repeats], axis=0) if Y_valid_pred is not None else None + Y_test_pred = np.nanmean(Y_test_pred[:total_repeats], axis=0) if Y_test_pred is not None else None + + self.Y_optimization = self.y_train # np.array(Y_targets) + self.Y_actual_train = self.y_train # np.array(Y_train_targets) + + self.pipeline = self._get_pipeline() - self.logger.debug("In train evaluator.fit_predict_and_loss, num_run: {} loss:{}," - " status: {},\nadditional run info:\n{}".format(self.num_run, - loss, - dict_repr(additional_run_info), - status)) + train_loss = self._loss(self.Y_actual_train, Y_train_pred) + opt_loss = self._loss(self.Y_optimization, Y_optimization_pred) + + status = StatusType.SUCCESS + self.logger.debug("In train evaluator fit_predict_and_loss, num_run: {} loss:{}".format( + self.num_run, + opt_loss + )) self.finish_up( - loss=loss, + loss=opt_loss, train_loss=train_loss, - ensemble_opt_pred=y_ensemble_opt_pred, - valid_pred=y_valid_pred, - test_pred=y_test_pred, + opt_pred=Y_optimization_pred, + valid_pred=Y_valid_pred, + test_pred=Y_test_pred, additional_run_info=additional_run_info, file_output=True, status=status, - pipeline_opt_pred=y_pipeline_opt_pred ) - def _fit_and_predict( self, pipeline: BaseEstimator, fold: int, + repeat_id: int, train_indices: Union[np.ndarray, List], test_indices: Union[np.ndarray, List], ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: @@ -344,12 +481,15 @@ def _fit_and_predict( X = {'train_indices': train_indices, 'val_indices': test_indices, 'split_id': fold, + 'repeat_id': repeat_id, 'num_run': self.num_run, **self.fit_dictionary} # fit dictionary y = None fit_and_suppress_warnings(self.logger, pipeline, X, y) self.logger.info("Model fitted, now predicting") - Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred, Y_ensemble_preds = self._predict( + ( + Y_train_pred, Y_opt_pred, Y_valid_pred, Y_test_pred + ) = self._predict( pipeline, train_indices=train_indices, test_indices=test_indices, @@ -357,7 +497,7 @@ def _fit_and_predict( self.pipeline = pipeline - return Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred, Y_ensemble_preds + return Y_train_pred, Y_opt_pred, Y_valid_pred, Y_test_pred def _predict( self, @@ -368,19 +508,9 @@ def _predict( train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline, self.y_train[train_indices]) - pipeline_opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline, + opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline, self.y_train[train_indices]) - ensemble_dir = self.backend.get_ensemble_dir() - if os.path.exists(ensemble_dir) and len(os.listdir(ensemble_dir)) >= 1: - old_ensemble = self.backend.load_ensemble(self.seed) - assert isinstance(old_ensemble, StackingEnsemble) - ensemble_opt_pred = old_ensemble.predict_with_current_pipeline(pipeline_opt_pred) - ensemble_preds = old_ensemble.get_ensemble_predictions_with_current_pipeline(pipeline_opt_pred) - else: - ensemble_opt_pred = pipeline_opt_pred.copy() - ensemble_preds = [pipeline_opt_pred] - # self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}") if self.X_valid is not None: valid_pred = self.predict_function(self.X_valid, pipeline, @@ -394,11 +524,11 @@ def _predict( else: test_pred = None - return train_pred, pipeline_opt_pred, ensemble_opt_pred, valid_pred, test_pred, ensemble_preds + return train_pred, opt_pred, valid_pred, test_pred # create closure for evaluating an algorithm -def eval_function( +def eval_repeated_cv_function( backend: Backend, queue: Queue, metric: autoPyTorchMetric, @@ -418,6 +548,7 @@ def eval_function( search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, use_ensemble_opt_loss=False, instance: str = None, + cur_stacking_layer: int = 0, ) -> None: """ This closure allows the communication between the ExecuteTaFuncWithQueue and the @@ -481,7 +612,7 @@ def eval_function( This instance is a compatibility argument for SMAC, that is capable of working with multiple datasets at the same time. """ - evaluator = StackingEvaluator( + evaluator = RepeatedCrossValEvaluator( backend=backend, queue=queue, metric=metric, @@ -499,6 +630,7 @@ def eval_function( all_supported_metrics=all_supported_metrics, pipeline_config=pipeline_config, search_space_updates=search_space_updates, - use_ensemble_opt_loss=use_ensemble_opt_loss + use_ensemble_opt_loss=use_ensemble_opt_loss, + cur_stacking_layer=cur_stacking_layer, ) evaluator.fit_predict_and_loss() diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py index 4ac84c8ef..fe8513aec 100644 --- a/autoPyTorch/evaluation/tae.py +++ b/autoPyTorch/evaluation/tae.py @@ -28,9 +28,11 @@ from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, HoldoutValTypes, - NoResamplingStrategyTypes + NoResamplingStrategyTypes, + RepeatedCrossValTypes ) -import autoPyTorch.evaluation.stacking_evaluator +from autoPyTorch.evaluation.ensemble_optimisation_evaluator import eval_ensemble_optimise_function +from autoPyTorch.evaluation.repeated_crossval_evaluator import eval_repeated_cv_function from autoPyTorch.evaluation.test_evaluator import eval_test_function from autoPyTorch.evaluation.train_evaluator import eval_train_function from autoPyTorch.evaluation.utils import ( @@ -131,8 +133,9 @@ def __init__( logger_port: int = None, all_supported_metrics: bool = True, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, - ensemble_method = None, - use_ensemble_opt_loss=False + ensemble_method: EnsembleSelectionTypes = None, + use_ensemble_opt_loss=False, + cur_stacking_layer: int = 0 ): self.backend = backend @@ -151,10 +154,25 @@ def __init__( self.resampling_strategy_args = dm.resampling_strategy_args if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)): - if ensemble_method is None or ensemble_method == EnsembleSelectionTypes.ensemble_selection: - eval_function = eval_train_function - elif ensemble_method == EnsembleSelectionTypes.stacking_ensemble: - eval_function = autoPyTorch.evaluation.stacking_evaluator.eval_function + eval_function = eval_train_function + if ( + ensemble_method == EnsembleSelectionTypes.stacking_optimisation_ensemble + or ensemble_method == EnsembleSelectionTypes.stacking_repeat_models + or ensemble_method == EnsembleSelectionTypes.stacking_autogluon + or ensemble_method == EnsembleSelectionTypes.stacking_ensemble_selection_per_layer + ): + raise ValueError(f"fitting ensemble stacking requires resampling strategy to be of {RepeatedCrossValTypes} but got {self.resampling_strategy}") + elif isinstance(self.resampling_strategy, RepeatedCrossValTypes): + if ensemble_method == EnsembleSelectionTypes.stacking_optimisation_ensemble: + eval_function = eval_ensemble_optimise_function + elif ( + ensemble_method == EnsembleSelectionTypes.stacking_ensemble_selection_per_layer + or ensemble_method == EnsembleSelectionTypes.stacking_repeat_models + or ensemble_method == EnsembleSelectionTypes.stacking_autogluon + or ensemble_method is None + or ensemble_method == EnsembleSelectionTypes.ensemble_selection + ): + eval_function = eval_repeated_cv_function self.output_y_hat_optimization = output_y_hat_optimization elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes): eval_function = eval_test_function @@ -209,6 +227,7 @@ def __init__( self.memory_limit = memory_limit self.search_space_updates = search_space_updates + self.cur_stacking_layer = cur_stacking_layer self.use_ensemble_opt_loss = use_ensemble_opt_loss def _check_and_get_default_budget(self) -> float: @@ -349,7 +368,8 @@ def run( logger_port=self.logger_port, all_supported_metrics=self.all_supported_metrics, search_space_updates=self.search_space_updates, - use_ensemble_opt_loss=self.use_ensemble_opt_loss + use_ensemble_opt_loss=self.use_ensemble_opt_loss, + cur_stacking_layer=self.cur_stacking_layer ) info: Optional[List[RunValue]] @@ -511,3 +531,4 @@ def run( ) ) return status, cost, runtime, additional_run_info + diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index e5884a9f7..891fdeb46 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -132,7 +132,8 @@ def __init__(self, backend: Backend, queue: Queue, keep_models: Optional[bool] = None, all_supported_metrics: bool = True, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, - use_ensemble_opt_loss=False) -> None: + use_ensemble_opt_loss=False, + cur_stacking_layer: int = 0) -> None: super().__init__( backend=backend, queue=queue, @@ -160,7 +161,7 @@ def __init__(self, backend: Backend, queue: Queue, f'(CrossValTypes, HoldoutValTypes), but got {self.resampling_strategy}' ) - self.num_folds: int = len(self.splits) + self.num_folds: int = len(self.splits[0]) self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN self.pipelines: List[Optional[BaseEstimator]] = [None] * self.num_folds @@ -177,14 +178,15 @@ def fit_predict_and_loss(self) -> None: additional_run_info: Optional[Dict] = None if self.num_folds == 1: split_id = 0 + repeat_id = 0 self.logger.info("Starting fit {}".format(split_id)) pipeline = self._get_pipeline() - train_split, test_split = self.splits[split_id] + train_split, test_split = self.splits[repeat_id][split_id] self.Y_optimization = self.y_train[test_split] self.Y_actual_train = self.y_train[train_split] - y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id, + y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id, repeat_id, train_indices=train_split, test_indices=test_split, add_pipeline_to_self=True) @@ -231,11 +233,12 @@ def fit_predict_and_loss(self) -> None: opt_fold_weights = [np.NaN] * self.num_folds additional_run_info = {} + repeat_id = 0 - for i, (train_split, test_split) in enumerate(self.splits): + for i, (train_split, test_split) in enumerate(self.splits[repeat_id]): pipeline = self.pipelines[i] - train_pred, opt_pred, valid_pred, test_pred = self._fit_and_predict(pipeline, i, + train_pred, opt_pred, valid_pred, test_pred = self._fit_and_predict(pipeline, i, repeat_id, train_indices=train_split, test_indices=test_split, add_pipeline_to_self=False) @@ -350,7 +353,9 @@ def fit_predict_and_loss(self) -> None: status=status, ) - def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Union[np.ndarray, List], + def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, + repeat_id: int, + train_indices: Union[np.ndarray, List], test_indices: Union[np.ndarray, List], add_pipeline_to_self: bool ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: @@ -362,6 +367,7 @@ def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Un X = {'train_indices': train_indices, 'val_indices': test_indices, 'split_id': fold, + 'repeat_id': repeat_id, 'num_run': self.num_run, **self.fit_dictionary} # fit dictionary y = None @@ -431,6 +437,7 @@ def eval_train_function( all_supported_metrics: bool = True, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, use_ensemble_opt_loss=False, + cur_stacking_layer: int = 0, instance: str = None, ) -> None: """ @@ -513,6 +520,7 @@ def eval_train_function( all_supported_metrics=all_supported_metrics, pipeline_config=pipeline_config, search_space_updates=search_space_updates, - use_ensemble_opt_loss=use_ensemble_opt_loss + use_ensemble_opt_loss=use_ensemble_opt_loss, + cur_stacking_layer=cur_stacking_layer ) evaluator.fit_predict_and_loss() diff --git a/autoPyTorch/evaluation/utils.py b/autoPyTorch/evaluation/utils.py index 37e5fa36d..094e373ac 100644 --- a/autoPyTorch/evaluation/utils.py +++ b/autoPyTorch/evaluation/utils.py @@ -2,6 +2,8 @@ from multiprocessing.queues import Queue from typing import List, Optional, Union +from ConfigSpace.configuration_space import Configuration + import numpy as np from sklearn.ensemble import VotingRegressor @@ -20,6 +22,13 @@ ] +def check_pipeline_is_fitted(pipeline, configuration): + if isinstance(configuration, Configuration): + return hasattr(pipeline.named_steps['network'], 'is_fitted_') and pipeline.named_steps['network'].is_fitted_ + else: + return pipeline.is_fitted_ + + def read_queue(queue_: Queue) -> List[RunValue]: stack: List[RunValue] = [] while True: diff --git a/autoPyTorch/optimizer/run_history_callback.py b/autoPyTorch/optimizer/run_history_callback.py index 376478813..1ee56666e 100644 --- a/autoPyTorch/optimizer/run_history_callback.py +++ b/autoPyTorch/optimizer/run_history_callback.py @@ -244,6 +244,7 @@ def run(self, iteration: int) -> Optional[List[Tuple[RunKey, float]]]: try: with (open(self.ensemble_loss_file, "rb")) as memory: read_losses = pickle.load(memory) + self.logger.debug(f"read losses at iteration: {iteration}: {read_losses.keys()}") except Exception as e: self.logger.debug(f"Could not read losses at iteration: {iteration} with exception {e}") return None diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index 47fb4e619..5cad8c10c 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -2,34 +2,44 @@ import json import logging.handlers from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import os import ConfigSpace from ConfigSpace.configuration_space import Configuration import dask.distributed +import numpy as np + from smac.facade.smac_ac_facade import SMAC4AC from smac.intensification.hyperband import Hyperband from smac.optimizer.smbo import SMBO -from smac.runhistory.runhistory import RunHistory +from smac.runhistory.runhistory import RunHistory, DataOrigin from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost from smac.scenario.scenario import Scenario from smac.tae.dask_runner import DaskParallelRunner from smac.tae.serial_runner import SerialRunner from smac.utils.io.traj_logging import TrajEntry +from autoPyTorch.data.tabular_validator import TabularInputValidator from autoPyTorch.automl_common.common.utils.backend import Backend +from autoPyTorch.datasets.base_dataset import BaseDataset +from autoPyTorch.datasets.tabular_dataset import TabularDataset from autoPyTorch.datasets.resampling_strategy import ( - CrossValTypes, + ResamplingStrategies, DEFAULT_RESAMPLING_PARAMETERS, HoldoutValTypes, - NoResamplingStrategyTypes + CrossValTypes ) +from autoPyTorch.datasets.utils import get_appended_dataset from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager +from autoPyTorch.ensemble.ensemble_optimisation_stacking_ensemble import EnsembleOptimisationStackingEnsemble +from autoPyTorch.ensemble.ensemble_selection_per_layer_stacking_ensemble import EnsembleSelectionPerLayerStackingEnsemble from autoPyTorch.ensemble.utils import EnsembleSelectionTypes from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash -from autoPyTorch.optimizer.utils import read_return_initial_configurations +from autoPyTorch.optimizer.utils import delete_other_runs, read_return_initial_configurations from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.utils.pipeline import get_configuration_space, get_dataset_requirements from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates from autoPyTorch.utils.logging_ import get_named_client_logger from autoPyTorch.utils.stopwatch import StopWatch @@ -102,9 +112,7 @@ def __init__(self, pipeline_config: Dict[str, Any], start_num_run: int = 1, seed: int = 1, - resampling_strategy: Union[HoldoutValTypes, - CrossValTypes, - NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation, + resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, @@ -113,13 +121,14 @@ def __init__(self, get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, ensemble_callback: Optional[EnsembleBuilderManager] = None, + num_stacking_layers: Optional[int] = None, logger_port: Optional[int] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, portfolio_selection: Optional[str] = None, pynisher_context: str = 'spawn', min_budget: int = 5, max_budget: int = 50, - ensemble_method: int = EnsembleSelectionTypes.ensemble_selection, + ensemble_method: EnsembleSelectionTypes = EnsembleSelectionTypes.ensemble_selection, other_callbacks: Optional[List] = None, smbo_class: Optional[SMBO] = None, use_ensemble_opt_loss: bool = False @@ -200,6 +209,7 @@ def __init__(self, """ super(AutoMLSMBO, self).__init__() # data related + self.datamanager: Optional[BaseDataset] = None self.dataset_name = dataset_name self.metric = metric @@ -239,6 +249,13 @@ def __init__(self, self.ensemble_method = ensemble_method self.ensemble_callback = ensemble_callback + if self.ensemble_method.is_stacking_ensemble() and num_stacking_layers is None: + raise ValueError("'num_stacking_layers' can't be none for stacked ensembles") + + self.num_stacking_layers = num_stacking_layers + + self.run_history = RunHistory() + self.trajectory: List[TrajEntry] = [] self.other_callbacks = other_callbacks self.smbo_class = smbo_class @@ -265,18 +282,46 @@ def __init__(self, self.logger.warning("None of the portfolio configurations are compatible" " with the current search space. Skipping initial configuration...") - def run_smbo(self, func: Optional[Callable] = None - ) -> Tuple[RunHistory, List[TrajEntry], str]: - - self.watcher.start_task('SMBO') - self.logger.info("Started run of SMBO") + def reset_data_manager(self) -> None: + if self.datamanager is not None: + del self.datamanager + self.datamanager = self.backend.load_datamanager() + if self.datamanager is not None and self.datamanager.task_type is not None: + self.task = self.datamanager.task_type + + def reset_attributes(self, datamanager: BaseDataset) -> None: + self.backend.save_datamanager(datamanager=datamanager) + + dataset_requirements = get_dataset_requirements( + info=datamanager.get_required_dataset_info(), + include=self.include, + exclude=self.exclude, + search_space_updates=self.search_space_updates) + self._dataset_requirements = dataset_requirements + dataset_properties = datamanager.get_dataset_properties(dataset_requirements) + self.config_space = get_configuration_space(dataset_properties, include=self.include, exclude=self.exclude, search_space_updates=self.search_space_updates) + + def _run_smbo( + self, + cur_stacking_layer: int, + walltime_limit: int, + initial_num_run: int, + func: Optional[Callable] = None, + ) -> Tuple[RunHistory, List[TrajEntry], str]: + + current_task_name = f'SMBO_{cur_stacking_layer}' + + self.watcher.start_task(current_task_name) + self.logger.info(f"Started {cur_stacking_layer} run of SMBO") + + # # == first things first: load the datamanager + # self.reset_data_manager() # == Initialize non-SMBO stuff # first create a scenario seed = self.seed self.config_space.seed(seed) # allocate a run history - num_run = self.start_num_run # Initialize some SMAC dependencies @@ -295,7 +340,7 @@ def run_smbo(self, func: Optional[Callable] = None ta_kwargs = dict( backend=copy.deepcopy(self.backend), seed=seed, - initial_num_run=num_run, + initial_num_run=initial_num_run, include=self.include if self.include is not None else dict(), exclude=self.exclude if self.exclude is not None else dict(), metric=self.metric, @@ -308,13 +353,14 @@ def run_smbo(self, func: Optional[Callable] = None search_space_updates=self.search_space_updates, pynisher_context=self.pynisher_context, ensemble_method=self.ensemble_method, - use_ensemble_opt_loss=self.use_ensemble_opt_loss + use_ensemble_opt_loss=self.use_ensemble_opt_loss, + cur_stacking_layer=cur_stacking_layer ) ta = ExecuteTaFuncWithQueue self.logger.info("Finish creating Target Algorithm (TA) function") - startup_time = self.watcher.wall_elapsed(self.dataset_name) - total_walltime_limit = self.total_walltime_limit - startup_time - 5 + startup_time = self.watcher.wall_elapsed(current_task_name) + walltime_limit = walltime_limit - startup_time - 5 scenario_dict = { 'abort_on_first_run_crash': False, 'cs': self.config_space, @@ -324,7 +370,7 @@ def run_smbo(self, func: Optional[Callable] = None 'memory_limit': self.memory_limit, 'output-dir': self.backend.get_smac_output_directory(), 'run_obj': 'quality', - 'wallclock_limit': total_walltime_limit, + 'wallclock_limit': walltime_limit, 'cost_for_crash': self.worst_possible_result, } if self.smac_scenario_args is not None: @@ -365,7 +411,8 @@ def run_smbo(self, func: Optional[Callable] = None initial_budget=self.min_budget, max_budget=self.max_budget, dask_client=self.dask_client, - initial_configurations=self.initial_configurations) + initial_configurations=self.initial_configurations, + smbo_class=self.smbo_class) else: smac = get_smac_object(scenario_dict=scenario_dict, seed=seed, @@ -378,20 +425,22 @@ def run_smbo(self, func: Optional[Callable] = None initial_configurations=self.initial_configurations, smbo_class=self.smbo_class) + if self.ensemble_method.is_stacking_ensemble(): + self.ensemble_callback.update_for_new_stacking_layer(cur_stacking_layer, initial_num_run) if self.ensemble_callback is not None: smac.register_callback(self.ensemble_callback) - if self.other_callbacks is not None: for callback in self.other_callbacks: smac.register_callback(callback) + self.logger.info("initialised SMBO, running SMBO.optimize()") smac.optimize() self.logger.info("finished SMBO.optimize()") - self.runhistory = smac.solver.runhistory - self.trajectory = smac.solver.intensifier.traj_logger.trajectory + runhistory = smac.solver.runhistory + trajectory = smac.solver.intensifier.traj_logger.trajectory if isinstance(smac.solver.tae_runner, DaskParallelRunner): self._budget_type = smac.solver.tae_runner.single_worker.budget_type elif isinstance(smac.solver.tae_runner, SerialRunner): @@ -399,4 +448,55 @@ def run_smbo(self, func: Optional[Callable] = None else: raise NotImplementedError(type(smac.solver.tae_runner)) - return self.runhistory, self.trajectory, self._budget_type + return runhistory, trajectory, self._budget_type + + def run_smbo(self, func: Optional[Callable] = None + ) -> Tuple[RunHistory, List[TrajEntry], str]: + individual_wall_times = self.total_walltime_limit / self.num_stacking_layers + initial_num_run = self.start_num_run + self.reset_data_manager() + for cur_stacking_layer in range(self.num_stacking_layers): + if cur_stacking_layer == 0: + self.logger.debug(f"Initial feat_types = {self.datamanager.feat_type}") + run_history, trajectory, _ = self._run_smbo( + walltime_limit=individual_wall_times, + cur_stacking_layer=cur_stacking_layer, + initial_num_run=initial_num_run, + func=func + ) + self.run_history.update(run_history, origin=DataOrigin.INTERNAL) + self.trajectory.extend(trajectory) + if self.num_stacking_layers <= 1: + break + old_ensemble: Optional[Union[EnsembleSelectionPerLayerStackingEnsemble, EnsembleOptimisationStackingEnsemble]] = None + ensemble_dir = self.backend.get_ensemble_dir() + if os.path.exists(ensemble_dir) and len(os.listdir(ensemble_dir)) >= 1: + old_ensemble = self.backend.load_ensemble(self.seed) + assert isinstance(old_ensemble, (EnsembleOptimisationStackingEnsemble, EnsembleSelectionPerLayerStackingEnsemble)) + if cur_stacking_layer != self.num_stacking_layers -1: + selected_identifiers = old_ensemble.get_selected_model_identifiers()[old_ensemble.cur_stacking_layer] + nonnull_identifiers = [identifier for identifier in selected_identifiers if identifier is not None] + ensemble_runs = [self.backend.get_numrun_directory(seed=seed, num_run=num_run, budget=budget).split('/')[-1] for seed, num_run, budget in nonnull_identifiers] + self.logger.debug(f"deleting runs other than {ensemble_runs}") + delete_other_runs(ensemble_runs=ensemble_runs, runs_directory=self.backend.get_runs_directory()) + previous_layer_predictions_train = old_ensemble.get_layer_stacking_ensemble_predictions(stacking_layer=cur_stacking_layer) + previous_layer_predictions_test = old_ensemble.get_layer_stacking_ensemble_predictions(stacking_layer=cur_stacking_layer, dataset='test') + self.logger.debug(f"Original feat types len: {len(self.datamanager.feat_type)}") + nonnull_model_predictions_train = [pred for pred in previous_layer_predictions_train if pred is not None] + nonnull_model_predictions_test = [pred for pred in previous_layer_predictions_test if pred is not None] + assert len(nonnull_model_predictions_train) == len(nonnull_model_predictions_test) + self.logger.debug(f"length Non nulll predictions: {len(nonnull_model_predictions_train)}") + datamanager = get_appended_dataset( + original_dataset=self.datamanager, + previous_layer_predictions_train=nonnull_model_predictions_train, + previous_layer_predictions_test=nonnull_model_predictions_test, + resampling_strategy=self.resampling_strategy, + resampling_strategy_args=self.resampling_strategy_args, + ) + self.logger.debug(f"new feat_types len: {len(datamanager.feat_type)}") + self.reset_attributes(datamanager=datamanager) + + initial_num_run = self.backend.get_next_num_run() + self.logger.debug(f"cutoff num_run: {initial_num_run}") + + return self.run_history, self.trajectory, self._budget_type diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py index c44252021..23af08c0f 100644 --- a/autoPyTorch/optimizer/utils.py +++ b/autoPyTorch/optimizer/utils.py @@ -1,5 +1,6 @@ import json import os +import shutil import warnings from typing import Any, Dict, List, Union @@ -48,6 +49,14 @@ def read_return_initial_configurations( f"configuration as it does not match the current config space. ") return initial_configurations + +def delete_other_runs(ensemble_runs, runs_directory): + all_runs = os.listdir(runs_directory) + for run in all_runs: + if run not in ensemble_runs: + shutil.rmtree(os.path.join(runs_directory, run)) + + class AdjustRunHistoryCallback: """ Allows manipulating run history for custom needs @@ -55,6 +64,7 @@ class AdjustRunHistoryCallback: def __call__(self, smbo: 'SMBO') -> RunHistory: pass + class autoPyTorchSMBO(SMBO): def __init__(self, scenario: Scenario, @@ -135,7 +145,6 @@ def _incorporate_run_results(self, run_info: RunInfo, result: RunValue, time_lef "configuration does not crashes. (To deactivate this exception, use the SMAC scenario option " "'abort_on_first_run_crash'). Additional run info: %s" % result.additional_info ) - self.logger.debug(f"\nbefore ensemble, result: {result}, \nrunhistory: {self.runhistory.data}") for callback in self._callbacks['_incorporate_run_results']: response = callback(smbo=self, run_info=run_info, result=result, time_left=time_left) # If a callback returns False, the optimization loop should be interrupted diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index fe9727502..f87d36cf7 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -310,33 +310,6 @@ def _add_forbidden_conditions(self, cs): """ - # Learned Entity Embedding is only valid when encoder is one hot encoder - if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): - embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices - if 'LearnedEntityEmbedding' in embeddings: - encoders = cs.get_hyperparameter('encoder:__choice__').choices - possible_default_embeddings = copy(list(embeddings)) - del possible_default_embeddings[possible_default_embeddings.index('LearnedEntityEmbedding')] - - for encoder in encoders: - if encoder == 'OneHotEncoder': - continue - while True: - try: - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), - ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder) - )) - break - except ValueError: - # change the default and try again - try: - default = possible_default_embeddings.pop() - except IndexError: - raise ValueError("Cannot find a legal default configuration") - cs.get_hyperparameter('network_embedding:__choice__').default_value = default - # Disable CyclicLR until todo is completed. if 'lr_scheduler' in self.named_steps.keys() and 'trainer' in self.named_steps.keys(): trainers = cs.get_hyperparameter('trainer:__choice__').choices diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 6b38b4650..bfb54610e 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -23,7 +23,10 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N self.preprocessor: Optional[ColumnTransformer] = None self.add_fit_requirements([ FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True), - FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)]) + FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), + FitRequirement('skew_columns', (List,), user_defined=True, dataset_property=False), + FitRequirement('encode_columns', (List,), user_defined=True, dataset_property=False), + FitRequirement('embed_columns', (List,), user_defined=True, dataset_property=False)]) def get_column_transformer(self) -> ColumnTransformer: """ @@ -63,6 +66,21 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": column_transformers.append( ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns']) ) + if len(preprocessors['skew']) > 0: + skew_pipeline = make_pipeline(*preprocessors['skew']) + column_transformers.append( + ('skew_pipeline', skew_pipeline, X['skew_columns']) + ) + if len(preprocessors['encode']) > 0: + encode_pipeline = make_pipeline(*preprocessors['encode']) + column_transformers.append( + ('encode_pipeline', encode_pipeline, X['encode_columns']) + ) + if len(preprocessors['scale']) > 0: + scale_pipeline = make_pipeline(*preprocessors['scale']) + column_transformers.append( + ('scale_pipeline', scale_pipeline, X['scale_columns']) + ) # in case the preprocessing steps are disabled # i.e, NoEncoder for categorical, we want to diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py index aefe9ddf8..18d7f815e 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py @@ -14,7 +14,7 @@ class autoPyTorchTabularPreprocessingComponent(autoPyTorchPreprocessingComponent def __init__(self) -> None: super().__init__() self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict( - numerical=None, categorical=None) + numerical=None, encode=None, skew=None, scale=None, categorical=None) def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]: """ @@ -26,9 +26,6 @@ def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]: Returns: Dict[str, BaseEstimator]: early_preprocessor dictionary """ - if (self.preprocessor['numerical'] and self.preprocessor['categorical']) is None: - raise AttributeError("{} can't return early_preprocessor dict without fitting first" - .format(self.__class__.__name__)) return self.preprocessor def __str__(self) -> str: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py new file mode 100644 index 000000000..0333c3cab --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py @@ -0,0 +1,100 @@ +from typing import Any, Dict, List, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, +) + +import pandas as pd + +import numpy as np + +from scipy.stats import skew + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ + autoPyTorchTabularPreprocessingComponent +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, ispandas + + +def _get_skew( + data: Union[np.ndarray, pd.DataFrame] +)->float: + return data.skew() if ispandas(data) else skew(data) + +class ColumnSplitter(autoPyTorchTabularPreprocessingComponent): + """ + Removes features that have the same value in the training data. + """ + def __init__( + self, + min_categories_for_embedding: float = 5, + skew_threshold: float = 0.99, + random_state: Optional[np.random.RandomState] = None + ): + self.min_categories_for_embedding = min_categories_for_embedding + self.skew_threshold = skew_threshold + + self.special_feature_types = dict(skew_columns=[], encode_columns=[], embed_columns=[], scale_columns=[]) + self.num_categories_per_col: Optional[List] = None + super().__init__() + + def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'ColumnSplitter': + + self.check_requirements(X, y) + + if len(X['dataset_properties']['categorical_columns']) > 0: + self.num_categories_per_col = [] + for categories_per_column, column in zip(X['dataset_properties']['num_categories_per_col'], X['dataset_properties']['categorical_columns']): + if ( + categories_per_column >= self.min_categories_for_embedding + ): + self.special_feature_types['embed_columns'].append(column) + self.num_categories_per_col.append(categories_per_column) + else: + self.special_feature_types['encode_columns'].append(column) + + # Make sure each column is a valid type + for column in X['dataset_properties']['numerical_columns']: + + if np.abs(_get_skew(X['X_train'][X['train_indices']][column])) > self.skew_threshold: + self.special_feature_types['skew_columns'].append(column) + else: + self.special_feature_types['scale_columns'].append(column) + + return self + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + if self.num_categories_per_col is not None: + X['dataset_properties']['num_categories_per_col'] = self.num_categories_per_col + X.update(self.special_feature_types) + return X + + @staticmethod + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + + return { + 'shortname': 'ColumnSplitter', + 'name': 'Column Splitter', + 'handles_sparse': False, + } + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + min_categories_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter="min_categories_for_embedding", + value_range=(3, 4, 10, 100, 1000), + default_value=4), + skew_threshold: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="skew_threshold", + value_range=(0.2, 0.3, 0.5, 0.8, 0.99, 10.0, 100.0), + default_value=0.99,) + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + + add_hyperparameter(cs, min_categories_for_embedding, CategoricalHyperparameter) + add_hyperparameter(cs, skew_threshold, CategoricalHyperparameter) + + return cs \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py index 929e99048..341cc5065 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py @@ -31,18 +31,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: return self - def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: - """ - Adds the self into the 'X' dictionary and returns it. - Args: - X (Dict[str, Any]): 'X' dictionary - - Returns: - (Dict[str, Any]): the updated 'X' dictionary - """ - X.update({'encoder': self.preprocessor}) - return X - @staticmethod def get_properties( dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py index 5c9281891..b91387d66 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py @@ -20,12 +20,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: self.check_requirements(X, y) - self.preprocessor['categorical'] = OHE( - # It is safer to have the OHE produce a 0 array than to crash a good configuration - categories=X['dataset_properties']['categories'] - if len(X['dataset_properties']['categories']) > 0 else 'auto', - sparse=False, - handle_unknown='ignore') + if self._has_encode_columns(X): + self.preprocessor['encode'] = OHE( + # It is safer to have the OHE produce a 0 array than to crash a good configuration + sparse=False, + handle_unknown='ignore') return self @staticmethod diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py index eadc0a188..b62822107 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py @@ -13,8 +13,11 @@ class BaseEncoder(autoPyTorchTabularPreprocessingComponent): def __init__(self) -> None: super().__init__() self.add_fit_requirements([ - FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), - FitRequirement('categories', (List,), user_defined=True, dataset_property=True)]) + FitRequirement('encode_columns', (List,), user_defined=True, dataset_property=False)]) + + @staticmethod + def _has_encode_columns(X: Dict[str, Any]): + return len(X.get('encode_columns', [])) > 0 def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ @@ -25,8 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: Returns: (Dict[str, Any]): the updated 'X' dictionary """ - if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: - raise ValueError("cant call transform on {} without fitting first." - .format(self.__class__.__name__)) X.update({'encoder': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/MinMaxScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/MinMaxScaler.py index 97766217b..7f19f44d6 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/MinMaxScaler.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/MinMaxScaler.py @@ -23,7 +23,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: self.check_requirements(X, y) - self.preprocessor['numerical'] = SklearnMinMaxScaler(feature_range=self.feature_range, copy=False) + if self._has_scale_columns(X): + self.preprocessor['scale'] = SklearnMinMaxScaler(feature_range=self.feature_range, copy=False) return self @staticmethod diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py index 9d50aa8f5..e5fc369f0 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py @@ -32,20 +32,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: return self - def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: - """ - The transform function calls the transform function of the - underlying model and returns the transformed array. - - Args: - X (np.ndarray): input features - - Returns: - np.ndarray: Transformed features - """ - X.update({'scaler': self.preprocessor}) - return X - @staticmethod def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None ) -> Dict[str, Union[str, bool]]: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/Normalizer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/Normalizer.py index 678071378..cb6e2daf4 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/Normalizer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/Normalizer.py @@ -34,7 +34,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: self.check_requirements(X, y) map_norm = dict({"mean_abs": "l1", "mean_squared": "l2", "max": "max"}) - self.preprocessor['numerical'] = SklearnNormalizer(norm=map_norm[self.norm], copy=False) + if self._has_scale_columns(X): + self.preprocessor['scale'] = SklearnNormalizer(norm=map_norm[self.norm], copy=False) return self @staticmethod diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py index 2c59d77c2..5d18794b7 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py @@ -40,7 +40,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: self.check_requirements(X, y) with_centering = bool(not X['dataset_properties']['issparse']) - self.preprocessor['numerical'] = SklearnRobustScaler(quantile_range=(self.q_min, self.q_max), + if self._has_scale_columns(X): + self.preprocessor['scale'] = SklearnRobustScaler(quantile_range=(self.q_min, self.q_max), with_centering=with_centering, copy=False) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/StandardScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/StandardScaler.py index 664f45e04..173b959fa 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/StandardScaler.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/StandardScaler.py @@ -27,7 +27,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: self.check_requirements(X, y) with_mean, with_std = (False, False) if X['dataset_properties']['issparse'] else (True, True) - self.preprocessor['numerical'] = SklearnStandardScaler(with_mean=with_mean, with_std=with_std, copy=False) + if self._has_scale_columns(X): + self.preprocessor['scale'] = SklearnStandardScaler(with_mean=with_mean, with_std=with_std, copy=False) return self @staticmethod diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py index 39834dd2b..f9f43c58f 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py @@ -14,8 +14,12 @@ class BaseScaler(autoPyTorchTabularPreprocessingComponent): def __init__(self) -> None: super().__init__() self.add_fit_requirements([ - FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)]) + FitRequirement('scale_columns', (List,), user_defined=True, dataset_property=False)]) + @staticmethod + def _has_scale_columns(X: Dict[str, Any]): + return len(X.get('scale_columns', [])) > 0 + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ Adds the fitted scalar into the 'X' dictionary and returns it. @@ -25,8 +29,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: Returns: (Dict[str, Any]): the updated 'X' dictionary """ - if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: - raise ValueError("cant call transform on {} without fitting first." - .format(self.__class__.__name__)) X.update({'scaler': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/NoSkewTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/NoSkewTransformer.py new file mode 100644 index 000000000..9ea4801e8 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/NoSkewTransformer.py @@ -0,0 +1,42 @@ +from typing import Any, Dict, Optional, Union + +import numpy as np + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.skew_transformer.base_skew_transformer import BaseSkewTransformer + + +class NoSkewTransformer(BaseSkewTransformer): + """ + No scaling performed + """ + def __init__(self, + random_state: Optional[Union[np.random.RandomState, int]] = None + ): + super().__init__() + self.random_state = random_state + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseSkewTransformer: + """ + The fit function calls the fit function of the underlying model + and returns the transformed array. + Args: + X (np.ndarray): input features + y (Optional[np.ndarray]): input labels + + Returns: + instance of self + """ + + self.check_requirements(X, y) + + return self + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'NoSkewTransformer', + 'name': 'No Skew Transformer', + 'handles_sparse': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/PowerTransformer.py similarity index 76% rename from autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py rename to autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/PowerTransformer.py index 7dd2502f9..0cd231666 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/PowerTransformer.py @@ -5,10 +5,10 @@ from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.skew_transformer.base_skew_transformer import BaseSkewTransformer -class PowerTransformer(BaseScaler): +class PowerTransformer(BaseSkewTransformer): """ Map data to as close to a Gaussian distribution as possible in order to reduce variance and minimize skewness. @@ -21,11 +21,12 @@ def __init__(self, super().__init__() self.random_state = random_state - def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseSkewTransformer: self.check_requirements(X, y) - self.preprocessor['numerical'] = SklearnPowerTransformer(method='yeo-johnson', copy=False) + if self._has_skew_columns(X): + self.preprocessor['skew'] = SklearnPowerTransformer(method='yeo-johnson', copy=False) return self @staticmethod diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/QuantileTransformer.py similarity index 90% rename from autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py rename to autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/QuantileTransformer.py index cc0b4fa7a..7bd4e5482 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/QuantileTransformer.py @@ -11,11 +11,11 @@ from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.skew_transformer.base_skew_transformer import BaseSkewTransformer from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter -class QuantileTransformer(BaseScaler): +class QuantileTransformer(BaseSkewTransformer): """ Transform the features to follow a uniform or a normal distribution using quantiles information. @@ -34,11 +34,12 @@ def __init__( self.n_quantiles = n_quantiles self.output_distribution = output_distribution - def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseSkewTransformer: self.check_requirements(X, y) - self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles, + if self._has_skew_columns(X): + self.preprocessor['skew'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles, output_distribution=self.output_distribution, copy=False) return self diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/__init__.py new file mode 100644 index 000000000..421632101 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/__init__.py @@ -0,0 +1,143 @@ +import os +from collections import OrderedDict +from typing import Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.skew_transformer.base_skew_transformer import BaseSkewTransformer + +skew_transforming_directory = os.path.split(__file__)[0] +_skew_transformers = find_components(__package__, + skew_transforming_directory, + BaseSkewTransformer) + +_addons = ThirdPartyComponents(BaseSkewTransformer) + + +def add_skew_transformer(skew_transformer: BaseSkewTransformer) -> None: + _addons.add_component(skew_transformer) + + +class SkewTransformerChoice(autoPyTorchChoice): + """ + Allows for dynamically choosing skew_transforming component at runtime + """ + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available skew_transformer components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all BaseSkewTransformers components available + as choices for skew_transforming + """ + components = OrderedDict() + components.update(_skew_transformers) + components.update(_addons.components) + return components + + def get_hyperparameter_search_space(self, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None) -> ConfigurationSpace: + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = dict() + + dataset_properties = {**self.dataset_properties, **dataset_properties} + + available_skew_transformers = self.get_available_components(dataset_properties=dataset_properties, + include=include, + exclude=exclude) + + if len(available_skew_transformers) == 0: + raise ValueError("no skew_transformers found, please add a skew_transformer") + + if default is None: + defaults = [ + 'PowerTransformer', + 'QuantileTransformer', + 'NoSkewTransformer' + ] + for default_ in defaults: + if default_ in available_skew_transformers: + if include is not None and default_ not in include: + continue + if exclude is not None and default_ in exclude: + continue + default = default_ + break + + numerical_columns = dataset_properties['numerical_columns']\ + if isinstance(dataset_properties['numerical_columns'], List) else [] + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_skew_transformers): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_skew_transformers, + choice_hyperparameter.value_range)) + if len(numerical_columns) == 0: + assert len(choice_hyperparameter.value_range) == 1 + if 'NoSkewTransformer' not in choice_hyperparameter.value_range: + raise ValueError("Provided {} in choices, however, the dataset " + "is incompatible with it".format(choice_hyperparameter.value_range)) + + preprocessor = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + # add only no skew_transformer to choice hyperparameters in case the dataset is only categorical + if len(numerical_columns) == 0: + default = 'NoSkewTransformer' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, " + "the dataset is incompatible with it".format(include)) + preprocessor = CSH.CategoricalHyperparameter('__choice__', + ['NoSkewTransformer'], + default_value=default) + else: + preprocessor = CSH.CategoricalHyperparameter('__choice__', + list(available_skew_transformers.keys()), + default_value=default) + cs.add_hyperparameter(preprocessor) + + # add only child hyperparameters of preprocessor choices + for name in preprocessor.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_skew_transformers[name].get_hyperparameter_search_space(dataset_properties, # type:ignore + **updates) + parent_hyperparameter = {'parent': preprocessor, 'value': name} + cs.add_configuration_space(name, config_space, + parent_hyperparameter=parent_hyperparameter) + + self.configuration_space = cs + self.dataset_properties = dataset_properties + return cs + + def _check_dataset_properties(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> None: + """ + A mechanism in code to ensure the correctness of the fit dictionary + It recursively makes sure that the children and parent level requirements + are honored before fit. + Args: + dataset_properties: + + """ + super()._check_dataset_properties(dataset_properties) + assert 'numerical_columns' in dataset_properties.keys() and \ + 'categorical_columns' in dataset_properties.keys(), \ + "Dataset properties must contain information about the type of columns" diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/base_skew_transformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/base_skew_transformer.py new file mode 100644 index 000000000..d62055f6f --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/base_skew_transformer.py @@ -0,0 +1,33 @@ +from typing import Any, Dict, List + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import ( + autoPyTorchTabularPreprocessingComponent +) +from autoPyTorch.utils.common import FitRequirement + + +class BaseSkewTransformer(autoPyTorchTabularPreprocessingComponent): + """ + Provides abstract class interface for Scalers in AutoPytorch + """ + + def __init__(self) -> None: + super().__init__() + self.add_fit_requirements([ + FitRequirement('skew_columns', (List,), user_defined=True, dataset_property=False)]) + + @staticmethod + def _has_skew_columns(X: Dict[str, Any]): + return len(X.get('skew_columns', [])) > 0 + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + """ + Adds the fitted scalar into the 'X' dictionary and returns it. + Args: + X (Dict[str, Any]): 'X' dictionary + + Returns: + (Dict[str, Any]): the updated 'X' dictionary + """ + X.update({'skew_transformer': self.preprocessor}) + return X \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py index e71583e3e..d6d1a60da 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py @@ -21,7 +21,8 @@ def get_tabular_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator Returns: (Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors """ - preprocessor: Dict[str, List[BaseEstimator]] = dict(numerical=list(), categorical=list()) + preprocessor: Dict[str, List[BaseEstimator]] = dict(numerical=list(), categorical=list(), scale=list(), encode=list(), skew=list()) + for key, value in X.items(): if isinstance(value, dict): # as each preprocessor is child of BaseEstimator @@ -29,5 +30,11 @@ def get_tabular_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator preprocessor['numerical'].append(value['numerical']) if 'categorical' in value and isinstance(value['categorical'], BaseEstimator): preprocessor['categorical'].append(value['categorical']) + if 'scale' in value and isinstance(value['scale'], BaseEstimator): + preprocessor['scale'].append(value['scale']) + if 'encode' in value and isinstance(value['encode'], BaseEstimator): + preprocessor['encode'].append(value['encode']) + if 'skew' in value and isinstance(value['skew'], BaseEstimator): + preprocessor['skew'].append(value['skew']) return preprocessor diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index 597f14ca6..5b60ff4ed 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -40,7 +40,10 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: X['X_train'] = preprocess(dataset=X_train, transforms=transforms) # We need to also save the preprocess transforms for inference - X.update({'preprocess_transforms': transforms}) + X.update({ + 'preprocess_transforms': transforms, + 'shape_after_preprocessing': X['X_train'].shape[1:] + }) return X @staticmethod diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py index 7ec872b96..6b68fe973 100644 --- a/autoPyTorch/pipeline/components/setup/network/base_network.py +++ b/autoPyTorch/pipeline/components/setup/network/base_network.py @@ -36,7 +36,6 @@ def __init__( FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False), FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False), ]) - self.network = network self.final_activation: Optional[torch.nn.Module] = None def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent: diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py index 5b6e48bf1..bee4a6abc 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py @@ -55,7 +55,8 @@ def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: in """ layers.append(nn.Linear(in_features, out_features)) - layers.append(nn.BatchNorm1d(out_features)) + if self.config['use_batch_norm']: + layers.append(nn.BatchNorm1d(out_features)) layers.append(_activations[self.config["activation"]]()) if self.config['use_dropout']: layers.append(nn.Dropout(self.config["dropout_%d" % layer_id])) @@ -86,6 +87,10 @@ def get_hyperparameter_search_space( value_range=(True, False), default_value=False, ), + use_batch_norm: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_batch_norm", + value_range=(True, False), + default_value=False, + ), num_units: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_units", value_range=(10, 1024), default_value=200, @@ -105,6 +110,9 @@ def get_hyperparameter_search_space( num_groups = get_hyperparameter(num_groups, UniformIntegerHyperparameter) add_hyperparameter(cs, activation, CategoricalHyperparameter) + # whether to use batch normalization + add_hyperparameter(cs, use_batch_norm, CategoricalHyperparameter) + # We can have dropout in the network for # better generalization dropout_flag = False diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index ef3cc1768..f3957bbc2 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -30,7 +30,7 @@ def __init__(self, self.add_fit_requirements([ FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True, dataset_property=False), - FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True), + FitRequirement('shape_after_preprocessing', (Iterable,), user_defined=False, dataset_property=False), FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False), FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False) ]) @@ -49,9 +49,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: Self """ self.check_requirements(X, y) - X_train = X['X_train'] - - input_shape = X_train.shape[1:] + input_shape = X['shape_after_preprocessing'] input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape) self.input_shape = input_shape diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index 1af7ad7af..fd8f5eca5 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -12,7 +12,8 @@ _activations = { "relu": torch.nn.ReLU, "tanh": torch.nn.Tanh, - "sigmoid": torch.nn.Sigmoid + "sigmoid": torch.nn.Sigmoid, + "elu": torch.nn.ELU } @@ -25,7 +26,7 @@ def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...] :param input_shape: shape of the input :return: output_shape """ - placeholder = torch.randn((2, *input_shape), dtype=torch.float) + placeholder = torch.randint(high=2, size=(2, *input_shape), dtype=torch.float) with torch.no_grad(): output = network(placeholder) diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py index 49ecf40b7..2a391f754 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -1,9 +1,11 @@ +from math import ceil from typing import Any, Dict, List, Optional, Union from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, UniformFloatHyperparameter, - UniformIntegerHyperparameter + UniformIntegerHyperparameter, ) import numpy as np @@ -16,39 +18,45 @@ from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter +def get_num_output_dimensions(config, num_categs_per_feature): + """ Returns list of embedding sizes for each categorical variable. + Selects this adaptively based on training_datset. + Note: Assumes there is at least one embed feature. + """ + max_embedding_dim = config['max_embedding_dim'] + embed_exponent = config['embed_exponent'] + size_factor = config['embedding_size_factor'] + num_output_dimensions = [int(size_factor*max( + 2, + min(max_embedding_dim, + 1.6 * num_categories**embed_exponent))) + if num_categories > 0 else 1 for num_categories in num_categs_per_feature] + return num_output_dimensions + + class _LearnedEntityEmbedding(nn.Module): """ Learned entity embedding module for categorical features""" - def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_numerical_features: int): + def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, num_features_excl_embed: int): """ Args: config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer num_input_features (np.ndarray): column wise information of number of output columns after transformation for each categorical column and 0 for numerical columns - num_numerical_features (int): number of numerical features in X + num_features_excl_embed (int): number of features in X excluding the features that need to be embedded """ super().__init__() self.config = config - - self.num_numerical = num_numerical_features # list of number of categories of categorical data # or 0 for numerical data - self.num_input_features = num_input_features - categorical_features = self.num_input_features > 0 - - self.num_categorical_features = self.num_input_features[categorical_features] - - self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in - self.num_input_features] - self.num_output_dimensions = [0] * num_numerical_features - self.num_output_dimensions.extend([config["dimension_reduction_" + str(i)] * num_in for i, num_in in - enumerate(self.num_categorical_features)]) - self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in - zip(self.num_output_dimensions, self.num_input_features)] - self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in - zip(self.num_output_dimensions, self.embed_features, - self.num_input_features)] - self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) + self.num_categories_per_col = num_categories_per_col + self.embed_features = self.num_categories_per_col > 0 + + self.num_embed_features = self.num_categories_per_col[self.embed_features] + + self.num_output_dimensions = get_num_output_dimensions(config, self.num_categories_per_col) + + self.num_out_feats = num_features_excl_embed + sum(self.num_output_dimensions) self.ee_layers = self._create_ee_layers() @@ -56,32 +64,30 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # pass the columns of each categorical feature through entity embedding layer # before passing it through the model concat_seq = [] - last_concat = 0 + x_pointer = 0 layer_pointer = 0 - for num_in, embed in zip(self.num_input_features, self.embed_features): + for x_pointer, embed in enumerate(self.embed_features): + current_feature_slice = x[:, x_pointer] if not embed: x_pointer += 1 + concat_seq.append(current_feature_slice.view(-1, 1)) continue - if x_pointer > last_concat: - concat_seq.append(x[:, last_concat: x_pointer]) - categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] - concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) + current_feature_slice = current_feature_slice.to(torch.int) + concat_seq.append(self.ee_layers[layer_pointer](current_feature_slice)) layer_pointer += 1 - x_pointer += num_in - last_concat = x_pointer - concat_seq.append(x[:, last_concat:]) return torch.cat(concat_seq, dim=1) def _create_ee_layers(self) -> nn.ModuleList: # entity embeding layers are Linear Layers layers = nn.ModuleList() - for i, (num_in, embed, num_out) in enumerate(zip(self.num_input_features, self.embed_features, - self.num_output_dimensions)): + for num_cat, embed, num_out in zip(self.num_categories_per_col, + self.embed_features, + self.num_output_dimensions): if not embed: continue - layers.append(nn.Linear(num_in, num_out)) + layers.append(nn.Embedding(num_cat, num_out)) return layers @@ -94,33 +100,32 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None, **kwarg super().__init__(random_state=random_state) self.config = kwargs - def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: + def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module: return _LearnedEntityEmbedding(config=self.config, - num_input_features=num_input_features, - num_numerical_features=num_numerical_features) + num_categories_per_col=num_categories_per_col, + num_features_excl_embed=num_features_excl_embed) @staticmethod def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, - min_unique_values_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace( - hyperparameter="min_unique_values_for_embedding", - value_range=(3, 7), - default_value=5, - log=True), - dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction", - value_range=(0, 1), - default_value=0.5), + embed_exponent: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embed_exponent", + value_range=(0.56,), + default_value=0.56), + max_embedding_dim: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="max_embedding_dim", + value_range=(100,), + default_value=100), + embedding_size_factor: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embedding_size_factor", + value_range=(1.0, 0.5, 1.5, 0.7, 0.6, 0.8, 0.9, 1.1, 1.2, 1.3, 1.4), + default_value=1, + ), ) -> ConfigurationSpace: cs = ConfigurationSpace() - add_hyperparameter(cs, min_unique_values_for_embedding, UniformIntegerHyperparameter) if dataset_properties is not None: - for i in range(len(dataset_properties['categorical_columns']) - if isinstance(dataset_properties['categorical_columns'], List) else 0): - ee_dimensions_search_space = HyperparameterSearchSpace(hyperparameter="dimension_reduction_" + str(i), - value_range=dimension_reduction.value_range, - default_value=dimension_reduction.default_value, - log=dimension_reduction.log) - add_hyperparameter(cs, ee_dimensions_search_space, UniformFloatHyperparameter) + if len(dataset_properties['categorical_columns']) > 0: + add_hyperparameter(cs, embed_exponent, UniformFloatHyperparameter) + add_hyperparameter(cs, max_embedding_dim, UniformIntegerHyperparameter) + add_hyperparameter(cs, embedding_size_factor, CategoricalHyperparameter) + return cs @staticmethod diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py index 830bdbb00..73d4708a0 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py @@ -24,7 +24,7 @@ class NoEmbedding(NetworkEmbeddingComponent): def __init__(self, random_state: Optional[np.random.RandomState] = None): super().__init__(random_state=random_state) - def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: + def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module: return _NoEmbedding() @staticmethod diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 998055d2b..6b88e4929 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,5 +1,4 @@ -import copy -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import numpy as np @@ -8,27 +7,32 @@ from torch import nn from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent +from autoPyTorch.utils.common import FitRequirement class NetworkEmbeddingComponent(autoPyTorchSetupComponent): def __init__(self, random_state: Optional[np.random.RandomState] = None): super().__init__(random_state=random_state) + self.add_fit_requirements([ + FitRequirement('num_categories_per_col', (List,), user_defined=True, dataset_property=True), + FitRequirement('shape_after_preprocessing', (Tuple,), user_defined=False, dataset_property=False)]) + self.embedding: Optional[nn.Module] = None def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: - num_numerical_columns, num_input_features = self._get_required_info_from_data(X) + num_features_excl_embed, num_categories_per_col = self._get_required_info_from_data(X) self.embedding = self.build_embedding( - num_input_features=num_input_features, - num_numerical_features=num_numerical_columns) + num_categories_per_col=num_categories_per_col, + num_features_excl_embed=num_features_excl_embed) return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: X.update({'network_embedding': self.embedding}) return X - def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: + def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module: raise NotImplementedError def _get_required_info_from_data(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: @@ -48,22 +52,16 @@ def _get_required_info_from_data(self, X: Dict[str, Any]) -> Tuple[int, np.ndarr number of categories for categorical columns and 0 for numerical columns """ - # Feature preprocessors can alter numerical columns - if len(X['dataset_properties']['numerical_columns']) == 0: - num_numerical_columns = 0 - else: - X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) - - numerical_column_transformer = X['tabular_transformer'].preprocessor. \ - named_transformers_['numerical_pipeline'] - num_numerical_columns = numerical_column_transformer.transform( - X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] + num_cols = X['shape_after_preprocessing'] + # only works for 2D(rows, features) tabular data + num_features_excl_embed = num_cols[0] - len(X['embed_columns']) + + num_categories_per_col = np.zeros(num_cols, dtype=np.int16) - num_cols = num_numerical_columns + len(X['dataset_properties']['categorical_columns']) - num_input_feats = np.zeros(num_cols, dtype=np.int32) + categories_per_embed_col = X['dataset_properties']['num_categories_per_col'] - categories = X['dataset_properties']['categories'] - for idx, cats in enumerate(categories, start=num_numerical_columns): - num_input_feats[idx] = len(cats) + # only fill num categories for embedding columns + for idx, cats in enumerate(categories_per_embed_col, start=num_features_excl_embed): + num_categories_per_col[idx] = cats - return num_numerical_columns, num_input_feats + return num_features_excl_embed, num_categories_per_col diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py b/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py index 7d26c5481..d53298665 100644 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py @@ -4,10 +4,13 @@ from abc import abstractmethod from typing import Any, Dict, List, Optional, Tuple, Union +from ConfigSpace.configuration_space import Configuration + import numpy as np import pandas as pd +from sklearn.base import BaseEstimator from sklearn.utils import check_random_state import torch @@ -84,9 +87,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchSetupComponent: logger_port=X['logger_port'] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT, output_shape=output_shape, + dataset_properties=X['dataset_properties'], task_type=X['dataset_properties']['task_type'], output_type=X['dataset_properties']['output_type'], - optimize_metric=X['optimize_metric'] if 'optimize_metric' in X else None) + optimize_metric=X['optimize_metric'] if 'optimize_metric' in X else None, + time_limit=X['func_eval_time_limit_secs']) # train model blockPrint() @@ -102,6 +107,30 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchSetupComponent: self.fit_output["test_preds"] = test_preds return self + def set_hyperparameters(self, + configuration: Configuration, + init_params: Optional[Dict[str, Any]] = None + ) -> BaseEstimator: + """ + Applies a configuration to the given component. + This method translate a hierarchical configuration key, + to an actual parameter of the autoPyTorch component. + + Args: + configuration (Configuration): + Which configuration to apply to the chosen component + init_params (Optional[Dict[str, any]]): + Optional arguments to initialize the chosen component + + Returns: + An instance of self + """ + params = configuration.get_dictionary() + + setattr(self, 'config', params) + + return self + @abstractmethod def build_model( self, @@ -110,6 +139,7 @@ def build_model( logger_port: int, task_type: str, output_type: str, + time_limit: Optional[int] = None, optimize_metric: Optional[str] = None ) -> BaseTraditionalLearner: """ diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json deleted file mode 100644 index c65a311fe..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "iterations" : 10000, - "learning_rate" : 0.1 -} diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json deleted file mode 100644 index 81f1d6383..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "n_estimators" : 300 -} diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json deleted file mode 100644 index 0fa7f95d4..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "weights" : "uniform" -} diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json deleted file mode 100644 index d8e061f5e..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "num_rounds" : 10000, - "num_leaves" : 128, - "two_round" : "True", - "min_data_in_leaf" : 3, - "feature_fraction" : 0.9, - "boosting_type" : "gbdt", - "learning_rate" : 0.03 -} diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json deleted file mode 100644 index 81f1d6383..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "n_estimators" : 300 -} diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json deleted file mode 100644 index 2c63c0851..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json +++ /dev/null @@ -1,2 +0,0 @@ -{ -} diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json deleted file mode 100644 index e5f3c5622..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "C" : 1.0, - "degree" : 3 -} diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_traditional_model.py b/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_traditional_model.py index 588fb83ed..b20427e77 100644 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_traditional_model.py +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_traditional_model.py @@ -1,4 +1,5 @@ from typing import Any, Dict, List, Optional, Tuple, Type, Union +import re from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -11,8 +12,10 @@ from autoPyTorch.pipeline.components.setup.traditional_ml.base_model import BaseModelComponent from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import ( BaseTraditionalLearner, get_available_traditional_learners) +from autoPyTorch.utils.common import HyperparameterSearchSpace +# TODO: Make this a choice and individual components for each traditional classifier class TabularTraditionalModel(BaseModelComponent): """ Implementation of a dynamic model, that consists of a learner and a head @@ -38,25 +41,66 @@ def get_properties( "name": "Tabular Traditional Model", } - @staticmethod - def get_hyperparameter_search_space(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + + def get_hyperparameter_search_space(self, dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, **kwargs: Any) -> ConfigurationSpace: cs = ConfigurationSpace() - traditional_learners: Dict[str, Type[BaseTraditionalLearner]] = get_available_traditional_learners() + available_traditional_learners: Dict[str, Type[BaseTraditionalLearner]] = get_available_traditional_learners() # Remove knn if data is all categorical if dataset_properties is not None: numerical_columns = dataset_properties['numerical_columns'] \ if isinstance(dataset_properties['numerical_columns'], List) else [] if len(numerical_columns) == 0: - del traditional_learners['knn'] - learner_hp = CategoricalHyperparameter("traditional_learner", choices=traditional_learners.keys()) + del available_traditional_learners['knn'] + + updates = self._get_search_space_updates() + + if 'traditional_learner' in updates: + learner_hp = CategoricalHyperparameter("traditional_learner", choices=updates['traditional_learner'].value_range) + else: + learner_hp = CategoricalHyperparameter("traditional_learner", choices=available_traditional_learners.keys()) cs.add_hyperparameters([learner_hp]) + for name in learner_hp.choices: + child_updates = self._get_child_search_space_updates(prefix=name) + model_configuration_space = available_traditional_learners[name]. \ + get_hyperparameter_search_space(dataset_properties, **child_updates) + parent_hyperparameter = {'parent': learner_hp, 'value': name} + cs.add_configuration_space( + name, + model_configuration_space, + parent_hyperparameter=parent_hyperparameter + ) + return cs + def _get_child_search_space_updates(self, prefix: Optional[str] = None) -> Dict[str, HyperparameterSearchSpace]: + """Get the search space updates with the given prefix + + Args: + prefix (str): + Only return search space updates with given prefix (default: {None}) + + Returns: + Dict[str, HyperparameterSearchSpace]: + Mapping of search space updates. Keys don't contain the prefix. + """ + + result: Dict[str, HyperparameterSearchSpace] = dict() + + # iterate over all search space updates of this node and keep the ones that have the given prefix + for key in self._cs_updates.keys(): + if prefix is None: + result[key] = self._cs_updates[key].get_search_space() + elif re.search(f'^{prefix}', key) is not None: + result[key[len(prefix) + 1:]] = self._cs_updates[key].get_search_space(remove_prefix=prefix) + return result + def build_model(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...], - logger_port: int, task_type: str, output_type: str, optimize_metric: Optional[str] = None + dataset_properties: Dict[str, BaseDatasetPropertiesType], + logger_port: int, task_type: str, output_type: str, optimize_metric: Optional[str] = None, + time_limit: Optional[int] = None, ) -> BaseTraditionalLearner: """ This method returns a traditional learner, that is dynamically @@ -64,14 +108,19 @@ def build_model(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ... the additional configuration hyperparameters to build a domain specific model """ - learner_name = self.config["traditional_learner"] + learner_name = self.config.pop("traditional_learner") Learner = self._traditional_learners[learner_name] + config = self._remove_prefix_config(learner_name=learner_name) learner = Learner(random_state=self.random_state, logger_port=logger_port, - task_type=task_type, output_type=output_type, optimize_metric=optimize_metric) + task_type=task_type, output_type=output_type, optimize_metric=optimize_metric, + dataset_properties=dataset_properties, time_limit=time_limit, **config) return learner + def _remove_prefix_config(self, learner_name): + return {key.replace(f'{learner_name}:', ''): value for key, value in self.config.items()} + def __str__(self) -> str: """ Allow a nice understanding of what components where used """ return f"TabularTraditionalModel: {self.model.name if self.model is not None else None}" diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/__init__.py index f4a7b98de..34e71bf05 100644 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/__init__.py +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/__init__.py @@ -1,17 +1,17 @@ -from typing import Any, Dict, Type, Union +from typing import Any, Dict, Optional, Type, Union +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.base_component import ( ThirdPartyComponents, ) from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \ BaseTraditionalLearner -from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.learners import ( - CatboostModel, - ExtraTreesModel, - KNNModel, - LGBModel, - RFModel, - SVMModel) +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.lgbm.lgbm import LGBModel +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.catboost.catboost import CatboostModel +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.random_forest.random_forest import RFModel +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.extratrees.extratrees import ExtraTreesModel +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.knn.knn import KNNModel +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.xgboost.xgboost import XGBModel _traditional_learners = { # Sort by more robust models @@ -28,8 +28,8 @@ 'catboost': CatboostModel, 'random_forest': RFModel, 'extra_trees': ExtraTreesModel, - 'svm': SVMModel, 'knn': KNNModel, + 'xgboost': XGBModel } _addons = ThirdPartyComponents(BaseTraditionalLearner) @@ -38,7 +38,14 @@ def add_traditional_learner(traditional_learner: BaseTraditionalLearner) -> None _addons.add_component(traditional_learner) -def get_available_traditional_learners() -> Dict[str, Union[Type[BaseTraditionalLearner], Any]]: +def get_available_traditional_learners( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, +) -> Dict[str, Union[Type[BaseTraditionalLearner], Any]]: traditional_learners = dict() traditional_learners.update(_traditional_learners) + traditional_learners.update(_addons.components) + + if dataset_properties is not None and len(dataset_properties['numerical_columns']) ==0: + traditional_learners.pop('knn', None) + return traditional_learners diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/base_traditional_learner.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/base_traditional_learner.py index 9c0166a9f..a9b306475 100644 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/base_traditional_learner.py +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/base_traditional_learner.py @@ -2,7 +2,7 @@ import logging.handlers import os as os from abc import abstractmethod -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union from catboost import CatBoost @@ -13,7 +13,8 @@ from sklearn.base import BaseEstimator from sklearn.utils import check_random_state -from autoPyTorch.constants import REGRESSION_TASKS, STRING_TO_TASK_TYPES +from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType +from autoPyTorch.constants import REGRESSION_TASKS, STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics from autoPyTorch.utils.logging_ import get_named_client_logger @@ -42,9 +43,12 @@ class BaseTraditionalLearner: def __init__(self, task_type: str, output_type: str, + params_func: Optional[Callable], + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, optimize_metric: Optional[str] = None, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, random_state: Optional[np.random.RandomState] = None, + time_limit: Optional[int] = None, name: Optional[str] = None): self.model: Optional[Union[CatBoost, BaseEstimator]] = None @@ -61,13 +65,15 @@ def __init__(self, self.random_state = check_random_state(1) else: self.random_state = check_random_state(random_state) - self.config = self.get_config() + + self.output_type = STRING_TO_OUTPUT_TYPES[output_type] + self.config = params_func(self.output_type) self.all_nan: Optional[np.ndarray] = None self.num_classes: Optional[int] = None - + self.time_limit = time_limit self.is_classification = STRING_TO_TASK_TYPES[task_type] not in REGRESSION_TASKS - + self.dataset_properties = dataset_properties self.metric = get_metrics(dataset_properties={'task_type': task_type, 'output_type': output_type}, names=[optimize_metric] if optimize_metric is not None else None)[0] @@ -76,16 +82,7 @@ def get_config(self) -> Dict[str, Union[int, str, float, bool]]: """ Load the parameters for the classifier model from ../estimator_configs/modelname.json. """ - dirname = os.path.dirname(os.path.abspath(__file__)) - config_path = os.path.join(dirname, "../estimator_configs", self.name + ".json") - with open(config_path, "r") as f: - config: Dict[str, Union[int, str, float, bool]] = json.load(f) - for k, v in config.items(): - if v == "True": - config[k] = True - if v == "False": - config[k] = False - return config + return self.config def _preprocess(self, X: np.ndarray diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/catboost.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/catboost.py new file mode 100644 index 000000000..c5f81fb7b --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/catboost.py @@ -0,0 +1,142 @@ +import logging.handlers +import tempfile +from typing import Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformIntegerHyperparameter, + UniformFloatHyperparameter +) + +import numpy as np + +from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \ + BaseTraditionalLearner +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.catboost.utils import ( + AutoPyTorchToCatboostMetrics, + EarlyStoppingCallback, + MemoryCheckCallback, + get_params +) + +from catboost import CatBoostClassifier, CatBoostRegressor, Pool +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter + +from autoPyTorch.utils.early_stopping import get_early_stopping_rounds + + +class CatboostModel(BaseTraditionalLearner): + + def __init__(self, + task_type: str, + output_type: str, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None, + time_limit: Optional[int] = None, + **kwargs + ): + super(CatboostModel, self).__init__(name="catboost", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric, + dataset_properties=dataset_properties, + time_limit=time_limit, + params_func=get_params) + self.config["train_dir"] = tempfile.gettempdir() + self.config.update(kwargs) + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + if not self.is_classification: + self.config['eval_metric'] = AutoPyTorchToCatboostMetrics[self.metric.name].value + # CatBoost Cannot handle a random state object, just the seed + self.model = CatBoostRegressor(**self.config, random_state=self.random_state.get_state()[1][0]) + else: + self.config['eval_metric'] = AutoPyTorchToCatboostMetrics[self.metric.name].value + # CatBoost Cannot handle a random state object, just the seed + self.model = CatBoostClassifier(**self.config, random_state=self.random_state.get_state()[1][0]) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> None: + + assert self.model is not None, "No model found. Can't fit without preparing the model" + early_stopping = get_early_stopping_rounds(num_rows_train=X_train.shape[0]) + callbacks = [] + callbacks.append(EarlyStoppingCallback(stopping_rounds=early_stopping, eval_metric=self.config['eval_metric'])) + num_rows_train = X_train.shape[0] + num_cols_train = X_train.shape[1] + self.num_classes = len(np.unique(y_train)) if len(np.unique(y_train)) != 2 else 1 + if num_rows_train * num_cols_train * self.num_classes > 5_000_000: + # The data is large enough to potentially cause memory issues during training, so monitor memory usage via callback. + callbacks.append(MemoryCheckCallback()) + categoricals = [ind for ind in range(X_train.shape[1]) if isinstance(X_train[0, ind], str)] + + X_train_pooled = Pool(data=X_train, label=y_train, cat_features=categoricals) + X_val_pooled = Pool(data=X_val, label=y_val, cat_features=categoricals) + + self.model.fit(X_train_pooled, + eval_set=X_val_pooled, + use_best_model=True, + early_stopping_rounds=early_stopping, + callbacks=callbacks, + verbose=False) + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + learning_rate: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='learning_rate', + value_range=(5e-3, 0.2), + default_value=0.05, + log=True + ), + depth: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='depth', + value_range=(5, 8), + default_value=6, + ), + l2_leaf_reg: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='l2_leaf_reg', + value_range=(1, 5), + default_value=3, + ), + ) -> ConfigurationSpace: + """Get the hyperparameter search space for the SimpleImputer + + Args: + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]) + Properties that describe the dataset + Note: Not actually Optional, just adhering to its supertype + numerical_strategy (HyperparameterSearchSpace: default = ...) + The strategy to use for numerical imputation + + Returns: + ConfigurationSpace + The space of possible configurations for a SimpleImputer with the given + `dataset_properties` + """ + cs = ConfigurationSpace() + + add_hyperparameter(cs, l2_leaf_reg, UniformIntegerHyperparameter) + add_hyperparameter(cs, depth, UniformIntegerHyperparameter) + add_hyperparameter(cs, learning_rate, UniformFloatHyperparameter) + + return cs + + @staticmethod + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'CBLearner', + 'name': 'Categorical Boosting Learner', + } diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/utils.py new file mode 100644 index 000000000..ffac75e6c --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/utils.py @@ -0,0 +1,138 @@ +from typing import Any, Dict +import logging +import time +import psutil +from enum import Enum + + +class AutoPyTorchToCatboostMetrics(Enum): + mean_absolute_error = "MAE" + root_mean_squared_error = "RMSE" + mean_squared_log_error = "MSLE" + r2 = "R2" + accuracy = "Accuracy" + balanced_accuracy = "BalancedAccuracy" + f1 = "F1" + roc_auc = "AUC" + precision = "Precision" + recall = "Recall" + log_loss = "Logloss" + + +class MemoryCheckCallback: + """ + Callback to ensure memory usage is safe, otherwise early stops the model to avoid OOM errors. + + This callback is CatBoost specific. + + Args: + + period : int, default = 10 + Number of iterations between checking memory status. Higher values are less precise but use less compute. + verbose : bool, default = False + Whether to log information on memory status even if memory usage is low. + """ + def __init__(self, period: int = 10, verbose=False): + self.period = period + self.mem_status = psutil.Process() + self.init_mem_rss = self.mem_status.memory_info().rss + self.init_mem_avail = psutil.virtual_memory().available + self.verbose = verbose + + self._cur_period = 1 + + def after_iteration(self, info): + iteration = info.iteration + if iteration % self._cur_period == 0: + not_enough_memory = self.memory_check(iteration) + if not_enough_memory: + return False + return True + + def memory_check(self, iter) -> bool: + """Checks if memory usage is unsafe. If so, then returns True to signal the model to stop training early.""" + available_bytes = psutil.virtual_memory().available + cur_rss = self.mem_status.memory_info().rss + + if cur_rss < self.init_mem_rss: + self.init_mem_rss = cur_rss + estimated_model_size_mb = (cur_rss - self.init_mem_rss) >> 20 + available_mb = available_bytes >> 20 + model_size_memory_ratio = estimated_model_size_mb / available_mb + + early_stop = False + if model_size_memory_ratio > 1.0: + early_stop = True + + if available_mb < 512: # Less than 500 MB + early_stop = True + + if early_stop: + return True + elif self.verbose or (model_size_memory_ratio > 0.25): + + if model_size_memory_ratio > 0.5: + self._cur_period = 1 # Increase rate of memory check if model gets large enough to cause OOM potentially + elif iter > self.period: + self._cur_period = self.period + + return False + + +class EarlyStoppingCallback: + """ + Early stopping callback. + + This callback is CatBoost specific. + + Args: + stopping_rounds : int or tuple + If int, The possible number of rounds without the trend occurrence. + If tuple, contains early stopping class as first element and class init kwargs as second element. + eval_metric : str + The eval_metric to use for early stopping. Must also be specified in the CatBoost model params. + compare_key : str, default = 'validation' + The data to use for scoring. It is recommended to keep as default. + """ + def __init__(self, stopping_rounds, eval_metric, compare_key='validation'): + if isinstance(stopping_rounds, int): + from autoPyTorch.utils.early_stopping import SimpleEarlyStopper + self.es = SimpleEarlyStopper(patience=stopping_rounds) + else: + self.es = stopping_rounds[0](**stopping_rounds[1]) + self.best_score = None + self.compare_key = compare_key + + if isinstance(eval_metric, str): + from catboost._catboost import is_maximizable_metric + is_max_optimal = is_maximizable_metric(eval_metric) + eval_metric_name = eval_metric + else: + is_max_optimal = eval_metric.is_max_optimal() + + eval_metric_name = eval_metric.__class__.__name__ + + self.eval_metric_name = eval_metric_name + self.is_max_optimal = is_max_optimal + + def after_iteration(self, info): + is_best_iter = False + cur_score = info.metrics[self.compare_key][self.eval_metric_name][-1] + if not self.is_max_optimal: + cur_score *= -1 + if self.best_score is None: + self.best_score = cur_score + elif cur_score > self.best_score: + is_best_iter = True + self.best_score = cur_score + + should_stop = self.es.update(current_epoch=info.iteration, is_best=is_best_iter) + return not should_stop + + +def get_params(output_type: int) -> Dict[str, Any]: + + return { + "iterations" : 10000, + "learning_rate" : 0.1 +} \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/extratrees.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/extratrees.py new file mode 100644 index 000000000..f237ed61c --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/extratrees.py @@ -0,0 +1,99 @@ +import logging.handlers +from typing import Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from sklearn.ensemble import ( + ExtraTreesClassifier, + ExtraTreesRegressor, +) + +from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \ + BaseTraditionalLearner +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.extratrees.utils import get_params + + +class ExtraTreesModel(BaseTraditionalLearner): + + def __init__(self, + task_type: str, + output_type: str, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None, + time_limit: Optional[int] = None, + **kwargs + ): + super(ExtraTreesModel, self).__init__(name="extra_trees", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric, + dataset_properties=dataset_properties, + time_limit=time_limit, + params_func=get_params) + + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + self.config["warm_start"] = False + + if not self.is_classification: + self.model = ExtraTreesRegressor(**self.config, random_state=self.random_state) + else: + self.num_classes = len(np.unique(y_train)) + if self.num_classes > 2: + self.logger.info("==> Using warmstarting for multiclass") + self.final_n_estimators = self.config["n_estimators"] + self.config["n_estimators"] = 8 + self.config["warm_start"] = True + + self.model = ExtraTreesClassifier(**self.config, random_state=self.random_state) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> None: + assert self.model is not None, "No model found. Can't fit without preparing the model" + self.model.fit(X_train, y_train) + if self.config["warm_start"]: + self.model.n_estimators = self.final_n_estimators + self.model.fit(X_train, y_train) + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + ) -> ConfigurationSpace: + """Get the hyperparameter search space for the SimpleImputer + + Args: + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]) + Properties that describe the dataset + Note: Not actually Optional, just adhering to its supertype + numerical_strategy (HyperparameterSearchSpace: default = ...) + The strategy to use for numerical imputation + + Returns: + ConfigurationSpace + The space of possible configurations for a SimpleImputer with the given + `dataset_properties` + """ + cs = ConfigurationSpace() + + return cs + + @staticmethod + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'ETLearner', + 'name': 'ExtraTreesLearner', + } diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/utils.py new file mode 100644 index 000000000..e480dfed8 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/utils.py @@ -0,0 +1,7 @@ +from typing import Any, Dict + + +def get_params(output_type: int) -> Dict[str, Any]: + return { + "n_estimators" : 300 + } \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/knn.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/knn.py new file mode 100644 index 000000000..9e20ccbd6 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/knn.py @@ -0,0 +1,108 @@ +import logging.handlers +from typing import Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor + + +from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \ + BaseTraditionalLearner +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.knn.utils import get_params as knn_get_params + + +class KNNModel(BaseTraditionalLearner): + + def __init__(self, + task_type: str, + output_type: str, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None, + time_limit: Optional[int] = None, + **kwargs + ): + super(KNNModel, self).__init__(name="knn", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric, + dataset_properties=dataset_properties, + time_limit=time_limit, + params_func=knn_get_params) + self.categoricals: Optional[np.ndarray[bool]] = None + self.config.update(kwargs) + + def _preprocess(self, + X: np.ndarray + ) -> np.ndarray: + + super(KNNModel, self)._preprocess(X) + if self.categoricals is None: + self.categoricals = np.array([isinstance(X[0, ind], str) for ind in range(X.shape[1])]) + X = X[:, ~self.categoricals] if self.categoricals is not None else X + + return X + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + try: + # TODO: Add more granular switch, currently this affects all future KNN models even if they had `use_daal=False` + from sklearnex import patch_sklearn + patch_sklearn("knn_classifier") + patch_sklearn("knn_regressor") + # sklearnex backend for KNN seems to be 20-40x+ faster than native sklearn with no downsides. + self.logger.log(15, '\tUsing sklearnex KNN backend...') + except: + pass + if not self.is_classification: + self.model = KNeighborsRegressor(**self.config) + else: + self.num_classes = len(np.unique(y_train)) + # KNN is deterministic, no random seed needed + self.model = KNeighborsClassifier(**self.config) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> None: + assert self.model is not None, "No model found. Can't fit without preparing the model" + self.model.fit(X_train, y_train) + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + ) -> ConfigurationSpace: + """Get the hyperparameter search space for the SimpleImputer + + Args: + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]) + Properties that describe the dataset + Note: Not actually Optional, just adhering to its supertype + numerical_strategy (HyperparameterSearchSpace: default = ...) + The strategy to use for numerical imputation + + Returns: + ConfigurationSpace + The space of possible configurations for a SimpleImputer with the given + `dataset_properties` + """ + cs = ConfigurationSpace() + + return cs + + @staticmethod + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'KNNLearner', + 'name': 'K Nearest Neighbors Learner', + } diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/utils.py new file mode 100644 index 000000000..61ca3ca8f --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/utils.py @@ -0,0 +1,8 @@ +from typing import Any, Dict + + +def get_params(output_type: int) -> Dict[str, Any]: + + return dict( + weights="uniform" + ) diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/learners.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/learners.py deleted file mode 100644 index 220c52dcd..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/learners.py +++ /dev/null @@ -1,361 +0,0 @@ -import logging.handlers -import tempfile -from typing import Dict, Optional, Union - -from catboost import CatBoostClassifier, CatBoostRegressor, Pool - -from lightgbm import LGBMClassifier, LGBMRegressor - -import numpy as np - -from sklearn.ensemble import ( - ExtraTreesClassifier, - ExtraTreesRegressor, - RandomForestClassifier, - RandomForestRegressor -) -from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor -from sklearn.svm import SVC, SVR - -from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType -from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \ - BaseTraditionalLearner -from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.utils import ( - AutoPyTorchToCatboostMetrics -) - - -class LGBModel(BaseTraditionalLearner): - - def __init__(self, - task_type: str, - output_type: str, - optimize_metric: Optional[str] = None, - logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None - ): - super(LGBModel, self).__init__(name="lgb", - logger_port=logger_port, - random_state=random_state, - task_type=task_type, - output_type=output_type, - optimize_metric=optimize_metric) - - def _prepare_model(self, - X_train: np.ndarray, - y_train: np.ndarray - ) -> None: - early_stopping = 150 if X_train.shape[0] > 10000 else max(round(150 * 10000 / X_train.shape[0]), 10) - self.config["early_stopping_rounds"] = early_stopping - if not self.is_classification: - self.model = LGBMRegressor(**self.config, random_state=self.random_state) - else: - self.num_classes = len(np.unique(y_train)) if len(np.unique(y_train)) != 2 else 1 # this fixes a bug - self.config["num_class"] = self.num_classes - - self.model = LGBMClassifier(**self.config, random_state=self.random_state) - - def _fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray - ) -> None: - assert self.model is not None, "No model found. Can't fit without preparing the model" - self.model.fit(X_train, y_train, eval_set=[(X_val, y_val)]) - - def predict(self, X_test: np.ndarray, - predict_proba: bool = False, - preprocess: bool = True) -> np.ndarray: - assert self.model is not None, "No model found. Can't " \ - "predict before fitting. " \ - "Call fit before predicting" - if preprocess: - X_test = self._preprocess(X_test) - - if predict_proba: - if not self.is_classification: - raise ValueError("Can't predict probabilities for a regressor") - y_pred_proba = self.model.predict_proba(X_test) - if self.num_classes == 2: - y_pred_proba = y_pred_proba.transpose()[0:len(X_test)] - return y_pred_proba - - y_pred = self.model.predict(X_test) - return y_pred - - @staticmethod - def get_properties( - dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None - ) -> Dict[str, Union[str, bool]]: - return { - 'shortname': 'LGBMLearner', - 'name': 'Light Gradient Boosting Machine Learner', - } - - -class CatboostModel(BaseTraditionalLearner): - - def __init__(self, - task_type: str, - output_type: str, - optimize_metric: Optional[str] = None, - logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None - ): - super(CatboostModel, self).__init__(name="catboost", - logger_port=logger_port, - random_state=random_state, - task_type=task_type, - output_type=output_type, - optimize_metric=optimize_metric) - self.config["train_dir"] = tempfile.gettempdir() - - def _prepare_model(self, - X_train: np.ndarray, - y_train: np.ndarray - ) -> None: - if not self.is_classification: - self.config['eval_metric'] = AutoPyTorchToCatboostMetrics[self.metric.name].value - # CatBoost Cannot handle a random state object, just the seed - self.model = CatBoostRegressor(**self.config, random_state=self.random_state.get_state()[1][0]) - else: - self.config['eval_metric'] = AutoPyTorchToCatboostMetrics[self.metric.name].value - # CatBoost Cannot handle a random state object, just the seed - self.model = CatBoostClassifier(**self.config, random_state=self.random_state.get_state()[1][0]) - - def _fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray) -> None: - - assert self.model is not None, "No model found. Can't fit without preparing the model" - early_stopping = 150 if X_train.shape[0] > 10000 else max(round(150 * 10000 / X_train.shape[0]), 10) - categoricals = [ind for ind in range(X_train.shape[1]) if isinstance(X_train[0, ind], str)] - - X_train_pooled = Pool(data=X_train, label=y_train, cat_features=categoricals) - X_val_pooled = Pool(data=X_val, label=y_val, cat_features=categoricals) - - self.model.fit(X_train_pooled, - eval_set=X_val_pooled, - use_best_model=True, - early_stopping_rounds=early_stopping, - verbose=False) - - @staticmethod - def get_properties( - dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None - ) -> Dict[str, Union[str, bool]]: - return { - 'shortname': 'CBLearner', - 'name': 'Categorical Boosting Learner', - } - - -class RFModel(BaseTraditionalLearner): - - def __init__(self, - task_type: str, - output_type: str, - optimize_metric: Optional[str] = None, - logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None - ): - super(RFModel, self).__init__(name="random_forest", - logger_port=logger_port, - random_state=random_state, - task_type=task_type, - output_type=output_type, - optimize_metric=optimize_metric) - - def _prepare_model(self, - X_train: np.ndarray, - y_train: np.ndarray - ) -> None: - - self.config["warm_start"] = False - # TODO: Check if we need to warmstart for regression. - # In autogluon, they warm start when usinf daal backend, see - # ('https://github.com/awslabs/autogluon/blob/master/tabular/src/autogluon/tabular/models/rf/rf_model.py#L35') - if not self.is_classification: - self.model = RandomForestRegressor(**self.config, random_state=self.random_state) - else: - self.num_classes = len(np.unique(y_train)) - if self.num_classes > 2: - self.logger.info("==> Using warmstarting for multiclass") - self.final_n_estimators = self.config["n_estimators"] - self.config["n_estimators"] = 8 - self.config["warm_start"] = True - self.model = RandomForestClassifier(**self.config, random_state=self.random_state) - - def _fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray) -> None: - assert self.model is not None, "No model found. Can't fit without preparing the model" - - self.model.fit(X_train, y_train) - if self.config["warm_start"]: - self.model.n_estimators = self.final_n_estimators - self.model.fit(X_train, y_train) - - @staticmethod - def get_properties( - dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None - ) -> Dict[str, Union[str, bool]]: - return { - 'shortname': 'RFLearner', - 'name': 'Random Forest Learner', - } - - -class ExtraTreesModel(BaseTraditionalLearner): - - def __init__(self, - task_type: str, - output_type: str, - optimize_metric: Optional[str] = None, - logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None - ): - super(ExtraTreesModel, self).__init__(name="extra_trees", - logger_port=logger_port, - random_state=random_state, - task_type=task_type, - output_type=output_type, - optimize_metric=optimize_metric) - - def _prepare_model(self, - X_train: np.ndarray, - y_train: np.ndarray - ) -> None: - self.config["warm_start"] = False - - if not self.is_classification: - self.model = ExtraTreesRegressor(**self.config, random_state=self.random_state) - else: - self.num_classes = len(np.unique(y_train)) - if self.num_classes > 2: - self.logger.info("==> Using warmstarting for multiclass") - self.final_n_estimators = self.config["n_estimators"] - self.config["n_estimators"] = 8 - self.config["warm_start"] = True - - self.model = ExtraTreesClassifier(**self.config, random_state=self.random_state) - - def _fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray) -> None: - assert self.model is not None, "No model found. Can't fit without preparing the model" - self.model.fit(X_train, y_train) - if self.config["warm_start"]: - self.model.n_estimators = self.final_n_estimators - self.model.fit(X_train, y_train) - - @staticmethod - def get_properties( - dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None - ) -> Dict[str, Union[str, bool]]: - return { - 'shortname': 'ETLearner', - 'name': 'ExtraTreesLearner', - } - - -class KNNModel(BaseTraditionalLearner): - - def __init__(self, - task_type: str, - output_type: str, - optimize_metric: Optional[str] = None, - logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None - ): - super(KNNModel, self).__init__(name="knn", - logger_port=logger_port, - random_state=random_state, - task_type=task_type, - output_type=output_type, - optimize_metric=optimize_metric) - self.categoricals: Optional[np.ndarray[bool]] = None - - def _preprocess(self, - X: np.ndarray - ) -> np.ndarray: - - super(KNNModel, self)._preprocess(X) - if self.categoricals is None: - self.categoricals = np.array([isinstance(X[0, ind], str) for ind in range(X.shape[1])]) - X = X[:, ~self.categoricals] if self.categoricals is not None else X - - return X - - def _prepare_model(self, - X_train: np.ndarray, - y_train: np.ndarray - ) -> None: - if not self.is_classification: - self.model = KNeighborsRegressor(**self.config) - else: - self.num_classes = len(np.unique(y_train)) - # KNN is deterministic, no random seed needed - self.model = KNeighborsClassifier(**self.config) - - def _fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray) -> None: - assert self.model is not None, "No model found. Can't fit without preparing the model" - self.model.fit(X_train, y_train) - - @staticmethod - def get_properties( - dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None - ) -> Dict[str, Union[str, bool]]: - return { - 'shortname': 'KNNLearner', - 'name': 'K Nearest Neighbors Learner', - } - - -class SVMModel(BaseTraditionalLearner): - - def __init__(self, - task_type: str, - output_type: str, - optimize_metric: Optional[str] = None, - logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, - random_state: Optional[np.random.RandomState] = None - ): - super(SVMModel, self).__init__(name="svm", - logger_port=logger_port, - random_state=random_state, - task_type=task_type, - output_type=output_type, - optimize_metric=optimize_metric) - - def _prepare_model(self, - X_train: np.ndarray, - y_train: np.ndarray - ) -> None: - if not self.is_classification: - # Does not take random state. - self.model = SVR(**self.config) - else: - self.model = SVC(**self.config, probability=True, random_state=self.random_state) - - def _fit(self, X_train: np.ndarray, - y_train: np.ndarray, - X_val: np.ndarray, - y_val: np.ndarray) -> None: - assert self.model is not None, "No model found. Can't fit without preparing the model" - self.model.fit(X_train, y_train) - - @staticmethod - def get_properties( - dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None - ) -> Dict[str, Union[str, bool]]: - return { - 'shortname': 'SVMLearner', - 'name': 'Support Vector Machine Learner', - } diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/lgbm.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/lgbm.py new file mode 100644 index 000000000..644c9fde6 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/lgbm.py @@ -0,0 +1,153 @@ +import logging.handlers +from time import time +from typing import Dict, Optional, Union + +import logging + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformIntegerHyperparameter, + UniformFloatHyperparameter +) + +from lightgbm import LGBMClassifier, LGBMRegressor + +import numpy as np + +from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \ + BaseTraditionalLearner +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.lgbm.utils import early_stopping_custom, get_metric, get_params as lgb_get_params, get_train_loss_name +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter +from autoPyTorch.utils.early_stopping import get_early_stopping_rounds + + + +class LGBModel(BaseTraditionalLearner): + def __init__(self, + task_type: str, + output_type: str, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None, + time_limit: Optional[int] = None, + **kwargs + ): + super(LGBModel, self).__init__(name="lgb", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric, + dataset_properties=dataset_properties, + time_limit=time_limit, + params_func=lgb_get_params) + self.config.update(kwargs) + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + early_stopping = get_early_stopping_rounds(X_train.shape[0]) + self.config["early_stopping_rounds"] = early_stopping + self.stopping_metric_name = get_metric(output_type=self.output_type, optimize_metric=self.metric.name) + self.training_objective = get_train_loss_name(self.output_type) + if not self.is_classification: + self.model = LGBMRegressor(**self.config, random_state=self.random_state) + else: + self.num_classes = len(np.unique(y_train)) if len(np.unique(y_train)) != 2 else 1 # this fixes a bug + self.config["num_class"] = self.num_classes + + self.model = LGBMClassifier(**self.config, random_state=self.random_state) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray + ) -> None: + assert self.model is not None, "No model found. Can't fit without preparing the model" + start_time = time() + callbacks = [ + # TODO: pass start time and time limit to early stopping + early_stopping_custom(self.config["early_stopping_rounds"], logger=self.logger, metrics_to_use=[('valid_set', self.stopping_metric_name)], max_diff=None, start_time=start_time, time_limit=self.time_limit, + ignore_dart_warning=True, verbose=False, manual_stop_file=False, train_loss_name=self.training_objective), + ] + self.model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=self.training_objective, callbacks=callbacks) + + def predict(self, X_test: np.ndarray, + predict_proba: bool = False, + preprocess: bool = True) -> np.ndarray: + assert self.model is not None, "No model found. Can't " \ + "predict before fitting. " \ + "Call fit before predicting" + if preprocess: + X_test = self._preprocess(X_test) + + if predict_proba: + if not self.is_classification: + raise ValueError("Can't predict probabilities for a regressor") + y_pred_proba = self.model.predict_proba(X_test) + if self.num_classes == 2: + y_pred_proba = y_pred_proba.transpose()[0:len(X_test)] + return y_pred_proba + + y_pred = self.model.predict(X_test) + return y_pred + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + learning_rate: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='learning_rate', + value_range=(5e-3, 0.2), + default_value=0.05, + log=True + ), + feature_fraction: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='feature_fraction', + value_range=(0.75, 1), + default_value=1, + ), + min_data_in_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='min_data_in_leaf', + value_range=(2, 60), + default_value=20, + ), + num_leaves: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='num_leaves', + value_range=(16, 96), + default_value=31, + ), + ) -> ConfigurationSpace: + """Get the hyperparameter search space for the SimpleImputer + + Args: + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]) + Properties that describe the dataset + Note: Not actually Optional, just adhering to its supertype + numerical_strategy (HyperparameterSearchSpace: default = ...) + The strategy to use for numerical imputation + + Returns: + ConfigurationSpace + The space of possible configurations for a SimpleImputer with the given + `dataset_properties` + """ + cs = ConfigurationSpace() + + add_hyperparameter(cs, num_leaves, UniformIntegerHyperparameter) + add_hyperparameter(cs, min_data_in_leaf, UniformIntegerHyperparameter) + add_hyperparameter(cs, feature_fraction, UniformFloatHyperparameter) + add_hyperparameter(cs, learning_rate, UniformFloatHyperparameter) + + return cs + + @staticmethod + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'LGBMLearner', + 'name': 'Light Gradient Boosting Machine Learner', + } \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/utils.py new file mode 100644 index 000000000..466031433 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/utils.py @@ -0,0 +1,298 @@ +from typing import Any, Dict +from autoPyTorch.constants import ( + MULTICLASS, + BINARY, + CONTINUOUS, + OUTPUT_TYPES_TO_STRING +) +import logging.handlers +from typing import Dict, Optional, Union + +import copy +import logging +import os +import psutil +import time +import warnings +from operator import gt, lt + +from lightgbm.callback import _format_eval_result, EarlyStopException +from autoPyTorch.utils.early_stopping import SimpleEarlyStopper + + +DEFAULT_METRIC_INDEX = 0 + + +def get_common_params(): + return { + "num_rounds": 10000, + "num_leaves": 128, + "feature_fraction": 0.9, + "boosting_type": "gbdt", + } + + +def get_params_binary(): + return { + "min_data_in_leaf": 3, + "learning_rate": 0.03 + } + +def get_params_multiclass(): + return { + "min_data_in_leaf": 3, + "learning_rate": 0.03 + } + +def get_params_continuous(): + return { + "min_data_in_leaf": 3, + "learning_rate": 0.03 + } + +def get_params(output_type: int) -> Dict[str, Any]: + + common_params = get_common_params() + if output_type == BINARY: + common_params.update(get_params_binary()) + elif output_type == MULTICLASS: + common_params.update(get_params_multiclass()) + elif output_type == CONTINUOUS: + common_params.update(get_params_continuous()) + else: + raise ValueError(f"Unknown output_type: {OUTPUT_TYPES_TO_STRING[output_type]}") + return common_params + + +def early_stopping_custom(stopping_rounds, logger, first_metric_only=False, metrics_to_use=None, start_time=None, time_limit=None, verbose=True, max_diff=None, ignore_dart_warning=False, manual_stop_file=None, train_loss_name=None, reporter=None): + """Create a callback that activates early stopping. + Note: + Implementation from autogluon + Note + ---- + Activates early stopping. + The model will train until the validation score stops improving. + Validation score needs to improve at least every ``early_stopping_rounds`` round(s) + to continue training. + Requires at least one validation data and one metric. + If there's more than one, will check all of them. But the training data is ignored anyway. + To check only the first metric set ``first_metric_only`` to True. + Parameters + ---------- + stopping_rounds : int or tuple + If int, The possible number of rounds without the trend occurrence. + If tuple, contains early stopping class as first element and class init kwargs as second element. + first_metric_only : bool, optional (default=False) + Whether to use only the first metric for early stopping. + verbose : bool, optional (default=True) + Whether to print message with early stopping information. + train_loss_name : str, optional (default=None): + Name of metric that contains training loss value. + reporter : optional (default=None): + reporter object from AutoGluon scheduler. + Returns + ------- + callback : function + The callback that activates early stopping. + """ + best_score = [] + best_iter = [] + best_score_list = [] + best_trainloss = [] # stores training losses at corresponding best_iter + cmp_op = [] + enabled = [True] + indices_to_check = [] + mem_status = psutil.Process() + init_mem_rss = [] + init_mem_avail = [] + es = [] + + def _init(env): + if not ignore_dart_warning: + enabled[0] = not any((boost_alias in env.params + and env.params[boost_alias] == 'dart') for boost_alias in ('boosting', + 'boosting_type', + 'boost')) + if not enabled[0]: + warnings.warn('Early stopping is not available in dart mode') + return + if not env.evaluation_result_list: + raise ValueError('For early stopping, ' + 'at least one dataset and eval metric is required for evaluation') + + if verbose: + msg = "Training until validation scores don't improve for {} rounds." + logger.debug(msg.format(stopping_rounds)) + if manual_stop_file: + logger.debug('Manually stop training by creating file at location: ', manual_stop_file) + + if isinstance(stopping_rounds, int): + es_template = SimpleEarlyStopper(patience=stopping_rounds) + else: + es_template = stopping_rounds[0](**stopping_rounds[1]) + + for eval_ret in env.evaluation_result_list: + best_iter.append(0) + best_score_list.append(None) + best_trainloss.append(None) + es.append(copy.deepcopy(es_template)) + if eval_ret[3]: + best_score.append(float('-inf')) + cmp_op.append(gt) + else: + best_score.append(float('inf')) + cmp_op.append(lt) + + if metrics_to_use is None: + for i in range(len(env.evaluation_result_list)): + indices_to_check.append(i) + if first_metric_only: + break + else: + for i, eval in enumerate(env.evaluation_result_list): + if (eval[0], eval[1]) in metrics_to_use: + indices_to_check.append(i) + if first_metric_only: + break + + init_mem_rss.append(mem_status.memory_info().rss) + init_mem_avail.append(psutil.virtual_memory().available) + + def _callback(env): + if not cmp_op: + _init(env) + if not enabled[0]: + return + train_loss_val = 0.0 + for i in indices_to_check: + is_best_iter = False + eval_result = env.evaluation_result_list[i] + _, eval_metric, score, greater_is_better = eval_result + if best_score_list[i] is None or cmp_op[i](score, best_score[i]): + is_best_iter = True + best_score[i] = score + best_iter[i] = env.iteration + best_score_list[i] = env.evaluation_result_list + best_trainloss[i] = train_loss_val + if reporter is not None: # Report current best scores for iteration, used in HPO + if i == indices_to_check[0]: # TODO: documentation needs to note that we assume 0th index is the 'official' validation performance metric. + if cmp_op[i] == gt: + validation_perf = score + else: + validation_perf = -score + reporter(epoch=env.iteration + 1, + validation_performance=validation_perf, + train_loss=best_trainloss[i], + best_iter_sofar=best_iter[i] + 1, + best_valperf_sofar=best_score[i], + eval_metric=eval_metric, # eval_metric here is the stopping_metric from LGBModel + greater_is_better=greater_is_better, + ) + early_stop = es[i].update(cur_round=env.iteration, is_best=is_best_iter) + if early_stop: + if verbose: + logger.log(15, 'Early stopping, best iteration is:\n[%d]\t%s' % ( + best_iter[i] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[i]]))) + raise EarlyStopException(best_iter[i], best_score_list[i]) + elif (max_diff is not None) and (abs(score - best_score[i]) > max_diff): + if verbose: + logger.debug('max_diff breached!') + logger.debug(abs(score - best_score[i])) + logger.log(15, 'Early stopping, best iteration is:\n[%d]\t%s' % ( + best_iter[i] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[i]]))) + raise EarlyStopException(best_iter[i], best_score_list[i]) + if env.iteration == env.end_iteration - 1: + if verbose: + logger.log(15, 'Did not meet early stopping criterion. Best iteration is:\n[%d]\t%s' % ( + best_iter[i] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[i]]))) + raise EarlyStopException(best_iter[i], best_score_list[i]) + if verbose: + logger.debug((env.iteration - best_iter[i], eval_result)) + if manual_stop_file: + if os.path.exists(manual_stop_file): + i = indices_to_check[0] + logger.log(20, 'Found manual stop file, early stopping. Best iteration is:\n[%d]\t%s' % ( + best_iter[i] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[i]]))) + raise EarlyStopException(best_iter[i], best_score_list[i]) + if time_limit: + time_elapsed = time.time() - start_time + time_left = time_limit - time_elapsed + if time_left <= 0: + i = indices_to_check[0] + logger.log(20, '\tRan out of time, early stopping on iteration ' + str(env.iteration+1) + '. Best iteration is:\n\t[%d]\t%s' % ( + best_iter[i] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[i]]))) + raise EarlyStopException(best_iter[i], best_score_list[i]) + + # TODO: Add toggle parameter to early_stopping to disable this + # TODO: Identify optimal threshold values for early_stopping based on lack of memory + if env.iteration % 10 == 0: + available = psutil.virtual_memory().available + cur_rss = mem_status.memory_info().rss + + if cur_rss < init_mem_rss[0]: + init_mem_rss[0] = cur_rss + estimated_model_size_mb = (cur_rss - init_mem_rss[0]) >> 20 + available_mb = available >> 20 + + model_size_memory_ratio = estimated_model_size_mb / available_mb + if verbose or (model_size_memory_ratio > 0.25): + logging.debug('Available Memory: '+str(available_mb)+' MB') + logging.debug('Estimated Model Size: '+str(estimated_model_size_mb)+' MB') + + early_stop = False + if model_size_memory_ratio > 1.0: + logger.warning('Warning: Large GBM model size may cause OOM error if training continues') + logger.warning('Available Memory: '+str(available_mb)+' MB') + logger.warning('Estimated GBM model size: '+str(estimated_model_size_mb)+' MB') + early_stop = True + + # TODO: We will want to track size of model as well, even if we early stop before OOM, we will still crash when saving if the model is large enough + if available_mb < 512: # Less than 500 MB + logger.warning('Warning: Low available memory may cause OOM error if training continues') + logger.warning('Available Memory: '+str(available_mb)+' MB') + logger.warning('Estimated GBM model size: '+str(estimated_model_size_mb)+' MB') + early_stop = True + + if early_stop: + logger.warning('Warning: Early stopped GBM model prior to optimal result to avoid OOM error. Please increase available memory to avoid subpar model quality.') + logger.log(15, 'Early stopping, best iteration is:\n[%d]\t%s' % ( + best_iter[0] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[0]]))) + raise EarlyStopException(best_iter[0], best_score_list[0]) + + _callback.order = 30 + return _callback + +def get_compatible_metric_dict(output_type: int) -> Dict[str, str]: + if output_type == BINARY: + return dict( + accuracy='binary_error', + log_loss='binary_logloss', + roc_auc='auc', + ) + elif output_type == MULTICLASS: + return dict( + accuracy='multi_error', + log_loss='multi_logloss', + ) + elif output_type == CONTINUOUS: + return dict( + mean_absolute_error='l1', + mean_squared_error='l2', + root_mean_squared_error='rmse', + ) + + +def get_metric(output_type: int, optimize_metric: str) -> str: + metric_dict = get_compatible_metric_dict(output_type=output_type) + return metric_dict.get(optimize_metric, list(metric_dict.values())[DEFAULT_METRIC_INDEX]) + +def get_train_loss_name(output_type: int): + if output_type == BINARY: + train_loss_name = 'binary_logloss' + elif output_type == MULTICLASS: + train_loss_name = 'multi_logloss' + elif output_type == CONTINUOUS: + train_loss_name = 'l2' + else: + raise ValueError(f"unknown output_type for LGBModel: {output_type}") + return train_loss_name diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/random_forest.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/random_forest.py new file mode 100644 index 000000000..f368331ef --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/random_forest.py @@ -0,0 +1,103 @@ +import logging.handlers +import tempfile +from typing import Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from sklearn.ensemble import ( + RandomForestClassifier, + RandomForestRegressor +) + +from autoPyTorch.constants import MULTICLASS +from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \ + BaseTraditionalLearner +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.random_forest.utils import get_params + + +class RFModel(BaseTraditionalLearner): + def __init__(self, + task_type: str, + output_type: str, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None, + time_limit: Optional[int] = None, + **kwargs + ): + super(RFModel, self).__init__(name="random_forest", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric, + dataset_properties=dataset_properties, + time_limit=time_limit, + params_func=get_params) + self.config.update(kwargs) + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + + self.config["warm_start"] = False + # TODO: Check if we need to warmstart for regression. + # In autogluon, they warm start when usinf daal backend, see + # ('https://github.com/awslabs/autogluon/blob/master/tabular/src/autogluon/tabular/models/rf/rf_model.py#L35') + if not self.is_classification: + self.model = RandomForestRegressor(**self.config, random_state=self.random_state) + else: + self.num_classes = len(np.unique(y_train)) + if self.num_classes > 2: + self.logger.info("==> Using warmstarting for multiclass") + self.final_n_estimators = self.config["n_estimators"] + self.config["n_estimators"] = 8 + self.config["warm_start"] = True + self.model = RandomForestClassifier(**self.config, random_state=self.random_state) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray) -> None: + assert self.model is not None, "No model found. Can't fit without preparing the model" + + self.model = self.model.fit(X_train, y_train) + if self.config["warm_start"]: + self.model.n_estimators = self.final_n_estimators + self.model.fit(X_train, y_train) + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + ) -> ConfigurationSpace: + """Get the hyperparameter search space for the SimpleImputer + + Args: + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]) + Properties that describe the dataset + Note: Not actually Optional, just adhering to its supertype + numerical_strategy (HyperparameterSearchSpace: default = ...) + The strategy to use for numerical imputation + + Returns: + ConfigurationSpace + The space of possible configurations for a SimpleImputer with the given + `dataset_properties` + """ + cs = ConfigurationSpace() + + return cs + + @staticmethod + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'RFLearner', + 'name': 'Random Forest Learner', + } diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/utils.py new file mode 100644 index 000000000..320ca413e --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/utils.py @@ -0,0 +1,9 @@ +from typing import Any, Dict + + +def get_params(output_type: int) -> Dict[str, Any]: + + return { + "n_estimators" : 300, + 'bootstrap': True + } diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/utils.py deleted file mode 100644 index b45161aa9..000000000 --- a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/utils.py +++ /dev/null @@ -1,15 +0,0 @@ -from enum import Enum - - -class AutoPyTorchToCatboostMetrics(Enum): - mean_absolute_error = "MAE" - root_mean_squared_error = "RMSE" - mean_squared_log_error = "MSLE" - r2 = "R2" - accuracy = "Accuracy" - balanced_accuracy = "BalancedAccuracy" - f1 = "F1" - roc_auc = "AUC" - precision = "Precision" - recall = "Recall" - log_loss = "Logloss" diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/early_stopping_custom.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/early_stopping_custom.py new file mode 100644 index 000000000..31dd386b1 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/early_stopping_custom.py @@ -0,0 +1,90 @@ +import time +import psutil +import logging + +from xgboost.callback import EarlyStopping + +from autoPyTorch.utils.early_stopping import SimpleEarlyStopper + +logger = logging.getLogger(__name__) + + +class EarlyStoppingCustom(EarlyStopping): + """ + Augments early stopping in XGBoost to also consider time_limit, memory usage, and usage of adaptive early stopping methods. + + Parameters + ---------- + rounds : int or tuple + If int, The possible number of rounds without the trend occurrence. + If tuple, contains early stopping class as first element and class init kwargs as second element. + """ + def __init__(self, rounds, time_limit=None, start_time=None, verbose=False, **kwargs): + if rounds is None: + # Disable early stopping via rounds + rounds = 999999 + super().__init__(rounds=999999, **kwargs) + if isinstance(rounds, int): + self.es = SimpleEarlyStopper(patience=rounds) + else: + self.es = rounds[0](**rounds[1]) + self.time_limit = time_limit + self.start_time = start_time + self.verbose = verbose + self._mem_status = None + self._mem_init_rss = None + + def before_training(self, model): + model = super().before_training(model=model) + if self.start_time is None: + self.start_time = time.time() + self._mem_status = psutil.Process() + self._mem_init_rss = self._mem_status.memory_info().rss + return model + + def after_iteration(self, model, epoch, evals_log): + should_stop = super().after_iteration(model, epoch, evals_log) + if should_stop: + return should_stop + is_best_iter = self.current_rounds == 0 + should_stop = self.es.update(current_epoch=epoch, is_best=is_best_iter) + if should_stop: + return should_stop + if self._time_check(model=model, epoch=epoch): + return True + if epoch % 10 == 0 and self._memory_check(model=model): + return True + return should_stop + + def _time_check(self, model, epoch): + if self.time_limit is not None: + time_elapsed = time.time() - self.start_time + time_left = self.time_limit - time_elapsed + if time_left <= 0: + if self.verbose: + logger.log(20, f"Ran out of time, early stopping on iteration {epoch}. Best iteration is: \t[{model.attr('best_iteration')}]\t{model.attr('best_score')}") + return True + return False + + def _memory_check(self, model): + available = psutil.virtual_memory().available + cur_rss = self._mem_status.memory_info().rss + if cur_rss < self._mem_init_rss: + self._mem_init_rss = cur_rss + estimated_model_size_mb = (cur_rss - self._mem_init_rss) >> 20 + available_mb = available >> 20 + + model_size_memory_ratio = estimated_model_size_mb / available_mb + + if (model_size_memory_ratio > 1.0) or (available_mb < 512): + logger.warning('Warning: Large XGB model size may cause OOM error if training continues') + logger.warning(f'Available Memory: {available_mb} MB') + logger.warning(f'Estimated XGB model size: {estimated_model_size_mb} MB') + if self.verbose: + logger.warning(f'Warning: Early stopped XGB model prior to optimal result to avoid OOM error. Please increase available memory to avoid subpar model quality.\n') + logger.warning(f"Early stopping. Best iteration is: \t[{model.attr('best_iteration')}]\t{model.attr('best_score')}") + return True + elif self.verbose and (model_size_memory_ratio > 0.25): + logger.log(15, f'Available Memory: {available_mb} MB') + logger.log(15, f'Estimated XGB model size: {estimated_model_size_mb} MB') + return False diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/utils.py new file mode 100644 index 000000000..4124c171b --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/utils.py @@ -0,0 +1,85 @@ +from typing import Dict +from autoPyTorch.constants import BINARY, MULTICLASS, CONTINUOUS +from enum import Enum + + +DEFAULT_METRIC_INDEX = 0 + + +def get_compatible_metric_dict(output_type: int) -> Dict[str, str]: + if output_type == BINARY: + return dict( + accuracy='error', + log_loss='logloss', + roc_auc='auc', + ) + elif output_type == MULTICLASS: + return dict( + accuracy='merror', + log_loss='mlogloss', + ) + elif output_type == CONTINUOUS: + return dict( + mean_absolute_error='mae', + root_mean_squared_error='rmse', + ) + +def get_metric(output_type: int, optimize_metric: str) -> str: + metric_dict = get_compatible_metric_dict(output_type=output_type) + return metric_dict.get(optimize_metric, list(metric_dict.values())[DEFAULT_METRIC_INDEX]) + + +DEFAULT_NUM_BOOST_ROUND = 10000 +# Options: [10, 100, 200, 300, 400, 500, 1000, 10000] + + +def get_param_baseline(output_type): + if output_type == BINARY: + return get_param_binary_baseline() + elif output_type == MULTICLASS: + return get_param_multiclass_baseline() + elif output_type == CONTINUOUS: + return get_param_regression_baseline() + else: + return get_param_binary_baseline() + + +def get_base_params(): + base_params = { + 'n_estimators': DEFAULT_NUM_BOOST_ROUND, + 'learning_rate': 0.1, + 'n_jobs': -1, + } + return base_params + + +def get_param_binary_baseline(): + params = get_base_params() + baseline_params = { + 'objective': 'binary:logistic', + 'booster': 'gbtree', + 'use_label_encoder': False, + } + params.update(baseline_params) + return params + + +def get_param_multiclass_baseline(): + params = get_base_params() + baseline_params = { + 'objective': 'multi:softmax', + 'booster': 'gbtree', + 'use_label_encoder': False, + } + params.update(baseline_params) + return params + + +def get_param_regression_baseline(): + params = get_base_params() + baseline_params = { + 'objective': 'reg:squarederror', + 'booster': 'gbtree', + } + params.update(baseline_params) + return params diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/xgboost.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/xgboost.py new file mode 100644 index 000000000..f95427085 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/xgboost.py @@ -0,0 +1,198 @@ +import logging.handlers +from time import time +from typing import Dict, Optional, Union + +import logging + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformIntegerHyperparameter, + UniformFloatHyperparameter +) + +import numpy as np + +from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \ + BaseTraditionalLearner +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.xgboost.utils import get_metric, get_param_baseline as xgb_get_params +from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.xgboost.early_stopping_custom import EarlyStoppingCustom +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter +from autoPyTorch.utils.early_stopping import get_early_stopping_rounds + + +class XGBModel(BaseTraditionalLearner): + def __init__(self, + task_type: str, + output_type: str, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + optimize_metric: Optional[str] = None, + logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + random_state: Optional[np.random.RandomState] = None, + time_limit: Optional[int] = None, + **kwargs + ): + super(XGBModel, self).__init__(name="xgboost", + logger_port=logger_port, + random_state=random_state, + task_type=task_type, + output_type=output_type, + optimize_metric=optimize_metric, + dataset_properties=dataset_properties, + time_limit=time_limit, + params_func=xgb_get_params) + self.config.update(kwargs) + self.encoder = None + + def _prepare_model(self, + X_train: np.ndarray, + y_train: np.ndarray + ) -> None: + from xgboost import XGBClassifier, XGBRegressor + self.eval_metric = get_metric(self.output_type, optimize_metric=self.metric.name) + # avoid unnecessary warnings + self.config['eval_metric'] = get_metric(self.output_type, optimize_metric=self.metric.name) + if not self.is_classification: + self.model = XGBRegressor(**self.config, random_state=self.random_state) + else: + self.config["num_class"] = len(np.unique(y_train)) if len(np.unique(y_train)) != 2 else 1 # this fixes a bug + + self.model = XGBClassifier(**self.config, random_state=self.random_state) + + def _fit(self, X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray + ) -> None: + start_time = time() + + assert self.model is not None, "No model found. Can't fit without preparing the model" + eval_set = [] + if X_val is None: + early_stopping_rounds = None + eval_set = None + else: + eval_set.append((X_val, y_val)) + early_stopping_rounds = get_early_stopping_rounds(X_train.shape[0]) + + callbacks = [] + if eval_set is not None: + callbacks.append(EarlyStoppingCustom(early_stopping_rounds, start_time=start_time, time_limit=self.time_limit)) + self.model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=self.eval_metric, callbacks=callbacks, verbose=False) + + def _preprocess(self, + X: np.ndarray + ) -> np.ndarray: + from sklearn.compose import make_column_transformer + from sklearn.preprocessing import OneHotEncoder + + super(XGBModel, self)._preprocess(X) + + if len(self.dataset_properties['categorical_columns']) > 0: + if self.encoder is None: + self.encoder = make_column_transformer((OneHotEncoder(sparse=False, handle_unknown='ignore'), self.dataset_properties['categorical_columns']), remainder="passthrough") + self.encoder.fit(X) + X = self.encoder.transform(X) + + return X + + def predict(self, X_test: np.ndarray, + predict_proba: bool = False, + preprocess: bool = True) -> np.ndarray: + assert self.model is not None, "No model found. Can't " \ + "predict before fitting. " \ + "Call fit before predicting" + if preprocess: + X_test = self._preprocess(X_test) + + if predict_proba: + if not self.is_classification: + raise ValueError("Can't predict probabilities for a regressor") + y_pred_proba = self.model.predict_proba(X_test) + if self.num_classes == 2: + y_pred_proba = y_pred_proba.transpose()[0:len(X_test)] + return y_pred_proba + + y_pred = self.model.predict(X_test) + return y_pred + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + learning_rate: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='learning_rate', + value_range=(5e-3, 0.2), + default_value=0.1, + log=True + ), + max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='max_depth', + value_range=(3, 10), + default_value=6, + ), + min_child_weight: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='min_child_weight', + value_range=(1, 5), + default_value=1, + ), + gamma: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='gamma', + value_range=(0, 5), + default_value=0.01, + ), + subsample: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='subsample', + value_range=(0.5, 1), + default_value=1, + ), + colsample_bytree: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='colsample_bytree', + value_range=(0.5, 1), + default_value=1, + ), + reg_alpha: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='reg_alpha', + value_range=(0, 10), + default_value=0, + ), + reg_lambda: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='reg_lambda', + value_range=(0, 10), + default_value=0, + ), + ) -> ConfigurationSpace: + """Get the hyperparameter search space for the SimpleImputer + + Args: + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]) + Properties that describe the dataset + Note: Not actually Optional, just adhering to its supertype + numerical_strategy (HyperparameterSearchSpace: default = ...) + The strategy to use for numerical imputation + + Returns: + ConfigurationSpace + The space of possible configurations for a SimpleImputer with the given + `dataset_properties` + """ + cs = ConfigurationSpace() + + add_hyperparameter(cs, colsample_bytree, UniformFloatHyperparameter) + add_hyperparameter(cs, subsample, UniformFloatHyperparameter) + add_hyperparameter(cs, reg_alpha, UniformFloatHyperparameter) + add_hyperparameter(cs, gamma, UniformFloatHyperparameter) + add_hyperparameter(cs, min_child_weight, UniformIntegerHyperparameter) + add_hyperparameter(cs, learning_rate, UniformFloatHyperparameter) + add_hyperparameter(cs, reg_lambda, UniformFloatHyperparameter) + add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter) + + return cs + + @staticmethod + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'XGBLearner', + 'name': 'Xtreme Gradient Boosting Machine Learner', + } \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py index 0cea0b2c7..38b508ae4 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py @@ -31,10 +31,10 @@ class BaseDataLoaderComponent(autoPyTorchTrainingComponent): """ - def __init__(self, batch_size: int = 64, + def __init__(self, max_batch_size: int = 64, random_state: Optional[np.random.RandomState] = None) -> None: super().__init__(random_state=random_state) - self.batch_size = batch_size + self.max_batch_size = max_batch_size self.train_data_loader: Optional[torch.utils.data.DataLoader] = None self.val_data_loader: Optional[torch.utils.data.DataLoader] = None self.test_data_loader: Optional[torch.utils.data.DataLoader] = None @@ -108,6 +108,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader: train_dataset = datamanager.get_dataset(split_id=X['split_id'], train=True) + self.batch_size = min(int(2 ** (3 + np.floor(np.log10(len(train_dataset))))), self.max_batch_size) + self.train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=min(self.batch_size, len(train_dataset)), @@ -258,13 +260,13 @@ def get_torchvision_datasets(self) -> Dict[str, torchvision.datasets.VisionDatas @staticmethod def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, - batch_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="batch_size", + max_batch_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="max_batch_size", value_range=(32, 320), default_value=64, log=True) ) -> ConfigurationSpace: cs = ConfigurationSpace() - add_hyperparameter(cs, batch_size, UniformIntegerHyperparameter) + add_hyperparameter(cs, max_batch_size, UniformIntegerHyperparameter) return cs diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py index b380659da..e7f103f24 100755 --- a/autoPyTorch/pipeline/components/training/trainer/__init__.py +++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py @@ -35,8 +35,10 @@ ) from autoPyTorch.pipeline.components.training.trainer.utils import Lookahead, update_model_state_dict_from_swa from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, get_device_from_fit_dictionary +from autoPyTorch.utils.early_stopping import AbstractEarlyStopper, SimpleEarlyStopper from autoPyTorch.utils.logging_ import get_named_client_logger + trainer_directory = os.path.split(__file__)[0] _trainers = find_components(__package__, trainer_directory, @@ -68,6 +70,7 @@ def __init__(self, self.run_summary: Optional[RunSummary] = None self.writer: Optional[SummaryWriter] = None self.early_stopping_split_type: Optional[str] = None + self.early_stopper: Optional[AbstractEarlyStopper] = None self._fit_requirements: Optional[List[FitRequirement]] = [ FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False), FitRequirement("num_run", (int,), user_defined=False, dataset_property=False), @@ -338,6 +341,9 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic additional_losses = X['additional_losses'] if 'additional_losses' in X else None labels = self._get_train_label(X) + # Allow to disable early stopping + if X['early_stopping'] is not None or X['early_stopping'] >= 0: + self.early_stopper = SimpleEarlyStopper(patience=X['early_stopping']) self.choice.prepare( model=X['network'], @@ -481,7 +487,7 @@ def _get_train_label(self, X: Dict[str, Any]) -> List[int]: Verifies and validates the labels from train split. """ # Ensure that the split is not missing any class. - labels: List[int] = X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]] + labels: List[int] = X['y_train'][X['backend'].load_datamanager().splits[X['repeat_id']][X['split_id']][0]] if STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']] in CLASSIFICATION_TASKS: unique_labels = len(np.unique(labels)) if unique_labels < X['dataset_properties']['output_shape']: @@ -527,7 +533,7 @@ def early_stop_handler(self, X: Dict[str, Any]) -> bool: assert self.early_stopping_split_type is not None # mypy # Allow to disable early stopping - if X['early_stopping'] is None or X['early_stopping'] < 0: + if self.early_stopper is None: return False # Store the best weights seen so far: @@ -536,14 +542,14 @@ def early_stop_handler(self, X: Dict[str, Any]) -> bool: last_epoch = self.run_summary.get_last_epoch() best_epoch = self.run_summary.get_best_epoch(split_type=self.early_stopping_split_type) - epochs_since_best = last_epoch - best_epoch + is_best = last_epoch == best_epoch # Save the checkpoint if there is a new best epoch best_path = os.path.join(self.checkpoint_dir, 'best.pth') - if epochs_since_best == 0: + if is_best: torch.save(X['network'].state_dict(), best_path) - return epochs_since_best > cast(int, X['early_stopping']) + return self.early_stopper.update(last_epoch, is_best) def eval_valid_each_epoch(self, X: Dict[str, Any]) -> bool: """ diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index 2e64a6944..376996ed3 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -18,6 +18,9 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import ( TabularColumnTransformer ) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import ( + ColumnSplitter +) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import ( CoalescerChoice ) @@ -29,6 +32,7 @@ ) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.skew_transformer import SkewTransformerChoice from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing @@ -288,9 +292,11 @@ def _get_pipeline_steps( steps.extend([ ("imputer", SimpleImputer(random_state=self.random_state)), ("variance_threshold", VarianceThreshold(random_state=self.random_state)), - ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)), + # ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)), + ("column_splitter", ColumnSplitter(random_state=self.random_state)), ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)), ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)), + ("skew_transformer", SkewTransformerChoice(default_dataset_properties, random_state=self.random_state)), ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties, random_state=self.random_state)), ("tabular_transformer", TabularColumnTransformer(random_state=self.random_state)), diff --git a/autoPyTorch/pipeline/traditional_tabular_classification.py b/autoPyTorch/pipeline/traditional_tabular_classification.py index 8cdfeaf39..675ad9b43 100644 --- a/autoPyTorch/pipeline/traditional_tabular_classification.py +++ b/autoPyTorch/pipeline/traditional_tabular_classification.py @@ -11,9 +11,8 @@ from autoPyTorch.pipeline.base_pipeline import BasePipeline, PipelineStepType from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent -from autoPyTorch.pipeline.components.setup.traditional_ml import ModelChoice from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates - +from autoPyTorch.pipeline.components.setup.traditional_ml.tabular_traditional_model import TabularTraditionalModel class TraditionalTabularClassificationPipeline(ClassifierMixin, BasePipeline): """ @@ -229,7 +228,8 @@ def _get_pipeline_steps( default_dataset_properties.update(dataset_properties) steps.extend([ - ("model_trainer", ModelChoice(default_dataset_properties, + ("model_trainer", TabularTraditionalModel( + # ModelChoice(default_dataset_properties, random_state=self.random_state)), ]) return steps @@ -257,14 +257,14 @@ def get_pipeline_representation(self) -> Dict[str, str]: Contains the pipeline representation in a short format """ estimator_name = 'TraditionalTabularClassification' - if self.steps[0][1].choice is not None: - if self.steps[0][1].choice.model is None: - estimator_name = self.steps[0][1].choice.__class__.__name__ - else: - estimator_name = cast( - str, - self.steps[0][1].choice.model.get_properties()['shortname'] - ) + # if self.steps[0][1].choice is not None: + if self.steps[0][1].model is None: + estimator_name = self.steps[0][1].model.__class__.__name__ + else: + estimator_name = cast( + str, + self.steps[0][1].model.get_properties()['shortname'] + ) return { 'Preprocessing': 'None', 'Estimator': estimator_name, diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py index d37a0c182..ec8a03b06 100644 --- a/autoPyTorch/utils/common.py +++ b/autoPyTorch/utils/common.py @@ -1,7 +1,9 @@ +import copy from enum import Enum +from math import floor from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union -from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.configuration_space import ConfigurationSpace, Configuration from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, Constant, @@ -22,6 +24,8 @@ HyperparameterValueType = Union[int, str, float] +ENSEMBLE_ITERATION_MULTIPLIER = 1e8 + def ispandas(X: Any) -> bool: """ Whether X is pandas.DataFrame or pandas.Series """ return hasattr(X, "iloc") @@ -283,3 +287,41 @@ def check_none(p: Any) -> bool: if p in ("None", "none", None): return True return False + + +def validate_config(config, search_space: ConfigurationSpace, n_numerical_in_incumbent_on_task_id, num_numerical, assert_autogluon_numerical_hyperparameters: bool=False): + modified_config = config.get_dictionary().copy() if isinstance(config, Configuration) else config.copy() + + if num_numerical > 0: + imputer_numerical_hyperparameter = "imputer:numerical_strategy" + if imputer_numerical_hyperparameter not in modified_config: + modified_config[imputer_numerical_hyperparameter] = search_space.get_hyperparameter(imputer_numerical_hyperparameter).default_value if not assert_autogluon_numerical_hyperparameters else 'median' + if assert_autogluon_numerical_hyperparameters: + quantile_hp_name = 'QuantileTransformer' + skew_transformer_choice = modified_config.get('skew_transformer:__choice__', None) + if skew_transformer_choice is not None: + if skew_transformer_choice != quantile_hp_name: + to_remove_hps = [hyp.name for hyp in search_space.get_children_of('skew_transformer:__choice__') if skew_transformer_choice in hyp.name] + [modified_config.pop(remove_hp, None) for remove_hp in to_remove_hps] + + to_add_hps = [hyp for hyp in search_space.get_children_of('skew_transformer:__choice__') if quantile_hp_name in hyp.name] + modified_config['skew_transformer:__choice__'] = quantile_hp_name + for add_hp in to_add_hps: + modified_config[add_hp.name] = add_hp.default_value + + feature_preprocessing_choice = modified_config['feature_preprocessor:__choice__'] + + to_adjust_hyperparams = ['n_clusters', 'n_components', 'target_dim'] + children_hyperparameters = [hyp for hyp in search_space.get_children_of('feature_preprocessor:__choice__') if feature_preprocessing_choice in hyp.name] + for hyp in children_hyperparameters: + children = search_space.get_children_of(hyp) + if len(children) > 0: + children_hyperparameters.extend(children) + children_hyperparameters = [hyp for hyp in children_hyperparameters if hyp.name in modified_config and any([ta_hyp in hyp.name for ta_hyp in to_adjust_hyperparams])] + + for child_hyperparam in children_hyperparameters: + modified_config[child_hyperparam.name] = floor(modified_config[child_hyperparam.name]/n_numerical_in_incumbent_on_task_id * num_numerical) + + return Configuration(search_space, modified_config) + + diff --git a/autoPyTorch/utils/data_classes.py b/autoPyTorch/utils/data_classes.py new file mode 100644 index 000000000..4d031253a --- /dev/null +++ b/autoPyTorch/utils/data_classes.py @@ -0,0 +1,27 @@ +import numpy as np + +from autoPyTorch.datasets.base_dataset import BaseDataset +from autoPyTorch.constants import ( + STRING_TO_TASK_TYPES, + TABULAR_TASKS, + IMAGE_TASKS +) +from autoPyTorch.datasets.tabular_dataset import TabularDataset +from autoPyTorch.datasets.image_dataset import ImageDataset + +from autoPyTorch.data.base_validator import BaseInputValidator +from autoPyTorch.data.tabular_validator import TabularInputValidator + + +def get_dataset_class(task_type: str) -> BaseDataset: + if STRING_TO_TASK_TYPES[task_type] in TABULAR_TASKS: + return TabularDataset + elif STRING_TO_TASK_TYPES[task_type] in IMAGE_TASKS: + return ImageDataset + + +def get_data_validator_class(task_type: str) -> BaseInputValidator: + if STRING_TO_TASK_TYPES[task_type] in TABULAR_TASKS: + return TabularInputValidator + elif STRING_TO_TASK_TYPES[task_type] in IMAGE_TASKS: + return None diff --git a/autoPyTorch/utils/early_stopping.py b/autoPyTorch/utils/early_stopping.py new file mode 100644 index 000000000..3bb3d9c32 --- /dev/null +++ b/autoPyTorch/utils/early_stopping.py @@ -0,0 +1,47 @@ +""" +Implementation from autogluon early_stopping +""" +from abc import ABC + + +class AbstractEarlyStopper(ABC): + """ + Abstract class for early stopping + """ + def update(self, current_epoch, is_best=False) -> bool: + raise NotImplementedError + + def early_stop(self, current_epoch, is_best=False) -> bool: + raise NotImplementedError + + +class SimpleEarlyStopper(AbstractEarlyStopper): + """ + Implements early stopping with fixed patience + Args: + patience : int, default 10 + If no improvement occurs in `patience` epochs or greater, self.early_stop will return True. + """ + def __init__(self, patience=10): + self.patience = patience + self.best_epoch = 0 + + def update(self, current_epoch, is_best=False): + if is_best: + self.best_epoch = current_epoch + return self.early_stop(current_epoch, is_best=is_best) + + def early_stop(self, current_epoch, is_best=False): + if is_best: + return False + return current_epoch - self.best_epoch >= self.patience + + +def get_early_stopping_rounds(num_rows_train, min_patience=20, max_patience=300, min_rows=10000): + + modifier = 1 if num_rows_train <= min_rows else min_rows / num_rows_train + simple_early_stopping_rounds = max( + round(modifier * max_patience), + min_patience, + ) + return simple_early_stopping_rounds diff --git a/autoPyTorch/utils/parallel_model_runner.py b/autoPyTorch/utils/parallel_model_runner.py new file mode 100644 index 000000000..083296038 --- /dev/null +++ b/autoPyTorch/utils/parallel_model_runner.py @@ -0,0 +1,167 @@ +import time +import math +from typing import Any, Dict, List, Tuple, Union +import unittest + +from ConfigSpace.configuration_space import Configuration, ConfigurationSpace + +import dask.distributed + +from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue +from smac.stats.stats import Stats +from smac.tae import StatusType + +from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash +from autoPyTorch.automl_common.common.utils.backend import Backend +from autoPyTorch.utils.common import dict_repr + + +def run_models_on_dataset( + time_left: int, + func_eval_time_limit_secs: int, + model_configs: List[Tuple[Union[str, Configuration]]], + logger, + logger_port, + metric, + dask_client: dask.distributed.Client, + backend: Backend, + memory_limit: int, + disable_file_output, + all_supported_metrics: bool, + ensemble_method, + include, + exclude, + search_space_updates, + pipeline_options, + seed: int, + multiprocessing_context, + n_jobs: int, + current_search_space: ConfigurationSpace, + smac_initial_run: int +) -> RunHistory: + starttime = time.time() + run_history = RunHistory() + memory_limit = memory_limit + if memory_limit is not None: + memory_limit = int(math.ceil(memory_limit)) + model_identifiers = [] + total_models = len(model_configs) + dask_futures = [] + for n_r, (config, budget) in enumerate(model_configs): + + # Only launch a task if there is time + start_time = time.time() + if time_left >= func_eval_time_limit_secs: + logger.info(f"{n_r}: Started fitting {config} with cutoff={func_eval_time_limit_secs}") + scenario_mock = unittest.mock.Mock() + scenario_mock.wallclock_limit = time_left + # This stats object is a hack - maybe the SMAC stats object should + # already be generated here! + stats = Stats(scenario_mock) + stats.start_timing() + + if isinstance(config, Configuration): + config.config_id = n_r + init_num_run = smac_initial_run + else: + init_num_run = smac_initial_run + n_r + + ta = ExecuteTaFuncWithQueue( + pynisher_context=multiprocessing_context, + backend=backend, + seed=seed, + metric=metric, + multi_objectives=["cost"], + logger_port=logger_port, + pipeline_config=pipeline_options, + cost_for_crash=get_cost_of_crash(metric), + abort_on_first_run_crash=False, + initial_num_run=init_num_run, + stats=stats, + memory_limit=memory_limit, + disable_file_output=disable_file_output, + all_supported_metrics=all_supported_metrics, + ensemble_method=ensemble_method, + include=include, + exclude=exclude, + search_space_updates=search_space_updates + ) + dask_futures.append([ + config, + dask_client.submit( + ta.run, config=config, + cutoff=func_eval_time_limit_secs, + budget=budget + ) + ]) + + # When managing time, we need to take into account the allocated time resources, + # which are dependent on the number of cores. 'dask_futures' is a proxy to the number + # of workers /n_jobs that we have, in that if there are 4 cores allocated, we can run at most + # 4 task in parallel. Every 'cutoff' seconds, we generate up to 4 tasks. + # If we only have 4 workers and there are 4 futures in dask_futures, it means that every + # worker has a task. We would not like to launch another job until a worker is available. To this + # end, the following if-statement queries the number of active jobs, and forces to wait for a job + # completion via future.result(), so that a new worker is available for the next iteration. + if len(dask_futures) >= n_jobs: + + # How many workers to wait before starting fitting the next iteration + workers_to_wait = 1 + if n_r >= total_models - 1 or time_left <= func_eval_time_limit_secs: + # If on the last iteration, flush out all tasks + workers_to_wait = len(dask_futures) + + while workers_to_wait >= 1: + workers_to_wait -= 1 + # We launch dask jobs only when there are resources available. + # This allow us to control time allocation properly, and early terminate + # the traditional machine learning pipeline + cls, future = dask_futures.pop(0) + status, cost, runtime, additional_info = future.result() + + if status == StatusType.SUCCESS: + logger.info( + "Fitting {} took {} [sec] and got performance: {}.\n" + "additional info:\n{}".format(cls, runtime, cost, dict_repr(additional_info)) + ) + origin = additional_info['configuration_origin'] + config = additional_info['configuration'] + budget = additional_info['budget'] + if isinstance(config, dict): + configuration = Configuration(current_search_space, config) + else: + configuration = additional_info.pop('pipeline_configuration') + + # additional_info.pop('pipeline_configuration') + run_history.add(config=configuration, cost=cost, + time=runtime, status=status, seed=seed, + starttime=starttime, endtime=starttime + runtime, + origin=origin, additional_info=additional_info) + model_identifiers.append((seed, additional_info['num_run'], float(budget))) + else: + if additional_info.get('exitcode') == -6: + logger.error( + "Traditional prediction for {} failed with run state {},\n" + "because the provided memory limits were too tight.\n" + "Please increase the 'ml_memory_limit' and try again.\n" + "If you still get the problem, please open an issue\n" + "and paste the additional info.\n" + "Additional info:\n{}".format(cls, str(status), dict_repr(additional_info)) + ) + else: + logger.error( + "Traditional prediction for {} failed with run state {}.\nAdditional info:\n{}".format( + cls, str(status), dict_repr(additional_info) + ) + ) + model_identifiers.append(None) + # In the case of a serial execution, calling submit halts the run for a resource + # dynamically adjust time in this case + time_left -= int(time.time() - start_time) + + # Exit if no more time is available for a new classifier + if time_left < func_eval_time_limit_secs: + logger.warning("Not enough time to fit all machine learning models." + "Please consider increasing the run time to further improve performance.") + break + return run_history, model_identifiers diff --git a/examples/20_basics/example_autogluon_ensemble.py b/examples/20_basics/example_autogluon_ensemble.py new file mode 100644 index 000000000..8a8b692b7 --- /dev/null +++ b/examples/20_basics/example_autogluon_ensemble.py @@ -0,0 +1,105 @@ +""" +====================== +Tabular Classification +====================== + +The following example shows how to fit a sample classification model +with AutoPyTorch +""" +import os +import tempfile as tmp +import warnings +from autoPyTorch.datasets.resampling_strategy import RepeatedCrossValTypes + +from autoPyTorch.optimizer.utils import autoPyTorchSMBO + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import openml +import sklearn.model_selection + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.ensemble.utils import EnsembleSelectionTypes + +############################################################################ +# Data Loading +# ============ +task = openml.tasks.get_task(task_id=3917) +dataset = task.get_dataset() +X, y, categorical_indicator, _ = dataset.get_data( + dataset_format='dataframe', + target=dataset.default_target_attribute, +) + +train_indices, test_indices = task.get_train_test_split_indices() +# AutoPyTorch fails when it is given a y DataFrame with False and True +# values and category as dtype. in its inner workings it uses sklearn +# which cannot detect the column type. +if isinstance(y[1], bool): + y = y.astype('bool') + +# uncomment only for np.arrays + +X_train = X.iloc[train_indices] +y_train = y.iloc[train_indices] +X_test = X.iloc[test_indices] +y_test = y.iloc[test_indices] + +feat_type = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator] + +############################################################################ +# Build and fit a classifier +# ========================== +api = TabularClassificationTask( + # To maintain logs of the run, you can uncomment the + # Following lines + temporary_directory='./tmp/stacking_autogluon_tmp_10', + output_directory='./tmp/stacking_autogluon_out_10', + delete_tmp_folder_after_terminate=False, + delete_output_folder_after_terminate=False, + seed=1, + ensemble_method=EnsembleSelectionTypes.stacking_autogluon, + resampling_strategy=RepeatedCrossValTypes.repeated_k_fold_cross_validation, + resampling_strategy_args={ + 'num_splits': 2, + 'num_repeats': 1 + }, + ensemble_size=6, + num_stacking_layers=1, + feat_type=feat_type +) + +############################################################################ +# Search for an ensemble of machine learning algorithms +# ===================================================== +api.run_autogluon_stacking( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + dataset_name='Australian', + optimize_metric='accuracy', + total_walltime_limit=600, + func_eval_time_limit_secs=130, + all_supported_metrics=False, + max_budget=10 +) + +############################################################################ +# Print the final ensemble performance +# ==================================== +y_pred = api.predict(X_test) +score = api.score(y_pred, y_test, metric='accuracy') +print(score) +# Print the final ensemble built by AutoPyTorch +print(api.show_models()) + +# Print statistics from search +# print(api.sprint_statistics()) + diff --git a/examples/20_basics/example_stacking_ensemble.py b/examples/20_basics/example_stacking_ensemble.py index e3d7c308a..100341c84 100644 --- a/examples/20_basics/example_stacking_ensemble.py +++ b/examples/20_basics/example_stacking_ensemble.py @@ -8,6 +8,9 @@ import os import tempfile as tmp import warnings +from autoPyTorch.api.utils import get_autogluon_default_nn_config + +from autoPyTorch.datasets.resampling_strategy import RepeatedCrossValTypes os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() os.environ['OMP_NUM_THREADS'] = '1' @@ -17,8 +20,7 @@ warnings.simplefilter(action='ignore', category=UserWarning) warnings.simplefilter(action='ignore', category=FutureWarning) -import sklearn.datasets -import sklearn.model_selection +import openml from autoPyTorch.api.tabular_classification import TabularClassificationTask from autoPyTorch.ensemble.utils import EnsembleSelectionTypes @@ -27,53 +29,82 @@ ############################################################################ # Data Loading # ============ -X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) -X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, - y, - random_state=1, +task = openml.tasks.get_task(task_id=146821) +dataset = task.get_dataset() +X, y, categorical_indicator, _ = dataset.get_data( + dataset_format='dataframe', + target=dataset.default_target_attribute, ) +train_indices, test_indices = task.get_train_test_split_indices() +# AutoPyTorch fails when it is given a y DataFrame with False and True +# values and category as dtype. in its inner workings it uses sklearn +# which cannot detect the column type. +if isinstance(y[1], bool): + y = y.astype('bool') + +# uncomment only for np.arrays + +X_train = X.iloc[train_indices] +y_train = y.iloc[train_indices] +X_test = X.iloc[test_indices] +y_test = y.iloc[test_indices] + +feat_type = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator] + +search_space_updates = get_autogluon_default_nn_config(feat_type=feat_type) ############################################################################ # Build and fit a classifier # ========================== -api = TabularClassificationTask( - # To maintain logs of the run, you can uncomment the - # Following lines - temporary_directory='./tmp/autoPyTorch_example_tmp_02', - output_directory='./tmp/autoPyTorch_example_out_02', - delete_tmp_folder_after_terminate=False, - delete_output_folder_after_terminate=False, - seed=42, - ensemble_method=EnsembleSelectionTypes.stacking_ensemble, - ensemble_size=5 -) +if __name__ == '__main__': + api = TabularClassificationTask( + # To maintain logs of the run, you can uncomment the + # Following lines + temporary_directory='./tmp/stacking_optimisation_ensemble_tmp_24', + output_directory='./tmp/stacking_optimisation_ensemble_out_24', + delete_tmp_folder_after_terminate=False, + delete_output_folder_after_terminate=False, + seed=4, + ensemble_method=EnsembleSelectionTypes.stacking_optimisation_ensemble, + resampling_strategy=RepeatedCrossValTypes.stratified_repeated_k_fold_cross_validation, + ensemble_size=5, + num_stacking_layers=1, + resampling_strategy_args={ + 'num_splits': 5, + 'num_repeats': 2 + }, + search_space_updates=search_space_updates, + n_jobs=1 + ) -############################################################################ -# Search for an ensemble of machine learning algorithms -# ===================================================== -api.search( - X_train=X_train, - y_train=y_train, - X_test=X_test.copy(), - y_test=y_test.copy(), - dataset_name='Australian', - optimize_metric='accuracy', - total_walltime_limit=1000, - func_eval_time_limit_secs=50, - enable_traditional_pipeline=False, - smbo_class=autoPyTorchSMBO, - all_supported_metrics=False -) + ############################################################################ + # Search for an ensemble of machine learning algorithms + # ===================================================== + api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + dataset_name='Australian', + optimize_metric='balanced_accuracy', + total_walltime_limit=500, + func_eval_time_limit_secs=70, + enable_traditional_pipeline=True, + smbo_class=autoPyTorchSMBO, + all_supported_metrics=False, + # use_ensemble_opt_loss=True, + posthoc_ensemble_fit_stacking_ensemble_optimization=True, + max_budget=10 + ) -############################################################################ -# Print the final ensemble performance -# ==================================== -y_pred = api.predict(X_test) -score = api.score(y_pred, y_test, metric='accuracy') -print(score) -# Print the final ensemble built by AutoPyTorch -print(api.show_models()) + ############################################################################ + # Print the final ensemble performance + # ==================================== + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test, metric='accuracy') + print(score) + # Print the final ensemble built by AutoPyTorch + print(api.show_models()) -# Print statistics from search -# print(api.sprint_statistics()) \ No newline at end of file + # Print statistics from search + # print(api.sprint_statistics()) \ No newline at end of file diff --git a/examples/20_basics/example_stacking_ensemble_selection_base.py b/examples/20_basics/example_stacking_ensemble_selection_base.py new file mode 100644 index 000000000..58e6ad378 --- /dev/null +++ b/examples/20_basics/example_stacking_ensemble_selection_base.py @@ -0,0 +1,109 @@ +""" +====================== +Tabular Classification +====================== + +The following example shows how to fit a sample classification model +with AutoPyTorch +""" +import os +import tempfile as tmp +import warnings +from autoPyTorch.api.utils import get_autogluon_default_nn_config +from autoPyTorch.datasets.resampling_strategy import RepeatedCrossValTypes + +from autoPyTorch.optimizer.utils import autoPyTorchSMBO + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import openml + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.ensemble.utils import EnsembleSelectionTypes + +############################################################################ +# Data Loading +# ============ +task = openml.tasks.get_task(task_id=3917) +dataset = task.get_dataset() +X, y, categorical_indicator, _ = dataset.get_data( + dataset_format='dataframe', + target=dataset.default_target_attribute, +) + +train_indices, test_indices = task.get_train_test_split_indices() +# AutoPyTorch fails when it is given a y DataFrame with False and True +# values and category as dtype. in its inner workings it uses sklearn +# which cannot detect the column type. +if isinstance(y[1], bool): + y = y.astype('bool') + +# uncomment only for np.arrays + +X_train = X.iloc[train_indices] +y_train = y.iloc[train_indices] +X_test = X.iloc[test_indices] +y_test = y.iloc[test_indices] + +feat_type = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator] + +search_space_updates = get_autogluon_default_nn_config(feat_type=feat_type) +############################################################################ +# Build and fit a classifier +# ========================== +api = TabularClassificationTask( + # To maintain logs of the run, you can uncomment the + # Following lines + temporary_directory='./tmp/stacking_repeat_base_models_tmp_07', + output_directory='./tmp/stacking_repeat_base_models_out_07', + delete_tmp_folder_after_terminate=False, + delete_output_folder_after_terminate=False, + seed=1, + ensemble_method=EnsembleSelectionTypes.stacking_repeat_models, + resampling_strategy=RepeatedCrossValTypes.repeated_k_fold_cross_validation, + resampling_strategy_args={ + 'num_splits': 2, + 'num_repeats': 1 + }, + ensemble_size=5, + num_stacking_layers=2, + search_space_updates=search_space_updates +) + +############################################################################ +# Search for an ensemble of machine learning algorithms +# ===================================================== +api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + dataset_name='Australian', + optimize_metric='accuracy', + total_walltime_limit=900, + func_eval_time_limit_secs=150, + enable_traditional_pipeline=True, + # smbo_class=autoPyTorchSMBO, + all_supported_metrics=False, + min_budget=5, + max_budget=10 +) + +############################################################################ +# Print the final ensemble performance +# ==================================== +y_pred = api.predict(X_test) +score = api.score(y_pred, y_test, metric='accuracy') +print(score) +# Print the final ensemble built by AutoPyTorch +print(api.show_models()) + +# Print statistics from search +# print(api.sprint_statistics()) + diff --git a/examples/20_basics/example_stacking_ensemble_selection_per_layer.py b/examples/20_basics/example_stacking_ensemble_selection_per_layer.py new file mode 100644 index 000000000..37f40f850 --- /dev/null +++ b/examples/20_basics/example_stacking_ensemble_selection_per_layer.py @@ -0,0 +1,107 @@ +""" +====================== +Tabular Classification +====================== +The following example shows how to fit a sample classification model +with AutoPyTorch +""" +import os +import tempfile as tmp +import warnings +from autoPyTorch.api.utils import get_autogluon_default_nn_config + +from autoPyTorch.datasets.resampling_strategy import RepeatedCrossValTypes + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import openml + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.ensemble.utils import EnsembleSelectionTypes +from autoPyTorch.optimizer.utils import autoPyTorchSMBO + +############################################################################ +# Data Loading +# ============ +task = openml.tasks.get_task(task_id=3917) +dataset = task.get_dataset() +X, y, categorical_indicator, _ = dataset.get_data( + dataset_format='dataframe', + target=dataset.default_target_attribute, +) + +train_indices, test_indices = task.get_train_test_split_indices() +# AutoPyTorch fails when it is given a y DataFrame with False and True +# values and category as dtype. in its inner workings it uses sklearn +# which cannot detect the column type. +if isinstance(y[1], bool): + y = y.astype('bool') + +# uncomment only for np.arrays + +X_train = X.iloc[train_indices] +y_train = y.iloc[train_indices] +X_test = X.iloc[test_indices] +y_test = y.iloc[test_indices] + +feat_type = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator] + +search_space_updates = get_autogluon_default_nn_config(feat_type=feat_type) +############################################################################ +# Build and fit a classifier +# ========================== +if __name__ == '__main__': + api = TabularClassificationTask( + # To maintain logs of the run, you can uncomment the + # Following lines + temporary_directory='./tmp/stacking_ensemble_selection_per_layer_tmp_09', + output_directory='./tmp/stacking_ensemble_selection_per_layer_out_09', + delete_tmp_folder_after_terminate=False, + delete_output_folder_after_terminate=False, + seed=4, + ensemble_method=EnsembleSelectionTypes.stacking_ensemble_selection_per_layer, + resampling_strategy=RepeatedCrossValTypes.repeated_k_fold_cross_validation, + ensemble_size=5, + num_stacking_layers=2, + resampling_strategy_args={ + 'num_splits': 5, + 'num_repeats': 2 + }, + search_space_updates=search_space_updates, + ) + + ############################################################################ + # Search for an ensemble of machine learning algorithms + # ===================================================== + api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + dataset_name='Australian', + optimize_metric='balanced_accuracy', + total_walltime_limit=900, + func_eval_time_limit_secs=150, + enable_traditional_pipeline=False, + all_supported_metrics=False, + min_budget=5, + max_budget=15 + ) + + ############################################################################ + # Print the final ensemble performance + # ==================================== + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test, metric='accuracy') + print(score) + # Print the final ensemble built by AutoPyTorch + print(api.show_models()) + + # Print statistics from search + # print(api.sprint_statistics()) \ No newline at end of file diff --git a/examples/20_basics/example_tabular_classification.py b/examples/20_basics/example_tabular_classification.py index 636281eff..d57a42bb3 100644 --- a/examples/20_basics/example_tabular_classification.py +++ b/examples/20_basics/example_tabular_classification.py @@ -10,6 +10,9 @@ import tempfile as tmp import warnings +from autoPyTorch.datasets.resampling_strategy import RepeatedCrossValTypes +from autoPyTorch.api.utils import get_autogluon_default_nn_config + os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() os.environ['OMP_NUM_THREADS'] = '1' os.environ['OPENBLAS_NUM_THREADS'] = '1' @@ -18,33 +21,52 @@ warnings.simplefilter(action='ignore', category=UserWarning) warnings.simplefilter(action='ignore', category=FutureWarning) -import sklearn.datasets -import sklearn.model_selection +import openml from autoPyTorch.api.tabular_classification import TabularClassificationTask - +from autoPyTorch.ensemble.utils import EnsembleSelectionTypes ############################################################################ # Data Loading # ============ -X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) -X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, - y, - random_state=1, +task = openml.tasks.get_task(task_id=146821) +dataset = task.get_dataset() +X, y, categorical_indicator, _ = dataset.get_data( + dataset_format='dataframe', + target=dataset.default_target_attribute, ) +train_indices, test_indices = task.get_train_test_split_indices() +# AutoPyTorch fails when it is given a y DataFrame with False and True +# values and category as dtype. in its inner workings it uses sklearn +# which cannot detect the column type. +if isinstance(y[1], bool): + y = y.astype('bool') + +# uncomment only for np.arrays + +X_train = X.iloc[train_indices] +y_train = y.iloc[train_indices] +X_test = X.iloc[test_indices] +y_test = y.iloc[test_indices] + +feat_type = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator] + +search_space_updates = get_autogluon_default_nn_config(feat_type=feat_type) ############################################################################ # Build and fit a classifier # ========================== api = TabularClassificationTask( # To maintain logs of the run, you can uncomment the # Following lines - # temporary_directory='./tmp/autoPyTorch_example_tmp_01', - # output_directory='./tmp/autoPyTorch_example_out_01', - # delete_tmp_folder_after_terminate=False, - # delete_output_folder_after_terminate=False, + temporary_directory='./tmp/autoPyTorch_example_tmp_06', + output_directory='./tmp/autoPyTorch_example_out_06', + delete_tmp_folder_after_terminate=False, + delete_output_folder_after_terminate=False, seed=42, + ensemble_size=5, + resampling_strategy=RepeatedCrossValTypes.repeated_k_fold_cross_validation, + search_space_updates=search_space_updates ) ############################################################################ @@ -57,8 +79,9 @@ y_test=y_test.copy(), dataset_name='Australian', optimize_metric='accuracy', - total_walltime_limit=300, - func_eval_time_limit_secs=50 + total_walltime_limit=1200, + func_eval_time_limit_secs=200, + enable_traditional_pipeline=True ) ############################################################################ diff --git a/requirements.txt b/requirements.txt index 1f2dd38b6..f458ef51a 100755 --- a/requirements.txt +++ b/requirements.txt @@ -10,10 +10,12 @@ imgaug>=0.4.0 ConfigSpace>=0.4.14,<0.5 pynisher>=0.6.3 pyrfr>=0.7,<0.9 -smac>=1.2 +smac==1.2 dask distributed>=2.2.0 -catboost -lightgbm flaky -tabulate \ No newline at end of file +tabulate +lightgbm>=3.3,<3.4 +catboost>=1.0,<1.1 +xgboost>=1.4,<1.5 +scikit-learn-intelex>=2021.5,<2021.6